diff options
author | Mateusz Kowalczyk <fuuzetsu@fuuzetsu.co.uk> | 2013-07-09 14:11:22 +0100 |
---|---|---|
committer | Austin Seipp <austin@well-typed.com> | 2014-01-12 14:48:35 -0600 |
commit | c1228df0339d041b455bb993786a9ed6322c5e01 (patch) | |
tree | 6d42c42934820868fa931919bcdd9f45b228c222 /vendor/attoparsec-0.10.4.0/Data/Attoparsec/ByteString/Char8.hs | |
parent | a2f3551c276cc77d3c93f048b77cab96a5e648ed (diff) |
Add ByteString version of Attoparsec
Diffstat (limited to 'vendor/attoparsec-0.10.4.0/Data/Attoparsec/ByteString/Char8.hs')
-rw-r--r-- | vendor/attoparsec-0.10.4.0/Data/Attoparsec/ByteString/Char8.hs | 549 |
1 files changed, 549 insertions, 0 deletions
diff --git a/vendor/attoparsec-0.10.4.0/Data/Attoparsec/ByteString/Char8.hs b/vendor/attoparsec-0.10.4.0/Data/Attoparsec/ByteString/Char8.hs new file mode 100644 index 00000000..3bbe51f0 --- /dev/null +++ b/vendor/attoparsec-0.10.4.0/Data/Attoparsec/ByteString/Char8.hs @@ -0,0 +1,549 @@ +{-# LANGUAGE BangPatterns, FlexibleInstances, TypeFamilies, + TypeSynonymInstances, GADTs #-} +{-# OPTIONS_GHC -fno-warn-orphans #-} + +-- | +-- Module : Data.Attoparsec.ByteString.Char8 +-- Copyright : Bryan O'Sullivan 2007-2011 +-- License : BSD3 +-- +-- Maintainer : bos@serpentine.com +-- Stability : experimental +-- Portability : unknown +-- +-- Simple, efficient, character-oriented combinator parsing for +-- 'B.ByteString' strings, loosely based on the Parsec library. + +module Data.Attoparsec.ByteString.Char8 + ( + -- * Character encodings + -- $encodings + + -- * Parser types + Parser + , A.Result + , A.IResult(..) + , I.compareResults + + -- * Running parsers + , A.parse + , A.feed + , A.parseOnly + , A.parseTest + , A.parseWith + + -- ** Result conversion + , A.maybeResult + , A.eitherResult + + -- * Combinators + , (I.<?>) + , I.try + , module Data.Attoparsec.Combinator + + -- * Parsing individual characters + , char + , char8 + , anyChar + , notChar + , peekChar + , satisfy + + -- ** Special character parsers + , digit + , letter_iso8859_15 + , letter_ascii + , space + + -- ** Fast predicates + , isDigit + , isDigit_w8 + , isAlpha_iso8859_15 + , isAlpha_ascii + , isSpace + , isSpace_w8 + + -- *** Character classes + , inClass + , notInClass + + -- * Efficient string handling + , I.string + , stringCI + , skipSpace + , skipWhile + , I.take + , scan + , takeWhile + , takeWhile1 + , takeTill + + -- ** String combinators + -- $specalt + , (.*>) + , (<*.) + + -- ** Consume all remaining input + , I.takeByteString + , I.takeLazyByteString + + -- * Text parsing + , I.endOfLine + , isEndOfLine + , isHorizontalSpace + + -- * Numeric parsers + , decimal + , hexadecimal + , signed + , double + , Number(..) + , number + , rational + + -- * State observation and manipulation functions + , I.endOfInput + , I.atEnd + ) where + +import Control.Applicative ((*>), (<*), (<$>), (<|>)) +import Data.Attoparsec.ByteString.FastSet (charClass, memberChar) +import Data.Attoparsec.ByteString.Internal (Parser, (<?>)) +import Data.Attoparsec.Combinator +import Data.Attoparsec.Number (Number(..)) +import Data.Bits (Bits, (.|.), shiftL) +import Data.ByteString.Internal (c2w, w2c) +import Data.Int (Int8, Int16, Int32, Int64) +import Data.Ratio ((%)) +import Data.String (IsString(..)) +import Data.Word (Word8, Word16, Word32, Word64, Word) +import Prelude hiding (takeWhile) +import qualified Data.Attoparsec.ByteString as A +import qualified Data.Attoparsec.ByteString.Internal as I +import qualified Data.Attoparsec.Internal as I +import qualified Data.ByteString as B8 +import qualified Data.ByteString.Char8 as B + +instance (a ~ B.ByteString) => IsString (Parser a) where + fromString = I.string . B.pack + +-- $encodings +-- +-- This module is intended for parsing text that is +-- represented using an 8-bit character set, e.g. ASCII or +-- ISO-8859-15. It /does not/ make any attempt to deal with character +-- encodings, multibyte characters, or wide characters. In +-- particular, all attempts to use characters above code point U+00FF +-- will give wrong answers. +-- +-- Code points below U+0100 are simply translated to and from their +-- numeric values, so e.g. the code point U+00A4 becomes the byte +-- @0xA4@ (which is the Euro symbol in ISO-8859-15, but the generic +-- currency sign in ISO-8859-1). Haskell 'Char' values above U+00FF +-- are truncated, so e.g. U+1D6B7 is truncated to the byte @0xB7@. + +-- ASCII-specific but fast, oh yes. +toLower :: Word8 -> Word8 +toLower w | w >= 65 && w <= 90 = w + 32 + | otherwise = w + +-- | Satisfy a literal string, ignoring case. +stringCI :: B.ByteString -> Parser B.ByteString +stringCI = I.stringTransform (B8.map toLower) +{-# INLINE stringCI #-} + +-- | Consume input as long as the predicate returns 'True', and return +-- the consumed input. +-- +-- This parser requires the predicate to succeed on at least one byte +-- of input: it will fail if the predicate never returns 'True' or if +-- there is no input left. +takeWhile1 :: (Char -> Bool) -> Parser B.ByteString +takeWhile1 p = I.takeWhile1 (p . w2c) +{-# INLINE takeWhile1 #-} + +-- | The parser @satisfy p@ succeeds for any byte for which the +-- predicate @p@ returns 'True'. Returns the byte that is actually +-- parsed. +-- +-- >digit = satisfy isDigit +-- > where isDigit c = c >= '0' && c <= '9' +satisfy :: (Char -> Bool) -> Parser Char +satisfy = I.satisfyWith w2c +{-# INLINE satisfy #-} + +-- | Match a letter, in the ISO-8859-15 encoding. +letter_iso8859_15 :: Parser Char +letter_iso8859_15 = satisfy isAlpha_iso8859_15 <?> "letter_iso8859_15" +{-# INLINE letter_iso8859_15 #-} + +-- | Match a letter, in the ASCII encoding. +letter_ascii :: Parser Char +letter_ascii = satisfy isAlpha_ascii <?> "letter_ascii" +{-# INLINE letter_ascii #-} + +-- | A fast alphabetic predicate for the ISO-8859-15 encoding +-- +-- /Note/: For all character encodings other than ISO-8859-15, and +-- almost all Unicode code points above U+00A3, this predicate gives +-- /wrong answers/. +isAlpha_iso8859_15 :: Char -> Bool +isAlpha_iso8859_15 c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '\166' && moby c) + where moby = notInClass "\167\169\171-\179\182\183\185\187\191\215\247" + {-# NOINLINE moby #-} +{-# INLINE isAlpha_iso8859_15 #-} + +-- | A fast alphabetic predicate for the ASCII encoding +-- +-- /Note/: For all character encodings other than ASCII, and +-- almost all Unicode code points above U+007F, this predicate gives +-- /wrong answers/. +isAlpha_ascii :: Char -> Bool +isAlpha_ascii c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') +{-# INLINE isAlpha_ascii #-} + +-- | Parse a single digit. +digit :: Parser Char +digit = satisfy isDigit <?> "digit" +{-# INLINE digit #-} + +-- | A fast digit predicate. +isDigit :: Char -> Bool +isDigit c = c >= '0' && c <= '9' +{-# INLINE isDigit #-} + +-- | A fast digit predicate. +isDigit_w8 :: Word8 -> Bool +isDigit_w8 w = w >= 48 && w <= 57 +{-# INLINE isDigit_w8 #-} + +-- | Match any character. +anyChar :: Parser Char +anyChar = satisfy $ const True +{-# INLINE anyChar #-} + +-- | Match any character. Returns 'Nothing' if end of input has been +-- reached. Does not consume any input. +-- +-- /Note/: Because this parser does not fail, do not use it with +-- combinators such as 'many', because such parsers loop until a +-- failure occurs. Careless use will thus result in an infinite loop. +peekChar :: Parser (Maybe Char) +peekChar = (fmap w2c) `fmap` I.peekWord8 +{-# INLINE peekChar #-} + +-- | Fast predicate for matching ASCII space characters. +-- +-- /Note/: This predicate only gives correct answers for the ASCII +-- encoding. For instance, it does not recognise U+00A0 (non-breaking +-- space) as a space character, even though it is a valid ISO-8859-15 +-- byte. For a Unicode-aware and only slightly slower predicate, +-- use 'Data.Char.isSpace' +isSpace :: Char -> Bool +isSpace c = (c == ' ') || ('\t' <= c && c <= '\r') +{-# INLINE isSpace #-} + +-- | Fast 'Word8' predicate for matching ASCII space characters. +isSpace_w8 :: Word8 -> Bool +isSpace_w8 w = (w == 32) || (9 <= w && w <= 13) +{-# INLINE isSpace_w8 #-} + + +-- | Parse a space character. +-- +-- /Note/: This parser only gives correct answers for the ASCII +-- encoding. For instance, it does not recognise U+00A0 (non-breaking +-- space) as a space character, even though it is a valid ISO-8859-15 +-- byte. +space :: Parser Char +space = satisfy isSpace <?> "space" +{-# INLINE space #-} + +-- | Match a specific character. +char :: Char -> Parser Char +char c = satisfy (== c) <?> [c] +{-# INLINE char #-} + +-- | Match a specific character, but return its 'Word8' value. +char8 :: Char -> Parser Word8 +char8 c = I.satisfy (== c2w c) <?> [c] +{-# INLINE char8 #-} + +-- | Match any character except the given one. +notChar :: Char -> Parser Char +notChar c = satisfy (/= c) <?> "not " ++ [c] +{-# INLINE notChar #-} + +-- | Match any character in a set. +-- +-- >vowel = inClass "aeiou" +-- +-- Range notation is supported. +-- +-- >halfAlphabet = inClass "a-nA-N" +-- +-- To add a literal \'-\' to a set, place it at the beginning or end +-- of the string. +inClass :: String -> Char -> Bool +inClass s = (`memberChar` mySet) + where mySet = charClass s +{-# INLINE inClass #-} + +-- | Match any character not in a set. +notInClass :: String -> Char -> Bool +notInClass s = not . inClass s +{-# INLINE notInClass #-} + +-- | Consume input as long as the predicate returns 'True', and return +-- the consumed input. +-- +-- This parser does not fail. It will return an empty string if the +-- predicate returns 'False' on the first byte of input. +-- +-- /Note/: Because this parser does not fail, do not use it with +-- combinators such as 'many', because such parsers loop until a +-- failure occurs. Careless use will thus result in an infinite loop. +takeWhile :: (Char -> Bool) -> Parser B.ByteString +takeWhile p = I.takeWhile (p . w2c) +{-# INLINE takeWhile #-} + +-- | A stateful scanner. The predicate consumes and transforms a +-- state argument, and each transformed state is passed to successive +-- invocations of the predicate on each byte of the input until one +-- returns 'Nothing' or the input ends. +-- +-- This parser does not fail. It will return an empty string if the +-- predicate returns 'Nothing' on the first byte of input. +-- +-- /Note/: Because this parser does not fail, do not use it with +-- combinators such as 'many', because such parsers loop until a +-- failure occurs. Careless use will thus result in an infinite loop. +scan :: s -> (s -> Char -> Maybe s) -> Parser B.ByteString +scan s0 p = I.scan s0 (\s -> p s . w2c) +{-# INLINE scan #-} + +-- | Consume input as long as the predicate returns 'False' +-- (i.e. until it returns 'True'), and return the consumed input. +-- +-- This parser does not fail. It will return an empty string if the +-- predicate returns 'True' on the first byte of input. +-- +-- /Note/: Because this parser does not fail, do not use it with +-- combinators such as 'many', because such parsers loop until a +-- failure occurs. Careless use will thus result in an infinite loop. +takeTill :: (Char -> Bool) -> Parser B.ByteString +takeTill p = I.takeTill (p . w2c) +{-# INLINE takeTill #-} + +-- | Skip past input for as long as the predicate returns 'True'. +skipWhile :: (Char -> Bool) -> Parser () +skipWhile p = I.skipWhile (p . w2c) +{-# INLINE skipWhile #-} + +-- | Skip over white space. +skipSpace :: Parser () +skipSpace = I.skipWhile isSpace_w8 +{-# INLINE skipSpace #-} + +-- $specalt +-- +-- The '.*>' and '<*.' combinators are intended for use with the +-- @OverloadedStrings@ language extension. They simplify the common +-- task of matching a statically known string, then immediately +-- parsing something else. +-- +-- An example makes this easier to understand: +-- +-- @{-\# LANGUAGE OverloadedStrings #-} +-- +-- shoeSize = \"Shoe size: \" '.*>' 'decimal' +-- @ +-- +-- If we were to try to use '*>' above instead, the type checker would +-- not be able to tell which 'IsString' instance to use for the text +-- in quotes. We would have to be explicit, using either a type +-- signature or the 'I.string' parser. + +-- | Type-specialized version of '*>' for 'B.ByteString'. +(.*>) :: B.ByteString -> Parser a -> Parser a +s .*> f = I.string s *> f + +-- | Type-specialized version of '<*' for 'B.ByteString'. +(<*.) :: Parser a -> B.ByteString -> Parser a +f <*. s = f <* I.string s + +-- | A predicate that matches either a carriage return @\'\\r\'@ or +-- newline @\'\\n\'@ character. +isEndOfLine :: Word8 -> Bool +isEndOfLine w = w == 13 || w == 10 +{-# INLINE isEndOfLine #-} + +-- | A predicate that matches either a space @\' \'@ or horizontal tab +-- @\'\\t\'@ character. +isHorizontalSpace :: Word8 -> Bool +isHorizontalSpace w = w == 32 || w == 9 +{-# INLINE isHorizontalSpace #-} + +-- | Parse and decode an unsigned hexadecimal number. The hex digits +-- @\'a\'@ through @\'f\'@ may be upper or lower case. +-- +-- This parser does not accept a leading @\"0x\"@ string. +hexadecimal :: (Integral a, Bits a) => Parser a +hexadecimal = B8.foldl' step 0 `fmap` I.takeWhile1 isHexDigit + where + isHexDigit w = (w >= 48 && w <= 57) || + (w >= 97 && w <= 102) || + (w >= 65 && w <= 70) + step a w | w >= 48 && w <= 57 = (a `shiftL` 4) .|. fromIntegral (w - 48) + | w >= 97 = (a `shiftL` 4) .|. fromIntegral (w - 87) + | otherwise = (a `shiftL` 4) .|. fromIntegral (w - 55) +{-# SPECIALISE hexadecimal :: Parser Int #-} +{-# SPECIALISE hexadecimal :: Parser Int8 #-} +{-# SPECIALISE hexadecimal :: Parser Int16 #-} +{-# SPECIALISE hexadecimal :: Parser Int32 #-} +{-# SPECIALISE hexadecimal :: Parser Int64 #-} +{-# SPECIALISE hexadecimal :: Parser Integer #-} +{-# SPECIALISE hexadecimal :: Parser Word #-} +{-# SPECIALISE hexadecimal :: Parser Word8 #-} +{-# SPECIALISE hexadecimal :: Parser Word16 #-} +{-# SPECIALISE hexadecimal :: Parser Word32 #-} +{-# SPECIALISE hexadecimal :: Parser Word64 #-} + +-- | Parse and decode an unsigned decimal number. +decimal :: Integral a => Parser a +decimal = B8.foldl' step 0 `fmap` I.takeWhile1 isDig + where isDig w = w >= 48 && w <= 57 + step a w = a * 10 + fromIntegral (w - 48) +{-# SPECIALISE decimal :: Parser Int #-} +{-# SPECIALISE decimal :: Parser Int8 #-} +{-# SPECIALISE decimal :: Parser Int16 #-} +{-# SPECIALISE decimal :: Parser Int32 #-} +{-# SPECIALISE decimal :: Parser Int64 #-} +{-# SPECIALISE decimal :: Parser Integer #-} +{-# SPECIALISE decimal :: Parser Word #-} +{-# SPECIALISE decimal :: Parser Word8 #-} +{-# SPECIALISE decimal :: Parser Word16 #-} +{-# SPECIALISE decimal :: Parser Word32 #-} +{-# SPECIALISE decimal :: Parser Word64 #-} + +-- | Parse a number with an optional leading @\'+\'@ or @\'-\'@ sign +-- character. +signed :: Num a => Parser a -> Parser a +{-# SPECIALISE signed :: Parser Int -> Parser Int #-} +{-# SPECIALISE signed :: Parser Int8 -> Parser Int8 #-} +{-# SPECIALISE signed :: Parser Int16 -> Parser Int16 #-} +{-# SPECIALISE signed :: Parser Int32 -> Parser Int32 #-} +{-# SPECIALISE signed :: Parser Int64 -> Parser Int64 #-} +{-# SPECIALISE signed :: Parser Integer -> Parser Integer #-} +signed p = (negate <$> (char8 '-' *> p)) + <|> (char8 '+' *> p) + <|> p + +-- | Parse a rational number. +-- +-- This parser accepts an optional leading sign character, followed by +-- at least one decimal digit. The syntax similar to that accepted by +-- the 'read' function, with the exception that a trailing @\'.\'@ or +-- @\'e\'@ /not/ followed by a number is not consumed. +-- +-- Examples with behaviour identical to 'read', if you feed an empty +-- continuation to the first result: +-- +-- >rational "3" == Done 3.0 "" +-- >rational "3.1" == Done 3.1 "" +-- >rational "3e4" == Done 30000.0 "" +-- >rational "3.1e4" == Done 31000.0, "" +-- +-- Examples with behaviour identical to 'read': +-- +-- >rational ".3" == Fail "input does not start with a digit" +-- >rational "e3" == Fail "input does not start with a digit" +-- +-- Examples of differences from 'read': +-- +-- >rational "3.foo" == Done 3.0 ".foo" +-- >rational "3e" == Done 3.0 "e" +-- +-- This function does not accept string representations of \"NaN\" or +-- \"Infinity\". +rational :: Fractional a => Parser a +{-# SPECIALIZE rational :: Parser Double #-} +{-# SPECIALIZE rational :: Parser Float #-} +{-# SPECIALIZE rational :: Parser Rational #-} +rational = floaty $ \real frac fracDenom -> fromRational $ + real % 1 + frac % fracDenom + +-- | Parse a rational number. +-- +-- The syntax accepted by this parser is the same as for 'rational'. +-- +-- /Note/: This function is almost ten times faster than 'rational', +-- but is slightly less accurate. +-- +-- The 'Double' type supports about 16 decimal places of accuracy. +-- For 94.2% of numbers, this function and 'rational' give identical +-- results, but for the remaining 5.8%, this function loses precision +-- around the 15th decimal place. For 0.001% of numbers, this +-- function will lose precision at the 13th or 14th decimal place. +-- +-- This function does not accept string representations of \"NaN\" or +-- \"Infinity\". +double :: Parser Double +double = floaty asDouble + +asDouble :: Integer -> Integer -> Integer -> Double +asDouble real frac fracDenom = + fromIntegral real + fromIntegral frac / fromIntegral fracDenom +{-# INLINE asDouble #-} + +-- | Parse a number, attempting to preserve both speed and precision. +-- +-- The syntax accepted by this parser is the same as for 'rational'. +-- +-- /Note/: This function is almost ten times faster than 'rational'. +-- On integral inputs, it gives perfectly accurate answers, and on +-- floating point inputs, it is slightly less accurate than +-- 'rational'. +-- +-- This function does not accept string representations of \"NaN\" or +-- \"Infinity\". +number :: Parser Number +number = floaty $ \real frac fracDenom -> + if frac == 0 && fracDenom == 0 + then I real + else D (asDouble real frac fracDenom) +{-# INLINE number #-} + +data T = T !Integer !Int + +floaty :: Fractional a => (Integer -> Integer -> Integer -> a) -> Parser a +{-# INLINE floaty #-} +floaty f = do + let minus = 45 + plus = 43 + !positive <- ((== plus) <$> I.satisfy (\c -> c == minus || c == plus)) <|> + return True + real <- decimal + let tryFraction = do + let dot = 46 + _ <- I.satisfy (==dot) + ds <- I.takeWhile isDigit_w8 + case I.parseOnly decimal ds of + Right n -> return $ T n (B.length ds) + _ -> fail "no digits after decimal" + T fraction fracDigits <- tryFraction <|> return (T 0 0) + let littleE = 101 + bigE = 69 + e w = w == littleE || w == bigE + power <- (I.satisfy e *> signed decimal) <|> return (0::Int) + let n = if fracDigits == 0 + then if power == 0 + then fromIntegral real + else fromIntegral real * (10 ^^ power) + else if power == 0 + then f real fraction (10 ^ fracDigits) + else f real fraction (10 ^ fracDigits) * (10 ^^ power) + return $ if positive + then n + else -n |