1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
|
{-# LANGUAGE CPP #-}
#if __GLASGOW_HASKELL__ >= 702
{-# LANGUAGE Trustworthy #-}
#endif
-- |
-- Module : Data.Attoparsec.ByteString
-- Copyright : Bryan O'Sullivan 2007-2015
-- License : BSD3
--
-- Maintainer : bos@serpentine.com
-- Stability : experimental
-- Portability : unknown
--
-- Simple, efficient combinator parsing for 'B.ByteString' strings,
-- loosely based on the Parsec library.
module Data.Attoparsec.ByteString
(
-- * Differences from Parsec
-- $parsec
-- * Incremental input
-- $incremental
-- * Performance considerations
-- $performance
-- * Parser types
I.Parser
, Result
, T.IResult(..)
, I.compareResults
-- * Running parsers
, parse
, feed
, I.parseOnly
, parseWith
, parseTest
-- ** Result conversion
, maybeResult
, eitherResult
-- * Parsing individual bytes
, I.word8
, I.anyWord8
, I.notWord8
, I.satisfy
, I.satisfyWith
, I.skip
-- ** Lookahead
, I.peekWord8
, I.peekWord8'
-- ** Byte classes
, I.inClass
, I.notInClass
-- * Efficient string handling
, I.string
, I.skipWhile
, I.take
, I.scan
, I.runScanner
, I.takeWhile
, I.takeWhile1
, I.takeTill
-- ** Consume all remaining input
, I.takeByteString
, I.takeLazyByteString
-- * Combinators
, try
, (<?>)
, choice
, count
, option
, many'
, many1
, many1'
, manyTill
, manyTill'
, sepBy
, sepBy'
, sepBy1
, sepBy1'
, skipMany
, skipMany1
, eitherP
, I.match
-- * State observation and manipulation functions
, I.endOfInput
, I.atEnd
) where
import Data.Attoparsec.Combinator
import Data.List (intercalate)
import qualified Data.Attoparsec.ByteString.Internal as I
import qualified Data.Attoparsec.Internal as I
import qualified Data.ByteString as B
import Data.Attoparsec.ByteString.Internal (Result, parse)
import qualified Data.Attoparsec.Internal.Types as T
-- $parsec
--
-- Compared to Parsec 3, attoparsec makes several tradeoffs. It is
-- not intended for, or ideal for, all possible uses.
--
-- * While attoparsec can consume input incrementally, Parsec cannot.
-- Incremental input is a huge deal for efficient and secure network
-- and system programming, since it gives much more control to users
-- of the library over matters such as resource usage and the I/O
-- model to use.
--
-- * Much of the performance advantage of attoparsec is gained via
-- high-performance parsers such as 'I.takeWhile' and 'I.string'.
-- If you use complicated combinators that return lists of bytes or
-- characters, there is less performance difference between the two
-- libraries.
--
-- * Unlike Parsec 3, attoparsec does not support being used as a
-- monad transformer.
--
-- * attoparsec is specialised to deal only with strict 'B.ByteString'
-- input. Efficiency concerns rule out both lists and lazy
-- bytestrings. The usual use for lazy bytestrings would be to
-- allow consumption of very large input without a large footprint.
-- For this need, attoparsec's incremental input provides an
-- excellent substitute, with much more control over when input
-- takes place. If you must use lazy bytestrings, see the
-- "Data.Attoparsec.ByteString.Lazy" module, which feeds lazy chunks
-- to a regular parser.
--
-- * Parsec parsers can produce more helpful error messages than
-- attoparsec parsers. This is a matter of focus: attoparsec avoids
-- the extra book-keeping in favour of higher performance.
-- $incremental
--
-- attoparsec supports incremental input, meaning that you can feed it
-- a bytestring that represents only part of the expected total amount
-- of data to parse. If your parser reaches the end of a fragment of
-- input and could consume more input, it will suspend parsing and
-- return a 'T.Partial' continuation.
--
-- Supplying the 'T.Partial' continuation with a bytestring will
-- resume parsing at the point where it was suspended, with the
-- bytestring you supplied used as new input at the end of the
-- existing input. You must be prepared for the result of the resumed
-- parse to be another 'T.Partial' continuation.
--
-- To indicate that you have no more input, supply the 'T.Partial'
-- continuation with an empty bytestring.
--
-- Remember that some parsing combinators will not return a result
-- until they reach the end of input. They may thus cause 'T.Partial'
-- results to be returned.
--
-- If you do not need support for incremental input, consider using
-- the 'I.parseOnly' function to run your parser. It will never
-- prompt for more input.
--
-- /Note/: incremental input does /not/ imply that attoparsec will
-- release portions of its internal state for garbage collection as it
-- proceeds. Its internal representation is equivalent to a single
-- 'ByteString': if you feed incremental input to a parser, it will
-- require memory proportional to the amount of input you supply.
-- (This is necessary to support arbitrary backtracking.)
-- $performance
--
-- If you write an attoparsec-based parser carefully, it can be
-- realistic to expect it to perform similarly to a hand-rolled C
-- parser (measuring megabytes parsed per second).
--
-- To actually achieve high performance, there are a few guidelines
-- that it is useful to follow.
--
-- Use the 'B.ByteString'-oriented parsers whenever possible,
-- e.g. 'I.takeWhile1' instead of 'many1' 'I.anyWord8'. There is
-- about a factor of 100 difference in performance between the two
-- kinds of parser.
--
-- For very simple byte-testing predicates, write them by hand instead
-- of using 'I.inClass' or 'I.notInClass'. For instance, both of
-- these predicates test for an end-of-line byte, but the first is
-- much faster than the second:
--
-- >endOfLine_fast w = w == 13 || w == 10
-- >endOfLine_slow = inClass "\r\n"
--
-- Make active use of benchmarking and profiling tools to measure,
-- find the problems with, and improve the performance of your parser.
-- | Run a parser and print its result to standard output.
parseTest :: (Show a) => I.Parser a -> B.ByteString -> IO ()
parseTest p s = print (parse p s)
-- | Run a parser with an initial input string, and a monadic action
-- that can supply more input if needed.
parseWith :: Monad m =>
(m B.ByteString)
-- ^ An action that will be executed to provide the parser
-- with more input, if necessary. The action must return an
-- 'B.empty' string when there is no more input available.
-> I.Parser a
-> B.ByteString
-- ^ Initial input for the parser.
-> m (Result a)
parseWith refill p s = step $ parse p s
where step (T.Partial k) = (step . k) =<< refill
step r = return r
{-# INLINE parseWith #-}
-- | Convert a 'Result' value to a 'Maybe' value. A 'T.Partial' result
-- is treated as failure.
maybeResult :: Result r -> Maybe r
maybeResult (T.Done _ r) = Just r
maybeResult _ = Nothing
-- | Convert a 'Result' value to an 'Either' value. A 'T.Partial'
-- result is treated as failure.
eitherResult :: Result r -> Either String r
eitherResult (T.Done _ r) = Right r
eitherResult (T.Fail _ [] msg) = Left msg
eitherResult (T.Fail _ ctxs msg) = Left (intercalate " > " ctxs ++ ": " ++ msg)
eitherResult _ = Left "Result: incomplete input"
|