1 // pest. The Elegant Parser
2 // Copyright (c) 2018 Dragoș Tiselice
3 //
4 // Licensed under the Apache License, Version 2.0
5 // <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
6 // license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. All files in the project carrying such notice may not be copied,
8 // modified, or distributed except according to those terms.
9 #![doc(
10     html_root_url = "https://docs.rs/pest_derive",
11     html_logo_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg",
12     html_favicon_url = "https://raw.githubusercontent.com/pest-parser/pest/master/pest-logo.svg"
13 )]
14 #![warn(missing_docs, rust_2018_idioms, unused_qualifications)]
15 //! # pest. The Elegant Parser
16 //!
17 //! pest is a general purpose parser written in Rust with a focus on accessibility, correctness,
18 //! and performance. It uses parsing expression grammars (or [PEG]) as input, which are similar in
19 //! spirit to regular expressions, but which offer the enhanced expressivity needed to parse
20 //! complex languages.
21 //!
22 //! [PEG]: https://en.wikipedia.org/wiki/Parsing_expression_grammar
23 //!
24 //! ## Getting started
25 //!
26 //! The recommended way to start parsing with pest is to read the official [book].
27 //!
28 //! Other helpful resources:
29 //!
30 //! * API reference on [docs.rs]
31 //! * play with grammars and share them on our [fiddle]
32 //! * find previous common questions answered or ask questions on [GitHub Discussions]
33 //! * leave feedback, ask questions, or greet us on [Gitter] or [Discord]
34 //!
35 //! [book]: https://pest.rs/book
36 //! [docs.rs]: https://docs.rs/pest
37 //! [fiddle]: https://pest.rs/#editor
38 //! [Gitter]: https://gitter.im/pest-parser/pest
39 //! [Discord]: https://discord.gg/XEGACtWpT2
40 //! [GitHub Discussions]: https://github.com/pest-parser/pest/discussions
41 //!
42 //!
43 //! ## `.pest` files
44 //!
45 //! Grammar definitions reside in custom `.pest` files located in the `src` directory. Their path is
46 //! relative to `src` and is specified between the `derive` attribute and empty `struct` that
47 //! `Parser` will be derived on.
48 //!
49 //! ```ignore
50 //! #[derive(Parser)]
51 //! #[grammar = "path/to/my_grammar.pest"] // relative to src
52 //! struct MyParser;
53 //! ```
54 //!
55 //! ## Inline grammars
56 //!
57 //! Grammars can also be inlined by using the `#[grammar_inline = "..."]` attribute.
58 //!
59 //! ## Grammar
60 //!
61 //! A grammar is a series of rules separated by whitespace, possibly containing comments.
62 //!
63 //! ### Comments
64 //!
65 //! Comments start with `//` and end at the end of the line.
66 //!
67 //! ```text
68 //! // a comment
69 //! ```
70 //!
71 //! ### Rules
72 //!
73 //! Rules have the following form:
74 //!
75 //! ```ignore
76 //! name = optional_modifier { expression }
77 //! ```
78 //!
79 //! The name of the rule is formed from alphanumeric characters or `_` with the condition that the
80 //! first character is not a digit and is used to create token pairs. When the rule starts being
81 //! parsed, the starting part of the token is being produced, with the ending part being produced
82 //! when the rule finishes parsing.
83 //!
84 //! The following token pair notation `a(b(), c())` denotes the tokens: start `a`, start `b`, end
85 //! `b`, start `c`, end `c`, end `a`.
86 //!
87 //! #### Modifiers
88 //!
89 //! Modifiers are optional and can be one of `_`, `@`, `$`, or `!`. These modifiers change the
90 //! behavior of the rules.
91 //!
92 //! 1. Silent (`_`)
93 //!
94 //!     Silent rules do not create token pairs during parsing, nor are they error-reported.
95 //!
96 //!     ```ignore
97 //!     a = _{ "a" }
98 //!     b =  { a ~ "b" }
99 //!     ```
100 //!
101 //!     Parsing `"ab"` produces the token pair `b()`.
102 //!
103 //! 2. Atomic (`@`)
104 //!
105 //!     Atomic rules do not accept whitespace or comments within their expressions and have a
106 //!     cascading effect on any rule they call. I.e. rules that are not atomic but are called by atomic
107 //!     rules behave atomically.
108 //!
109 //!     Any rules called by atomic rules do not generate token pairs.
110 //!
111 //!     ```ignore
112 //!     a =  { "a" }
113 //!     b = @{ a ~ "b" }
114 //!
115 //!     WHITESPACE = _{ " " }
116 //!     ```
117 //!
118 //!     Parsing `"ab"` produces the token pair `b()`, while `"a   b"` produces an error.
119 //!
120 //! 3. Compound-atomic (`$`)
121 //!
122 //!     Compound-atomic are identical to atomic rules with the exception that rules called by them are
123 //!     not forbidden from generating token pairs.
124 //!
125 //!     ```ignore
126 //!     a =  { "a" }
127 //!     b = ${ a ~ "b" }
128 //!
129 //!     WHITESPACE = _{ " " }
130 //!     ```
131 //!
132 //!     Parsing `"ab"` produces the token pairs `b(a())`, while `"a   b"` produces an error.
133 //!
134 //! 4. Non-atomic (`!`)
135 //!
136 //!     Non-atomic are identical to normal rules with the exception that they stop the cascading effect
137 //!     of atomic and compound-atomic rules.
138 //!
139 //!     ```ignore
140 //!     a =  { "a" }
141 //!     b = !{ a ~ "b" }
142 //!     c = @{ b }
143 //!
144 //!     WHITESPACE = _{ " " }
145 //!     ```
146 //!
147 //!     Parsing both `"ab"` and `"a   b"` produce the token pairs `c(a())`.
148 //!
149 //! #### Expressions
150 //!
151 //! Expressions can be either terminals or non-terminals.
152 //!
153 //! 1. Terminals
154 //!
155 //! | Terminal   | Usage                                                          |
156 //! |------------|----------------------------------------------------------------|
157 //! | `"a"`      | matches the exact string `"a"`                                 |
158 //! | `^"a"`     | matches the exact string `"a"` case insensitively (ASCII only) |
159 //! | `'a'..'z'` | matches one character between `'a'` and `'z'`                  |
160 //! | `a`        | matches rule `a`                                               |
161 //!
162 //! Strings and characters follow
163 //! [Rust's escape mechanisms](https://doc.rust-lang.org/reference/tokens.html#byte-escapes), while
164 //! identifiers can contain alphanumeric characters and underscores (`_`), as long as they do not
165 //! start with a digit.
166 //!
167 //! 2. Non-terminals
168 //!
169 //! | Non-terminal          | Usage                                                      |
170 //! |-----------------------|------------------------------------------------------------|
171 //! | `(e)`                 | matches `e`                                                |
172 //! | `e1 ~ e2`             | matches the sequence `e1` `e2`                             |
173 //! | <code>e1 \| e2</code> | matches either `e1` or `e2`                                |
174 //! | `e*`                  | matches `e` zero or more times                             |
175 //! | `e+`                  | matches `e` one or more times                              |
176 //! | `e{n}`                | matches `e` exactly `n` times                              |
177 //! | `e{, n}`              | matches `e` at most `n` times                              |
178 //! | `e{n,}`               | matches `e` at least `n` times                             |
179 //! | `e{m, n}`             | matches `e` between `m` and `n` times inclusively          |
180 //! | `e?`                  | optionally matches `e`                                     |
181 //! | `&e`                  | matches `e` without making progress                        |
182 //! | `!e`                  | matches if `e` doesn't match without making progress       |
183 //! | `PUSH(e)`             | matches `e` and pushes it's captured string down the stack |
184 //!
185 //! where `e`, `e1`, and `e2` are expressions.
186 //!
187 //! Matching is greedy, without backtracking.  Note the difference in behavior for
188 //! these two rules in matching identifiers that don't end in an underscore:
189 //!
190 //! ```ignore
191 //! // input: ab_bb_b
192 //!
193 //! identifier = @{ "a" ~ ("b"|"_")* ~ "b" }
194 //! // matches:      a     b_bb_b       nothing -> error!
195 //!
196 //! identifier = @{ "a" ~ ("_"* ~ "b")* }
197 //! // matches:      a     b, _bb, _b   in three repetitions
198 //! ```
199 //!
200 //! Expressions can modify the stack only if they match the input. For example,
201 //! if `e1` in the compound expression `e1 | e2` does not match the input, then
202 //! it does not modify the stack, so `e2` sees the stack in the same state as
203 //! `e1` did. Repetitions and optionals (`e*`, `e+`, `e{, n}`, `e{n,}`,
204 //! `e{m,n}`, `e?`) can modify the stack each time `e` matches. The `!e` and `&e`
205 //! expressions are a special case; they never modify the stack.
206 //! Many languages have "keyword" tokens (e.g. if, for, while) as well as general
207 //! tokens (e.g. identifier) that matches any word. In order to match a keyword,
208 //! generally, you may need to restrict that is not immediately followed by another
209 //! letter or digit (otherwise it would be matched as an identifier).
210 //!
211 //! ## Special rules
212 //!
213 //! Special rules can be called within the grammar. They are:
214 //!
215 //! * `WHITESPACE` - runs between rules and sub-rules
216 //! * `COMMENT` - runs between rules and sub-rules
217 //! * `ANY` - matches exactly one `char`
218 //! * `SOI` - (start-of-input) matches only when a `Parser` is still at the starting position
219 //! * `EOI` - (end-of-input) matches only when a `Parser` has reached its end
220 //! * `POP` - pops a string from the stack and matches it
221 //! * `POP_ALL` - pops the entire state of the stack and matches it
222 //! * `PEEK` - peeks a string from the stack and matches it
223 //! * `PEEK[a..b]` - peeks part of the stack and matches it
224 //! * `PEEK_ALL` - peeks the entire state of the stack and matches it
225 //! * `DROP` - drops the top of the stack (fails to match if the stack is empty)
226 //!
227 //! `WHITESPACE` and `COMMENT` should be defined manually if needed. All other rules cannot be
228 //! overridden.
229 //!
230 //! ## `WHITESPACE` and `COMMENT`
231 //!
232 //! When defined, these rules get matched automatically in sequences (`~`) and repetitions
233 //! (`*`, `+`) between expressions. Atomic rules and those rules called by atomic rules are exempt
234 //! from this behavior.
235 //!
236 //! These rules should be defined so as to match one whitespace character and one comment only since
237 //! they are run in repetitions.
238 //!
239 //! If both `WHITESPACE` and `COMMENT` are defined, this grammar:
240 //!
241 //! ```ignore
242 //! a = { b ~ c }
243 //! ```
244 //!
245 //! is effectively transformed into this one behind the scenes:
246 //!
247 //! ```ignore
248 //! a = { b ~ WHITESPACE* ~ (COMMENT ~ WHITESPACE*)* ~ c }
249 //! ```
250 //!
251 //! ## `PUSH`, `POP`, `DROP`, and `PEEK`
252 //!
253 //! `PUSH(e)` simply pushes the captured string of the expression `e` down a stack. This stack can
254 //! then later be used to match grammar based on its content with `POP` and `PEEK`.
255 //!
256 //! `PEEK` always matches the string at the top of stack. So, if the stack contains `["b", "a"]`
257 //! (`"a"` being on top), this grammar:
258 //!
259 //! ```ignore
260 //! a = { PEEK }
261 //! ```
262 //!
263 //! is effectively transformed into at parse time:
264 //!
265 //! ```ignore
266 //! a = { "a" }
267 //! ```
268 //!
269 //! `POP` works the same way with the exception that it pops the string off of the stack if the
270 //! match worked. With the stack from above, if `POP` matches `"a"`, the stack will be mutated
271 //! to `["b"]`.
272 //!
273 //! `DROP` makes it possible to remove the string at the top of the stack
274 //! without matching it. If the stack is nonempty, `DROP` drops the top of the
275 //! stack. If the stack is empty, then `DROP` fails to match.
276 //!
277 //! ### Advanced peeking
278 //!
279 //! `PEEK[start..end]` and `PEEK_ALL` allow to peek deeper into the stack. The syntax works exactly
280 //! like Rust’s exclusive slice syntax. Additionally, negative indices can be used to indicate an
281 //! offset from the top. If the end lies before or at the start, the expression matches (as does
282 //! a `PEEK_ALL` on an empty stack). With the stack `["c", "b", "a"]` (`"a"` on top):
283 //!
284 //! ```ignore
285 //! fill = PUSH("c") ~ PUSH("b") ~ PUSH("a")
286 //! v = { PEEK_ALL } = { "a" ~ "b" ~ "c" }  // top to bottom
287 //! w = { PEEK[..] } = { "c" ~ "b" ~ "a" }  // bottom to top
288 //! x = { PEEK[1..2] } = { PEEK[1..-1] } = { "b" }
289 //! y = { PEEK[..-2] } = { PEEK[0..1] } = { "a" }
290 //! z = { PEEK[1..] } = { PEEK[-2..3] } = { "c" ~ "b" }
291 //! n = { PEEK[2..-2] } = { PEEK[2..1] } = { "" }
292 //! ```
293 //!
294 //! For historical reasons, `PEEK_ALL` matches from top to bottom, while `PEEK[start..end]` matches
295 //! from bottom to top. There is currently no syntax to match a slice of the stack top to bottom.
296 //!
297 //! ## `Rule`
298 //!
299 //! All rules defined or used in the grammar populate a generated `enum` called `Rule`. This
300 //! implements `pest`'s `RuleType` and can be used throughout the API.
301 //!
302 //! ## `Built-in rules`
303 //!
304 //! Pest also comes with a number of built-in rules for convenience. They are:
305 //!
306 //! * `ASCII_DIGIT` - matches a numeric character from 0..9
307 //! * `ASCII_NONZERO_DIGIT` - matches a numeric character from 1..9
308 //! * `ASCII_BIN_DIGIT` - matches a numeric character from 0..1
309 //! * `ASCII_OCT_DIGIT` - matches a numeric character from 0..7
310 //! * `ASCII_HEX_DIGIT` - matches a numeric character from 0..9 or a..f or A..F
311 //! * `ASCII_ALPHA_LOWER` - matches a character from a..z
312 //! * `ASCII_ALPHA_UPPER` - matches a character from A..Z
313 //! * `ASCII_ALPHA` - matches a character from a..z or A..Z
314 //! * `ASCII_ALPHANUMERIC` - matches a character from a..z or A..Z or 0..9
315 //! * `ASCII` - matches a character from \x00..\x7f
316 //! * `NEWLINE` - matches either "\n" or "\r\n" or "\r"
317 
318 use proc_macro::TokenStream;
319 
320 /// The main method that's called by the proc macro
321 /// (a wrapper around `pest_generator::derive_parser`)
322 #[proc_macro_derive(Parser, attributes(grammar, grammar_inline))]
derive_parser(input: TokenStream) -> TokenStream323 pub fn derive_parser(input: TokenStream) -> TokenStream {
324     pest_generator::derive_parser(input.into(), true).into()
325 }
326