11//! A parser of the ENBF-like grammar.
22
33use super :: { Characters , Expression , ExpressionKind , Grammar , Production } ;
4- use regex:: { Captures , Regex } ;
54use std:: fmt;
65use std:: fmt:: Display ;
76use std:: path:: Path ;
8- use std:: sync:: LazyLock ;
97
108struct Parser < ' a > {
119 input : & ' a str ,
1210 index : usize ,
1311}
1412
13+ #[ derive( Debug ) ]
1514pub struct Error {
1615 message : String ,
1716 line : String ,
@@ -76,18 +75,6 @@ impl Parser<'_> {
7675 & self . input [ i..i + upper]
7776 }
7877
79- /// If the input matches the given regex, it is returned and the head is moved forward.
80- ///
81- /// Note that regexes must start with `^`.
82- fn take_re ( & mut self , re : & Regex ) -> Option < Captures < ' _ > > {
83- if let Some ( cap) = re. captures ( & self . input [ self . index ..] ) {
84- self . index += cap[ 0 ] . len ( ) ;
85- Some ( cap)
86- } else {
87- None
88- }
89- }
90-
9178 /// Returns whether or not the given string is next, and advances the head if it is.
9279 fn take_str ( & mut self , s : & str ) -> bool {
9380 if self . input [ self . index ..] . starts_with ( s) {
@@ -168,13 +155,12 @@ impl Parser<'_> {
168155 }
169156
170157 fn parse_expression ( & mut self ) -> Result < Option < Expression > > {
171- static ALT_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^ *\| *" ) . unwrap ( ) ) ;
172-
173158 let mut es = Vec :: new ( ) ;
174159 loop {
175160 let Some ( e) = self . parse_seq ( ) ? else { break } ;
176161 es. push ( e) ;
177- if self . take_re ( & ALT_RE ) . is_none ( ) {
162+ _ = self . space0 ( ) ;
163+ if !self . take_str ( "|" ) {
178164 break ;
179165 }
180166 }
@@ -268,21 +254,28 @@ impl Parser<'_> {
268254 Some ( ExpressionKind :: Nt ( nt) )
269255 }
270256
257+ /// Parse terminal within backticks.
271258 fn parse_terminal ( & mut self ) -> Result < ExpressionKind > {
272- static TERMINAL_RE : LazyLock < Regex > =
273- LazyLock :: new ( || Regex :: new ( r"^`([^`\n]+)`" ) . unwrap ( ) ) ;
274- match self . take_re ( & TERMINAL_RE ) {
275- Some ( cap) => Ok ( ExpressionKind :: Terminal ( cap[ 1 ] . to_string ( ) ) ) ,
276- None => bail ! ( self , "unterminated terminal, expected closing backtick" ) ,
259+ Ok ( ExpressionKind :: Terminal ( self . parse_terminal_str ( ) ?) )
260+ }
261+
262+ /// Parse string within backticks.
263+ fn parse_terminal_str ( & mut self ) -> Result < String > {
264+ self . expect ( "`" , "expected opening backtick" ) ?;
265+ let term = self . take_while ( & |x| ![ '\n' , '`' ] . contains ( & x) ) . to_string ( ) ;
266+ if term. is_empty ( ) {
267+ bail ! ( self , "expected terminal" ) ;
277268 }
269+ self . expect ( "`" , "expected closing backtick" ) ?;
270+ Ok ( term)
278271 }
279272
280273 fn parse_charset ( & mut self ) -> Result < ExpressionKind > {
281274 self . expect ( "[" , "expected opening [" ) ?;
282275 let mut characters = Vec :: new ( ) ;
283276 loop {
284277 self . space0 ( ) ;
285- let Some ( ch) = self . parse_characters ( ) else {
278+ let Some ( ch) = self . parse_characters ( ) ? else {
286279 break ;
287280 } ;
288281 characters. push ( ch) ;
@@ -295,27 +288,49 @@ impl Parser<'_> {
295288 Ok ( ExpressionKind :: Charset ( characters) )
296289 }
297290
298- fn parse_characters ( & mut self ) -> Option < Characters > {
299- static RANGE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^`(.)`-`(.)`" ) . unwrap ( ) ) ;
300- static TERMINAL_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( "^`([^`\n ]+)`" ) . unwrap ( ) ) ;
301- if let Some ( cap) = self . take_re ( & RANGE_RE ) {
302- let a = cap[ 1 ] . chars ( ) . next ( ) . unwrap ( ) ;
303- let b = cap[ 2 ] . chars ( ) . next ( ) . unwrap ( ) ;
304- Some ( Characters :: Range ( a, b) )
305- } else if let Some ( cap) = self . take_re ( & TERMINAL_RE ) {
306- Some ( Characters :: Terminal ( cap[ 1 ] . to_string ( ) ) )
291+ /// Parse an element of a character class, e.g.
292+ /// `` `a`-`b` `` | `` `term` `` | `` NonTerminal ``.
293+ fn parse_characters ( & mut self ) -> Result < Option < Characters > > {
294+ if let Some ( b'`' ) = self . peek ( ) {
295+ let recov = self . index ;
296+ let a = self . parse_terminal_str ( ) ?;
297+ if let Some ( b'-' ) = self . peek ( ) {
298+ //~^ Parse `` `a`-`b` `` character range.
299+ if a. len ( ) > 1 {
300+ self . index = recov + 1 ;
301+ bail ! ( self , "invalid start terminal in range" ) ;
302+ }
303+ self . expect ( "-" , "expected `-`" ) . unwrap ( ) ;
304+ let recov = self . index ;
305+ let b = self . parse_terminal_str ( ) ?;
306+ if b. len ( ) > 1 {
307+ self . index = recov + 1 ;
308+ bail ! ( self , "invalid end terminal in range" ) ;
309+ }
310+ let a = a. chars ( ) . next ( ) . unwrap ( ) ;
311+ let b = b. chars ( ) . next ( ) . unwrap ( ) ;
312+ Ok ( Some ( Characters :: Range ( a, b) ) )
313+ } else {
314+ //~^ Parse terminal in backticks.
315+ Ok ( Some ( Characters :: Terminal ( a) ) )
316+ }
317+ } else if let Some ( name) = self . parse_name ( ) {
318+ //~^ Parse nonterminal identifier.
319+ Ok ( Some ( Characters :: Named ( name) ) )
307320 } else {
308- let name = self . parse_name ( ) ?;
309- Some ( Characters :: Named ( name) )
321+ Ok ( None )
310322 }
311323 }
312324
325+ /// Parse e.g. `<prose text>`.
313326 fn parse_prose ( & mut self ) -> Result < ExpressionKind > {
314- static PROSE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^<([^>\n]+)>" ) . unwrap ( ) ) ;
315- match self . take_re ( & PROSE_RE ) {
316- Some ( cap ) => Ok ( ExpressionKind :: Prose ( cap [ 1 ] . to_string ( ) ) ) ,
317- None => bail ! ( self , "unterminated prose, expected closing `>`" ) ,
327+ self . expect ( "<" , "expected opening `<`" ) ? ;
328+ let text = self . take_while ( & |x| ! [ '\n' , '>' ] . contains ( & x ) ) . to_string ( ) ;
329+ if text . is_empty ( ) {
330+ bail ! ( self , "expected prose text" ) ;
318331 }
332+ self . expect ( ">" , "expected closing `>`" ) ?;
333+ Ok ( ExpressionKind :: Prose ( text) )
319334 }
320335
321336 fn parse_grouped ( & mut self ) -> Result < ExpressionKind > {
@@ -344,13 +359,19 @@ impl Parser<'_> {
344359 Ok ( ExpressionKind :: NegExpression ( box_kind ( kind) ) )
345360 }
346361
362+ /// Parse e.g. `F00F` after `U+`.
347363 fn parse_unicode ( & mut self ) -> Result < ExpressionKind > {
348- static UNICODE_RE : LazyLock < Regex > = LazyLock :: new ( || Regex :: new ( r"^[A-Z0-9]{4}" ) . unwrap ( ) ) ;
349-
350- match self . take_re ( & UNICODE_RE ) {
351- Some ( s) => Ok ( ExpressionKind :: Unicode ( s[ 0 ] . to_string ( ) ) ) ,
352- None => bail ! ( self , "expected 4 hexadecimal uppercase digits after U+" ) ,
364+ let mut xs = Vec :: with_capacity ( 4 ) ;
365+ for _ in 0 ..4 {
366+ match self . peek ( ) {
367+ Some ( x @ ( b'0' ..=b'9' | b'A' ..=b'F' ) ) => {
368+ xs. push ( x) ;
369+ self . index += 1 ;
370+ }
371+ _ => bail ! ( self , "expected 4 uppercase hexidecimal digits after `U+`" ) ,
372+ }
353373 }
374+ Ok ( ExpressionKind :: Unicode ( String :: from_utf8 ( xs) . unwrap ( ) ) )
354375 }
355376
356377 /// Parse `?` after expression.
@@ -428,16 +449,17 @@ impl Parser<'_> {
428449 Ok ( Some ( self . input [ start..self . index - 1 ] . to_string ( ) ) )
429450 }
430451
452+ /// Parse footnote reference, e.g. `[^id]`.
431453 fn parse_footnote ( & mut self ) -> Result < Option < String > > {
432- static FOOTNOTE_RE : LazyLock < Regex > =
433- LazyLock :: new ( || Regex :: new ( r"^([^\]\n]+)]" ) . unwrap ( ) ) ;
434454 if !self . take_str ( "[^" ) {
435455 return Ok ( None ) ;
436456 }
437- match self . take_re ( & FOOTNOTE_RE ) {
438- Some ( cap ) => Ok ( Some ( cap [ 1 ] . to_string ( ) ) ) ,
439- None => bail ! ( self , "unterminated footnote, expected closing `]`" ) ,
457+ let id = self . take_while ( & |x| ! [ '\n' , ']' ] . contains ( & x ) ) . to_string ( ) ;
458+ if id . is_empty ( ) {
459+ bail ! ( self , "expected footnote id" ) ;
440460 }
461+ self . expect ( "]" , "expected closing `]`" ) ?;
462+ Ok ( Some ( id) )
441463 }
442464}
443465
0 commit comments