@@ -104,7 +104,11 @@ impl Parser {
104104 // Starts at the beginning of the input and consumes until either the end
105105 // of input or an error.
106106 fn parse_expr ( mut self ) -> Result < Expr > {
107- while !self . eof ( ) {
107+ loop {
108+ self . ignore_space ( ) ;
109+ if self . eof ( ) {
110+ break ;
111+ }
108112 let build_expr = match self . cur ( ) {
109113 '\\' => try!( self . parse_escape ( ) ) ,
110114 '|' => { let e = try!( self . alternate ( ) ) ; self . bump ( ) ; e }
@@ -177,7 +181,7 @@ impl Parser {
177181 return Err ( self . err ( ErrorKind :: UnexpectedEscapeEof ) ) ;
178182 }
179183 let c = self . cur ( ) ;
180- if is_punct ( c) {
184+ if is_punct ( c) || ( self . flags . ignore_space && c . is_whitespace ( ) ) {
181185 let c = self . bump ( ) ;
182186 return Ok ( try!( self . lit ( c) ) ) ;
183187 }
@@ -234,6 +238,7 @@ impl Parser {
234238 let chari = self . chari ;
235239 let mut name: CaptureName = None ;
236240 self . bump ( ) ;
241+ self . ignore_space ( ) ;
237242 if self . bump_if ( "?P<" ) {
238243 let n = try!( self . parse_group_name ( ) ) ;
239244 if self . names . iter ( ) . any ( |n2| n2 == & n) {
@@ -370,13 +375,16 @@ impl Parser {
370375 return Err ( self . err ( ErrorKind :: RepeaterUnexpectedExpr ( e) ) ) ;
371376 }
372377 self . bump ( ) ;
373- let min = try!( self . parse_decimal ( |c| c != ',' && c != '}' ) ) ;
378+ self . ignore_space ( ) ;
379+ let min = try!( self . parse_decimal ( ) ) ;
374380 let mut max_opt = Some ( min) ;
381+ self . ignore_space ( ) ;
375382 if self . bump_if ( ',' ) {
383+ self . ignore_space ( ) ;
376384 if self . peek_is ( '}' ) {
377385 max_opt = None ;
378386 } else {
379- let max = try!( self . parse_decimal ( |c| c != '}' ) ) ;
387+ let max = try!( self . parse_decimal ( ) ) ;
380388 if min > max {
381389 // e.g., a{2,1}
382390 return Err ( self . err ( ErrorKind :: InvalidRepeatRange {
@@ -387,6 +395,7 @@ impl Parser {
387395 max_opt = Some ( max) ;
388396 }
389397 }
398+ self . ignore_space ( ) ;
390399 if !self . bump_if ( '}' ) {
391400 Err ( self . err ( ErrorKind :: UnclosedRepeat ) )
392401 } else {
@@ -423,8 +432,8 @@ impl Parser {
423432 //
424433 // Start: `1`
425434 // End: `,` (where `until == ','`)
426- fn parse_decimal < B : Bumpable > ( & mut self , until : B ) -> Result < u32 > {
427- match self . bump_get ( until ) {
435+ fn parse_decimal ( & mut self ) -> Result < u32 > {
436+ match self . bump_get ( |c| is_ascii_word ( c ) || c . is_whitespace ( ) ) {
428437 // e.g., a{}
429438 None => Err ( self . err ( ErrorKind :: MissingBase10 ) ) ,
430439 Some ( n) => {
@@ -472,6 +481,7 @@ impl Parser {
472481 // Start: `{`
473482 // End: `b`
474483 fn parse_hex ( & mut self ) -> Result < Build > {
484+ self . ignore_space ( ) ;
475485 if self . bump_if ( '{' ) {
476486 self . parse_hex_many_digits ( )
477487 } else {
@@ -486,9 +496,11 @@ impl Parser {
486496 fn parse_hex_many_digits ( & mut self ) -> Result < Build > {
487497 use std:: char;
488498
489- let s = self . bump_get ( |c| c != '}' ) . unwrap_or ( "" . into ( ) ) ;
499+ self . ignore_space ( ) ;
500+ let s = self . bump_get ( is_ascii_word) . unwrap_or ( "" . into ( ) ) ;
490501 let n = try!( u32:: from_str_radix ( & s, 16 )
491502 . map_err ( |_| self . err ( ErrorKind :: InvalidBase16 ( s) ) ) ) ;
503+ self . ignore_space ( ) ;
492504 if !self . bump_if ( '}' ) {
493505 // e.g., a\x{d
494506 return Err ( self . err ( ErrorKind :: UnclosedHex ) ) ;
@@ -530,12 +542,16 @@ impl Parser {
530542 // End: `+`
531543 fn parse_class ( & mut self ) -> Result < Build > {
532544 self . bump ( ) ;
545+ self . ignore_space ( ) ;
533546 let negated = self . bump_if ( '^' ) ;
547+ self . ignore_space ( ) ;
534548 let mut class = CharClass :: empty ( ) ;
535549 while self . bump_if ( '-' ) {
550+ self . ignore_space ( ) ;
536551 class. ranges . push ( ClassRange :: one ( '-' ) ) ;
537552 }
538553 loop {
554+ self . ignore_space ( ) ;
539555 if self . eof ( ) {
540556 // e.g., [a
541557 return Err ( self . err ( ErrorKind :: UnexpectedClassEof ) ) ;
@@ -631,11 +647,13 @@ impl Parser {
631647 // End: `]`
632648 fn parse_class_range ( & mut self , class : & mut CharClass , start : char )
633649 -> Result < ( ) > {
650+ self . ignore_space ( ) ;
634651 if !self . bump_if ( '-' ) {
635652 // Not a range, so just push a singleton range.
636653 class. ranges . push ( ClassRange :: one ( start) ) ;
637654 return Ok ( ( ) ) ;
638655 }
656+ self . ignore_space ( ) ;
639657 if self . eof ( ) {
640658 // e.g., [a-
641659 return Err ( self . err ( ErrorKind :: UnexpectedClassEof ) ) ;
@@ -730,9 +748,12 @@ impl Parser {
730748 //
731749 // `negate` is true when the class name is used with `\P`.
732750 fn parse_unicode_class ( & mut self , neg : bool ) -> Result < CharClass > {
751+ self . ignore_space ( ) ;
733752 let name =
734753 if self . bump_if ( '{' ) {
735- let n = self . bump_get ( |c| c != '}' ) . unwrap_or ( "" . into ( ) ) ;
754+ self . ignore_space ( ) ;
755+ let n = self . bump_get ( is_ascii_word) . unwrap_or ( "" . into ( ) ) ;
756+ self . ignore_space ( ) ;
736757 if n. is_empty ( ) || !self . bump_if ( '}' ) {
737758 // e.g., \p{Greek
738759 return Err ( self . err ( ErrorKind :: UnclosedUnicodeName ) ) ;
@@ -796,7 +817,31 @@ impl Parser {
796817// Auxiliary helper methods.
797818impl Parser {
798819 fn chars ( & self ) -> Chars {
799- Chars :: new ( & self . chars [ self . chari ..] , self . flags . ignore_space )
820+ Chars :: new ( & self . chars [ self . chari ..] )
821+ }
822+
823+ fn ignore_space ( & mut self ) {
824+ if !self . flags . ignore_space {
825+ return ;
826+ }
827+ while !self . eof ( ) {
828+ match self . cur ( ) {
829+ '#' => {
830+ self . bump ( ) ;
831+ while !self . eof ( ) {
832+ match self . bump ( ) {
833+ '\n' => break ,
834+ _ => continue ,
835+ }
836+ }
837+ } ,
838+ c => if !c. is_whitespace ( ) {
839+ return ;
840+ } else {
841+ self . bump ( ) ;
842+ }
843+ }
844+ }
800845 }
801846
802847 fn bump ( & mut self ) -> char {
@@ -924,48 +969,22 @@ impl Parser {
924969struct Chars < ' a > {
925970 chars : & ' a [ char ] ,
926971 cur : usize ,
927- ignore_space : bool ,
928972}
929973
930974impl < ' a > Iterator for Chars < ' a > {
931975 type Item = char ;
932976 fn next ( & mut self ) -> Option < char > {
933- if !self . ignore_space {
934- let x = self . c ( ) ;
935- self . advance ( ) ;
936- return x;
937- }
938- while let Some ( c) = self . c ( ) {
939- self . advance ( ) ;
940- match c {
941- '\\' => return match self . c ( ) {
942- Some ( '#' ) => { self . advance ( ) ; Some ( '#' ) }
943- _ => Some ( '\\' )
944- } ,
945- '#' => loop {
946- match self . c ( ) {
947- Some ( c) => {
948- self . advance ( ) ;
949- if c == '\n' {
950- break ;
951- }
952- } ,
953- None => return None
954- }
955- } ,
956- _ => if !c. is_whitespace ( ) { return Some ( c) ; }
957- }
958- }
959- None
977+ let x = self . c ( ) ;
978+ self . advance ( ) ;
979+ return x;
960980 }
961981}
962982
963983impl < ' a > Chars < ' a > {
964- fn new ( chars : & [ char ] , ignore_space : bool ) -> Chars {
984+ fn new ( chars : & [ char ] ) -> Chars {
965985 Chars {
966986 chars : chars,
967987 cur : 0 ,
968- ignore_space : ignore_space,
969988 }
970989 }
971990
@@ -1221,6 +1240,13 @@ fn is_valid_capture_char(c: char) -> bool {
12211240 || ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' )
12221241}
12231242
1243+ fn is_ascii_word ( c : char ) -> bool {
1244+ match c {
1245+ 'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true ,
1246+ _ => false ,
1247+ }
1248+ }
1249+
12241250/// Returns true if the give character has significance in a regex.
12251251pub fn is_punct ( c : char ) -> bool {
12261252 match c {
@@ -2368,10 +2394,49 @@ mod tests {
23682394 }
23692395
23702396 #[ test]
2371- fn ignore_space_escape ( ) {
2372- assert_eq ! ( p( r"(?x)\ d" ) , Expr :: Class ( class( PERLD ) ) ) ;
2373- assert_eq ! ( p( r"(?x)\
2374- D" ) , Expr :: Class ( class( PERLD ) . negate( ) ) ) ;
2397+ fn ignore_space_escape_octal ( ) {
2398+ assert_eq ! ( p( r"(?x)\12 3" ) , Expr :: Concat ( vec![
2399+ lit( '\n' ) ,
2400+ lit( '3' ) ,
2401+ ] ) ) ;
2402+ }
2403+
2404+ #[ test]
2405+ fn ignore_space_escape_hex ( ) {
2406+ assert_eq ! ( p( r"(?x)\x { 53 }" ) , lit( 'S' ) ) ;
2407+ assert_eq ! ( p( r"(?x)\x # comment
2408+ { # comment
2409+ 53 # comment
2410+ } # comment" ) , lit( 'S' ) ) ;
2411+ }
2412+
2413+ #[ test]
2414+ fn ignore_space_escape_hex2 ( ) {
2415+ assert_eq ! ( p( r"(?x)\x 53" ) , lit( 'S' ) ) ;
2416+ assert_eq ! ( p( r"(?x)\x # comment
2417+ 53 # comment" ) , lit( 'S' ) ) ;
2418+ }
2419+
2420+ #[ test]
2421+ fn ignore_space_escape_unicode_name ( ) {
2422+ assert_eq ! ( p( r"(?x)\p # comment
2423+ { # comment
2424+ Yi # comment
2425+ } # comment" ) , Expr :: Class ( class( YI ) ) ) ;
2426+ }
2427+
2428+ #[ test]
2429+ fn ignore_space_repeat_counted ( ) {
2430+ assert_eq ! ( p( "(?x)a # comment
2431+ { # comment
2432+ 5 # comment
2433+ , # comment
2434+ 10 # comment
2435+ }" ) , Expr :: Repeat {
2436+ e: b( lit( 'a' ) ) ,
2437+ r: Repeater :: Range { min: 5 , max: Some ( 10 ) } ,
2438+ greedy: true ,
2439+ } ) ;
23752440 }
23762441
23772442 #[ test]
@@ -2424,6 +2489,14 @@ mod tests {
24242489 ] ) ) ;
24252490 }
24262491
2492+ #[ test]
2493+ fn ignore_space_escape_space ( ) {
2494+ assert_eq ! ( p( r"(?x)a\ # hi there" ) , Expr :: Concat ( vec![
2495+ lit( 'a' ) ,
2496+ lit( ' ' ) ,
2497+ ] ) ) ;
2498+ }
2499+
24272500 // Test every single possible error case.
24282501
24292502 macro_rules! test_err {
@@ -2815,4 +2888,19 @@ mod tests {
28152888 test_err ! ( "(?P<a>.)(?P<a>.)" , 14 ,
28162889 ErrorKind :: DuplicateCaptureName ( "a" . into( ) ) ) ;
28172890 }
2891+
2892+ #[ test]
2893+ fn error_ignore_space_escape_hex ( ) {
2894+ test_err ! ( r"(?x)\x{ 5 3 }" , 10 , ErrorKind :: UnclosedHex ) ;
2895+ }
2896+
2897+ #[ test]
2898+ fn error_ignore_space_escape_hex2 ( ) {
2899+ test_err ! ( r"(?x)\x 5 3" , 9 , ErrorKind :: InvalidBase16 ( "5 " . into( ) ) ) ;
2900+ }
2901+
2902+ #[ test]
2903+ fn error_ignore_space_escape_unicode_name ( ) {
2904+ test_err ! ( r"(?x)\p{Y i}" , 9 , ErrorKind :: UnclosedUnicodeName ) ;
2905+ }
28182906}
0 commit comments