@@ -191,6 +191,17 @@ static TOKEN_DISPATCH: [TokenType; 128] = {
191
191
table
192
192
} ;
193
193
194
+ // Thread-local SIMD vectors for whitespace processing
195
+ thread_local ! {
196
+ static SPACE_VEC : u8x16 = u8x16:: splat( b' ' ) ;
197
+ static TAB_VEC : u8x16 = u8x16:: splat( b'\t' ) ;
198
+ static NEWLINE_VEC : u8x16 = u8x16:: splat( b'\n' ) ;
199
+ static CARRIAGE_RETURN_VEC : u8x16 = u8x16:: splat( b'\r' ) ;
200
+ static FORM_FEED_VEC : u8x16 = u8x16:: splat( 0x0c ) ; // Form feed
201
+ static VERT_TAB_VEC : u8x16 = u8x16:: splat( 0x0b ) ; // Vertical tab
202
+ static SLASH_VEC : u8x16 = u8x16:: splat( b'/' ) ; // For detecting comments
203
+ }
204
+
194
205
impl < ' a > Lexer < ' a > {
195
206
/// Create a new lexer from a string input
196
207
#[ inline( always) ]
@@ -502,100 +513,80 @@ impl<'a> Lexer<'a> {
502
513
#[ inline]
503
514
fn process_whitespace_simd ( & mut self ) -> bool {
504
515
// Need at least 16 bytes to use SIMD
505
- let rest_len = self . cursor . rest ( ) . len ( ) ;
506
- if rest_len < 16 || self . cursor . position ( ) + 16 > rest_len as u32 {
516
+ if self . cursor . position ( ) + 16 > self . cursor . rest ( ) . len ( ) as u32 {
507
517
return false ;
508
518
}
509
519
510
- // Get current 16 bytes and load them directly into SIMD vector
511
- let input = self . cursor . rest ( ) ;
512
- let data = unsafe {
513
- // SAFETY: We've checked that we have at least 16 bytes
514
- let mut bytes = [ 0u8 ; 16 ] ;
515
- std:: ptr:: copy_nonoverlapping ( input. as_ptr ( ) , bytes. as_mut_ptr ( ) , 16 ) ;
516
- u8x16:: new ( bytes)
517
- } ;
518
-
519
- // Handle special characters separately for better branch prediction
520
- let first_byte = unsafe { * input. get_unchecked ( 0 ) } ;
521
-
522
- // Check for special cases that need individual handling
523
- match first_byte {
524
- b'\n' => {
525
- self . cursor . advance ( ) ;
526
- self . had_line_break = LineBreak :: Present ;
527
- return true ;
528
- }
529
- b'\r' => {
530
- self . cursor . advance ( ) ;
531
- if let Some ( b'\n' ) = self . cursor . peek ( ) {
532
- self . cursor . advance ( ) ;
533
- }
534
- self . had_line_break = LineBreak :: Present ;
535
- return true ;
536
- }
537
- b'/' => {
538
- // Check if this could be a comment start
539
- if let Some ( b'/' ) | Some ( b'*' ) = self . cursor . peek_at ( 1 ) {
540
- return false ; // Let the caller handle comments
541
- }
542
- return false ; // Not a whitespace
543
- }
544
- 0xe2 => {
545
- // Check for line separator (U+2028) and paragraph separator (U+2029)
546
- let bytes = self . cursor . peek_n ( 3 ) ;
547
- if bytes. len ( ) == 3
548
- && bytes[ 0 ] == 0xe2
549
- && bytes[ 1 ] == 0x80
550
- && ( bytes[ 2 ] == 0xa8 || bytes[ 2 ] == 0xa9 )
551
- {
552
- self . cursor . advance_n ( 3 ) ;
553
- self . had_line_break = LineBreak :: Present ;
554
- return true ;
555
- }
556
- return false ;
557
- }
558
- _ => { }
559
- }
520
+ // Use thread-local SIMD vectors for common whitespace characters
521
+ let space_vec = SPACE_VEC . with ( |v| * v) ;
522
+ let tab_vec = TAB_VEC . with ( |v| * v) ;
523
+ let newline_vec = NEWLINE_VEC . with ( |v| * v) ;
524
+ let carriage_return_vec = CARRIAGE_RETURN_VEC . with ( |v| * v) ;
525
+ let form_feed_vec = FORM_FEED_VEC . with ( |v| * v) ;
526
+ let vert_tab_vec = VERT_TAB_VEC . with ( |v| * v) ;
527
+ let slash_vec = SLASH_VEC . with ( |v| * v) ;
560
528
561
- // Create SIMD vectors for common whitespace characters
562
- let space_vec = u8x16 :: splat ( b' ' ) ;
563
- let tab_vec = u8x16 :: splat ( b'\t' ) ;
564
- let form_feed_vec = u8x16 :: splat ( 0x0c ) ; // Form feed
565
- let vert_tab_vec = u8x16:: splat ( 0x0b ) ; // Vertical tab
529
+ // Get current 16 bytes
530
+ let input = self . cursor . rest ( ) ;
531
+ let mut data = [ 0u8 ; 16 ] ;
532
+ data . copy_from_slice ( unsafe { input . get_unchecked ( 0 .. 16 ) } ) ;
533
+ let chunk = u8x16:: new ( data ) ;
566
534
567
- // Fast path for regular whitespace (space, tab, form feed, vertical tab)
568
535
// Compare with our whitespace vectors
569
- let is_space = data. cmp_eq ( space_vec) ;
570
- let is_tab = data. cmp_eq ( tab_vec) ;
571
- let is_ff = data. cmp_eq ( form_feed_vec) ;
572
- let is_vt = data. cmp_eq ( vert_tab_vec) ;
536
+ let is_space = chunk. cmp_eq ( space_vec) ;
537
+ let is_tab = chunk. cmp_eq ( tab_vec) ;
538
+ let is_newline = chunk. cmp_eq ( newline_vec) ;
539
+ let is_cr = chunk. cmp_eq ( carriage_return_vec) ;
540
+ let is_ff = chunk. cmp_eq ( form_feed_vec) ;
541
+ let is_vt = chunk. cmp_eq ( vert_tab_vec) ;
542
+ let is_slash = chunk. cmp_eq ( slash_vec) ;
573
543
574
544
// Combine masks for regular whitespace
575
545
let is_basic_ws = is_space | is_tab | is_ff | is_vt;
576
546
577
- // Convert SIMD mask to array to process consecutive whitespace
578
- let ws_array = is_basic_ws. to_array ( ) ;
547
+ // Convert masks to arrays
548
+ let is_basic_ws_arr = is_basic_ws. to_array ( ) ;
549
+ let is_newline_arr = is_newline. to_array ( ) ;
550
+ let is_cr_arr = is_cr. to_array ( ) ;
551
+ let is_slash_arr = is_slash. to_array ( ) ;
579
552
580
- // If the first byte is whitespace, process consecutive whitespace
581
- if ws_array[ 0 ] != 0 {
582
- // Count consecutive whitespace characters
583
- let mut count = 0 ;
584
- for ws_char in ws_array {
585
- if ws_char == 0 {
586
- break ;
587
- }
588
- count += 1 ;
553
+ // Check the first byte only - we'll process one character at a time
554
+ // This is more efficient than trying to process the entire chunk at once
555
+ // when we need to handle special cases like CR+LF and comments
556
+
557
+ if unsafe { * is_basic_ws_arr. get_unchecked ( 0 ) } != 0 {
558
+ // Regular whitespace - just advance
559
+ self . cursor . advance ( ) ;
560
+ return true ;
561
+ }
562
+
563
+ if unsafe { * is_newline_arr. get_unchecked ( 0 ) } != 0 {
564
+ // Newline - need to set had_line_break
565
+ self . cursor . advance ( ) ;
566
+ self . had_line_break = LineBreak :: Present ;
567
+ return true ;
568
+ }
569
+
570
+ if unsafe { * is_cr_arr. get_unchecked ( 0 ) } != 0 {
571
+ // Carriage return - need to check for CRLF sequence
572
+ self . cursor . advance ( ) ;
573
+ if let Some ( b'\n' ) = self . cursor . peek ( ) {
574
+ self . cursor . advance ( ) ;
589
575
}
576
+ self . had_line_break = LineBreak :: Present ;
577
+ return true ;
578
+ }
590
579
591
- // Skip all consecutive basic whitespace characters at once
592
- if count > 0 {
593
- self . cursor . advance_n ( count ) ;
594
- return true ;
580
+ if unsafe { * is_slash_arr . get_unchecked ( 0 ) } != 0 {
581
+ // Potential comment - need to check next character
582
+ if let Some ( b'/' ) | Some ( b'*' ) = self . cursor . peek_at ( 1 ) {
583
+ return false ; // Let the caller handle comments
595
584
}
585
+ // Not a comment, just a slash
586
+ return false ;
596
587
}
597
588
598
- // No whitespace found
589
+ // Not whitespace or a special character
599
590
false
600
591
}
601
592
0 commit comments