@@ -2,7 +2,7 @@ use super::Operator;
2
2
use crate :: {
3
3
buffer:: { BufferError , BufferWindow , BufferWindowBuilder } ,
4
4
data:: is_boundary,
5
- util:: { contains_zero_byte, count_chunk, repeat_byte} ,
5
+ util:: { contains_zero_byte, count_chunk, leading_whitespace , repeat_byte} ,
6
6
Scalar ,
7
7
} ;
8
8
use std:: io:: Read ;
@@ -134,8 +134,7 @@ where
134
134
self . buf . position ( )
135
135
}
136
136
137
- #[ inline]
138
- unsafe fn next_opt ( & mut self ) -> ( Option < Token > , Option < ReaderError > ) {
137
+ unsafe fn next_opt_fallback ( & mut self ) -> ( Option < Token > , Option < ReaderError > ) {
139
138
#[ derive( Debug ) ]
140
139
enum ParseState {
141
140
None ,
@@ -155,21 +154,7 @@ where
155
154
156
155
' inner: loop {
157
156
match * ptr {
158
- c @ b' ' | c @ b'\t' => {
159
- ptr = ptr. add ( 1 ) ;
160
- loop {
161
- if ptr == end {
162
- break ' eof ( 0 , 0 ) ;
163
- }
164
-
165
- if * ptr != c {
166
- break ;
167
- }
168
-
169
- ptr = ptr. add ( 1 )
170
- }
171
- }
172
- b'\n' | b'\r' | b';' => {
157
+ b' ' | b'\t' | b'\n' | b'\r' | b';' => {
173
158
ptr = ptr. add ( 1 ) ;
174
159
break ' inner;
175
160
}
@@ -425,6 +410,91 @@ where
425
410
}
426
411
}
427
412
413
+ #[ inline]
414
+ unsafe fn next_opt ( & mut self ) -> ( Option < Token > , Option < ReaderError > ) {
415
+ let mut ptr = self . buf . start ;
416
+ let end = self . buf . end ;
417
+
418
+ if end. offset_from ( ptr) < 9 {
419
+ return self . next_opt_fallback ( ) ;
420
+ }
421
+
422
+ // 3.4 million newlines followed by an average of 3.3 tabs
423
+ let data = ptr. cast :: < u64 > ( ) . read_unaligned ( ) . to_le ( ) ;
424
+ ptr = ptr. add ( leading_whitespace ( data) as usize ) ;
425
+
426
+ // Eagerly check for brackets, there'll be millions of them
427
+ if * ptr == b'{' {
428
+ self . buf . advance_to ( ptr. add ( 1 ) ) ;
429
+ return ( Some ( Token :: Open ) , None ) ;
430
+ } else if * ptr == b'}' {
431
+ self . buf . advance_to ( ptr. add ( 1 ) ) ;
432
+ return ( Some ( Token :: Close ) , None ) ;
433
+ }
434
+ // unquoted values are the most frequent type of values in
435
+ // text so if we see something that is alphanumeric or a
436
+ // dash (for negative numbers) we eagerly attempt to match
437
+ // against it. Loop unrolling is used to minimize the number
438
+ // of access to the boundary lookup table.
439
+ else if matches ! ( * ptr, b'a' ..=b'z' | b'0' ..=b'9' | b'A' ..=b'Z' | b'-' ) {
440
+ let start_ptr = ptr;
441
+ let mut opt_ptr = start_ptr. add ( 1 ) ;
442
+ while end. offset_from ( opt_ptr) > 8 {
443
+ for _ in 0 ..8 {
444
+ if is_boundary ( * opt_ptr) {
445
+ self . buf . advance_to ( opt_ptr) ;
446
+
447
+ // for space delimited arrays, advance one
448
+ if * opt_ptr == b' ' {
449
+ self . buf . advance ( 1 ) ;
450
+ }
451
+
452
+ let scalar = self . buf . get ( start_ptr..opt_ptr) ;
453
+ return ( Some ( Token :: Unquoted ( scalar) ) , None ) ;
454
+ }
455
+ opt_ptr = opt_ptr. add ( 1 ) ;
456
+ }
457
+ }
458
+
459
+ // optimization failed, fallback to inner parsing loop
460
+ } else if * ptr == b'\"' {
461
+ let start_ptr = ptr. add ( 1 ) ;
462
+ let mut opt_ptr = start_ptr;
463
+ let mut escaped = false ;
464
+ while end. offset_from ( opt_ptr) > 8 {
465
+ let data = opt_ptr. cast :: < u64 > ( ) . read_unaligned ( ) . to_le ( ) ;
466
+ escaped |= contains_zero_byte ( data ^ repeat_byte ( b'\\' ) ) ;
467
+
468
+ // http://0x80.pl/notesen/2023-03-06-swar-find-any.html#faster-swar-procedure
469
+ let mask = repeat_byte ( 0x7f ) ;
470
+ let lobits = data & mask;
471
+ let x0 = ( lobits ^ repeat_byte ( b'\"' ) ) + mask;
472
+ let t0 = x0 | data;
473
+ let t1 = t0 & repeat_byte ( 0x80 ) ;
474
+ let t2 = t1 ^ repeat_byte ( 0x80 ) ;
475
+
476
+ if t2 != 0 {
477
+ let quote_ind = t2. trailing_zeros ( ) >> 3 ;
478
+
479
+ if !escaped {
480
+ opt_ptr = opt_ptr. add ( quote_ind as usize ) ;
481
+ self . buf . advance_to ( opt_ptr. add ( 1 ) ) ;
482
+ let scalar = self . buf . get ( start_ptr..opt_ptr) ;
483
+ return ( Some ( Token :: Quoted ( scalar) ) , None ) ;
484
+ } else {
485
+ break ;
486
+ }
487
+ } else {
488
+ opt_ptr = opt_ptr. add ( 8 ) ;
489
+ }
490
+ }
491
+
492
+ // optimization failed, fallback to inner parsing loop
493
+ }
494
+
495
+ self . next_opt_fallback ( )
496
+ }
497
+
428
498
/// Advance a given number of bytes and return them.
429
499
///
430
500
/// The internal buffer must be large enough to accomodate all bytes.
0 commit comments