@@ -88,6 +88,10 @@ impl Debug for TextBuffer<'_> {
88
88
/// '\x0C', Form feed
89
89
pub ( crate ) const WHITESPACE_BYTES : & [ u8 ] = b" \t \r \n \x09 \x0B \x0C " ;
90
90
91
+ pub ( crate ) const NON_NEWLINE_BYTES : & [ u8 ] = b"\t \x09 \x0B \x0C " ;
92
+
93
+ pub ( crate ) const NEWLINE_BYTES : & [ u8 ] = b"\r \n " ;
94
+
91
95
/// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing
92
96
/// the various encoding elements of a text Ion stream.
93
97
///
@@ -106,6 +110,8 @@ pub struct TextBuffer<'top> {
106
110
// offset: 6
107
111
data : & ' top [ u8 ] ,
108
112
offset : usize ,
113
+ row : usize ,
114
+ prev_newline_offset : usize ,
109
115
pub ( crate ) context : EncodingContextRef < ' top > ,
110
116
is_final_data : bool ,
111
117
}
@@ -141,6 +147,8 @@ impl<'top> TextBuffer<'top> {
141
147
context,
142
148
data,
143
149
offset,
150
+ row : 1 ,
151
+ prev_newline_offset : 0 ,
144
152
is_final_data,
145
153
}
146
154
}
@@ -206,6 +214,22 @@ impl<'top> TextBuffer<'top> {
206
214
self . offset
207
215
}
208
216
217
+ /// Returns the row position for this buffer.
218
+ /// _Note: Row positions are calculated based on newline characters `\n` and `\r`._
219
+ pub fn row ( & self ) -> usize {
220
+ self . row
221
+ }
222
+
223
+ /// Returns the column position for this buffer.
224
+ /// _Note: Column positions are calculated based on current offset and previous newline byte offset._
225
+ pub fn column ( & self ) -> usize {
226
+ self . offset - self . prev_newline_offset + 1
227
+ }
228
+
229
+ pub fn prev_newline_offset ( & self ) -> usize {
230
+ self . prev_newline_offset
231
+ }
232
+
209
233
/// Returns the number of bytes in the buffer.
210
234
pub fn len ( & self ) -> usize {
211
235
self . data . len ( )
@@ -245,16 +269,44 @@ impl<'top> TextBuffer<'top> {
245
269
246
270
/// Matches one or more whitespace characters.
247
271
pub fn match_whitespace1 ( & mut self ) -> IonMatchResult < ' top > {
248
- take_while ( 1 .., WHITESPACE_BYTES ) . parse_next ( self )
272
+ let result = take_while ( 1 .., WHITESPACE_BYTES ) . parse_next ( self ) ?;
273
+ self . update_location_metadata ( result. data ) ;
274
+ Ok ( * self )
275
+ }
276
+
277
+ /// Updates the location metadata based on the matched whitespace bytes in the consumed buffer
278
+ fn update_location_metadata ( & mut self , data : & ' top [ u8 ] ) {
279
+ if !data. is_empty ( ) {
280
+ // If the bytes contain '\r\n' in this order then this must be coming from windows line ending pattern and hence should be counted as 1.
281
+ let crlf_count = data. windows ( 2 ) . filter ( |window| window == b"\r \n " ) . count ( ) ;
282
+
283
+ // Subtract the crlf_count from total count of all newline characters to get the correct number of newline match count.
284
+ let newline_match_count = data. iter ( ) . filter ( |b| NEWLINE_BYTES . contains ( b) ) . count ( ) - crlf_count;
285
+
286
+ // Gets index for the last occurrence of the newline byte and subtracts from the result length to get non newline bytes length
287
+ let last_index_newline_byte = data. iter ( ) . rposition ( |b| NEWLINE_BYTES . contains ( b) ) . unwrap_or ( 0 ) ;
288
+ let non_newline_match_length = data. len ( ) - last_index_newline_byte - 1 ;
289
+ self . row += newline_match_count;
290
+
291
+ // Stores this newline offset as previous newline offset for calculating column position since this has already been matched/parsed
292
+ if self . offset < non_newline_match_length {
293
+ // this means that the input is not yet consumed hence get the input length + the current offset
294
+ self . prev_newline_offset = self . offset + data. len ( ) - non_newline_match_length;
295
+ } else {
296
+ self . prev_newline_offset = self . offset - non_newline_match_length;
297
+
298
+ }
299
+ }
249
300
}
250
301
251
302
/// Matches zero or more whitespace characters.
252
303
pub fn match_whitespace0 ( & mut self ) -> IonMatchResult < ' top > {
253
- take_while ( 0 .., WHITESPACE_BYTES ) . parse_next ( self )
304
+ let result = take_while ( 0 .., WHITESPACE_BYTES ) . parse_next ( self ) ?;
305
+ self . update_location_metadata ( result. data ) ;
306
+ Ok ( * self )
254
307
}
255
308
256
309
/// Matches any amount of contiguous comments and whitespace, including none.
257
- #[ inline]
258
310
pub fn match_optional_comments_and_whitespace ( & mut self ) -> IonMatchResult < ' top > {
259
311
pub fn full_match_optional_comments_and_whitespace < ' t > (
260
312
input : & mut TextBuffer < ' t > ,
@@ -446,6 +498,7 @@ impl<'top> TextBuffer<'top> {
446
498
)
447
499
. map ( |( maybe_annotations, value) | input. apply_annotations ( maybe_annotations, value) )
448
500
. parse_next ( self )
501
+ // let length = result.with_span().map(|(_output, consumed)| consumed.len());
449
502
}
450
503
451
504
/// Matches a struct field name. That is:
@@ -1588,6 +1641,8 @@ impl<'top> TextBuffer<'top> {
1588
1641
let delimiter_head = delimiter. as_bytes ( ) [ 0 ] ;
1589
1642
// Whether we've encountered any escapes while looking for the delimiter
1590
1643
let mut contained_escapes = false ;
1644
+ // This input may contain newline characters hence update the location metadata.
1645
+ self . update_location_metadata ( self . bytes ( ) ) ;
1591
1646
// The input left to search
1592
1647
let mut remaining = * self ;
1593
1648
loop {
@@ -1987,11 +2042,20 @@ impl<'data> Stream for TextBuffer<'data> {
1987
2042
}
1988
2043
1989
2044
fn checkpoint ( & self ) -> Self :: Checkpoint {
1990
- * self
2045
+ let mut checkpoint = * self ;
2046
+ // Reset row, column at checkpoint
2047
+ checkpoint. row = 0 ;
2048
+ checkpoint. prev_newline_offset = 0 ;
2049
+ checkpoint
1991
2050
}
1992
2051
1993
2052
fn reset ( & mut self , checkpoint : & Self :: Checkpoint ) {
2053
+ let current_row = self . row ;
2054
+ let prev_column_value = self . prev_newline_offset ;
2055
+
1994
2056
* self = * checkpoint;
2057
+ self . row = current_row + checkpoint. row ;
2058
+ self . prev_newline_offset = prev_column_value + checkpoint. prev_newline_offset ;
1995
2059
}
1996
2060
1997
2061
fn raw ( & self ) -> & dyn Debug {
@@ -2157,22 +2221,23 @@ mod tests {
2157
2221
self
2158
2222
}
2159
2223
2160
- fn try_match < ' data , P , O > ( & ' data self , parser : P ) -> IonParseResult < ' data , usize >
2224
+ fn try_match < ' data , P , O > ( & ' data self , parser : P ) -> IonParseResult < ' data , ( TextBuffer < ' data > , usize ) >
2161
2225
where
2162
2226
P : Parser < TextBuffer < ' data > , O , IonParseError < ' data > > ,
2163
2227
{
2164
2228
let mut buffer = TextBuffer :: new ( self . context . get_ref ( ) , self . input . as_bytes ( ) , true ) ;
2165
- match_length ( parser) . parse_next ( & mut buffer)
2229
+ let matched_length = match_length ( parser) . parse_next ( & mut buffer) ?;
2230
+ Ok ( ( buffer, matched_length) )
2166
2231
}
2167
2232
2168
2233
fn expect_match < ' data , P , O > ( & ' data self , parser : P )
2169
2234
where
2170
2235
P : Parser < TextBuffer < ' data > , O , IonParseError < ' data > > ,
2171
2236
{
2172
- let result = self . try_match ( parser) ;
2173
- let match_length = result. unwrap_or_else ( |e| {
2237
+ let result = self . try_match ( parser) . unwrap_or_else ( |e| {
2174
2238
panic ! ( "Unexpected parse fail for input <{}>\n {e}" , self . input)
2175
2239
} ) ;
2240
+ let match_length = result. 1 ;
2176
2241
// Inputs have a trailing newline and `0` that should _not_ be part of the match
2177
2242
assert_eq ! (
2178
2243
match_length,
@@ -2183,6 +2248,26 @@ mod tests {
2183
2248
) ;
2184
2249
}
2185
2250
2251
+ fn expect_match_location < ' data , P , O > ( & ' data self , parser : P , expected_location : ( usize , usize ) )
2252
+ where
2253
+ P : Parser < TextBuffer < ' data > , O , IonParseError < ' data > > ,
2254
+ {
2255
+ let result = self . try_match ( parser) . unwrap_or_else ( |e| {
2256
+ panic ! ( "Unexpected parse fail for input <{}>\n {e}" , self . input)
2257
+ } ) ;
2258
+ let match_length = result. 1 ;
2259
+ // Inputs have a trailing newline and `0` that should _not_ be part of the match
2260
+ assert_eq ! (
2261
+ match_length,
2262
+ self . input. len( ) ,
2263
+ "\n Input: '{}'\n Matched: '{}'\n " ,
2264
+ self . input,
2265
+ & self . input[ ..match_length]
2266
+ ) ;
2267
+ // Assert the location metadata
2268
+ assert_eq ! ( expected_location, ( result. 0 . row( ) , result. 0 . column( ) ) ) ;
2269
+ }
2270
+
2186
2271
fn expect_mismatch < ' data , P , O > ( & ' data self , parser : P )
2187
2272
where
2188
2273
P : Parser < TextBuffer < ' data > , O , IonParseError < ' data > > ,
@@ -2192,7 +2277,7 @@ mod tests {
2192
2277
// input will be rejected outright.
2193
2278
2194
2279
match result {
2195
- Ok ( match_length) => {
2280
+ Ok ( ( _ , match_length) ) => {
2196
2281
assert_ne ! (
2197
2282
match_length,
2198
2283
self . input. len( ) ,
@@ -2817,6 +2902,21 @@ mod tests {
2817
2902
assert ! ( contains_escapes) ;
2818
2903
}
2819
2904
2905
+ #[ rstest]
2906
+ #[ case:: newlines( "\n \r " , ( 3 , 1 ) ) ]
2907
+ #[ case:: crlf( "\r \n \r \n " , ( 3 , 1 ) ) ]
2908
+ #[ case:: mixed( "\r \n \n \r \n " , ( 4 , 1 ) ) ]
2909
+ #[ case:: tabs( "\n \t \t \t " , ( 2 , 4 ) ) ]
2910
+ #[ case:: mix_tabs_and_newlines( "\n \t \n " , ( 3 , 1 ) ) ]
2911
+ fn expect_whitespace ( #[ case] input : & str , #[ case] expected_location : ( usize , usize ) ) {
2912
+ MatchTest :: new_1_0 ( input) . expect_match_location ( match_length ( TextBuffer :: match_whitespace0) , expected_location) ;
2913
+ }
2914
+
2915
+ #[ test]
2916
+ fn expect_newline_long_text ( ) {
2917
+ MatchTest :: new_1_0 ( "'''long \n \r \n \t hello'''" ) . expect_match_location ( match_length ( TextBuffer :: match_string) , ( 3 , 11 ) ) ;
2918
+ }
2919
+
2820
2920
#[ test]
2821
2921
fn expect_foo ( ) {
2822
2922
MatchTest :: new_1_0 ( "\" hello\" " ) . expect_match ( match_length ( TextBuffer :: match_string) ) ;
0 commit comments