Skip to content

Commit baf0a91

Browse files
committed
Adds location information to text buffer
1 parent a34f51f commit baf0a91

File tree

1 file changed

+109
-9
lines changed

1 file changed

+109
-9
lines changed

src/lazy/text/buffer.rs

Lines changed: 109 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ impl Debug for TextBuffer<'_> {
8888
/// '\x0C', Form feed
8989
pub(crate) const WHITESPACE_BYTES: &[u8] = b" \t\r\n\x09\x0B\x0C";
9090

91+
pub(crate) const NON_NEWLINE_BYTES: &[u8] = b"\t\x09\x0B\x0C";
92+
93+
pub(crate) const NEWLINE_BYTES: &[u8] = b"\r\n";
94+
9195
/// A slice of unsigned bytes that can be cheaply copied and which defines methods for parsing
9296
/// the various encoding elements of a text Ion stream.
9397
///
@@ -106,6 +110,8 @@ pub struct TextBuffer<'top> {
106110
// offset: 6
107111
data: &'top [u8],
108112
offset: usize,
113+
row: usize,
114+
prev_newline_offset: usize,
109115
pub(crate) context: EncodingContextRef<'top>,
110116
is_final_data: bool,
111117
}
@@ -141,6 +147,8 @@ impl<'top> TextBuffer<'top> {
141147
context,
142148
data,
143149
offset,
150+
row: 1,
151+
prev_newline_offset: 0,
144152
is_final_data,
145153
}
146154
}
@@ -206,6 +214,22 @@ impl<'top> TextBuffer<'top> {
206214
self.offset
207215
}
208216

217+
/// Returns the row position for this buffer.
218+
/// _Note: Row positions are calculated based on newline characters `\n` and `\r`._
219+
pub fn row(&self) -> usize {
220+
self.row
221+
}
222+
223+
/// Returns the column position for this buffer.
224+
/// _Note: Column positions are calculated based on current offset and previous newline byte offset._
225+
pub fn column(&self) -> usize {
226+
self.offset - self.prev_newline_offset + 1
227+
}
228+
229+
pub fn prev_newline_offset(&self) -> usize {
230+
self.prev_newline_offset
231+
}
232+
209233
/// Returns the number of bytes in the buffer.
210234
pub fn len(&self) -> usize {
211235
self.data.len()
@@ -245,16 +269,44 @@ impl<'top> TextBuffer<'top> {
245269

246270
/// Matches one or more whitespace characters.
247271
pub fn match_whitespace1(&mut self) -> IonMatchResult<'top> {
248-
take_while(1.., WHITESPACE_BYTES).parse_next(self)
272+
let result = take_while(1.., WHITESPACE_BYTES).parse_next(self)?;
273+
self.update_location_metadata(result.data);
274+
Ok(*self)
275+
}
276+
277+
/// Updates the location metadata based on the matched whitespace bytes in the consumed buffer
278+
fn update_location_metadata(&mut self, data: &'top [u8]) {
279+
if !data.is_empty() {
280+
// If the bytes contain '\r\n' in this order then this must be coming from windows line ending pattern and hence should be counted as 1.
281+
let crlf_count = data.windows(2).filter(|window| window == b"\r\n").count();
282+
283+
// Subtract the crlf_count from total count of all newline characters to get the correct number of newline match count.
284+
let newline_match_count = data.iter().filter(|b| NEWLINE_BYTES.contains(b)).count() - crlf_count;
285+
286+
// Gets index for the last occurrence of the newline byte and subtracts from the result length to get non newline bytes length
287+
let last_index_newline_byte = data.iter().rposition(|b| NEWLINE_BYTES.contains(b)).unwrap_or(0);
288+
let non_newline_match_length = data.len() - last_index_newline_byte - 1;
289+
self.row += newline_match_count;
290+
291+
// Stores this newline offset as previous newline offset for calculating column position since this has already been matched/parsed
292+
if self.offset < non_newline_match_length {
293+
// this means that the input is not yet consumed hence get the input length + the current offset
294+
self.prev_newline_offset = self.offset + data.len() - non_newline_match_length;
295+
} else {
296+
self.prev_newline_offset = self.offset - non_newline_match_length;
297+
298+
}
299+
}
249300
}
250301

251302
/// Matches zero or more whitespace characters.
252303
pub fn match_whitespace0(&mut self) -> IonMatchResult<'top> {
253-
take_while(0.., WHITESPACE_BYTES).parse_next(self)
304+
let result = take_while(0.., WHITESPACE_BYTES).parse_next(self)?;
305+
self.update_location_metadata(result.data);
306+
Ok(*self)
254307
}
255308

256309
/// Matches any amount of contiguous comments and whitespace, including none.
257-
#[inline]
258310
pub fn match_optional_comments_and_whitespace(&mut self) -> IonMatchResult<'top> {
259311
pub fn full_match_optional_comments_and_whitespace<'t>(
260312
input: &mut TextBuffer<'t>,
@@ -446,6 +498,7 @@ impl<'top> TextBuffer<'top> {
446498
)
447499
.map(|(maybe_annotations, value)| input.apply_annotations(maybe_annotations, value))
448500
.parse_next(self)
501+
// let length = result.with_span().map(|(_output, consumed)| consumed.len());
449502
}
450503

451504
/// Matches a struct field name. That is:
@@ -1588,6 +1641,8 @@ impl<'top> TextBuffer<'top> {
15881641
let delimiter_head = delimiter.as_bytes()[0];
15891642
// Whether we've encountered any escapes while looking for the delimiter
15901643
let mut contained_escapes = false;
1644+
// This input may contain newline characters hence update the location metadata.
1645+
self.update_location_metadata(self.bytes());
15911646
// The input left to search
15921647
let mut remaining = *self;
15931648
loop {
@@ -1987,11 +2042,20 @@ impl<'data> Stream for TextBuffer<'data> {
19872042
}
19882043

19892044
fn checkpoint(&self) -> Self::Checkpoint {
1990-
*self
2045+
let mut checkpoint = *self;
2046+
// Reset row, column at checkpoint
2047+
checkpoint.row = 0;
2048+
checkpoint.prev_newline_offset = 0;
2049+
checkpoint
19912050
}
19922051

19932052
fn reset(&mut self, checkpoint: &Self::Checkpoint) {
2053+
let current_row = self.row;
2054+
let prev_column_value = self.prev_newline_offset;
2055+
19942056
*self = *checkpoint;
2057+
self.row = current_row + checkpoint.row;
2058+
self.prev_newline_offset = prev_column_value + checkpoint.prev_newline_offset;
19952059
}
19962060

19972061
fn raw(&self) -> &dyn Debug {
@@ -2157,22 +2221,23 @@ mod tests {
21572221
self
21582222
}
21592223

2160-
fn try_match<'data, P, O>(&'data self, parser: P) -> IonParseResult<'data, usize>
2224+
fn try_match<'data, P, O>(&'data self, parser: P) -> IonParseResult<'data, (TextBuffer<'data>, usize)>
21612225
where
21622226
P: Parser<TextBuffer<'data>, O, IonParseError<'data>>,
21632227
{
21642228
let mut buffer = TextBuffer::new(self.context.get_ref(), self.input.as_bytes(), true);
2165-
match_length(parser).parse_next(&mut buffer)
2229+
let matched_length = match_length(parser).parse_next(&mut buffer)?;
2230+
Ok((buffer, matched_length))
21662231
}
21672232

21682233
fn expect_match<'data, P, O>(&'data self, parser: P)
21692234
where
21702235
P: Parser<TextBuffer<'data>, O, IonParseError<'data>>,
21712236
{
2172-
let result = self.try_match(parser);
2173-
let match_length = result.unwrap_or_else(|e| {
2237+
let result = self.try_match(parser).unwrap_or_else(|e| {
21742238
panic!("Unexpected parse fail for input <{}>\n{e}", self.input)
21752239
});
2240+
let match_length = result.1;
21762241
// Inputs have a trailing newline and `0` that should _not_ be part of the match
21772242
assert_eq!(
21782243
match_length,
@@ -2183,6 +2248,26 @@ mod tests {
21832248
);
21842249
}
21852250

2251+
fn expect_match_location<'data, P, O>(&'data self, parser: P, expected_location: (usize, usize))
2252+
where
2253+
P: Parser<TextBuffer<'data>, O, IonParseError<'data>>,
2254+
{
2255+
let result = self.try_match(parser).unwrap_or_else(|e| {
2256+
panic!("Unexpected parse fail for input <{}>\n{e}", self.input)
2257+
});
2258+
let match_length = result.1;
2259+
// Inputs have a trailing newline and `0` that should _not_ be part of the match
2260+
assert_eq!(
2261+
match_length,
2262+
self.input.len(),
2263+
"\nInput: '{}'\nMatched: '{}'\n",
2264+
self.input,
2265+
&self.input[..match_length]
2266+
);
2267+
// Assert the location metadata
2268+
assert_eq!(expected_location, (result.0.row(), result.0.column()));
2269+
}
2270+
21862271
fn expect_mismatch<'data, P, O>(&'data self, parser: P)
21872272
where
21882273
P: Parser<TextBuffer<'data>, O, IonParseError<'data>>,
@@ -2192,7 +2277,7 @@ mod tests {
21922277
// input will be rejected outright.
21932278

21942279
match result {
2195-
Ok(match_length) => {
2280+
Ok((_, match_length)) => {
21962281
assert_ne!(
21972282
match_length,
21982283
self.input.len(),
@@ -2817,6 +2902,21 @@ mod tests {
28172902
assert!(contains_escapes);
28182903
}
28192904

2905+
#[rstest]
2906+
#[case::newlines("\n\r", (3,1))]
2907+
#[case::crlf("\r\n\r\n", (3,1))]
2908+
#[case::mixed("\r\n\n\r\n", (4,1))]
2909+
#[case::tabs("\n\t\t\t", (2,4))]
2910+
#[case::mix_tabs_and_newlines("\n\t\n", (3,1))]
2911+
fn expect_whitespace(#[case] input: &str, #[case] expected_location: (usize, usize)) {
2912+
MatchTest::new_1_0(input).expect_match_location(match_length(TextBuffer::match_whitespace0), expected_location);
2913+
}
2914+
2915+
#[test]
2916+
fn expect_newline_long_text() {
2917+
MatchTest::new_1_0("'''long \n\r\n\t hello'''").expect_match_location(match_length(TextBuffer::match_string), (3, 11));
2918+
}
2919+
28202920
#[test]
28212921
fn expect_foo() {
28222922
MatchTest::new_1_0("\"hello\"").expect_match(match_length(TextBuffer::match_string));

0 commit comments

Comments
 (0)