From beac4083447e836d139bd4d092596fbc636a40a2 Mon Sep 17 00:00:00 2001 From: Ed Page Date: Thu, 27 Jul 2023 15:15:56 -0500 Subject: [PATCH] Pull newline counting out of inner loop --- gix-config/src/parse/nom/mod.rs | 77 +++-- gix-config/src/parse/nom/tests.rs | 463 ++++++++++++++---------------- 2 files changed, 246 insertions(+), 294 deletions(-) diff --git a/gix-config/src/parse/nom/mod.rs b/gix-config/src/parse/nom/mod.rs index 34b4552fdd4..3d2ee699186 100644 --- a/gix-config/src/parse/nom/mod.rs +++ b/gix-config/src/parse/nom/mod.rs @@ -18,18 +18,18 @@ use crate::parse::{error::ParseNode, section, Comment, Error, Event}; /// Attempt to zero-copy parse the provided bytes, passing results to `dispatch`. pub fn from_bytes<'i>(mut input: &'i [u8], mut dispatch: impl FnMut(Event<'i>)) -> Result<(), Error> { + let start = input.checkpoint(); + let bom = unicode_bom::Bom::from(input); input.next_slice(bom.len()); - let mut newlines = 0; let _ = fold_repeat( 0.., alt(( comment.map(Event::Comment), take_spaces1.map(|whitespace| Event::Whitespace(Cow::Borrowed(whitespace))), |i: &mut &'i [u8]| { - let (newline, counter) = take_newlines1.parse_next(i)?; - newlines += counter; + let newline = take_newlines1.parse_next(i)?; let o = Event::Newline(Cow::Borrowed(newline)); Ok(o) }, @@ -51,24 +51,22 @@ pub fn from_bytes<'i>(mut input: &'i [u8], mut dispatch: impl FnMut(Event<'i>)) let mut node = ParseNode::SectionHeader; - let res = fold_repeat( - 1.., - |i: &mut &'i [u8]| section(i, &mut node, &mut dispatch), - || (), - |_acc, additional_newlines| { - newlines += additional_newlines; - }, - ) - .parse_next(&mut input); - res.map_err(|_| Error { - line_number: newlines, - last_attempted_parser: node, - parsed_until: input.as_bstr().into(), + let res = repeat(1.., |i: &mut &'i [u8]| section(i, &mut node, &mut dispatch)) + .map(|()| ()) + .parse_next(&mut input); + res.map_err(|_| { + let newlines = newlines_from(input, start); + Error { + line_number: newlines, + last_attempted_parser: node, + parsed_until: input.as_bstr().into(), + } })?; // This needs to happen after we collect sections, otherwise the line number // will be off. if !input.is_empty() { + let newlines = newlines_from(input, start); return Err(Error { line_number: newlines, last_attempted_parser: node, @@ -79,6 +77,13 @@ pub fn from_bytes<'i>(mut input: &'i [u8], mut dispatch: impl FnMut(Event<'i>)) Ok(()) } +fn newlines_from(input: &[u8], start: winnow::stream::Checkpoint<&[u8]>) -> usize { + let offset = input.offset_from(&start); + let mut start_input = input; + start_input.reset(start); + start_input.next_slice(offset).iter().filter(|c| **c == b'\n').count() +} + fn comment<'i>(i: &mut &'i [u8]) -> PResult, NomError<&'i [u8]>> { ( one_of([';', '#']).map(|tag| tag as u8), @@ -95,7 +100,7 @@ fn section<'i>( i: &mut &'i [u8], node: &mut ParseNode, dispatch: &mut impl FnMut(Event<'i>), -) -> PResult> { +) -> PResult<(), NomError<&'i [u8]>> { let start = i.checkpoint(); let header = section_header(i).map_err(|e| { i.reset(start); @@ -103,8 +108,6 @@ fn section<'i>( })?; dispatch(Event::SectionHeader(header)); - let mut newlines = 0; - // This would usually be a many0(alt(...)), the manual loop allows us to // optimize vec insertions loop { @@ -114,14 +117,11 @@ fn section<'i>( dispatch(Event::Whitespace(Cow::Borrowed(v.as_bstr()))); } - if let Some((v, new_newlines)) = opt(take_newlines1).parse_next(i)? { - newlines += new_newlines; + if let Some(v) = opt(take_newlines1).parse_next(i)? { dispatch(Event::Newline(Cow::Borrowed(v.as_bstr()))); } - if let Ok(new_newlines) = key_value_pair(i, node, dispatch) { - newlines += new_newlines; - } + let _ = key_value_pair(i, node, dispatch); if let Some(comment) = opt(comment).parse_next(i)? { dispatch(Event::Comment(comment)); @@ -132,7 +132,7 @@ fn section<'i>( } } - Ok(newlines) + Ok(()) } fn section_header<'i>(i: &mut &'i [u8]) -> PResult, NomError<&'i [u8]>> { @@ -211,7 +211,7 @@ fn key_value_pair<'i>( i: &mut &'i [u8], node: &mut ParseNode, dispatch: &mut impl FnMut(Event<'i>), -) -> PResult> { +) -> PResult<(), NomError<&'i [u8]>> { *node = ParseNode::Name; let name = config_name.parse_next(i)?; @@ -222,8 +222,7 @@ fn key_value_pair<'i>( } *node = ParseNode::Value; - let newlines = config_value(i, dispatch)?; - Ok(newlines) + config_value(i, dispatch) } /// Parses the config name of a config pair. Assumes the input has already been @@ -238,30 +237,28 @@ fn config_name<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> { .parse_next(i) } -fn config_value<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PResult> { +fn config_value<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PResult<(), NomError<&'i [u8]>> { if opt('=').parse_next(i)?.is_some() { dispatch(Event::KeyValueSeparator); if let Some(whitespace) = opt(take_spaces1).parse_next(i)? { dispatch(Event::Whitespace(Cow::Borrowed(whitespace))); } - let newlines = value_impl(i, dispatch)?; - Ok(newlines) + value_impl(i, dispatch) } else { // This is a special way of denoting 'empty' values which a lot of code depends on. // Hence, rather to fix this everywhere else, leave it here and fix it where it matters, namely // when it's about differentiating between a missing key-value separator, and one followed by emptiness. dispatch(Event::Value(Cow::Borrowed("".into()))); - Ok(0) + Ok(()) } } /// Handles parsing of known-to-be values. This function handles both single /// line values as well as values that are continuations. -fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PResult> { +fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PResult<(), NomError<&'i [u8]>> { let start_checkpoint = i.checkpoint(); let mut value_start_checkpoint = i.checkpoint(); let mut value_end = None; - let mut newlines = 0; // This is required to ignore comment markers if they're in a quote. let mut is_in_quotes = false; @@ -301,7 +298,6 @@ fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PRe match c { b'\n' => { partial_value_found = true; - newlines += 1; i.reset(value_start_checkpoint); @@ -337,7 +333,7 @@ fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PRe let last_value_index = i.offset_from(&value_start_checkpoint); if last_value_index == 0 { dispatch(Event::Value(Cow::Borrowed("".into()))); - return Ok(newlines); + return Ok(()); } else { last_value_index } @@ -360,7 +356,7 @@ fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut impl FnMut(Event<'i>)) -> PRe dispatch(Event::Value(Cow::Borrowed(remainder_value.as_bstr()))); } - Ok(newlines) + Ok(()) } fn take_spaces1<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> { @@ -369,9 +365,10 @@ fn take_spaces1<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> { .parse_next(i) } -fn take_newlines1<'i>(i: &mut &'i [u8]) -> PResult<(&'i BStr, usize), NomError<&'i [u8]>> { +fn take_newlines1<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> { repeat(1.., alt(("\r\n", "\n"))) - .with_recognized() - .map(|(count, newlines): (usize, &[u8])| (newlines.as_bstr(), count)) + .map(|()| ()) + .recognize() + .map(|newlines: &[u8]| newlines.as_bstr()) .parse_next(i) } diff --git a/gix-config/src/parse/nom/tests.rs b/gix-config/src/parse/nom/tests.rs index b050f318733..730a1cb46c9 100644 --- a/gix-config/src/parse/nom/tests.rs +++ b/gix-config/src/parse/nom/tests.rs @@ -176,7 +176,7 @@ mod section { Event, Section, }; - fn section<'a>(mut i: &'a [u8], node: &mut ParseNode) -> winnow::IResult<&'a [u8], (Section<'a>, usize)> { + fn section<'a>(mut i: &'a [u8], node: &mut ParseNode) -> winnow::IResult<&'a [u8], Section<'a>> { let mut header = None; let mut events = section::Events::default(); super::section(&mut i, node, &mut |e| match &header { @@ -185,19 +185,16 @@ mod section { } Some(_) => events.push(e), }) - .map(|o| { + .map(|_| { ( i, - ( - Section { - header: match header.expect("header set") { - Event::SectionHeader(header) => header, - _ => unreachable!("unexpected"), - }, - events, + Section { + header: match header.expect("header set") { + Event::SectionHeader(header) => header, + _ => unreachable!("unexpected"), }, - o, - ), + events, + }, ) }) } @@ -207,22 +204,19 @@ mod section { let mut node = ParseNode::SectionHeader; assert_eq!( section(b"[a] k = \r\n", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("a", None), - events: vec![ - whitespace_event(" "), - name_event("k"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event(""), - newline_custom_event("\r\n") - ] - .into(), - }, - 1 - )), + fully_consumed(Section { + header: parsed_section_header("a", None), + events: vec![ + whitespace_event(" "), + name_event("k"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event(""), + newline_custom_event("\r\n") + ] + .into(), + }), ); } @@ -231,41 +225,35 @@ mod section { let mut node = ParseNode::SectionHeader; assert_eq!( section(b"[a] k = v\r\n", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("a", None), - events: vec![ - whitespace_event(" "), - name_event("k"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("v"), - newline_custom_event("\r\n") - ] - .into(), - }, - 1 - )), + fully_consumed(Section { + header: parsed_section_header("a", None), + events: vec![ + whitespace_event(" "), + name_event("k"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("v"), + newline_custom_event("\r\n") + ] + .into(), + }), ); assert_eq!( section(b"[a] k = \r\n", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("a", None), - events: vec![ - whitespace_event(" "), - name_event("k"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event(""), - newline_custom_event("\r\n") - ] - .into(), - }, - 1 - )), + fully_consumed(Section { + header: parsed_section_header("a", None), + events: vec![ + whitespace_event(" "), + name_event("k"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event(""), + newline_custom_event("\r\n") + ] + .into(), + }), ); } @@ -274,13 +262,10 @@ mod section { let mut node = ParseNode::SectionHeader; assert_eq!( section(b"[test]", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("test", None), - events: Default::default() - }, - 0 - )), + fully_consumed(Section { + header: parsed_section_header("test", None), + events: Default::default() + }), ); } @@ -293,33 +278,30 @@ mod section { d = "lol""#; assert_eq!( section(section_data, &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("hello", None), - events: vec![ - newline_event(), - whitespace_event(" "), - name_event("a"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("b"), - newline_event(), - whitespace_event(" "), - name_event("c"), - value_event(""), - newline_event(), - whitespace_event(" "), - name_event("d"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("\"lol\"") - ] - .into() - }, - 3 - )) + fully_consumed(Section { + header: parsed_section_header("hello", None), + events: vec![ + newline_event(), + whitespace_event(" "), + name_event("a"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("b"), + newline_event(), + whitespace_event(" "), + name_event("c"), + value_event(""), + newline_event(), + whitespace_event(" "), + name_event("d"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("\"lol\"") + ] + .into() + }) ); } @@ -329,38 +311,32 @@ mod section { let section_data = b"[a] k="; assert_eq!( section(section_data, &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("a", None), - events: vec![ - whitespace_event(" "), - name_event("k"), - Event::KeyValueSeparator, - value_event(""), - ] - .into() - }, - 0 - )) + fully_consumed(Section { + header: parsed_section_header("a", None), + events: vec![ + whitespace_event(" "), + name_event("k"), + Event::KeyValueSeparator, + value_event(""), + ] + .into() + }) ); let section_data = b"[a] k=\n"; assert_eq!( section(section_data, &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("a", None), - events: vec![ - whitespace_event(" "), - name_event("k"), - Event::KeyValueSeparator, - value_event(""), - newline_event(), - ] - .into() - }, - 1 - )) + fully_consumed(Section { + header: parsed_section_header("a", None), + events: vec![ + whitespace_event(" "), + name_event("k"), + Event::KeyValueSeparator, + value_event(""), + newline_event(), + ] + .into() + }) ); } @@ -373,34 +349,31 @@ mod section { d = "lol""#; assert_eq!( section(section_data, &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("hello", None), - events: vec![ - newline_event(), - whitespace_event(" "), - name_event("a"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("b"), - newline_event(), - whitespace_event(" "), - name_event("c"), - Event::KeyValueSeparator, - value_event(""), - newline_event(), - whitespace_event(" "), - name_event("d"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("\"lol\"") - ] - .into() - }, - 3 - )) + fully_consumed(Section { + header: parsed_section_header("hello", None), + events: vec![ + newline_event(), + whitespace_event(" "), + name_event("a"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("b"), + newline_event(), + whitespace_event(" "), + name_event("c"), + Event::KeyValueSeparator, + value_event(""), + newline_event(), + whitespace_event(" "), + name_event("d"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("\"lol\"") + ] + .into() + }) ); } @@ -409,32 +382,26 @@ mod section { let mut node = ParseNode::SectionHeader; assert_eq!( section(b"[hello] c", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("hello", None), - events: vec![whitespace_event(" "), name_event("c"), value_event("")].into() - }, - 0 - )) + fully_consumed(Section { + header: parsed_section_header("hello", None), + events: vec![whitespace_event(" "), name_event("c"), value_event("")].into() + }) ); assert_eq!( section(b"[hello] c\nd", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("hello", None), - events: vec![ - whitespace_event(" "), - name_event("c"), - value_event(""), - newline_event(), - name_event("d"), - value_event("") - ] - .into() - }, - 1 - )) + fully_consumed(Section { + header: parsed_section_header("hello", None), + events: vec![ + whitespace_event(" "), + name_event("c"), + value_event(""), + newline_event(), + name_event("d"), + value_event("") + ] + .into() + }) ); } @@ -448,39 +415,36 @@ mod section { c = d"#; assert_eq!( section(section_data, &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("hello", None), - events: vec![ - whitespace_event(" "), - comment_event(';', " commentA"), - newline_event(), - whitespace_event(" "), - name_event("a"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("b"), - whitespace_event(" "), - comment_event('#', " commentB"), - newline_event(), - whitespace_event(" "), - comment_event(';', " commentC"), - newline_event(), - whitespace_event(" "), - comment_event(';', " commentD"), - newline_event(), - whitespace_event(" "), - name_event("c"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_event("d"), - ] - .into() - }, - 4 - )) + fully_consumed(Section { + header: parsed_section_header("hello", None), + events: vec![ + whitespace_event(" "), + comment_event(';', " commentA"), + newline_event(), + whitespace_event(" "), + name_event("a"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("b"), + whitespace_event(" "), + comment_event('#', " commentB"), + newline_event(), + whitespace_event(" "), + comment_event(';', " commentC"), + newline_event(), + whitespace_event(" "), + comment_event(';', " commentD"), + newline_event(), + whitespace_event(" "), + name_event("c"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_event("d"), + ] + .into() + }) ); } @@ -490,27 +454,24 @@ mod section { // This test is absolute hell. Good luck if this fails. assert_eq!( section(b"[section] a = 1 \"\\\"\\\na ; e \"\\\"\\\nd # \"b\t ; c", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("section", None), - events: vec![ - whitespace_event(" "), - name_event("a"), - whitespace_event(" "), - Event::KeyValueSeparator, - whitespace_event(" "), - value_not_done_event(r#"1 "\""#), - newline_event(), - value_not_done_event(r#"a ; e "\""#), - newline_event(), - value_done_event("d"), - whitespace_event(" "), - comment_event('#', " \"b\t ; c"), - ] - .into() - }, - 2 - )) + fully_consumed(Section { + header: parsed_section_header("section", None), + events: vec![ + whitespace_event(" "), + name_event("a"), + whitespace_event(" "), + Event::KeyValueSeparator, + whitespace_event(" "), + value_not_done_event(r#"1 "\""#), + newline_event(), + value_not_done_event(r#"a ; e "\""#), + newline_event(), + value_done_event("d"), + whitespace_event(" "), + comment_event('#', " \"b\t ; c"), + ] + .into() + }) ); } @@ -519,23 +480,20 @@ mod section { let mut node = ParseNode::SectionHeader; assert_eq!( section(b"[section \"a\"] b =\"\\\n;\";a", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("section", (" ", "a")), - events: vec![ - whitespace_event(" "), - name_event("b"), - whitespace_event(" "), - Event::KeyValueSeparator, - value_not_done_event("\""), - newline_event(), - value_done_event(";\""), - comment_event(';', "a"), - ] - .into() - }, - 1 - )) + fully_consumed(Section { + header: parsed_section_header("section", (" ", "a")), + events: vec![ + whitespace_event(" "), + name_event("b"), + whitespace_event(" "), + Event::KeyValueSeparator, + value_not_done_event("\""), + newline_event(), + value_done_event(";\""), + comment_event(';', "a"), + ] + .into() + }) ); } @@ -544,19 +502,16 @@ mod section { let mut node = ParseNode::SectionHeader; assert_eq!( section(b"[s]hello #world", &mut node).unwrap(), - fully_consumed(( - Section { - header: parsed_section_header("s", None), - events: vec![ - name_event("hello"), - whitespace_event(" "), - value_event(""), - comment_event('#', "world"), - ] - .into() - }, - 0 - )) + fully_consumed(Section { + header: parsed_section_header("s", None), + events: vec![ + name_event("hello"), + whitespace_event(" "), + value_event(""), + comment_event('#', "world"), + ] + .into() + }) ); } }