From 64b147fc3610ce1dd11cffe319b16effa7644891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20M=C3=BCller?= <deso@posteo.net>
Date: Fri, 26 Jul 2024 14:43:34 -0700
Subject: [PATCH] Improve Breakpad file parsing performance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our breakpad parser supports various CFI directives and other
constructs, despite never actually using them. That is partly necessary:
we need to recognize them in order to ignore them. However, we don't
have to go all the way and parse them correctly. Rather, we can just
skip over them more quickly. Adjust the code to do just that. As a
result, we are a bit more lenient in what we accept (except for spurious
empty lines that we now no longer accept...), but nobody should be using
our parser to judge validity of a file, so being more accepting is fine.
As a result, we speed up parsing of breakpad files that contain quite
significantly:

  > main/symbolize_breakpad
  >   time:   [245.08 ms 245.62 ms 246.20 ms]
  >   change: [−12.182% −11.608% −11.143%] (p = 0.00 < 0.02)
  >   Performance has improved.

Signed-off-by: Daniel Müller <deso@posteo.net>
---
 CHANGELOG.md           |  1 +
 src/breakpad/parser.rs | 76 ++++--------------------------------------
 2 files changed, 8 insertions(+), 69 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7f27a325..4f9d0c61 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ Unreleased
   read failure
 - Added support for file based symbolization support on the Windows operating
   system
+- Improved performance for parsing Breakpad files
 
 
 0.2.0-rc.0
diff --git a/src/breakpad/parser.rs b/src/breakpad/parser.rs
index 2707484b..4ee70f8d 100644
--- a/src/breakpad/parser.rs
+++ b/src/breakpad/parser.rs
@@ -37,11 +37,8 @@ use std::str;
 use nom::branch::alt;
 use nom::bytes::complete::tag;
 use nom::bytes::complete::take_while;
-use nom::character::complete::hex_digit1;
 use nom::character::complete::multispace0;
 use nom::character::complete::space1;
-use nom::character::is_digit;
-use nom::character::is_hex_digit;
 use nom::combinator::cut;
 use nom::combinator::map;
 use nom::combinator::map_res;
@@ -288,11 +285,6 @@ fn decimal_u32(input: &[u8]) -> IResult<&[u8], u32, VerboseError<&[u8]>> {
     Ok((remaining, res))
 }
 
-/// Take 0 or more non-space bytes.
-fn non_space(input: &[u8]) -> IResult<&[u8], &[u8], VerboseError<&[u8]>> {
-    take_while(|c: u8| c != b' ')(input)
-}
-
 /// Accept `\n` with an arbitrary number of preceding `\r` bytes.
 ///
 /// This is different from `line_ending` which doesn't accept `\r` if it isn't
@@ -309,37 +301,18 @@ fn not_my_eol(input: &[u8]) -> IResult<&[u8], &[u8], VerboseError<&[u8]>> {
     take_while(|b| b != b'\r' && b != b'\n')(input)
 }
 
-/// Parse a single byte if it matches the predicate.
-///
-/// nom has `satisfy`, which is similar. It differs in the argument type of the
-/// predicate: `satisfy`'s predicate takes `char`, whereas `single`'s predicate
-/// takes `u8`.
-fn single(predicate: fn(u8) -> bool) -> impl Fn(&[u8]) -> IResult<&[u8], u8, VerboseError<&[u8]>> {
-    move |i: &[u8]| match i.split_first() {
-        Some((b, rest)) if predicate(*b) => Ok((rest, *b)),
-        _ => Err(Err::Error(VerboseError::from_error_kind(
-            i,
-            ErrorKind::Satisfy,
-        ))),
-    }
-}
-
 /// Matches a MODULE record.
 fn module_line(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
     let (input, _) = terminated(tag("MODULE"), space1)(input)?;
-    let (input, _) = cut(tuple((
-        terminated(non_space, space1),  // os
-        terminated(non_space, space1),  // cpu
-        terminated(hex_digit1, space1), // debug id
-        terminated(not_my_eol, my_eol), // filename
-    )))(input)?;
+    let (input, _) = cut(terminated(not_my_eol, my_eol))(input)?;
+
     Ok((input, ()))
 }
 
 /// Matches an INFO URL record.
 fn info_url(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
     let (input, _) = terminated(tag("INFO URL"), space1)(input)?;
-    let (input, _url) = cut(terminated(map_res(not_my_eol, str::from_utf8), my_eol))(input)?;
+    let (input, _) = cut(terminated(not_my_eol, my_eol))(input)?;
     Ok((input, ()))
 }
 
@@ -468,34 +441,7 @@ fn inline_line(input: &[u8]) -> IResult<&[u8], impl Iterator<Item = Inlinee>, Ve
 /// Matches a STACK WIN record.
 fn stack_win_line(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
     let (input, _) = terminated(tag("STACK WIN"), space1)(input)?;
-    let (
-        input,
-        (
-            _ty,
-            _address,
-            _code_size,
-            _prologue_size,
-            _epilogue_size,
-            _parameter_size,
-            _saved_register_size,
-            _local_size,
-            _max_stack_size,
-            _has_program_string,
-            _rest,
-        ),
-    ) = cut(tuple((
-        terminated(single(is_hex_digit), space1), // ty
-        terminated(hex_str::<u64>, space1),       // addr
-        terminated(hex_str::<u32>, space1),       // code_size
-        terminated(hex_str::<u32>, space1),       // prologue_size
-        terminated(hex_str::<u32>, space1),       // epilogue_size
-        terminated(hex_str::<u32>, space1),       // parameter_size
-        terminated(hex_str::<u32>, space1),       // saved_register_size
-        terminated(hex_str::<u32>, space1),       // local_size
-        terminated(hex_str::<u32>, space1),       // max_stack_size
-        terminated(map(single(is_digit), |b| b == b'1'), space1), // has_program_string
-        terminated(map_res(not_my_eol, str::from_utf8), my_eol),
-    )))(input)?;
+    let (input, _) = cut(terminated(not_my_eol, my_eol))(input)?;
 
     Ok((input, ()))
 }
@@ -503,10 +449,7 @@ fn stack_win_line(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
 /// Matches a STACK CFI record.
 fn stack_cfi(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
     let (input, _) = terminated(tag("STACK CFI"), space1)(input)?;
-    let (input, (_address, _rules)) = cut(tuple((
-        terminated(hex_str::<u64>, space1),
-        terminated(map_res(not_my_eol, str::from_utf8), my_eol),
-    )))(input)?;
+    let (input, _) = cut(terminated(not_my_eol, my_eol))(input)?;
 
     Ok((input, ()))
 }
@@ -514,11 +457,7 @@ fn stack_cfi(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
 /// Matches a STACK CFI INIT record.
 fn stack_cfi_init(input: &[u8]) -> IResult<&[u8], (), VerboseError<&[u8]>> {
     let (input, _) = terminated(tag("STACK CFI INIT"), space1)(input)?;
-    let (input, (_address, _size, _rules)) = cut(tuple((
-        terminated(hex_str::<u64>, space1),
-        terminated(hex_str::<u32>, space1),
-        terminated(map_res(not_my_eol, str::from_utf8), my_eol),
-    )))(input)?;
+    let (input, _) = cut(terminated(not_my_eol, my_eol))(input)?;
 
     Ok((input, ()))
 }
@@ -1166,7 +1105,6 @@ STACK CFI INIT badf00d abc init rules
 STACK CFI deadf00d some rules
 STACK CFI deadbeef more rules
 STACK CFI INIT f00f f0 more init rules
-
 "[..];
         let sym = SymbolFile::from_bytes(bytes).unwrap();
         assert_eq!(sym.files.len(), 2);
@@ -1253,7 +1191,7 @@ FUNC 1001 10 10 some func overlap contained
 FILE 0 foo.c
 "[..]
         )
-        .is_err(),);
+        .is_ok(),);
 
         assert!(SymbolFile::from_bytes(
             &b"MODULE Linux x86 abcd1234 foo