From 682e87146571800f2e82e2f9f2b38e07fa29d527 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 6 Nov 2025 05:47:33 -0500 Subject: [PATCH 1/5] Add reader parser unit tests. --- docs/doxygen/mainpage.dox | 4 +- src/log_surgeon/Constants.hpp | 1 + tests/CMakeLists.txt | 1 + tests/test-reader-parser.cpp | 365 ++++++++++++++++++++++++++++++++++ 4 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 tests/test-reader-parser.cpp diff --git a/docs/doxygen/mainpage.dox b/docs/doxygen/mainpage.dox index 329e4875..1072c436 100644 --- a/docs/doxygen/mainpage.dox +++ b/docs/doxygen/mainpage.dox @@ -2,7 +2,8 @@ * * # Use case examples of schema rules and parsing results: * - * - @ref test_buffer_parser_no_capture "Basic log parser" + * - @ref test_buffer_parser_no_capture "Basic log buffer parser" + * - @ref test_reader_parser_no_capture "Basic log file parser" * - @ref test_buffer_parser_capture "Captures" * - @ref test_buffer_parser_default_schema "Default CLP schema" * - @ref test_buffer_parser_delimited_variables "Backtracking on delimited variables" @@ -19,6 +20,7 @@ * - @ref unit_tests_prefix_tree "Prefix tree" * - @ref unit_tests_query "Query" * - @ref unit_tests_query_interpretation "Query Interpretation" + * - @ref unit_tests_reader_parser_wrap_around "Reader Parser" * - @ref unit_tests_regex_ast "Regex AST" * - @ref unit_tests_register_handler "Register handler" * - @ref unit_tests_schema "Schema" diff --git a/src/log_surgeon/Constants.hpp b/src/log_surgeon/Constants.hpp index 3517bf49..7006cdf2 100644 --- a/src/log_surgeon/Constants.hpp +++ b/src/log_surgeon/Constants.hpp @@ -45,6 +45,7 @@ constexpr char cTokenHex[] = "hex"; constexpr char cTokenFirstTimestamp[] = "firstTimestamp"; constexpr char cTokenNewlineTimestamp[] = "newLineTimestamp"; constexpr char cTokenNewline[] = "newLine"; +// Buffer size cannot be odd, so always use a multiple of 2 constexpr uint32_t cStaticByteBuffSize = 48'000; namespace utf8 { diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2121195d..9b6e1cbe 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -13,6 +13,7 @@ target_sources( test-prefix-tree.cpp test-query.cpp test-query-interpretation.cpp + test-reader-parser.cpp test-regex-ast.cpp test-register-handler.cpp test-schema.cpp diff --git a/tests/test-reader-parser.cpp b/tests/test-reader-parser.cpp new file mode 100644 index 00000000..47ad6392 --- /dev/null +++ b/tests/test-reader-parser.cpp @@ -0,0 +1,365 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +using log_surgeon::capture_id_t; +using log_surgeon::cStaticByteBuffSize; +using log_surgeon::ErrorCode; +using log_surgeon::finite_automata::PrefixTree; +using log_surgeon::Reader; +using log_surgeon::ReaderParser; +using log_surgeon::rule_id_t; +using log_surgeon::Schema; +using log_surgeon::SymbolId; +using log_surgeon::Token; +using std::map; +using std::string; +using std::string_view; +using std::unordered_map; +using std::vector; + +namespace { +struct CapturePositions { + vector m_start_positions; + vector m_end_positions; +}; + +struct ExpectedToken { + string_view m_raw_string; + string m_type; + map m_captures; +}; + +struct ExpectedEvent { + string_view m_logtype; + string_view m_timestamp_raw; + vector m_tokens; +}; + +/** + * Parses the given input and verifies the output is a sequence of tokens matching the expected + * tokens. + * + * If any rule has captures, verifies the captures are in the right place. + * + * @param reader_parser The reader parser to parse the input with. + * @param input The input to parse. + * @param expected_events The expected parsed events. + */ +auto parse_and_validate( + ReaderParser& reader_parser, + string_view input, + vector const& expected_events +) -> void; + +/** + * @param map The map to serialize. + * @return The serialized map. + */ +[[nodiscard]] auto serialize_id_symbol_map(unordered_map const& map) -> string; + +auto parse_and_validate( + ReaderParser& reader_parser, + string_view input, + vector const& expected_events +) -> void { + size_t curr_pos{0}; + + Reader reader{[&](char* buffer, size_t const count, size_t& read_to) -> ErrorCode { + if (input.size() <= curr_pos) { + read_to = 0; + return ErrorCode::EndOfFile; + } + + read_to = input.size() - curr_pos; + if(read_to > count) { + read_to = count; + } + + std::memcpy(buffer, input.data() + curr_pos, read_to); + curr_pos += read_to; + return ErrorCode::Success; + }}; + + + reader_parser.reset_and_set_reader(reader); + + CAPTURE(serialize_id_symbol_map(reader_parser.get_log_parser().m_lexer.m_id_symbol)); + CAPTURE(input); + string input_str(input); + + size_t count{0}; + for (auto const& [expected_logtype, expected_timestamp_raw, expected_tokens] : expected_events) + { + CAPTURE(count); + count++; + + auto err{reader_parser.parse_next_event()}; + REQUIRE(ErrorCode::Success == err); + auto const& event{reader_parser.get_log_parser().get_log_event_view()}; + REQUIRE(expected_logtype == event.get_logtype()); + if (nullptr == event.get_timestamp()) { + REQUIRE(expected_timestamp_raw.empty()); + } else { + REQUIRE(expected_timestamp_raw == event.get_timestamp()->to_string()); + } + + uint32_t event_offset{0}; + if (nullptr == event.get_timestamp()) { + event_offset = 1; + } + + REQUIRE(expected_tokens.size() == event.get_log_output_buffer()->pos() - event_offset); + for (size_t i{0}; i < expected_tokens.size(); ++i) { + auto const& [expected_raw_string, expected_type, expected_captures]{expected_tokens[i]}; + auto token{event.get_log_output_buffer()->get_token(i + event_offset)}; + CAPTURE(i); + REQUIRE(expected_raw_string == token.to_string()); + + uint32_t expected_token_type; + if (expected_type.empty()) { + expected_token_type = static_cast(SymbolId::TokenUncaughtString); + } else { + CAPTURE(expected_type); + REQUIRE(reader_parser.get_log_parser().get_symbol_id(expected_type).has_value()); + expected_token_type + = reader_parser.get_log_parser().get_symbol_id(expected_type).value(); + } + auto const token_type{token.m_type_ids_ptr->at(0)}; + REQUIRE(expected_token_type == token_type); + + if (false == expected_captures.empty()) { + auto const& lexer{reader_parser.get_log_parser().m_lexer}; + auto optional_capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; + REQUIRE(optional_capture_ids.has_value()); + + if (false == optional_capture_ids.has_value()) { + return; + } + + for (auto const capture_id : optional_capture_ids.value()) { + auto const capture_name{lexer.m_id_symbol.at(capture_id)}; + REQUIRE(expected_captures.contains(capture_name)); + auto optional_reg_ids{lexer.get_reg_ids_from_capture_id(capture_id)}; + REQUIRE(optional_reg_ids.has_value()); + if (false == optional_reg_ids.has_value()) { + return; + } + auto const [start_reg_id, end_reg_id]{optional_reg_ids.value()}; + auto const actual_start_positions{ + token.get_reversed_reg_positions(start_reg_id) + }; + auto const actual_end_positions{token.get_reversed_reg_positions(end_reg_id)}; + auto const [expected_start_positions, expected_end_positions]{ + expected_captures.at(capture_name) + }; + REQUIRE(expected_start_positions == actual_start_positions); + REQUIRE(expected_end_positions == actual_end_positions); + } + } + } + } + REQUIRE(reader_parser.done()); +} + +auto serialize_id_symbol_map(unordered_map const& map) -> string { + string serialized_map; + for (auto const& [id, symbol] : map) { + serialized_map += fmt::format("{}->{},", id, symbol); + } + return serialized_map; +} +} // namespace + +/** + * @defgroup test_reader_parser_no_capture Reader parser using variables without capture groups. + * @brief Tests covering variable matching without regex capture groups. + */ + +/** + * @ingroup test_reader_parser_no_capture + * @brief Tests the reader parser behavior when parsing variables without capture groups. + * + * This test verifies that the reader parser correctly matches exact variable patterns when no + * capture groups are involved. It confirms the `ReaderParser`: + * - Recognizes a variable exactly matching the defined schema ("myVar:userID=123"). + * - Treats close but non-matching strings as uncaught tokens. + * - Correctly classifies tokens that don't match any variable schema as uncaught strings. + * + * ### Schema Definition + * @code + * delimiters: \n\r\[:, + * myVar:userID=123 + * @endcode + * + * ### Test Input + * @code + * "userID=123 userID=234 userID=123 123 userID=123" + * @endcode + * + * ### Expected Logtype + * @code + * " userID=234 123 " + * @endcode + * + * ### Expected Tokenization + * @code + * "userID=123" -> "myVar" + * " userID=234" -> uncaught string + * " userID=123" -> "myVar" + * " 123" -> uncaught string + * " userID=123" -> "myVar" + * @endcode + */ +TEST_CASE("single_line_without_capture_reader_parser", "[ReaderParser]") { + constexpr string_view cDelimitersSchema{R"(delimiters: \n\r\[:,)"}; + constexpr string_view cVarSchema{"myVar:userID=123"}; + constexpr string_view cInput{"userID=123 userID=234 userID=123 123 userID=123"}; + ExpectedEvent const expected_event{ + .m_logtype{R"( userID=234 123 )"}, + .m_timestamp_raw{""}, + .m_tokens{ + {{"userID=123", "myVar", {}}, + {" userID=234", "", {}}, + {" userID=123", "myVar", {}}, + {" 123", "", {}}, + {" userID=123", "myVar", {}}} + } + }; + + Schema schema; + schema.add_delimiters(cDelimitersSchema); + schema.add_variable(cVarSchema, -1); + ReaderParser reader_parser(std::move(schema.release_schema_ast_ptr())); + + parse_and_validate(reader_parser, cInput, {expected_event}); +} + +/** + * @defgroup unit_tests_reader_parser_wrap_around `ReaderParser` unit tests. + * @brief Unit tests for `ReaderParser` wrap around handling. + + * These unit tests contain the `ReaderParser` tag. + */ + +/** + * @ingroup unit_tests_reader_parser_wrap_around + * @brief Tests the reader parser behavior when parsing variables without capture groups. + * + * This test verifies that the reader parser correctly handles the wrap around handling when a log + * begins or ends near the boundaries of the buffer: + * - Considers the case where the log ends right at the end of the buffer. + * - Considers the case where the log starts right after wrapping around. + * - Considers every case in between, which has the added benefit of testing every case for each + * tested variable as well (which include a capture). + */ +TEST_CASE("reader_parser_wrap_around", "[ReaderParser]") { + REQUIRE(48000 == cStaticByteBuffSize); + + constexpr string_view cDelimitersSchema{R"(delimiters: \n\r\[:,)"}; + constexpr string_view cVarSchema1{"myVar:userID=123"}; + constexpr string_view cVarSchema2{"myCapture:userID=(?234)"}; + constexpr string_view cInput1{"userID=123 userID=234 userID=123 123 userID=123\n"}; + constexpr string_view cInput3{"userID=123 userID=234 userID=123 123 userID=123"}; + constexpr uint32_t cNumInput1{998}; + constexpr uint32_t cRemainingSpace{cStaticByteBuffSize - cInput1.size() * cNumInput1}; + + Schema schema; + schema.add_delimiters(cDelimitersSchema); + schema.add_variable(cVarSchema1, -1); + schema.add_variable(cVarSchema2, -1); + ReaderParser reader_parser(std::move(schema.release_schema_ast_ptr())); + + for (int32_t offset{cInput3.size()}; offset >= 0; --offset) { + CAPTURE(offset); + + string user_var{"userID=123"}; + string remaining_filler(cRemainingSpace - user_var.size() - offset - 2, 'a'); + string input2{user_var + " " + remaining_filler + "\n"}; + string logtype2{" " + remaining_filler + "\n"}; + + string cInput; + for (uint32_t i{0}; i < cNumInput1; i++) { + cInput += cInput1; + } + REQUIRE(cInput.size() == cStaticByteBuffSize - cRemainingSpace); + cInput += input2; + REQUIRE(cInput.size() == cStaticByteBuffSize - offset); + cInput += cInput3; + + ExpectedEvent expected_event1{ + .m_logtype{" userID= 123 \n"}, + .m_timestamp_raw{""}, + .m_tokens{ + {{"userID=123", "myVar", {}}, + {" userID=234", "myCapture", {{{"capture",{{18}, {21}}}}}}, + {" userID=123", "myVar", {}}, + {" 123", "", {}}, + {" userID=123", "myVar", {}}, + {"\n", "", {}}} + } + }; + + string_view logtype2_view{logtype2}; + string_view user_var_view{user_var}; + string remaining_filler_with_space{" " + remaining_filler}; + string_view remaining_filler_view{remaining_filler_with_space}; + ExpectedEvent expected_event2{ + .m_logtype{logtype2_view}, + .m_timestamp_raw{""}, + .m_tokens{ + {{user_var_view, "myVar", {}}, + {remaining_filler_view, "", {}}, + {"\n", "", {}}} + } + }; + + int32_t log_start_pos{static_cast(cStaticByteBuffSize) - offset}; + int32_t cap_begin{log_start_pos+18}; + if(cap_begin >= cStaticByteBuffSize) { + cap_begin -= cStaticByteBuffSize; + } + int32_t cap_end{log_start_pos+21}; + if(cap_end >= cStaticByteBuffSize) { + cap_end -= cStaticByteBuffSize; + } + ExpectedEvent expected_event3{ + .m_logtype{" userID= 123 "}, + .m_timestamp_raw{""}, + .m_tokens{ + {{"userID=123", "myVar", {}}, + {" userID=234", "myCapture", {{{"capture",{{cap_begin}, {cap_end}}}}}}, + {" userID=123", "myVar", {}}, + {" 123", "", {}}, + {" userID=123", "myVar", {}}} + } + }; + + vector expected_events; + for (uint32_t i{0}; i < cNumInput1; ++i) { + expected_events.push_back(expected_event1); + auto& capture{expected_event1.m_tokens.at(1).m_captures["capture"]}; + capture.m_start_positions.at(0) += cInput1.size(); + capture.m_end_positions.at(0) += cInput1.size(); + } + expected_events.push_back(expected_event2); + expected_events.push_back(expected_event3); + + parse_and_validate(reader_parser, cInput, expected_events); + } +} From e53eaaa73b000839dc4c046b00489713ec8a0f2c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 6 Nov 2025 05:57:56 -0500 Subject: [PATCH 2/5] Update token usage. --- tests/test-reader-parser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test-reader-parser.cpp b/tests/test-reader-parser.cpp index 47ad6392..56afb64e 100644 --- a/tests/test-reader-parser.cpp +++ b/tests/test-reader-parser.cpp @@ -140,7 +140,7 @@ auto parse_and_validate( expected_token_type = reader_parser.get_log_parser().get_symbol_id(expected_type).value(); } - auto const token_type{token.m_type_ids_ptr->at(0)}; + auto const token_type{token.get_type_ids()->at(0)}; REQUIRE(expected_token_type == token_type); if (false == expected_captures.empty()) { From 4855365ca915310c6eb826a9bce3a308fcf97e08 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 6 Nov 2025 06:13:05 -0500 Subject: [PATCH 3/5] Add missing headers. --- tests/test-reader-parser.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test-reader-parser.cpp b/tests/test-reader-parser.cpp index 56afb64e..fe90863a 100644 --- a/tests/test-reader-parser.cpp +++ b/tests/test-reader-parser.cpp @@ -1,3 +1,6 @@ +#include +#include +#include #include #include #include From cf20d7be312bcde27706a616e9e0be3d485c9663 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 6 Nov 2025 06:14:21 -0500 Subject: [PATCH 4/5] Fix if spacing. --- tests/test-reader-parser.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test-reader-parser.cpp b/tests/test-reader-parser.cpp index fe90863a..f637fbd2 100644 --- a/tests/test-reader-parser.cpp +++ b/tests/test-reader-parser.cpp @@ -90,7 +90,7 @@ auto parse_and_validate( } read_to = input.size() - curr_pos; - if(read_to > count) { + if (read_to > count) { read_to = count; } @@ -334,11 +334,11 @@ TEST_CASE("reader_parser_wrap_around", "[ReaderParser]") { int32_t log_start_pos{static_cast(cStaticByteBuffSize) - offset}; int32_t cap_begin{log_start_pos+18}; - if(cap_begin >= cStaticByteBuffSize) { + if (cap_begin >= cStaticByteBuffSize) { cap_begin -= cStaticByteBuffSize; } int32_t cap_end{log_start_pos+21}; - if(cap_end >= cStaticByteBuffSize) { + if (cap_end >= cStaticByteBuffSize) { cap_end -= cStaticByteBuffSize; } ExpectedEvent expected_event3{ From e1c7f814155dff4085255420af89c491b1032fec Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 6 Nov 2025 06:15:23 -0500 Subject: [PATCH 5/5] Remove extra space; Remove unused var. --- tests/test-reader-parser.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test-reader-parser.cpp b/tests/test-reader-parser.cpp index f637fbd2..168f63f0 100644 --- a/tests/test-reader-parser.cpp +++ b/tests/test-reader-parser.cpp @@ -99,12 +99,10 @@ auto parse_and_validate( return ErrorCode::Success; }}; - reader_parser.reset_and_set_reader(reader); CAPTURE(serialize_id_symbol_map(reader_parser.get_log_parser().m_lexer.m_id_symbol)); CAPTURE(input); - string input_str(input); size_t count{0}; for (auto const& [expected_logtype, expected_timestamp_raw, expected_tokens] : expected_events)