From aae9a7ec0783a9cc52855c48cd6e6d443decf1a9 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 8 Jan 2026 20:13:27 +0800 Subject: [PATCH 01/11] feat(clp_s): sanitize JSON buffers with unescaped control characters Add automatic sanitization of JSON buffers that contain unescaped control characters (0x00-0x1F) inside string values. When simdjson returns UNESCAPED_CHARS error, the buffer is sanitized by escaping control characters to \u00XX format, and parsing is retried. - Add StringUtils::sanitize_json_buffer() that escapes control chars inside JSON strings while respecting existing escape sequences - Integrate sanitization in JsonFileIterator::read_new_json() with automatic retry after sanitization - Track sanitization byte expansion for accurate byte counting - Log warnings with detailed counts of sanitized characters - Add comprehensive unit tests for sanitization logic --- components/core/CMakeLists.txt | 1 + .../core/src/clp_s/JsonFileIterator.cpp | 56 +++- .../core/src/clp_s/JsonFileIterator.hpp | 7 +- components/core/src/clp_s/Utils.cpp | 70 ++++ components/core/src/clp_s/Utils.hpp | 37 ++ .../core/tests/test-clp_s-StringUtils.cpp | 316 ++++++++++++++++++ 6 files changed, 482 insertions(+), 5 deletions(-) create mode 100644 components/core/tests/test-clp_s-StringUtils.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 9e9738f05d..f183f15e48 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -704,6 +704,7 @@ set(SOURCE_FILES_unitTest tests/test-clp_s-end_to_end.cpp tests/test-clp_s-range_index.cpp tests/test-clp_s-search.cpp + tests/test-clp_s-StringUtils.cpp tests/test-EncodedVariableInterpreter.cpp tests/test-encoding_methods.cpp tests/test-ffi_IrUnitHandlerReq.cpp diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index a0a003d9f7..a6d043e82e 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -3,8 +3,11 @@ #include #include +#include #include +#include "Utils.hpp" + namespace clp_s { JsonFileIterator::JsonFileIterator( clp::ReaderInterface& reader, @@ -64,6 +67,51 @@ bool JsonFileIterator::read_new_json() { ) .get(m_stream); + // If we encounter unescaped control characters in JSON strings, sanitize the buffer + // by escaping them to \u00XX format and retry parsing + if (simdjson::error_code::UNESCAPED_CHARS == error) { + size_t const old_buf_occupied = m_buf_occupied; + // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by + // reference if reallocation is needed. This keeps m_buf_size in sync with the + // actual allocated buffer size. + auto const result = StringUtils::sanitize_json_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + // Build log message with details of sanitized characters + size_t total_sanitized = 0; + std::string char_details; + for (auto const& [ch, count] : result.sanitized_char_counts) { + if (!char_details.empty()) { + char_details += ", "; + } + char_details += fmt::format("0x{:02x} ({})", static_cast(ch), count); + total_sanitized += count; + } + size_t bytes_added = m_buf_occupied - old_buf_occupied; + SPDLOG_WARN( + "Escaped {} control character(s) in JSON: [{}]. Buffer expanded by {} bytes ({} -> {}).", + total_sanitized, + char_details, + bytes_added, + old_buf_occupied, + m_buf_occupied + ); + + error = m_parser + .iterate_many( + m_buf, + /* length of data */ m_buf_occupied, + /* batch size of data to parse*/ m_buf_occupied + ) + .get(m_stream); + } + if (error) { m_error_code = error; return false; @@ -151,10 +199,12 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i size_t JsonFileIterator::get_num_bytes_consumed() { // If there are more documents left in the current buffer account for how much of the // buffer has been consumed, otherwise report the total number of bytes read so that we - // capture trailing whitespace. + // capture trailing whitespace. Include bytes added by sanitization since the sanitized + // content is what gets compressed. if (m_doc_it != m_stream.end()) { - return m_bytes_read - (m_buf_occupied - m_next_document_position); + return m_bytes_read + m_sanitization_bytes_added + - (m_buf_occupied - m_next_document_position); } - return m_bytes_read; + return m_bytes_read + m_sanitization_bytes_added; } } // namespace clp_s diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index 9102087637..a9732ad5c6 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -41,9 +41,11 @@ class JsonFileIterator { [[nodiscard]] size_t truncated_bytes() const { return m_truncated_bytes; } /** - * @return total number of bytes read from the file + * @return total number of bytes read from the file, plus any bytes added by sanitization */ - [[nodiscard]] size_t get_num_bytes_read() const { return m_bytes_read; } + [[nodiscard]] size_t get_num_bytes_read() const { + return m_bytes_read + m_sanitization_bytes_added; + } /** * Note: this method can not be const because checking if a simdjson iterator is at the end @@ -76,6 +78,7 @@ class JsonFileIterator { size_t m_truncated_bytes{0}; size_t m_next_document_position{0}; size_t m_bytes_read{0}; + size_t m_sanitization_bytes_added{0}; size_t m_buf_size{0}; size_t m_buf_occupied{0}; size_t m_max_document_size{0}; diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 59b18a5bca..5efcb7d77e 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -163,6 +164,75 @@ bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& n return true; } +JsonSanitizeResult StringUtils::sanitize_json_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding +) { + // Build sanitized content in a temporary buffer, escaping control characters + // inside JSON strings. This is a fallback path only triggered on UNESCAPED_CHARS error, + // so we prioritize correctness over performance. + // + // Note: If the buffer contains unmatched quotes (e.g., truncated JSON), the string state + // tracking may be incorrect, potentially escaping control characters outside of actual + // JSON strings. This is acceptable since such malformed JSON will fail parsing anyway. + std::string sanitized; + + std::map sanitized_char_counts; + bool in_string = false; + bool escape_next = false; + + for (size_t i = 0; i < buf_occupied; ++i) { + char c = buf[i]; + + if (escape_next) { + escape_next = false; + sanitized.push_back(c); + continue; + } + + if (c == '\\' && in_string) { + escape_next = true; + sanitized.push_back(c); + continue; + } + + if (c == '"') { + in_string = !in_string; + sanitized.push_back(c); + continue; + } + + // Escape control characters (0x00-0x1F) inside strings to \u00XX format + if (in_string && static_cast(c) < 0x20) { + char_to_escaped_four_char_hex(sanitized, c); + ++sanitized_char_counts[c]; + } else { + sanitized.push_back(c); + } + } + + // If no changes were made, return early + if (sanitized.size() == buf_occupied) { + return {buf_occupied, {}}; + } + + // Grow buffer if needed to hold sanitized content + if (sanitized.size() > buf_size) { + // Allocate exactly the size needed since we know the exact size upfront + size_t new_buf_size = sanitized.size(); + char* new_buf = new char[new_buf_size + simdjson_padding]; + delete[] buf; + buf = new_buf; + buf_size = new_buf_size; + } + + // Copy sanitized content to buffer + std::memcpy(buf, sanitized.data(), sanitized.size()); + return {sanitized.size(), std::move(sanitized_char_counts)}; +} + void StringUtils::escape_json_string(std::string& destination, std::string_view const source) { // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index a711eb8ad6..df5d152444 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,14 @@ class UriUtils { static bool get_last_uri_component(std::string_view const uri, std::string& name); }; +/** + * Result of sanitizing a JSON buffer, including statistics about what was sanitized. + */ +struct JsonSanitizeResult { + size_t new_buf_occupied; + std::map sanitized_char_counts; +}; + class StringUtils { public: /** @@ -81,6 +90,34 @@ class StringUtils { */ static void escape_json_string(std::string& destination, std::string_view const source); + /** + * Sanitizes a JSON buffer by escaping unescaped control characters (0x00-0x1F) inside JSON + * strings. This is used to fix malformed JSON that contains raw control characters which + * would cause parsing errors. + * + * Only control characters inside JSON string values (between unescaped double quotes) are + * escaped. Control characters outside strings are left unchanged as they would cause + * structural JSON errors anyway. + * + * Characters are escaped using unicode escape sequences (e.g., \x00 becomes \u0000). + * + * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed. + * The caller's pointer will be updated to point to the new buffer if reallocation + * occurs. The caller is responsible for deleting the buffer. + * @param buf_size Current allocated size of the buffer (excluding simdjson padding). + * This parameter is modified by reference and will be updated if the buffer + * is reallocated to reflect the new buffer size. + * @param buf_occupied Number of bytes currently used in the buffer. + * @param simdjson_padding Amount of padding required after buffer content. + * @return Result containing new buf_occupied and counts of each sanitized character. + */ + static JsonSanitizeResult sanitize_json_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding + ); + private: /** * Converts a character into its two byte hexadecimal representation. diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp new file mode 100644 index 0000000000..9163281087 --- /dev/null +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -0,0 +1,316 @@ +#include +#include +#include + +#include +#include + +#include "../src/clp_s/Utils.hpp" + +using clp_s::JsonSanitizeResult; +using clp_s::StringUtils; + +// We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes +// and control characters. Without the `s` suffix, std::string's constructor from a C-string literal +// stops at the first null byte, truncating the test input. Additionally, we use string concatenation +// (e.g., "\x00" "end") to prevent hex escape sequences from being extended by following hex digits. +using namespace std::string_literals; + +namespace { +/** + * Helper to create a buffer with simdjson padding and run sanitization. + * @param input The input string to sanitize + * @return The sanitized string + * @note If sanitize_json_buffer reallocates the buffer, the new buffer pointer is returned + * and the original buffer is automatically deleted by sanitize_json_buffer. This helper + * correctly handles both cases (reallocation and no reallocation) by always deleting + * the final buffer pointer. + */ +auto sanitize_string(std::string_view input) -> std::string { + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + try { + auto const result = StringUtils::sanitize_json_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + std::string output(buf, result.new_buf_occupied); + // Delete the buffer (may be reallocated, but buf points to the current buffer) + delete[] buf; + return output; + } catch (...) { + // If sanitization failed, buf still points to the original buffer + delete[] buf; + throw; + } +} + +/** + * Helper to verify that a sanitized JSON string can be parsed by simdjson. + * @param json The JSON string to parse + * @return true if parsing succeeds, false otherwise + */ +auto can_parse_json(std::string_view json) -> bool { + simdjson::ondemand::parser parser; + simdjson::padded_string padded(json); + auto result = parser.iterate(padded); + return !result.error(); +} +} // namespace + +TEST_CASE("sanitize_json_buffer_no_changes", "[clp_s][StringUtils]") { + // Valid JSON without control characters should pass through unchanged + SECTION("simple object") { + std::string input = R"({"key": "value"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("object with escaped characters") { + std::string input = R"({"key": "line1\nline2\ttab"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("object with unicode escapes") { + std::string input = R"({"key": "null char: \u0000"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("multiple objects") { + std::string input = R"({"a": "1"}{"b": "2"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("control chars outside strings are unchanged") { + // Control chars outside strings aren't valid JSON anyway, + // but we don't touch them - let the JSON parser report the error + std::string input = "{\x01\"key\": \"value\"}"; + REQUIRE(sanitize_string(input) == input); + } +} + +TEST_CASE("sanitize_json_buffer_escapes_control_chars", "[clp_s][StringUtils]") { + SECTION("null byte in string value") { + auto input = "{\"key\": \"val\x00ue\"}"s; + std::string expected = R"({"key": "val\u0000ue"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("multiple null bytes") { + auto input = "{\"key\": \"\x00\x00\x00\"}"s; + std::string expected = R"({"key": "\u0000\u0000\u0000"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("various control characters") { + // Test 0x01 (SOH), 0x02 (STX), 0x1F (US) + auto input = "{\"key\": \"a\x01" "b\x02" "c\x1F" "d\"}"s; + std::string expected = R"({"key": "a\u0001b\u0002c\u001fd"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("control char in key") { + auto input = "{\"ke\x00y\": \"value\"}"s; + std::string expected = R"({"ke\u0000y": "value"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("mixed valid escapes and control chars") { + auto input = "{\"key\": \"line1\\nhas\x00null\"}"s; + std::string expected = R"({"key": "line1\nhas\u0000null"})"; + REQUIRE(sanitize_string(input) == expected); + } +} + +TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils]") { + SECTION("escaped quote should not toggle string state") { + // The \" should not end the string + std::string input = R"({"key": "quote:\"value"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("escaped backslash before quote") { + // \\" means escaped backslash followed by end of string + std::string input = R"({"key": "backslash:\\"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("control char after escaped backslash") { + // \\\x00 means escaped backslash followed by a control char still inside the string + auto input = "{\"key\": \"slash:\\\\\x00" "end\"}"s; + std::string expected = R"({"key": "slash:\\\u0000end"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("control char after escaped quote") { + auto input = "{\"key\": \"quote:\\\"\x00" "after\"}"s; + std::string expected = R"({"key": "quote:\"\u0000after"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("control char as invalid escape sequence target") { + // \ is not a valid JSON escape sequence, but we leave it as-is + // since the sanitizer treats any char after backslash as an escape target. + // This JSON is invalid anyway (bad escape sequence) and will fail parsing. + auto input = "{\"key\": \"bad:\\\x00" "end\"}"s; + auto expected = "{\"key\": \"bad:\\\x00" "end\"}"s; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("triple backslash followed by control char") { + // \\\ followed by \x00: first \\ is escaped backslash, third \ starts escape sequence. + // The third backslash makes the null appear to be an escape target, so it's not escaped. + auto input = "{\"key\": \"\\\\\\\x00" "end\"}"s; + auto expected = "{\"key\": \"\\\\\\\x00" "end\"}"s; + REQUIRE(sanitize_string(input) == expected); + } +} + +TEST_CASE("sanitize_json_buffer_result_is_valid_json", "[clp_s][StringUtils]") { + SECTION("sanitized null byte produces valid JSON") { + auto input = "{\"key\": \"val\x00ue\"}"s; + std::string sanitized = sanitize_string(input); + REQUIRE(can_parse_json(sanitized)); + } + + SECTION("sanitized multiple control chars produces valid JSON") { + auto input = "{\"data\": \"\x01\x02\x03\x04\x05\"}"s; + std::string sanitized = sanitize_string(input); + REQUIRE(can_parse_json(sanitized)); + } +} + +TEST_CASE("sanitize_json_buffer_handles_buffer_growth", "[clp_s][StringUtils]") { + // Each control char expands from 1 byte to 6 bytes (\u00XX) + // Create input that will require buffer growth + SECTION("many control chars requiring expansion") { + // 100 null bytes -> 600 bytes after escaping + std::string value(100, '\x00'); + std::string input = "{\"key\": \"" + value + "\"}"; + + std::string sanitized = sanitize_string(input); + + // Verify all nulls were escaped + REQUIRE(sanitized.find('\x00') == std::string::npos); + // Verify correct number of escape sequences + size_t count = 0; + size_t pos = 0; + while ((pos = sanitized.find("\\u0000", pos)) != std::string::npos) { + ++count; + pos += 6; + } + REQUIRE(count == 100); + // Verify result is valid JSON + REQUIRE(can_parse_json(sanitized)); + } +} + +TEST_CASE("sanitize_json_buffer_jsonl", "[clp_s][StringUtils]") { + SECTION("multiple JSON objects with control chars") { + auto input = "{\"a\": \"x\x00y\"}\n{\"b\": \"p\x01q\"}"s; + std::string expected = "{\"a\": \"x\\u0000y\"}\n{\"b\": \"p\\u0001q\"}"; + REQUIRE(sanitize_string(input) == expected); + } +} + +TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") { + SECTION("truncated JSON with unmatched quote - control chars may be escaped incorrectly") { + // This tests the edge case documented in the code: if JSON is truncated with unmatched + // quotes, string state tracking may be incorrect, potentially escaping control chars + // outside of actual JSON strings. This is acceptable since malformed JSON will fail + // parsing anyway. + // + // Input: truncated JSON where the string is not closed, followed by control chars + // The sanitizer may incorrectly think it's still in a string and escape the control chars + auto input = "{\"key\": \"unclosed string\x00\x01" "after\"}"s; + // The sanitizer will escape the control chars because it thinks we're still in a string + // (the quote after "unclosed string" is escaped, so the string continues) + std::string sanitized = sanitize_string(input); + // The control chars should be escaped (behavior may vary based on quote matching) + // The important thing is that the function doesn't crash and handles it gracefully + REQUIRE((sanitized.find('\x00') == std::string::npos || sanitized.find("\\u0000") != std::string::npos)); + } + + SECTION("truncated JSON ending mid-string") { + // JSON truncated in the middle of a string with control chars (no closing quote) + // This simulates a real truncation scenario where the buffer ends mid-string + auto input = "{\"key\": \"value\x00\x01" "truncated"s; + std::string sanitized = sanitize_string(input); + // Should handle gracefully without crashing + // Control chars inside the (unclosed) string should be escaped + REQUIRE(sanitized.size() >= input.size()); + // Verify no raw control characters remain (they should be escaped) + REQUIRE(sanitized.find('\x00') == std::string::npos); + REQUIRE(sanitized.find('\x01') == std::string::npos); + } + + SECTION("truncated JSON with control chars after unmatched quote") { + // JSON with unmatched quote followed by control chars outside the string + // The sanitizer may incorrectly escape these if it thinks we're still in a string + auto input = "{\"key\": \"value\x00" "}\x01\x02"s; + std::string sanitized = sanitize_string(input); + // Function should not crash - behavior may vary but should be consistent + REQUIRE(sanitized.size() >= input.size()); + } +} + +TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUtils]") { + SECTION("counts multiple different control characters") { + // Input with: 3x \x00, 2x \x01, 1x \x1f + auto input = "{\"a\": \"\x00\x00\x01\x00\x01\x1f\"}"s; + + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + try { + auto const result = StringUtils::sanitize_json_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + + REQUIRE(result.sanitized_char_counts.size() == 3); + REQUIRE(result.sanitized_char_counts.at('\x00') == 3); + REQUIRE(result.sanitized_char_counts.at('\x01') == 2); + REQUIRE(result.sanitized_char_counts.at('\x1f') == 1); + + delete[] buf; + } catch (...) { + delete[] buf; + throw; + } + } + + SECTION("returns empty counts when no sanitization needed") { + std::string input = R"({"key": "valid value"})"; + + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + try { + auto const result = StringUtils::sanitize_json_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + + REQUIRE(result.sanitized_char_counts.empty()); + REQUIRE(result.new_buf_occupied == input.size()); + + delete[] buf; + } catch (...) { + delete[] buf; + throw; + } + } +} From 0b825331250584dee70aa25fd1c038cfe7f59b73 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 8 Jan 2026 21:11:06 +0800 Subject: [PATCH 02/11] feat(clp_s): include file path in sanitization warning log --- components/core/src/clp_s/JsonFileIterator.cpp | 8 ++++++-- components/core/src/clp_s/JsonFileIterator.hpp | 5 +++++ components/core/src/clp_s/JsonParser.cpp | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index a6d043e82e..5b38e45bf7 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -12,12 +12,14 @@ namespace clp_s { JsonFileIterator::JsonFileIterator( clp::ReaderInterface& reader, size_t max_document_size, + std::string path, size_t buf_size ) : m_buf_size(buf_size), m_max_document_size(max_document_size), m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]), - m_reader(reader) { + m_reader(reader), + m_path(std::move(path)) { read_new_json(); } @@ -95,8 +97,10 @@ bool JsonFileIterator::read_new_json() { } size_t bytes_added = m_buf_occupied - old_buf_occupied; SPDLOG_WARN( - "Escaped {} control character(s) in JSON: [{}]. Buffer expanded by {} bytes ({} -> {}).", + "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes " + "({} -> {}).", total_sanitized, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), char_details, bytes_added, old_buf_occupied, diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index a9732ad5c6..dcb4a44d94 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -1,6 +1,8 @@ #ifndef CLP_S_JSONFILEITERATOR_HPP #define CLP_S_JSONFILEITERATOR_HPP +#include + #include #include "../clp/ReaderInterface.hpp" @@ -19,11 +21,13 @@ class JsonFileIterator { * @param reader the input stream containing JSON * @param max_document_size the maximum allowed size of a single document + * @param path optional path to the file being read (used for logging) * @param buf_size the initial buffer size */ explicit JsonFileIterator( clp::ReaderInterface& reader, size_t max_document_size, + std::string path = {}, size_t buf_size = 1024 * 1024 /* 1 MiB default */ ); ~JsonFileIterator(); @@ -84,6 +88,7 @@ class JsonFileIterator { size_t m_max_document_size{0}; char* m_buf{nullptr}; clp::ReaderInterface& m_reader; + std::string m_path; simdjson::ondemand::parser m_parser; simdjson::ondemand::document_stream m_stream; bool m_eof{false}; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index c0e2123e7f..ff36aac8cc 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -668,7 +668,7 @@ auto JsonParser::ingest_json( Path const& path, std::string const& archive_creator_id ) -> bool { - JsonFileIterator json_file_iterator(*reader, m_max_document_size); + JsonFileIterator json_file_iterator(*reader, m_max_document_size, path.path); if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) { SPDLOG_ERROR( "Encountered error - {} - while trying to parse {} after parsing 0 bytes", From c9fc698269cc0f32732e75ae962711d45be0fc8a Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Thu, 8 Jan 2026 22:28:33 +0800 Subject: [PATCH 03/11] style: apply clang-format to source files --- .../core/src/clp_s/JsonFileIterator.cpp | 5 +-- .../core/tests/test-clp_s-StringUtils.cpp | 42 +++++++++++++------ 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index 5b38e45bf7..07c2040944 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -107,12 +107,11 @@ bool JsonFileIterator::read_new_json() { m_buf_occupied ); - error = m_parser - .iterate_many( + error = m_parser.iterate_many( m_buf, /* length of data */ m_buf_occupied, /* batch size of data to parse*/ m_buf_occupied - ) + ) .get(m_stream); } diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp index 9163281087..ac9fafdd5c 100644 --- a/components/core/tests/test-clp_s-StringUtils.cpp +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -12,8 +12,9 @@ using clp_s::StringUtils; // We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes // and control characters. Without the `s` suffix, std::string's constructor from a C-string literal -// stops at the first null byte, truncating the test input. Additionally, we use string concatenation -// (e.g., "\x00" "end") to prevent hex escape sequences from being extended by following hex digits. +// stops at the first null byte, truncating the test input. Additionally, we use string +// concatenation (e.g., "\x00" "end") to prevent hex escape sequences from being extended by +// following hex digits. using namespace std::string_literals; namespace { @@ -108,7 +109,10 @@ TEST_CASE("sanitize_json_buffer_escapes_control_chars", "[clp_s][StringUtils]") SECTION("various control characters") { // Test 0x01 (SOH), 0x02 (STX), 0x1F (US) - auto input = "{\"key\": \"a\x01" "b\x02" "c\x1F" "d\"}"s; + auto input = "{\"key\": \"a\x01" + "b\x02" + "c\x1F" + "d\"}"s; std::string expected = R"({"key": "a\u0001b\u0002c\u001fd"})"; REQUIRE(sanitize_string(input) == expected); } @@ -141,13 +145,15 @@ TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils SECTION("control char after escaped backslash") { // \\\x00 means escaped backslash followed by a control char still inside the string - auto input = "{\"key\": \"slash:\\\\\x00" "end\"}"s; + auto input = "{\"key\": \"slash:\\\\\x00" + "end\"}"s; std::string expected = R"({"key": "slash:\\\u0000end"})"; REQUIRE(sanitize_string(input) == expected); } SECTION("control char after escaped quote") { - auto input = "{\"key\": \"quote:\\\"\x00" "after\"}"s; + auto input = "{\"key\": \"quote:\\\"\x00" + "after\"}"s; std::string expected = R"({"key": "quote:\"\u0000after"})"; REQUIRE(sanitize_string(input) == expected); } @@ -156,16 +162,20 @@ TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils // \ is not a valid JSON escape sequence, but we leave it as-is // since the sanitizer treats any char after backslash as an escape target. // This JSON is invalid anyway (bad escape sequence) and will fail parsing. - auto input = "{\"key\": \"bad:\\\x00" "end\"}"s; - auto expected = "{\"key\": \"bad:\\\x00" "end\"}"s; + auto input = "{\"key\": \"bad:\\\x00" + "end\"}"s; + auto expected = "{\"key\": \"bad:\\\x00" + "end\"}"s; REQUIRE(sanitize_string(input) == expected); } SECTION("triple backslash followed by control char") { // \\\ followed by \x00: first \\ is escaped backslash, third \ starts escape sequence. // The third backslash makes the null appear to be an escape target, so it's not escaped. - auto input = "{\"key\": \"\\\\\\\x00" "end\"}"s; - auto expected = "{\"key\": \"\\\\\\\x00" "end\"}"s; + auto input = "{\"key\": \"\\\\\\\x00" + "end\"}"s; + auto expected = "{\"key\": \"\\\\\\\x00" + "end\"}"s; REQUIRE(sanitize_string(input) == expected); } } @@ -226,19 +236,24 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") { // // Input: truncated JSON where the string is not closed, followed by control chars // The sanitizer may incorrectly think it's still in a string and escape the control chars - auto input = "{\"key\": \"unclosed string\x00\x01" "after\"}"s; + auto input = "{\"key\": \"unclosed string\x00\x01" + "after\"}"s; // The sanitizer will escape the control chars because it thinks we're still in a string // (the quote after "unclosed string" is escaped, so the string continues) std::string sanitized = sanitize_string(input); // The control chars should be escaped (behavior may vary based on quote matching) // The important thing is that the function doesn't crash and handles it gracefully - REQUIRE((sanitized.find('\x00') == std::string::npos || sanitized.find("\\u0000") != std::string::npos)); + REQUIRE( + (sanitized.find('\x00') == std::string::npos + || sanitized.find("\\u0000") != std::string::npos) + ); } SECTION("truncated JSON ending mid-string") { // JSON truncated in the middle of a string with control chars (no closing quote) // This simulates a real truncation scenario where the buffer ends mid-string - auto input = "{\"key\": \"value\x00\x01" "truncated"s; + auto input = "{\"key\": \"value\x00\x01" + "truncated"s; std::string sanitized = sanitize_string(input); // Should handle gracefully without crashing // Control chars inside the (unclosed) string should be escaped @@ -251,7 +266,8 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") { SECTION("truncated JSON with control chars after unmatched quote") { // JSON with unmatched quote followed by control chars outside the string // The sanitizer may incorrectly escape these if it thinks we're still in a string - auto input = "{\"key\": \"value\x00" "}\x01\x02"s; + auto input = "{\"key\": \"value\x00" + "}\x01\x02"s; std::string sanitized = sanitize_string(input); // Function should not crash - behavior may vary but should be consistent REQUIRE(sanitized.size() >= input.size()); From baf6366df970af3aba12636f7157103e00c03946 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 02:48:23 +0800 Subject: [PATCH 04/11] feat(clp-s): add --sanitize-invalid-json flag for opt-in JSON sanitization Add a new CLI flag that enables sanitization of invalid JSON during ingestion. When enabled, the following issues are automatically fixed: - Invalid UTF-8 sequences: replaced with U+FFFD replacement character - Unescaped control characters (0x00-0x1F): escaped to \u00XX format - Invalid surrogate escapes: handled via simdjson's allow_replacement When disabled (default), parsing fails fast on any of these issues. Changes: - Add --sanitize-invalid-json flag to CommandLineArguments - Add sanitize_utf8_buffer() to Utils for UTF-8 validation/replacement - Update JsonFileIterator to conditionally sanitize on UTF8_ERROR and UNESCAPED_CHARS errors during both initial parsing and iteration - Update JsonParser to pass flag through and use conditional get_string()/unescaped_key() for surrogate escape handling --- .../core/src/clp_s/CommandLineArguments.cpp | 6 + .../core/src/clp_s/CommandLineArguments.hpp | 3 + .../core/src/clp_s/JsonFileIterator.cpp | 227 ++++++++++++++---- .../core/src/clp_s/JsonFileIterator.hpp | 3 + components/core/src/clp_s/JsonParser.cpp | 14 +- components/core/src/clp_s/JsonParser.hpp | 2 + components/core/src/clp_s/Utils.cpp | 142 +++++++++++ components/core/src/clp_s/Utils.hpp | 23 ++ components/core/src/clp_s/clp-s.cpp | 1 + 9 files changed, 370 insertions(+), 51 deletions(-) diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 72c0d50d80..fef29442f6 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -256,6 +256,12 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "structurize-arrays", po::bool_switch(&m_structurize_arrays), "Structurize arrays instead of compressing them as clp strings." + )( + "sanitize-invalid-json", + po::bool_switch(&m_sanitize_invalid_json), + "Sanitize invalid JSON by escaping unescaped control characters (0x00-0x1F)," + " replacing invalid UTF-8 sequences with U+FFFD, and handling invalid" + " surrogate escapes. When disabled (default), parsing fails on invalid JSON." )( "disable-log-order", po::bool_switch(&m_disable_log_order), diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index b35124981a..2e32b57a68 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -113,6 +113,8 @@ class CommandLineArguments { bool get_structurize_arrays() const { return m_structurize_arrays; } + bool get_sanitize_invalid_json() const { return m_sanitize_invalid_json; } + bool get_ordered_decompression() const { return m_ordered_decompression; } size_t get_target_ordered_chunk_size() const { return m_target_ordered_chunk_size; } @@ -202,6 +204,7 @@ class CommandLineArguments { bool m_no_retain_float_format{false}; bool m_single_file_archive{false}; bool m_structurize_arrays{false}; + bool m_sanitize_invalid_json{false}; bool m_ordered_decompression{false}; size_t m_target_ordered_chunk_size{}; bool m_print_ordered_chunk_stats{false}; diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index 07c2040944..bc9cc234e8 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -12,6 +12,7 @@ namespace clp_s { JsonFileIterator::JsonFileIterator( clp::ReaderInterface& reader, size_t max_document_size, + bool sanitize_invalid_json, std::string path, size_t buf_size ) @@ -19,7 +20,8 @@ JsonFileIterator::JsonFileIterator( m_max_document_size(max_document_size), m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]), m_reader(reader), - m_path(std::move(path)) { + m_path(std::move(path)), + m_sanitize_invalid_json(sanitize_invalid_json) { read_new_json(); } @@ -69,50 +71,91 @@ bool JsonFileIterator::read_new_json() { ) .get(m_stream); - // If we encounter unescaped control characters in JSON strings, sanitize the buffer - // by escaping them to \u00XX format and retry parsing - if (simdjson::error_code::UNESCAPED_CHARS == error) { - size_t const old_buf_occupied = m_buf_occupied; - // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by - // reference if reallocation is needed. This keeps m_buf_size in sync with the - // actual allocated buffer size. - auto const result = StringUtils::sanitize_json_buffer( - m_buf, - m_buf_size, - m_buf_occupied, - simdjson::SIMDJSON_PADDING - ); - m_buf_occupied = result.new_buf_occupied; - m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; - - // Build log message with details of sanitized characters - size_t total_sanitized = 0; - std::string char_details; - for (auto const& [ch, count] : result.sanitized_char_counts) { - if (!char_details.empty()) { - char_details += ", "; + // If sanitization is enabled and we encounter errors that can be fixed by sanitization, + // sanitize the buffer and retry parsing + if (m_sanitize_invalid_json) { + // Handle invalid UTF-8 sequences by replacing with U+FFFD + if (simdjson::error_code::UTF8_ERROR == error) { + size_t const old_buf_occupied = m_buf_occupied; + auto const result = StringUtils::sanitize_utf8_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + if (!result.sanitized_char_counts.empty()) { + size_t total_replaced = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total_replaced += count; + } + SPDLOG_WARN( + "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size " + "changed by {} bytes ({} -> {}).", + total_replaced, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + static_cast(m_buf_occupied) + - static_cast(old_buf_occupied), + old_buf_occupied, + m_buf_occupied + ); + } + + error = m_parser.iterate_many( + m_buf, + /* length of data */ m_buf_occupied, + /* batch size of data to parse*/ m_buf_occupied + ) + .get(m_stream); + } + + // Handle unescaped control characters by escaping them to \u00XX format + if (simdjson::error_code::UNESCAPED_CHARS == error) { + size_t const old_buf_occupied = m_buf_occupied; + // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by + // reference if reallocation is needed. This keeps m_buf_size in sync with the + // actual allocated buffer size. + auto const result = StringUtils::sanitize_json_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + // Build log message with details of sanitized characters + size_t total_sanitized = 0; + std::string char_details; + for (auto const& [ch, count] : result.sanitized_char_counts) { + if (!char_details.empty()) { + char_details += ", "; + } + char_details + += fmt::format("0x{:02x} ({})", static_cast(ch), count); + total_sanitized += count; } - char_details += fmt::format("0x{:02x} ({})", static_cast(ch), count); - total_sanitized += count; + size_t bytes_added = m_buf_occupied - old_buf_occupied; + SPDLOG_WARN( + "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} " + "bytes ({} -> {}).", + total_sanitized, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + char_details, + bytes_added, + old_buf_occupied, + m_buf_occupied + ); + + error = m_parser.iterate_many( + m_buf, + /* length of data */ m_buf_occupied, + /* batch size of data to parse*/ m_buf_occupied + ) + .get(m_stream); } - size_t bytes_added = m_buf_occupied - old_buf_occupied; - SPDLOG_WARN( - "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes " - "({} -> {}).", - total_sanitized, - m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), - char_details, - bytes_added, - old_buf_occupied, - m_buf_occupied - ); - - error = m_parser.iterate_many( - m_buf, - /* length of data */ m_buf_occupied, - /* batch size of data to parse*/ m_buf_occupied - ) - .get(m_stream); } if (error) { @@ -169,6 +212,59 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i return true; } else if (m_doc_it.error() == simdjson::error_code::UTF8_ERROR) { maybe_utf8_edge_case = true; + } else if (m_sanitize_invalid_json + && m_doc_it.error() == simdjson::error_code::UNESCAPED_CHARS) + { + // Unescaped control characters detected during document iteration. Sanitize the + // buffer and re-setup the document stream to restart from the beginning. + size_t const old_buf_occupied = m_buf_occupied; + auto const result = StringUtils::sanitize_json_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + // Log sanitization details + if (!result.sanitized_char_counts.empty()) { + size_t total_sanitized = 0; + std::string char_details; + for (auto const& [ch, count] : result.sanitized_char_counts) { + if (!char_details.empty()) { + char_details += ", "; + } + char_details += fmt::format( + "0x{:02x} ({})", + static_cast(ch), + count + ); + total_sanitized += count; + } + SPDLOG_WARN( + "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by " + "{} bytes ({} -> {}).", + total_sanitized, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + char_details, + m_buf_occupied - old_buf_occupied, + old_buf_occupied, + m_buf_occupied + ); + } + + // Re-setup the document stream and restart iteration + auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied) + .get(m_stream); + if (error) { + m_error_code = error; + return false; + } + m_doc_it = m_stream.begin(); + m_first_doc_in_buffer = true; + m_next_document_position = 0; + continue; } else { m_error_code = m_doc_it.error(); return false; @@ -188,8 +284,49 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i // If we hit a UTF-8 error and either we have reached eof or the buffer occupancy is // greater than the maximum document size we assume that the UTF-8 error must have been // in the middle of the stream. Note: it is possible that the UTF-8 error is at the end - // of the stream and that this is actualy a truncation error. Unfortunately the only way - // to check is to parse it ourselves, so we rely on this heuristic for now. + // of the stream and that this is actually a truncation error. Unfortunately the only + // way to check is to parse it ourselves, so we rely on this heuristic for now. + if (m_sanitize_invalid_json) { + // Sanitize invalid UTF-8 sequences and retry + size_t const old_buf_occupied = m_buf_occupied; + auto const result = StringUtils::sanitize_utf8_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + if (!result.sanitized_char_counts.empty()) { + size_t total_replaced = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total_replaced += count; + } + SPDLOG_WARN( + "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size " + "changed by {} bytes ({} -> {}).", + total_replaced, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + static_cast(m_buf_occupied) + - static_cast(old_buf_occupied), + old_buf_occupied, + m_buf_occupied + ); + } + + // Re-setup the document stream and restart iteration + auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied) + .get(m_stream); + if (error) { + m_error_code = error; + return false; + } + m_doc_it = m_stream.begin(); + m_first_doc_in_buffer = true; + m_next_document_position = 0; + continue; + } m_error_code = simdjson::error_code::UTF8_ERROR; return false; } else if (maybe_utf8_edge_case) { diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index dcb4a44d94..8b9636bd14 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -21,12 +21,14 @@ class JsonFileIterator { * @param reader the input stream containing JSON * @param max_document_size the maximum allowed size of a single document + * @param sanitize_invalid_json whether to sanitize invalid JSON (control chars, invalid UTF-8) * @param path optional path to the file being read (used for logging) * @param buf_size the initial buffer size */ explicit JsonFileIterator( clp::ReaderInterface& reader, size_t max_document_size, + bool sanitize_invalid_json, std::string path = {}, size_t buf_size = 1024 * 1024 /* 1 MiB default */ ); @@ -89,6 +91,7 @@ class JsonFileIterator { char* m_buf{nullptr}; clp::ReaderInterface& m_reader; std::string m_path; + bool m_sanitize_invalid_json{false}; simdjson::ondemand::parser m_parser; simdjson::ondemand::document_stream m_stream; bool m_eof{false}; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index ff36aac8cc..93687beab1 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -144,6 +144,7 @@ JsonParser::JsonParser(JsonParserOption const& option) m_structurize_arrays(option.structurize_arrays), m_record_log_order(option.record_log_order), m_retain_float_format(option.retain_float_format), + m_sanitize_invalid_json(option.sanitize_invalid_json), m_input_paths(option.input_paths), m_network_auth(option.network_auth) { if (false == m_timestamp_key.empty()) { @@ -223,7 +224,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par } cur_field = *object_it_stack.top(); - cur_key = cur_field.unescaped_key(true); + cur_key = cur_field.unescaped_key(m_sanitize_invalid_json); cur_value = cur_field.value(); switch (cur_value.type()) { @@ -302,7 +303,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par break; } case simdjson::ondemand::json_type::string: { - std::string_view value = cur_value.get_string(true); + std::string_view value = cur_value.get_string(m_sanitize_invalid_json); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key); @@ -407,7 +408,7 @@ void JsonParser::parse_array(simdjson::ondemand::array array, int32_t parent_nod break; } case simdjson::ondemand::json_type::string: { - std::string_view value = cur_value.get_string(true); + std::string_view value = cur_value.get_string(m_sanitize_invalid_json); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, ""); } else { @@ -452,7 +453,7 @@ void JsonParser::parse_line( do { if (false == object_stack.empty()) { cur_field = *object_it_stack.top(); - cur_key = cur_field.unescaped_key(true); + cur_key = cur_field.unescaped_key(m_sanitize_invalid_json); line = cur_field.value(); } @@ -555,7 +556,7 @@ void JsonParser::parse_line( break; } case simdjson::ondemand::json_type::string: { - std::string_view value = line.get_string(true); + std::string_view value = line.get_string(m_sanitize_invalid_json); auto const matches_timestamp = m_archive_writer->matches_timestamp(node_id_stack.top(), cur_key); if (matches_timestamp) { @@ -668,7 +669,8 @@ auto JsonParser::ingest_json( Path const& path, std::string const& archive_creator_id ) -> bool { - JsonFileIterator json_file_iterator(*reader, m_max_document_size, path.path); + JsonFileIterator + json_file_iterator(*reader, m_max_document_size, m_sanitize_invalid_json, path.path); if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) { SPDLOG_ERROR( "Encountered error - {} - while trying to parse {} after parsing 0 bytes", diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 82f8d04786..e763d5fe71 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -40,6 +40,7 @@ struct JsonParserOption { bool record_log_order{true}; bool retain_float_format{false}; bool single_file_archive{false}; + bool sanitize_invalid_json{false}; NetworkAuthOption network_auth{}; }; @@ -239,6 +240,7 @@ class JsonParser { bool m_structurize_arrays{false}; bool m_record_log_order{true}; bool m_retain_float_format{false}; + bool m_sanitize_invalid_json{false}; absl::flat_hash_map, std::pair> m_ir_node_to_archive_node_id_mapping; diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 5efcb7d77e..9458c970dd 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -233,6 +233,148 @@ JsonSanitizeResult StringUtils::sanitize_json_buffer( return {sanitized.size(), std::move(sanitized_char_counts)}; } +namespace { +/** + * Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx pattern). + * @param byte The byte to check + * @return true if the byte is a valid continuation byte (0x80-0xBF) + */ +constexpr bool is_continuation_byte(unsigned char byte) { + return (byte & 0xC0) == 0x80; +} + +/** + * Validates a UTF-8 sequence starting at the given position and returns the sequence length. + * @param buf The buffer containing the UTF-8 data + * @param pos Current position in the buffer + * @param buf_occupied Total bytes in the buffer + * @return The length of the valid UTF-8 sequence (1-4), or 0 if invalid + */ +size_t validate_utf8_sequence(char const* buf, size_t pos, size_t buf_occupied) { + auto const byte = static_cast(buf[pos]); + size_t remaining = buf_occupied - pos; + + // ASCII (0x00-0x7F) + if (byte <= 0x7F) { + return 1; + } + + // Invalid: continuation byte without leader, or invalid lead bytes + if (byte < 0xC2 || byte > 0xF4) { + return 0; + } + + // 2-byte sequence (0xC2-0xDF) + if (byte <= 0xDF) { + if (remaining < 2 || !is_continuation_byte(static_cast(buf[pos + 1]))) { + return 0; + } + return 2; + } + + // 3-byte sequence (0xE0-0xEF) + if (byte <= 0xEF) { + if (remaining < 3) { + return 0; + } + auto const byte2 = static_cast(buf[pos + 1]); + auto const byte3 = static_cast(buf[pos + 2]); + + if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3)) { + return 0; + } + + // Check for overlong encoding (E0 requires second byte >= 0xA0) + if (byte == 0xE0 && byte2 < 0xA0) { + return 0; + } + + // Check for surrogate code points (ED with second byte >= 0xA0 means 0xD800-0xDFFF) + if (byte == 0xED && byte2 >= 0xA0) { + return 0; + } + + return 3; + } + + // 4-byte sequence (0xF0-0xF4) + if (remaining < 4) { + return 0; + } + auto const byte2 = static_cast(buf[pos + 1]); + auto const byte3 = static_cast(buf[pos + 2]); + auto const byte4 = static_cast(buf[pos + 3]); + + if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3) + || !is_continuation_byte(byte4)) + { + return 0; + } + + // Check for overlong encoding (F0 requires second byte >= 0x90) + if (byte == 0xF0 && byte2 < 0x90) { + return 0; + } + + // Check for code points > U+10FFFF (F4 requires second byte <= 0x8F) + if (byte == 0xF4 && byte2 > 0x8F) { + return 0; + } + + return 4; +} +} // namespace + +JsonSanitizeResult StringUtils::sanitize_utf8_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding +) { + // Build sanitized content in a temporary buffer, replacing invalid UTF-8 sequences + // with the Unicode replacement character U+FFFD (0xEF 0xBF 0xBD). + std::string sanitized; + + // We use a special key to count invalid UTF-8 sequences + // Using 0xFF as the key since it's never a valid UTF-8 byte + std::map sanitized_char_counts; + constexpr char cInvalidUtf8Key = static_cast(0xFF); + + size_t i = 0; + while (i < buf_occupied) { + size_t seq_len = validate_utf8_sequence(buf, i, buf_occupied); + if (seq_len > 0) { + // Valid sequence - copy it + sanitized.append(buf + i, seq_len); + i += seq_len; + } else { + // Invalid sequence - replace with U+FFFD + sanitized.append("\xEF\xBF\xBD", 3); + ++sanitized_char_counts[cInvalidUtf8Key]; + // Skip one byte and continue (maximal subpart replacement strategy) + ++i; + } + } + + // If no changes were made, return early + if (sanitized.size() == buf_occupied) { + return {buf_occupied, {}}; + } + + // Grow buffer if needed to hold sanitized content + if (sanitized.size() > buf_size) { + size_t new_buf_size = sanitized.size(); + char* new_buf = new char[new_buf_size + simdjson_padding]; + delete[] buf; + buf = new_buf; + buf_size = new_buf_size; + } + + // Copy sanitized content to buffer + std::memcpy(buf, sanitized.data(), sanitized.size()); + return {sanitized.size(), std::move(sanitized_char_counts)}; +} + void StringUtils::escape_json_string(std::string& destination, std::string_view const source) { // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index df5d152444..54111d25a6 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -118,6 +118,29 @@ class StringUtils { size_t simdjson_padding ); + /** + * Sanitizes a buffer by replacing invalid UTF-8 sequences with the Unicode replacement + * character (U+FFFD). This is used to fix malformed data that contains invalid UTF-8 + * which would cause JSON parsing errors. + * + * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed. + * The caller's pointer will be updated to point to the new buffer if reallocation + * occurs. The caller is responsible for deleting the buffer. + * @param buf_size Current allocated size of the buffer (excluding simdjson padding). + * This parameter is modified by reference and will be updated if the buffer + * is reallocated to reflect the new buffer size. + * @param buf_occupied Number of bytes currently used in the buffer. + * @param simdjson_padding Amount of padding required after buffer content. + * @return Result containing new buf_occupied and counts of each type of invalid sequence + * replaced. + */ + static JsonSanitizeResult sanitize_utf8_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding + ); + private: /** * Converts a character into its two byte hexadecimal representation. diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index c2cf7b6abd..fd24634680 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -107,6 +107,7 @@ bool compress(CommandLineArguments const& command_line_arguments) { option.single_file_archive = command_line_arguments.get_single_file_archive(); option.structurize_arrays = command_line_arguments.get_structurize_arrays(); option.record_log_order = command_line_arguments.get_record_log_order(); + option.sanitize_invalid_json = command_line_arguments.get_sanitize_invalid_json(); clp_s::JsonParser parser(option); if (false == parser.ingest()) { From b0b8b0fc9700604ddcdc3d4972479be7e21585d2 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 03:07:34 +0800 Subject: [PATCH 05/11] test(clp-s): add unit tests for sanitize_utf8_buffer function Add comprehensive tests for the new UTF-8 sanitization function covering: - Valid UTF-8 passthrough (ASCII, multibyte chars, JSON with UTF-8) - Invalid byte replacement (0xFF, 0xFE, continuation bytes without leader) - Truncated sequences (2/3/4-byte sequences cut short) - Overlong encodings (0xC0, 0xC1 lead bytes) - Surrogate code points (U+D800-U+DFFF) - Code points above U+10FFFF - Mixed valid/invalid UTF-8 - Correct replacement character counts - Buffer growth for many invalid bytes --- .../core/tests/test-clp_s-StringUtils.cpp | 251 ++++++++++++++++++ 1 file changed, 251 insertions(+) diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp index ac9fafdd5c..57bea49abf 100644 --- a/components/core/tests/test-clp_s-StringUtils.cpp +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -274,6 +274,257 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") { } } +// ===================================================================================== +// UTF-8 Sanitization Tests +// ===================================================================================== + +namespace { +/** + * Helper to create a buffer with simdjson padding and run UTF-8 sanitization. + * @param input The input string to sanitize + * @return The sanitized string + */ +auto sanitize_utf8_string(std::string_view input) -> std::string { + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + try { + auto const result = StringUtils::sanitize_utf8_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + std::string output(buf, result.new_buf_occupied); + delete[] buf; + return output; + } catch (...) { + delete[] buf; + throw; + } +} + +/** + * Count occurrences of U+FFFD replacement character (0xEF 0xBF 0xBD) in a string. + */ +auto count_replacement_chars(std::string_view s) -> size_t { + size_t count = 0; + for (size_t i = 0; i + 2 < s.size(); ++i) { + if (static_cast(s[i]) == 0xEF && static_cast(s[i + 1]) == 0xBF + && static_cast(s[i + 2]) == 0xBD) + { + ++count; + i += 2; // Skip past the replacement char + } + } + return count; +} +} // namespace + +TEST_CASE("sanitize_utf8_buffer_no_changes", "[clp_s][StringUtils]") { + SECTION("valid ASCII") { + std::string input = "Hello, World!"; + REQUIRE(sanitize_utf8_string(input) == input); + } + + SECTION("valid UTF-8 multibyte characters") { + // 2-byte: é (U+00E9) = 0xC3 0xA9 + // 3-byte: € (U+20AC) = 0xE2 0x82 0xAC + // 4-byte: 𝄞 (U+1D11E) = 0xF0 0x9D 0x84 0x9E + std::string input = "café € 𝄞"; + REQUIRE(sanitize_utf8_string(input) == input); + } + + SECTION("valid JSON with UTF-8") { + std::string input = R"({"msg": "日本語テスト"})"; + REQUIRE(sanitize_utf8_string(input) == input); + } + + SECTION("empty string") { + std::string input = ""; + REQUIRE(sanitize_utf8_string(input) == input); + } +} + +TEST_CASE("sanitize_utf8_buffer_replaces_invalid_bytes", "[clp_s][StringUtils]") { + // U+FFFD is encoded as 0xEF 0xBF 0xBD in UTF-8 + constexpr char cReplacementChar[] = "\xEF\xBF\xBD"; + + SECTION("single invalid byte 0xFF") { + auto input = "hello\xFF world"s; + std::string expected = "hello" + std::string(cReplacementChar) + " world"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("single invalid byte 0xFE") { + auto input = "test\xFE"s; + std::string expected = "test" + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("multiple invalid bytes") { + auto input = "\xFF\xFE\xFF"s; + std::string expected = std::string(cReplacementChar) + cReplacementChar + cReplacementChar; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("continuation byte without leader (0x80-0xBF)") { + auto input = "test\x80 value"s; + std::string expected = "test" + std::string(cReplacementChar) + " value"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("truncated 2-byte sequence") { + // 0xC3 expects one continuation byte, but we end the string + auto input = "test\xC3"s; + std::string expected = "test" + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("truncated 3-byte sequence") { + // 0xE2 expects two continuation bytes + auto input = "test\xE2\x82"s; + std::string expected + = "test" + std::string(cReplacementChar) + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("truncated 4-byte sequence") { + // 0xF0 expects three continuation bytes + auto input = "test\xF0\x9D\x84"s; + std::string expected = "test" + std::string(cReplacementChar) + + std::string(cReplacementChar) + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("overlong 2-byte encoding") { + // 0xC0 0x80 is overlong encoding of NUL (should be just 0x00) + // 0xC0 and 0xC1 are never valid UTF-8 lead bytes + auto input = "test\xC0\x80"s; + // Both bytes are invalid + std::string expected + = "test" + std::string(cReplacementChar) + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("surrogate code points (U+D800-U+DFFF)") { + // 0xED 0xA0 0x80 encodes U+D800 (invalid surrogate) + auto input = "test\xED\xA0\x80 end"s; + // The sequence is invalid, bytes replaced individually + std::string expected = "test" + std::string(cReplacementChar) + + std::string(cReplacementChar) + std::string(cReplacementChar) + + " end"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("code point above U+10FFFF") { + // 0xF4 0x90 0x80 0x80 would encode U+110000 (above max) + auto input = "test\xF4\x90\x80\x80 end"s; + std::string sanitized = sanitize_utf8_string(input); + // Should contain replacement characters + REQUIRE(count_replacement_chars(sanitized) >= 1); + } +} + +TEST_CASE("sanitize_utf8_buffer_mixed_valid_invalid", "[clp_s][StringUtils]") { + constexpr char cReplacementChar[] = "\xEF\xBF\xBD"; + + SECTION("invalid byte between valid UTF-8") { + // café with invalid byte in middle + auto input = "caf\xC3\xA9\xFF test"s; // café + 0xFF + " test" + std::string expected = "caf\xC3\xA9" + std::string(cReplacementChar) + " test"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("JSON with invalid UTF-8 in value") { + auto input = "{\"msg\": \"test\xFF value\"}"s; + std::string expected = "{\"msg\": \"test" + std::string(cReplacementChar) + " value\"}"; + REQUIRE(sanitize_utf8_string(input) == expected); + } +} + +TEST_CASE("sanitize_utf8_buffer_returns_correct_counts", "[clp_s][StringUtils]") { + SECTION("counts invalid sequences") { + auto input = "\xFF\xFE\xFF"s; // 3 invalid bytes + + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + try { + auto const result = StringUtils::sanitize_utf8_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + + // The count uses 0xFF as a special key for invalid UTF-8 sequences + REQUIRE_FALSE(result.sanitized_char_counts.empty()); + size_t total = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total += count; + } + REQUIRE(total == 3); + + delete[] buf; + } catch (...) { + delete[] buf; + throw; + } + } + + SECTION("returns empty counts when no sanitization needed") { + std::string input = "valid UTF-8 string"; + + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + try { + auto const result = StringUtils::sanitize_utf8_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + + REQUIRE(result.sanitized_char_counts.empty()); + REQUIRE(result.new_buf_occupied == input.size()); + + delete[] buf; + } catch (...) { + delete[] buf; + throw; + } + } +} + +TEST_CASE("sanitize_utf8_buffer_handles_buffer_growth", "[clp_s][StringUtils]") { + // Each invalid byte (1 byte) becomes U+FFFD (3 bytes), so buffer may need to grow + SECTION("many invalid bytes requiring expansion") { + // 100 invalid bytes -> 300 bytes after replacement with U+FFFD + std::string input(100, '\xFF'); + + std::string sanitized = sanitize_utf8_string(input); + + // Verify all bytes were replaced + REQUIRE(sanitized.find('\xFF') == std::string::npos); + // Verify correct number of replacement characters + REQUIRE(count_replacement_chars(sanitized) == 100); + // Verify size: 100 * 3 = 300 bytes + REQUIRE(sanitized.size() == 300); + } +} + +// ===================================================================================== +// JSON Sanitization Character Count Tests +// ===================================================================================== + TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUtils]") { SECTION("counts multiple different control characters") { // Input with: 3x \x00, 2x \x01, 1x \x1f From ee13dd3da4e8d827575af7308ee259b850b3bea2 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 03:41:51 +0800 Subject: [PATCH 06/11] fix(clp_s): prevent infinite loop if sanitization makes no changes Add guards in get_json() to exit with error if sanitization functions don't modify the buffer. This prevents infinite loops in edge cases where simdjson reports UNESCAPED_CHARS or UTF8_ERROR but the sanitization functions don't find anything to fix. --- components/core/src/clp_s/JsonFileIterator.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index bc9cc234e8..8152ad6d09 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -252,6 +252,11 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i old_buf_occupied, m_buf_occupied ); + } else { + // Sanitization made no changes - report the original error to avoid infinite + // loop + m_error_code = m_doc_it.error(); + return false; } // Re-setup the document stream and restart iteration @@ -313,6 +318,11 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i old_buf_occupied, m_buf_occupied ); + } else { + // Sanitization made no changes - report the original error to avoid infinite + // loop + m_error_code = simdjson::error_code::UTF8_ERROR; + return false; } // Re-setup the document stream and restart iteration From 9c2140ea70c2091b112dd353de0c03ffb697bc40 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 03:48:12 +0800 Subject: [PATCH 07/11] style(clp_s): use 'false == expr' instead of '!expr' per coding standards --- components/core/tests/test-clp_s-StringUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp index 57bea49abf..48794f8d4f 100644 --- a/components/core/tests/test-clp_s-StringUtils.cpp +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -60,7 +60,7 @@ auto can_parse_json(std::string_view json) -> bool { simdjson::ondemand::parser parser; simdjson::padded_string padded(json); auto result = parser.iterate(padded); - return !result.error(); + return false == result.error(); } } // namespace From 4dbf8f32ea974e36fe1d268604b8369053634968 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 03:53:29 +0800 Subject: [PATCH 08/11] refactor(clp_s): extract sanitize_and_log helper methods Extract sanitize_and_log_utf8() and sanitize_and_log_json() helper methods to reduce code duplication. The sanitization-and-log pattern was repeated 4 times with slight variations. Both helpers return true if sanitization made changes, allowing callers to detect infinite loop conditions in get_json(). --- .../core/src/clp_s/JsonFileIterator.cpp | 217 +++++++----------- .../core/src/clp_s/JsonFileIterator.hpp | 14 ++ 2 files changed, 94 insertions(+), 137 deletions(-) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index 8152ad6d09..afe014160c 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -76,85 +76,18 @@ bool JsonFileIterator::read_new_json() { if (m_sanitize_invalid_json) { // Handle invalid UTF-8 sequences by replacing with U+FFFD if (simdjson::error_code::UTF8_ERROR == error) { - size_t const old_buf_occupied = m_buf_occupied; - auto const result = StringUtils::sanitize_utf8_buffer( - m_buf, - m_buf_size, - m_buf_occupied, - simdjson::SIMDJSON_PADDING - ); - m_buf_occupied = result.new_buf_occupied; - m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; - - if (!result.sanitized_char_counts.empty()) { - size_t total_replaced = 0; - for (auto const& [ch, count] : result.sanitized_char_counts) { - total_replaced += count; - } - SPDLOG_WARN( - "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size " - "changed by {} bytes ({} -> {}).", - total_replaced, - m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), - static_cast(m_buf_occupied) - - static_cast(old_buf_occupied), - old_buf_occupied, - m_buf_occupied - ); - } - - error = m_parser.iterate_many( - m_buf, - /* length of data */ m_buf_occupied, - /* batch size of data to parse*/ m_buf_occupied - ) - .get(m_stream); + // Return value intentionally ignored - in read_new_json we always retry after + // sanitization regardless of whether changes were made + static_cast(sanitize_and_log_utf8()); + error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); } // Handle unescaped control characters by escaping them to \u00XX format if (simdjson::error_code::UNESCAPED_CHARS == error) { - size_t const old_buf_occupied = m_buf_occupied; - // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by - // reference if reallocation is needed. This keeps m_buf_size in sync with the - // actual allocated buffer size. - auto const result = StringUtils::sanitize_json_buffer( - m_buf, - m_buf_size, - m_buf_occupied, - simdjson::SIMDJSON_PADDING - ); - m_buf_occupied = result.new_buf_occupied; - m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; - - // Build log message with details of sanitized characters - size_t total_sanitized = 0; - std::string char_details; - for (auto const& [ch, count] : result.sanitized_char_counts) { - if (!char_details.empty()) { - char_details += ", "; - } - char_details - += fmt::format("0x{:02x} ({})", static_cast(ch), count); - total_sanitized += count; - } - size_t bytes_added = m_buf_occupied - old_buf_occupied; - SPDLOG_WARN( - "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} " - "bytes ({} -> {}).", - total_sanitized, - m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), - char_details, - bytes_added, - old_buf_occupied, - m_buf_occupied - ); - - error = m_parser.iterate_many( - m_buf, - /* length of data */ m_buf_occupied, - /* batch size of data to parse*/ m_buf_occupied - ) - .get(m_stream); + // Return value intentionally ignored - in read_new_json we always retry after + // sanitization regardless of whether changes were made + static_cast(sanitize_and_log_json()); + error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); } } @@ -189,6 +122,76 @@ size_t JsonFileIterator::skip_whitespace_and_get_truncated_bytes() { return m_buf_occupied - m_next_document_position; } +bool JsonFileIterator::sanitize_and_log_utf8() { + size_t const old_buf_occupied = m_buf_occupied; + auto const result = StringUtils::sanitize_utf8_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + if (result.sanitized_char_counts.empty()) { + return false; + } + + size_t total_replaced = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total_replaced += count; + } + SPDLOG_WARN( + "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size changed by {} bytes " + "({} -> {}).", + total_replaced, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + static_cast(m_buf_occupied) - static_cast(old_buf_occupied), + old_buf_occupied, + m_buf_occupied + ); + return true; +} + +bool JsonFileIterator::sanitize_and_log_json() { + size_t const old_buf_occupied = m_buf_occupied; + // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by reference if + // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size. + auto const result = StringUtils::sanitize_json_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + if (result.sanitized_char_counts.empty()) { + return false; + } + + size_t total_sanitized = 0; + std::string char_details; + for (auto const& [ch, count] : result.sanitized_char_counts) { + if (false == char_details.empty()) { + char_details += ", "; + } + char_details += fmt::format("0x{:02x} ({})", static_cast(ch), count); + total_sanitized += count; + } + SPDLOG_WARN( + "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes " + "({} -> {}).", + total_sanitized, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + char_details, + m_buf_occupied - old_buf_occupied, + old_buf_occupied, + m_buf_occupied + ); + return true; +} + bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) { if (false == m_first_read) { ++m_doc_it; @@ -217,42 +220,7 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i { // Unescaped control characters detected during document iteration. Sanitize the // buffer and re-setup the document stream to restart from the beginning. - size_t const old_buf_occupied = m_buf_occupied; - auto const result = StringUtils::sanitize_json_buffer( - m_buf, - m_buf_size, - m_buf_occupied, - simdjson::SIMDJSON_PADDING - ); - m_buf_occupied = result.new_buf_occupied; - m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; - - // Log sanitization details - if (!result.sanitized_char_counts.empty()) { - size_t total_sanitized = 0; - std::string char_details; - for (auto const& [ch, count] : result.sanitized_char_counts) { - if (!char_details.empty()) { - char_details += ", "; - } - char_details += fmt::format( - "0x{:02x} ({})", - static_cast(ch), - count - ); - total_sanitized += count; - } - SPDLOG_WARN( - "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by " - "{} bytes ({} -> {}).", - total_sanitized, - m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), - char_details, - m_buf_occupied - old_buf_occupied, - old_buf_occupied, - m_buf_occupied - ); - } else { + if (false == sanitize_and_log_json()) { // Sanitization made no changes - report the original error to avoid infinite // loop m_error_code = m_doc_it.error(); @@ -293,32 +261,7 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i // way to check is to parse it ourselves, so we rely on this heuristic for now. if (m_sanitize_invalid_json) { // Sanitize invalid UTF-8 sequences and retry - size_t const old_buf_occupied = m_buf_occupied; - auto const result = StringUtils::sanitize_utf8_buffer( - m_buf, - m_buf_size, - m_buf_occupied, - simdjson::SIMDJSON_PADDING - ); - m_buf_occupied = result.new_buf_occupied; - m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; - - if (!result.sanitized_char_counts.empty()) { - size_t total_replaced = 0; - for (auto const& [ch, count] : result.sanitized_char_counts) { - total_replaced += count; - } - SPDLOG_WARN( - "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size " - "changed by {} bytes ({} -> {}).", - total_replaced, - m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), - static_cast(m_buf_occupied) - - static_cast(old_buf_occupied), - old_buf_occupied, - m_buf_occupied - ); - } else { + if (false == sanitize_and_log_utf8()) { // Sanitization made no changes - report the original error to avoid infinite // loop m_error_code = simdjson::error_code::UTF8_ERROR; diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index 8b9636bd14..0ef58d6fa3 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -81,6 +81,20 @@ class JsonFileIterator { */ [[nodiscard]] size_t skip_whitespace_and_get_truncated_bytes(); + /** + * Sanitizes invalid UTF-8 sequences in the buffer by replacing them with U+FFFD, + * updates buffer tracking variables, and logs a warning if changes were made. + * @return true if sanitization made changes, false otherwise + */ + [[nodiscard]] bool sanitize_and_log_utf8(); + + /** + * Sanitizes unescaped control characters in the buffer by escaping them to \\u00XX format, + * updates buffer tracking variables, and logs a warning if changes were made. + * @return true if sanitization made changes, false otherwise + */ + [[nodiscard]] bool sanitize_and_log_json(); + size_t m_truncated_bytes{0}; size_t m_next_document_position{0}; size_t m_bytes_read{0}; From c68c60701776b7830bef46e25950b537945853c3 Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 03:58:03 +0800 Subject: [PATCH 09/11] refactor(clp_s): extract test helpers to reduce duplication - Add SanitizeResultWithOutput struct for returning both result and output - Add sanitize_json_with_result() and sanitize_utf8_with_result() helpers - Simplify sanitize_string() and sanitize_utf8_string() to use helpers - Replace manual buffer management in count tests with helper calls This reduces code duplication in test files and improves maintainability. --- .../core/tests/test-clp_s-StringUtils.cpp | 211 +++++++----------- 1 file changed, 82 insertions(+), 129 deletions(-) diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp index 48794f8d4f..f7969ecbee 100644 --- a/components/core/tests/test-clp_s-StringUtils.cpp +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -19,36 +20,47 @@ using namespace std::string_literals; namespace { /** - * Helper to create a buffer with simdjson padding and run sanitization. + * Result structure containing both the sanitization result and the output string. + */ +struct SanitizeResultWithOutput { + JsonSanitizeResult result; + std::string output; +}; + +/** + * Helper to create a buffer with simdjson padding and run JSON sanitization. + * Returns both the result (with character counts) and the sanitized output string. * @param input The input string to sanitize - * @return The sanitized string - * @note If sanitize_json_buffer reallocates the buffer, the new buffer pointer is returned - * and the original buffer is automatically deleted by sanitize_json_buffer. This helper - * correctly handles both cases (reallocation and no reallocation) by always deleting - * the final buffer pointer. + * @return SanitizeResultWithOutput containing result metadata and sanitized string + * @note sanitize_json_buffer may reallocate the buffer (passed by reference). This helper + * handles both reallocation and no-reallocation cases correctly. */ -auto sanitize_string(std::string_view input) -> std::string { +auto sanitize_json_with_result(std::string_view input) -> SanitizeResultWithOutput { size_t buf_size = input.size(); size_t buf_occupied = input.size(); + // Use raw pointer since sanitize_json_buffer may delete and reallocate char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; std::memcpy(buf, input.data(), input.size()); - try { - auto const result = StringUtils::sanitize_json_buffer( - buf, - buf_size, - buf_occupied, - simdjson::SIMDJSON_PADDING - ); - std::string output(buf, result.new_buf_occupied); - // Delete the buffer (may be reallocated, but buf points to the current buffer) - delete[] buf; - return output; - } catch (...) { - // If sanitization failed, buf still points to the original buffer - delete[] buf; - throw; - } + auto result = StringUtils::sanitize_json_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + std::string output(buf, result.new_buf_occupied); + // buf now points to the final buffer (may have been reallocated) + delete[] buf; + return {std::move(result), std::move(output)}; +} + +/** + * Helper to create a buffer with simdjson padding and run JSON sanitization. + * @param input The input string to sanitize + * @return The sanitized string + */ +auto sanitize_string(std::string_view input) -> std::string { + return sanitize_json_with_result(input).output; } /** @@ -281,29 +293,38 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") { namespace { /** * Helper to create a buffer with simdjson padding and run UTF-8 sanitization. + * Returns both the result (with character counts) and the sanitized output string. * @param input The input string to sanitize - * @return The sanitized string + * @return SanitizeResultWithOutput containing result metadata and sanitized string + * @note sanitize_utf8_buffer may reallocate the buffer (passed by reference). This helper + * handles both reallocation and no-reallocation cases correctly. */ -auto sanitize_utf8_string(std::string_view input) -> std::string { +auto sanitize_utf8_with_result(std::string_view input) -> SanitizeResultWithOutput { size_t buf_size = input.size(); size_t buf_occupied = input.size(); + // Use raw pointer since sanitize_utf8_buffer may delete and reallocate char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; std::memcpy(buf, input.data(), input.size()); - try { - auto const result = StringUtils::sanitize_utf8_buffer( - buf, - buf_size, - buf_occupied, - simdjson::SIMDJSON_PADDING - ); - std::string output(buf, result.new_buf_occupied); - delete[] buf; - return output; - } catch (...) { - delete[] buf; - throw; - } + auto result = StringUtils::sanitize_utf8_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + std::string output(buf, result.new_buf_occupied); + // buf now points to the final buffer (may have been reallocated) + delete[] buf; + return {std::move(result), std::move(output)}; +} + +/** + * Helper to create a buffer with simdjson padding and run UTF-8 sanitization. + * @param input The input string to sanitize + * @return The sanitized string + */ +auto sanitize_utf8_string(std::string_view input) -> std::string { + return sanitize_utf8_with_result(input).output; } /** @@ -449,58 +470,24 @@ TEST_CASE("sanitize_utf8_buffer_returns_correct_counts", "[clp_s][StringUtils]") SECTION("counts invalid sequences") { auto input = "\xFF\xFE\xFF"s; // 3 invalid bytes - size_t buf_size = input.size(); - size_t buf_occupied = input.size(); - char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; - std::memcpy(buf, input.data(), input.size()); - - try { - auto const result = StringUtils::sanitize_utf8_buffer( - buf, - buf_size, - buf_occupied, - simdjson::SIMDJSON_PADDING - ); - - // The count uses 0xFF as a special key for invalid UTF-8 sequences - REQUIRE_FALSE(result.sanitized_char_counts.empty()); - size_t total = 0; - for (auto const& [ch, count] : result.sanitized_char_counts) { - total += count; - } - REQUIRE(total == 3); - - delete[] buf; - } catch (...) { - delete[] buf; - throw; + auto [result, output] = sanitize_utf8_with_result(input); + + // The count uses 0xFF as a special key for invalid UTF-8 sequences + REQUIRE_FALSE(result.sanitized_char_counts.empty()); + size_t total = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total += count; } + REQUIRE(total == 3); } SECTION("returns empty counts when no sanitization needed") { std::string input = "valid UTF-8 string"; - size_t buf_size = input.size(); - size_t buf_occupied = input.size(); - char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; - std::memcpy(buf, input.data(), input.size()); - - try { - auto const result = StringUtils::sanitize_utf8_buffer( - buf, - buf_size, - buf_occupied, - simdjson::SIMDJSON_PADDING - ); - - REQUIRE(result.sanitized_char_counts.empty()); - REQUIRE(result.new_buf_occupied == input.size()); - - delete[] buf; - } catch (...) { - delete[] buf; - throw; - } + auto [result, output] = sanitize_utf8_with_result(input); + + REQUIRE(result.sanitized_char_counts.empty()); + REQUIRE(result.new_buf_occupied == input.size()); } } @@ -530,54 +517,20 @@ TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUti // Input with: 3x \x00, 2x \x01, 1x \x1f auto input = "{\"a\": \"\x00\x00\x01\x00\x01\x1f\"}"s; - size_t buf_size = input.size(); - size_t buf_occupied = input.size(); - char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; - std::memcpy(buf, input.data(), input.size()); - - try { - auto const result = StringUtils::sanitize_json_buffer( - buf, - buf_size, - buf_occupied, - simdjson::SIMDJSON_PADDING - ); - - REQUIRE(result.sanitized_char_counts.size() == 3); - REQUIRE(result.sanitized_char_counts.at('\x00') == 3); - REQUIRE(result.sanitized_char_counts.at('\x01') == 2); - REQUIRE(result.sanitized_char_counts.at('\x1f') == 1); - - delete[] buf; - } catch (...) { - delete[] buf; - throw; - } + auto [result, output] = sanitize_json_with_result(input); + + REQUIRE(result.sanitized_char_counts.size() == 3); + REQUIRE(result.sanitized_char_counts.at('\x00') == 3); + REQUIRE(result.sanitized_char_counts.at('\x01') == 2); + REQUIRE(result.sanitized_char_counts.at('\x1f') == 1); } SECTION("returns empty counts when no sanitization needed") { std::string input = R"({"key": "valid value"})"; - size_t buf_size = input.size(); - size_t buf_occupied = input.size(); - char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; - std::memcpy(buf, input.data(), input.size()); - - try { - auto const result = StringUtils::sanitize_json_buffer( - buf, - buf_size, - buf_occupied, - simdjson::SIMDJSON_PADDING - ); - - REQUIRE(result.sanitized_char_counts.empty()); - REQUIRE(result.new_buf_occupied == input.size()); - - delete[] buf; - } catch (...) { - delete[] buf; - throw; - } + auto [result, output] = sanitize_json_with_result(input); + + REQUIRE(result.sanitized_char_counts.empty()); + REQUIRE(result.new_buf_occupied == input.size()); } } From cce34fbeb99f822bfa4fb0745dfa627c5ef80fce Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 04:25:07 +0800 Subject: [PATCH 10/11] refactor(clp_s): improve sanitization code quality and efficiency Code quality improvements: - Rename JsonSanitizeResult to BufferSanitizeResult for clarity - Rename helper methods: sanitize_and_log_utf8 -> sanitize_invalid_utf8_and_log, sanitize_and_log_json -> sanitize_control_chars_and_log - Fix typo: "delimeters" -> "delimiters" - Fix docs: "Pointer to pointer" -> "Reference to pointer" - Add @note about buffer reallocation to both sanitization helpers - Add null pointer checks to sanitization functions - Extract reinitialize_document_stream() helper to reduce duplication - Add named constant cUtf8ReplacementChar for U+FFFD magic string - Remove unused variable in char_to_hex() - Use consistent signed arithmetic in log messages Efficiency improvements: - Use lazy-copy approach: only allocate when first issue is found - Copy valid ranges in bulk using append(ptr, len) instead of byte-by-byte - Zero allocation in the common case when no sanitization is needed --- .../core/src/clp_s/JsonFileIterator.cpp | 48 +++++----- .../core/src/clp_s/JsonFileIterator.hpp | 15 +++- components/core/src/clp_s/Utils.cpp | 88 ++++++++++++------- components/core/src/clp_s/Utils.hpp | 19 ++-- .../core/tests/test-clp_s-StringUtils.cpp | 4 +- 5 files changed, 105 insertions(+), 69 deletions(-) diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index afe014160c..447e73617d 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -78,7 +78,7 @@ bool JsonFileIterator::read_new_json() { if (simdjson::error_code::UTF8_ERROR == error) { // Return value intentionally ignored - in read_new_json we always retry after // sanitization regardless of whether changes were made - static_cast(sanitize_and_log_utf8()); + static_cast(sanitize_invalid_utf8_and_log()); error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); } @@ -86,7 +86,7 @@ bool JsonFileIterator::read_new_json() { if (simdjson::error_code::UNESCAPED_CHARS == error) { // Return value intentionally ignored - in read_new_json we always retry after // sanitization regardless of whether changes were made - static_cast(sanitize_and_log_json()); + static_cast(sanitize_control_chars_and_log()); error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); } } @@ -122,8 +122,10 @@ size_t JsonFileIterator::skip_whitespace_and_get_truncated_bytes() { return m_buf_occupied - m_next_document_position; } -bool JsonFileIterator::sanitize_and_log_utf8() { +bool JsonFileIterator::sanitize_invalid_utf8_and_log() { size_t const old_buf_occupied = m_buf_occupied; + // Note: sanitize_utf8_buffer may reallocate m_buf and will update m_buf_size by reference if + // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size. auto const result = StringUtils::sanitize_utf8_buffer( m_buf, m_buf_size, @@ -142,7 +144,7 @@ bool JsonFileIterator::sanitize_and_log_utf8() { total_replaced += count; } SPDLOG_WARN( - "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size changed by {} bytes " + "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer expanded by {} bytes " "({} -> {}).", total_replaced, m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), @@ -153,7 +155,7 @@ bool JsonFileIterator::sanitize_and_log_utf8() { return true; } -bool JsonFileIterator::sanitize_and_log_json() { +bool JsonFileIterator::sanitize_control_chars_and_log() { size_t const old_buf_occupied = m_buf_occupied; // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by reference if // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size. @@ -185,13 +187,25 @@ bool JsonFileIterator::sanitize_and_log_json() { total_sanitized, m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), char_details, - m_buf_occupied - old_buf_occupied, + static_cast(m_buf_occupied) - static_cast(old_buf_occupied), old_buf_occupied, m_buf_occupied ); return true; } +bool JsonFileIterator::reinitialize_document_stream() { + auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); + if (error) { + m_error_code = error; + return false; + } + m_doc_it = m_stream.begin(); + m_first_doc_in_buffer = true; + m_next_document_position = 0; + return true; +} + bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) { if (false == m_first_read) { ++m_doc_it; @@ -220,23 +234,16 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i { // Unescaped control characters detected during document iteration. Sanitize the // buffer and re-setup the document stream to restart from the beginning. - if (false == sanitize_and_log_json()) { + if (false == sanitize_control_chars_and_log()) { // Sanitization made no changes - report the original error to avoid infinite // loop m_error_code = m_doc_it.error(); return false; } - // Re-setup the document stream and restart iteration - auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied) - .get(m_stream); - if (error) { - m_error_code = error; + if (false == reinitialize_document_stream()) { return false; } - m_doc_it = m_stream.begin(); - m_first_doc_in_buffer = true; - m_next_document_position = 0; continue; } else { m_error_code = m_doc_it.error(); @@ -261,23 +268,16 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i // way to check is to parse it ourselves, so we rely on this heuristic for now. if (m_sanitize_invalid_json) { // Sanitize invalid UTF-8 sequences and retry - if (false == sanitize_and_log_utf8()) { + if (false == sanitize_invalid_utf8_and_log()) { // Sanitization made no changes - report the original error to avoid infinite // loop m_error_code = simdjson::error_code::UTF8_ERROR; return false; } - // Re-setup the document stream and restart iteration - auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied) - .get(m_stream); - if (error) { - m_error_code = error; + if (false == reinitialize_document_stream()) { return false; } - m_doc_it = m_stream.begin(); - m_first_doc_in_buffer = true; - m_next_document_position = 0; continue; } m_error_code = simdjson::error_code::UTF8_ERROR; diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index 0ef58d6fa3..2b8be94a4c 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -13,7 +13,7 @@ class JsonFileIterator { /** * An iterator over an input stream containing json objects. JSON is parsed * using simdjson::parse_many. This allows simdjson to efficiently find - * delimeters between JSON objects, and if enabled parse JSON ahead of time + * delimiters between JSON objects, and if enabled parse JSON ahead of time * in another thread while the JSON is being iterated over. * * The buffer grows automatically if there are JSON objects larger than the buffer size. @@ -85,15 +85,24 @@ class JsonFileIterator { * Sanitizes invalid UTF-8 sequences in the buffer by replacing them with U+FFFD, * updates buffer tracking variables, and logs a warning if changes were made. * @return true if sanitization made changes, false otherwise + * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly. */ - [[nodiscard]] bool sanitize_and_log_utf8(); + [[nodiscard]] bool sanitize_invalid_utf8_and_log(); /** * Sanitizes unescaped control characters in the buffer by escaping them to \\u00XX format, * updates buffer tracking variables, and logs a warning if changes were made. * @return true if sanitization made changes, false otherwise + * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly. */ - [[nodiscard]] bool sanitize_and_log_json(); + [[nodiscard]] bool sanitize_control_chars_and_log(); + + /** + * Reinitializes the document stream after buffer sanitization. + * Resets iteration state to start from the beginning of the buffer. + * @return true on success, false if iteration setup fails (m_error_code is set on failure) + */ + [[nodiscard]] bool reinitialize_document_stream(); size_t m_truncated_bytes{0}; size_t m_next_document_position{0}; diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 9458c970dd..321032788f 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -164,64 +164,73 @@ bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& n return true; } -JsonSanitizeResult StringUtils::sanitize_json_buffer( +BufferSanitizeResult StringUtils::sanitize_json_buffer( char*& buf, size_t& buf_size, size_t buf_occupied, size_t simdjson_padding ) { - // Build sanitized content in a temporary buffer, escaping control characters - // inside JSON strings. This is a fallback path only triggered on UNESCAPED_CHARS error, - // so we prioritize correctness over performance. + // Early return for empty or null buffer + if (nullptr == buf || 0 == buf_occupied) { + return {buf_occupied, {}}; + } + + // Sanitize by escaping control characters inside JSON strings. Uses a lazy-copy approach: + // only allocate the output string when the first control character is found, and copy + // valid ranges in bulk rather than byte-by-byte for efficiency. // // Note: If the buffer contains unmatched quotes (e.g., truncated JSON), the string state // tracking may be incorrect, potentially escaping control characters outside of actual // JSON strings. This is acceptable since such malformed JSON will fail parsing anyway. std::string sanitized; - std::map sanitized_char_counts; bool in_string = false; bool escape_next = false; + size_t copy_start = 0; for (size_t i = 0; i < buf_occupied; ++i) { - char c = buf[i]; + char const c = buf[i]; if (escape_next) { escape_next = false; - sanitized.push_back(c); continue; } - if (c == '\\' && in_string) { + if ('\\' == c && in_string) { escape_next = true; - sanitized.push_back(c); continue; } - if (c == '"') { + if ('"' == c) { in_string = !in_string; - sanitized.push_back(c); continue; } // Escape control characters (0x00-0x1F) inside strings to \u00XX format if (in_string && static_cast(c) < 0x20) { + if (sanitized.empty()) { + // First control char found - allocate with extra space for escapes + sanitized.reserve(buf_occupied + 64); + } + // Copy the valid range before this control character + sanitized.append(buf + copy_start, i - copy_start); char_to_escaped_four_char_hex(sanitized, c); ++sanitized_char_counts[c]; - } else { - sanitized.push_back(c); + copy_start = i + 1; } } - // If no changes were made, return early - if (sanitized.size() == buf_occupied) { + // If no sanitization was needed, return early without any allocation + if (sanitized.empty()) { return {buf_occupied, {}}; } + // Copy any remaining content after the last control character + sanitized.append(buf + copy_start, buf_occupied - copy_start); + // Grow buffer if needed to hold sanitized content if (sanitized.size() > buf_size) { - // Allocate exactly the size needed since we know the exact size upfront - size_t new_buf_size = sanitized.size(); + size_t const new_buf_size = sanitized.size(); char* new_buf = new char[new_buf_size + simdjson_padding]; delete[] buf; buf = new_buf; @@ -325,45 +334,60 @@ size_t validate_utf8_sequence(char const* buf, size_t pos, size_t buf_occupied) } } // namespace -JsonSanitizeResult StringUtils::sanitize_utf8_buffer( +BufferSanitizeResult StringUtils::sanitize_utf8_buffer( char*& buf, size_t& buf_size, size_t buf_occupied, size_t simdjson_padding ) { - // Build sanitized content in a temporary buffer, replacing invalid UTF-8 sequences - // with the Unicode replacement character U+FFFD (0xEF 0xBF 0xBD). - std::string sanitized; + // Early return for empty or null buffer + if (nullptr == buf || 0 == buf_occupied) { + return {buf_occupied, {}}; + } - // We use a special key to count invalid UTF-8 sequences - // Using 0xFF as the key since it's never a valid UTF-8 byte - std::map sanitized_char_counts; + // Sanitize by replacing invalid UTF-8 sequences with U+FFFD. Uses a lazy-copy approach: + // only allocate the output string when the first invalid sequence is found, and copy + // valid ranges in bulk rather than byte-by-byte for efficiency. + constexpr std::string_view cUtf8ReplacementChar{"\xEF\xBF\xBD", 3}; constexpr char cInvalidUtf8Key = static_cast(0xFF); + std::string sanitized; + std::map sanitized_char_counts; + size_t copy_start = 0; + size_t i = 0; while (i < buf_occupied) { - size_t seq_len = validate_utf8_sequence(buf, i, buf_occupied); + size_t const seq_len = validate_utf8_sequence(buf, i, buf_occupied); if (seq_len > 0) { - // Valid sequence - copy it - sanitized.append(buf + i, seq_len); + // Valid sequence - advance position (will be copied in bulk later) i += seq_len; } else { - // Invalid sequence - replace with U+FFFD - sanitized.append("\xEF\xBF\xBD", 3); + // Invalid sequence - need to sanitize + if (sanitized.empty()) { + // First invalid sequence found - allocate with extra space for replacements + sanitized.reserve(buf_occupied + 64); + } + // Copy the valid range before this invalid byte + sanitized.append(buf + copy_start, i - copy_start); + sanitized.append(cUtf8ReplacementChar); ++sanitized_char_counts[cInvalidUtf8Key]; // Skip one byte and continue (maximal subpart replacement strategy) ++i; + copy_start = i; } } - // If no changes were made, return early - if (sanitized.size() == buf_occupied) { + // If no sanitization was needed, return early without any allocation + if (sanitized.empty()) { return {buf_occupied, {}}; } + // Copy any remaining valid content after the last invalid byte + sanitized.append(buf + copy_start, buf_occupied - copy_start); + // Grow buffer if needed to hold sanitized content if (sanitized.size() > buf_size) { - size_t new_buf_size = sanitized.size(); + size_t const new_buf_size = sanitized.size(); char* new_buf = new char[new_buf_size + simdjson_padding]; delete[] buf; buf = new_buf; diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index 54111d25a6..95033e18c3 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -65,10 +65,14 @@ class UriUtils { }; /** - * Result of sanitizing a JSON buffer, including statistics about what was sanitized. + * Result of sanitizing a buffer, including statistics about what was sanitized. */ -struct JsonSanitizeResult { +struct BufferSanitizeResult { + /// New number of bytes occupied in the buffer after sanitization. size_t new_buf_occupied; + /// Map of sanitized characters to their occurrence counts. + /// For JSON control char sanitization: key is the actual control character (0x00-0x1F). + /// For UTF-8 sanitization: key is 0xFF (sentinel value for all invalid sequences). std::map sanitized_char_counts; }; @@ -101,7 +105,7 @@ class StringUtils { * * Characters are escaped using unicode escape sequences (e.g., \x00 becomes \u0000). * - * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed. + * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed. * The caller's pointer will be updated to point to the new buffer if reallocation * occurs. The caller is responsible for deleting the buffer. * @param buf_size Current allocated size of the buffer (excluding simdjson padding). @@ -111,7 +115,7 @@ class StringUtils { * @param simdjson_padding Amount of padding required after buffer content. * @return Result containing new buf_occupied and counts of each sanitized character. */ - static JsonSanitizeResult sanitize_json_buffer( + static BufferSanitizeResult sanitize_json_buffer( char*& buf, size_t& buf_size, size_t buf_occupied, @@ -123,7 +127,7 @@ class StringUtils { * character (U+FFFD). This is used to fix malformed data that contains invalid UTF-8 * which would cause JSON parsing errors. * - * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed. + * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed. * The caller's pointer will be updated to point to the new buffer if reallocation * occurs. The caller is responsible for deleting the buffer. * @param buf_size Current allocated size of the buffer (excluding simdjson padding). @@ -134,7 +138,7 @@ class StringUtils { * @return Result containing new buf_occupied and counts of each type of invalid sequence * replaced. */ - static JsonSanitizeResult sanitize_utf8_buffer( + static BufferSanitizeResult sanitize_utf8_buffer( char*& buf, size_t& buf_size, size_t buf_occupied, @@ -148,7 +152,6 @@ class StringUtils { * @return the two byte hexadecimal representation of c as an array of two characters. */ static std::array char_to_hex(char c) { - std::array ret; auto nibble_to_hex = [](char nibble) -> char { if ('\x00' <= nibble && nibble <= '\x09') { return '0' + (nibble - '\x00'); @@ -157,7 +160,7 @@ class StringUtils { } }; - return std::array{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)}; + return {nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)}; } /** diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp index f7969ecbee..898d0a7be4 100644 --- a/components/core/tests/test-clp_s-StringUtils.cpp +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -8,7 +8,7 @@ #include "../src/clp_s/Utils.hpp" -using clp_s::JsonSanitizeResult; +using clp_s::BufferSanitizeResult; using clp_s::StringUtils; // We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes @@ -23,7 +23,7 @@ namespace { * Result structure containing both the sanitization result and the output string. */ struct SanitizeResultWithOutput { - JsonSanitizeResult result; + BufferSanitizeResult result; std::string output; }; From 775f513350958e4578f7a3a474db4a98306b17ed Mon Sep 17 00:00:00 2001 From: Jack Luo Date: Fri, 9 Jan 2026 04:29:17 +0800 Subject: [PATCH 11/11] style(test): use explicit while loop for clearer skip logic in count_replacement_chars --- components/core/tests/test-clp_s-StringUtils.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp index 898d0a7be4..e3ef434ecb 100644 --- a/components/core/tests/test-clp_s-StringUtils.cpp +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -332,12 +332,15 @@ auto sanitize_utf8_string(std::string_view input) -> std::string { */ auto count_replacement_chars(std::string_view s) -> size_t { size_t count = 0; - for (size_t i = 0; i + 2 < s.size(); ++i) { + size_t i = 0; + while (i + 2 < s.size()) { if (static_cast(s[i]) == 0xEF && static_cast(s[i + 1]) == 0xBF && static_cast(s[i + 2]) == 0xBD) { ++count; - i += 2; // Skip past the replacement char + i += 3; // Skip past the replacement char (3 bytes for U+FFFD) + } else { + ++i; } } return count;