From aae9a7ec0783a9cc52855c48cd6e6d443decf1a9 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Thu, 8 Jan 2026 20:13:27 +0800
Subject: [PATCH 01/11] feat(clp_s): sanitize JSON buffers with unescaped
 control characters

Add automatic sanitization of JSON buffers that contain unescaped
control characters (0x00-0x1F) inside string values. When simdjson
returns UNESCAPED_CHARS error, the buffer is sanitized by escaping
control characters to \u00XX format, and parsing is retried.

- Add StringUtils::sanitize_json_buffer() that escapes control chars
  inside JSON strings while respecting existing escape sequences
- Integrate sanitization in JsonFileIterator::read_new_json() with
  automatic retry after sanitization
- Track sanitization byte expansion for accurate byte counting
- Log warnings with detailed counts of sanitized characters
- Add comprehensive unit tests for sanitization logic
---
 components/core/CMakeLists.txt                |   1 +
 .../core/src/clp_s/JsonFileIterator.cpp       |  56 +++-
 .../core/src/clp_s/JsonFileIterator.hpp       |   7 +-
 components/core/src/clp_s/Utils.cpp           |  70 ++++
 components/core/src/clp_s/Utils.hpp           |  37 ++
 .../core/tests/test-clp_s-StringUtils.cpp     | 316 ++++++++++++++++++
 6 files changed, 482 insertions(+), 5 deletions(-)
 create mode 100644 components/core/tests/test-clp_s-StringUtils.cpp
diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 9e9738f05d..f183f15e48 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -704,6 +704,7 @@ set(SOURCE_FILES_unitTest
         tests/test-clp_s-end_to_end.cpp
         tests/test-clp_s-range_index.cpp
         tests/test-clp_s-search.cpp
+        tests/test-clp_s-StringUtils.cpp
         tests/test-EncodedVariableInterpreter.cpp
         tests/test-encoding_methods.cpp
         tests/test-ffi_IrUnitHandlerReq.cpp
diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index a0a003d9f7..a6d043e82e 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -3,8 +3,11 @@
 #include <cctype>
 #include <cstring>
 
+#include <fmt/format.h>
 #include <spdlog/spdlog.h>
 
+#include "Utils.hpp"
+
 namespace clp_s {
 JsonFileIterator::JsonFileIterator(
         clp::ReaderInterface& reader,
@@ -64,6 +67,51 @@ bool JsonFileIterator::read_new_json() {
         )
                              .get(m_stream);
 
+        // If we encounter unescaped control characters in JSON strings, sanitize the buffer
+        // by escaping them to \u00XX format and retry parsing
+        if (simdjson::error_code::UNESCAPED_CHARS == error) {
+            size_t const old_buf_occupied = m_buf_occupied;
+            // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by
+            // reference if reallocation is needed. This keeps m_buf_size in sync with the
+            // actual allocated buffer size.
+            auto const result = StringUtils::sanitize_json_buffer(
+                    m_buf,
+                    m_buf_size,
+                    m_buf_occupied,
+                    simdjson::SIMDJSON_PADDING
+            );
+            m_buf_occupied = result.new_buf_occupied;
+            m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+            // Build log message with details of sanitized characters
+            size_t total_sanitized = 0;
+            std::string char_details;
+            for (auto const& [ch, count] : result.sanitized_char_counts) {
+                if (!char_details.empty()) {
+                    char_details += ", ";
+                }
+                char_details += fmt::format("0x{:02x} ({})", static_cast<unsigned char>(ch), count);
+                total_sanitized += count;
+            }
+            size_t bytes_added = m_buf_occupied - old_buf_occupied;
+            SPDLOG_WARN(
+                    "Escaped {} control character(s) in JSON: [{}]. Buffer expanded by {} bytes ({} -> {}).",
+                    total_sanitized,
+                    char_details,
+                    bytes_added,
+                    old_buf_occupied,
+                    m_buf_occupied
+            );
+
+            error = m_parser
+                            .iterate_many(
+                                    m_buf,
+                                    /* length of data */ m_buf_occupied,
+                                    /* batch size of data to parse*/ m_buf_occupied
+                            )
+                            .get(m_stream);
+        }
+
         if (error) {
             m_error_code = error;
             return false;
@@ -151,10 +199,12 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
 size_t JsonFileIterator::get_num_bytes_consumed() {
     // If there are more documents left in the current buffer account for how much of the
     // buffer has been consumed, otherwise report the total number of bytes read so that we
-    // capture trailing whitespace.
+    // capture trailing whitespace. Include bytes added by sanitization since the sanitized
+    // content is what gets compressed.
     if (m_doc_it != m_stream.end()) {
-        return m_bytes_read - (m_buf_occupied - m_next_document_position);
+        return m_bytes_read + m_sanitization_bytes_added
+               - (m_buf_occupied - m_next_document_position);
     }
-    return m_bytes_read;
+    return m_bytes_read + m_sanitization_bytes_added;
 }
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp
index 9102087637..a9732ad5c6 100644
--- a/components/core/src/clp_s/JsonFileIterator.hpp
+++ b/components/core/src/clp_s/JsonFileIterator.hpp
@@ -41,9 +41,11 @@ class JsonFileIterator {
     [[nodiscard]] size_t truncated_bytes() const { return m_truncated_bytes; }
 
     /**
-     * @return total number of bytes read from the file
+     * @return total number of bytes read from the file, plus any bytes added by sanitization
      */
-    [[nodiscard]] size_t get_num_bytes_read() const { return m_bytes_read; }
+    [[nodiscard]] size_t get_num_bytes_read() const {
+        return m_bytes_read + m_sanitization_bytes_added;
+    }
 
     /**
      * Note: this method can not be const because checking if a simdjson iterator is at the end
@@ -76,6 +78,7 @@ class JsonFileIterator {
     size_t m_truncated_bytes{0};
     size_t m_next_document_position{0};
     size_t m_bytes_read{0};
+    size_t m_sanitization_bytes_added{0};
     size_t m_buf_size{0};
     size_t m_buf_occupied{0};
     size_t m_max_document_size{0};
diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index 59b18a5bca..5efcb7d77e 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -2,6 +2,7 @@
 
 #include <charconv>
 #include <cstdint>
+#include <cstring>
 #include <exception>
 #include <filesystem>
 #include <set>
@@ -163,6 +164,75 @@ bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& n
     return true;
 }
 
+JsonSanitizeResult StringUtils::sanitize_json_buffer(
+        char*& buf,
+        size_t& buf_size,
+        size_t buf_occupied,
+        size_t simdjson_padding
+) {
+    // Build sanitized content in a temporary buffer, escaping control characters
+    // inside JSON strings. This is a fallback path only triggered on UNESCAPED_CHARS error,
+    // so we prioritize correctness over performance.
+    //
+    // Note: If the buffer contains unmatched quotes (e.g., truncated JSON), the string state
+    // tracking may be incorrect, potentially escaping control characters outside of actual
+    // JSON strings. This is acceptable since such malformed JSON will fail parsing anyway.
+    std::string sanitized;
+
+    std::map<char, size_t> sanitized_char_counts;
+    bool in_string = false;
+    bool escape_next = false;
+
+    for (size_t i = 0; i < buf_occupied; ++i) {
+        char c = buf[i];
+
+        if (escape_next) {
+            escape_next = false;
+            sanitized.push_back(c);
+            continue;
+        }
+
+        if (c == '\\' && in_string) {
+            escape_next = true;
+            sanitized.push_back(c);
+            continue;
+        }
+
+        if (c == '"') {
+            in_string = !in_string;
+            sanitized.push_back(c);
+            continue;
+        }
+
+        // Escape control characters (0x00-0x1F) inside strings to \u00XX format
+        if (in_string && static_cast<unsigned char>(c) < 0x20) {
+            char_to_escaped_four_char_hex(sanitized, c);
+            ++sanitized_char_counts[c];
+        } else {
+            sanitized.push_back(c);
+        }
+    }
+
+    // If no changes were made, return early
+    if (sanitized.size() == buf_occupied) {
+        return {buf_occupied, {}};
+    }
+
+    // Grow buffer if needed to hold sanitized content
+    if (sanitized.size() > buf_size) {
+        // Allocate exactly the size needed since we know the exact size upfront
+        size_t new_buf_size = sanitized.size();
+        char* new_buf = new char[new_buf_size + simdjson_padding];
+        delete[] buf;
+        buf = new_buf;
+        buf_size = new_buf_size;
+    }
+
+    // Copy sanitized content to buffer
+    std::memcpy(buf, sanitized.data(), sanitized.size());
+    return {sanitized.size(), std::move(sanitized_char_counts)};
+}
+
 void StringUtils::escape_json_string(std::string& destination, std::string_view const source) {
     // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path
     // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index a711eb8ad6..df5d152444 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -5,6 +5,7 @@
 #include <charconv>
 #include <cstdint>
 #include <cstring>
+#include <map>
 #include <sstream>
 #include <string>
 #include <string_view>
@@ -63,6 +64,14 @@ class UriUtils {
     static bool get_last_uri_component(std::string_view const uri, std::string& name);
 };
 
+/**
+ * Result of sanitizing a JSON buffer, including statistics about what was sanitized.
+ */
+struct JsonSanitizeResult {
+    size_t new_buf_occupied;
+    std::map<char, size_t> sanitized_char_counts;
+};
+
 class StringUtils {
 public:
     /**
@@ -81,6 +90,34 @@ class StringUtils {
      */
     static void escape_json_string(std::string& destination, std::string_view const source);
 
+    /**
+     * Sanitizes a JSON buffer by escaping unescaped control characters (0x00-0x1F) inside JSON
+     * strings. This is used to fix malformed JSON that contains raw control characters which
+     * would cause parsing errors.
+     *
+     * Only control characters inside JSON string values (between unescaped double quotes) are
+     * escaped. Control characters outside strings are left unchanged as they would cause
+     * structural JSON errors anyway.
+     *
+     * Characters are escaped using unicode escape sequences (e.g., \x00 becomes \u0000).
+     *
+     * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed.
+     *             The caller's pointer will be updated to point to the new buffer if reallocation
+     *             occurs. The caller is responsible for deleting the buffer.
+     * @param buf_size Current allocated size of the buffer (excluding simdjson padding).
+     *                 This parameter is modified by reference and will be updated if the buffer
+     *                 is reallocated to reflect the new buffer size.
+     * @param buf_occupied Number of bytes currently used in the buffer.
+     * @param simdjson_padding Amount of padding required after buffer content.
+     * @return Result containing new buf_occupied and counts of each sanitized character.
+     */
+    static JsonSanitizeResult sanitize_json_buffer(
+            char*& buf,
+            size_t& buf_size,
+            size_t buf_occupied,
+            size_t simdjson_padding
+    );
+
 private:
     /**
      * Converts a character into its two byte hexadecimal representation.
diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
new file mode 100644
index 0000000000..9163281087
--- /dev/null
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -0,0 +1,316 @@
+#include <cstring>
+#include <string>
+#include <string_view>
+
+#include <catch2/catch_test_macros.hpp>
+#include <simdjson.h>
+
+#include "../src/clp_s/Utils.hpp"
+
+using clp_s::JsonSanitizeResult;
+using clp_s::StringUtils;
+
+// We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes
+// and control characters. Without the `s` suffix, std::string's constructor from a C-string literal
+// stops at the first null byte, truncating the test input. Additionally, we use string concatenation
+// (e.g., "\x00" "end") to prevent hex escape sequences from being extended by following hex digits.
+using namespace std::string_literals;
+
+namespace {
+/**
+ * Helper to create a buffer with simdjson padding and run sanitization.
+ * @param input The input string to sanitize
+ * @return The sanitized string
+ * @note If sanitize_json_buffer reallocates the buffer, the new buffer pointer is returned
+ *       and the original buffer is automatically deleted by sanitize_json_buffer. This helper
+ *       correctly handles both cases (reallocation and no reallocation) by always deleting
+ *       the final buffer pointer.
+ */
+auto sanitize_string(std::string_view input) -> std::string {
+    size_t buf_size = input.size();
+    size_t buf_occupied = input.size();
+    char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+    std::memcpy(buf, input.data(), input.size());
+
+    try {
+        auto const result = StringUtils::sanitize_json_buffer(
+                buf,
+                buf_size,
+                buf_occupied,
+                simdjson::SIMDJSON_PADDING
+        );
+        std::string output(buf, result.new_buf_occupied);
+        // Delete the buffer (may be reallocated, but buf points to the current buffer)
+        delete[] buf;
+        return output;
+    } catch (...) {
+        // If sanitization failed, buf still points to the original buffer
+        delete[] buf;
+        throw;
+    }
+}
+
+/**
+ * Helper to verify that a sanitized JSON string can be parsed by simdjson.
+ * @param json The JSON string to parse
+ * @return true if parsing succeeds, false otherwise
+ */
+auto can_parse_json(std::string_view json) -> bool {
+    simdjson::ondemand::parser parser;
+    simdjson::padded_string padded(json);
+    auto result = parser.iterate(padded);
+    return !result.error();
+}
+}  // namespace
+
+TEST_CASE("sanitize_json_buffer_no_changes", "[clp_s][StringUtils]") {
+    // Valid JSON without control characters should pass through unchanged
+    SECTION("simple object") {
+        std::string input = R"({"key": "value"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("object with escaped characters") {
+        std::string input = R"({"key": "line1\nline2\ttab"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("object with unicode escapes") {
+        std::string input = R"({"key": "null char: \u0000"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("multiple objects") {
+        std::string input = R"({"a": "1"}{"b": "2"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("control chars outside strings are unchanged") {
+        // Control chars outside strings aren't valid JSON anyway,
+        // but we don't touch them - let the JSON parser report the error
+        std::string input = "{\x01\"key\": \"value\"}";
+        REQUIRE(sanitize_string(input) == input);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_escapes_control_chars", "[clp_s][StringUtils]") {
+    SECTION("null byte in string value") {
+        auto input = "{\"key\": \"val\x00ue\"}"s;
+        std::string expected = R"({"key": "val\u0000ue"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("multiple null bytes") {
+        auto input = "{\"key\": \"\x00\x00\x00\"}"s;
+        std::string expected = R"({"key": "\u0000\u0000\u0000"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("various control characters") {
+        // Test 0x01 (SOH), 0x02 (STX), 0x1F (US)
+        auto input = "{\"key\": \"a\x01" "b\x02" "c\x1F" "d\"}"s;
+        std::string expected = R"({"key": "a\u0001b\u0002c\u001fd"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("control char in key") {
+        auto input = "{\"ke\x00y\": \"value\"}"s;
+        std::string expected = R"({"ke\u0000y": "value"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("mixed valid escapes and control chars") {
+        auto input = "{\"key\": \"line1\\nhas\x00null\"}"s;
+        std::string expected = R"({"key": "line1\nhas\u0000null"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils]") {
+    SECTION("escaped quote should not toggle string state") {
+        // The \" should not end the string
+        std::string input = R"({"key": "quote:\"value"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("escaped backslash before quote") {
+        // \\" means escaped backslash followed by end of string
+        std::string input = R"({"key": "backslash:\\"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("control char after escaped backslash") {
+        // \\\x00 means escaped backslash followed by a control char still inside the string
+        auto input = "{\"key\": \"slash:\\\\\x00" "end\"}"s;
+        std::string expected = R"({"key": "slash:\\\u0000end"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("control char after escaped quote") {
+        auto input = "{\"key\": \"quote:\\\"\x00" "after\"}"s;
+        std::string expected = R"({"key": "quote:\"\u0000after"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("control char as invalid escape sequence target") {
+        // \<control-char> is not a valid JSON escape sequence, but we leave it as-is
+        // since the sanitizer treats any char after backslash as an escape target.
+        // This JSON is invalid anyway (bad escape sequence) and will fail parsing.
+        auto input = "{\"key\": \"bad:\\\x00" "end\"}"s;
+        auto expected = "{\"key\": \"bad:\\\x00" "end\"}"s;
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("triple backslash followed by control char") {
+        // \\\ followed by \x00: first \\ is escaped backslash, third \ starts escape sequence.
+        // The third backslash makes the null appear to be an escape target, so it's not escaped.
+        auto input = "{\"key\": \"\\\\\\\x00" "end\"}"s;
+        auto expected = "{\"key\": \"\\\\\\\x00" "end\"}"s;
+        REQUIRE(sanitize_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_result_is_valid_json", "[clp_s][StringUtils]") {
+    SECTION("sanitized null byte produces valid JSON") {
+        auto input = "{\"key\": \"val\x00ue\"}"s;
+        std::string sanitized = sanitize_string(input);
+        REQUIRE(can_parse_json(sanitized));
+    }
+
+    SECTION("sanitized multiple control chars produces valid JSON") {
+        auto input = "{\"data\": \"\x01\x02\x03\x04\x05\"}"s;
+        std::string sanitized = sanitize_string(input);
+        REQUIRE(can_parse_json(sanitized));
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_handles_buffer_growth", "[clp_s][StringUtils]") {
+    // Each control char expands from 1 byte to 6 bytes (\u00XX)
+    // Create input that will require buffer growth
+    SECTION("many control chars requiring expansion") {
+        // 100 null bytes -> 600 bytes after escaping
+        std::string value(100, '\x00');
+        std::string input = "{\"key\": \"" + value + "\"}";
+
+        std::string sanitized = sanitize_string(input);
+
+        // Verify all nulls were escaped
+        REQUIRE(sanitized.find('\x00') == std::string::npos);
+        // Verify correct number of escape sequences
+        size_t count = 0;
+        size_t pos = 0;
+        while ((pos = sanitized.find("\\u0000", pos)) != std::string::npos) {
+            ++count;
+            pos += 6;
+        }
+        REQUIRE(count == 100);
+        // Verify result is valid JSON
+        REQUIRE(can_parse_json(sanitized));
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_jsonl", "[clp_s][StringUtils]") {
+    SECTION("multiple JSON objects with control chars") {
+        auto input = "{\"a\": \"x\x00y\"}\n{\"b\": \"p\x01q\"}"s;
+        std::string expected = "{\"a\": \"x\\u0000y\"}\n{\"b\": \"p\\u0001q\"}";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") {
+    SECTION("truncated JSON with unmatched quote - control chars may be escaped incorrectly") {
+        // This tests the edge case documented in the code: if JSON is truncated with unmatched
+        // quotes, string state tracking may be incorrect, potentially escaping control chars
+        // outside of actual JSON strings. This is acceptable since malformed JSON will fail
+        // parsing anyway.
+        //
+        // Input: truncated JSON where the string is not closed, followed by control chars
+        // The sanitizer may incorrectly think it's still in a string and escape the control chars
+        auto input = "{\"key\": \"unclosed string\x00\x01" "after\"}"s;
+        // The sanitizer will escape the control chars because it thinks we're still in a string
+        // (the quote after "unclosed string" is escaped, so the string continues)
+        std::string sanitized = sanitize_string(input);
+        // The control chars should be escaped (behavior may vary based on quote matching)
+        // The important thing is that the function doesn't crash and handles it gracefully
+        REQUIRE((sanitized.find('\x00') == std::string::npos || sanitized.find("\\u0000") != std::string::npos));
+    }
+
+    SECTION("truncated JSON ending mid-string") {
+        // JSON truncated in the middle of a string with control chars (no closing quote)
+        // This simulates a real truncation scenario where the buffer ends mid-string
+        auto input = "{\"key\": \"value\x00\x01" "truncated"s;
+        std::string sanitized = sanitize_string(input);
+        // Should handle gracefully without crashing
+        // Control chars inside the (unclosed) string should be escaped
+        REQUIRE(sanitized.size() >= input.size());
+        // Verify no raw control characters remain (they should be escaped)
+        REQUIRE(sanitized.find('\x00') == std::string::npos);
+        REQUIRE(sanitized.find('\x01') == std::string::npos);
+    }
+
+    SECTION("truncated JSON with control chars after unmatched quote") {
+        // JSON with unmatched quote followed by control chars outside the string
+        // The sanitizer may incorrectly escape these if it thinks we're still in a string
+        auto input = "{\"key\": \"value\x00" "}\x01\x02"s;
+        std::string sanitized = sanitize_string(input);
+        // Function should not crash - behavior may vary but should be consistent
+        REQUIRE(sanitized.size() >= input.size());
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUtils]") {
+    SECTION("counts multiple different control characters") {
+        // Input with: 3x \x00, 2x \x01, 1x \x1f
+        auto input = "{\"a\": \"\x00\x00\x01\x00\x01\x1f\"}"s;
+
+        size_t buf_size = input.size();
+        size_t buf_occupied = input.size();
+        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+        std::memcpy(buf, input.data(), input.size());
+
+        try {
+            auto const result = StringUtils::sanitize_json_buffer(
+                    buf,
+                    buf_size,
+                    buf_occupied,
+                    simdjson::SIMDJSON_PADDING
+            );
+
+            REQUIRE(result.sanitized_char_counts.size() == 3);
+            REQUIRE(result.sanitized_char_counts.at('\x00') == 3);
+            REQUIRE(result.sanitized_char_counts.at('\x01') == 2);
+            REQUIRE(result.sanitized_char_counts.at('\x1f') == 1);
+
+            delete[] buf;
+        } catch (...) {
+            delete[] buf;
+            throw;
+        }
+    }
+
+    SECTION("returns empty counts when no sanitization needed") {
+        std::string input = R"({"key": "valid value"})";
+
+        size_t buf_size = input.size();
+        size_t buf_occupied = input.size();
+        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+        std::memcpy(buf, input.data(), input.size());
+
+        try {
+            auto const result = StringUtils::sanitize_json_buffer(
+                    buf,
+                    buf_size,
+                    buf_occupied,
+                    simdjson::SIMDJSON_PADDING
+            );
+
+            REQUIRE(result.sanitized_char_counts.empty());
+            REQUIRE(result.new_buf_occupied == input.size());
+
+            delete[] buf;
+        } catch (...) {
+            delete[] buf;
+            throw;
+        }
+    }
+}

From 0b825331250584dee70aa25fd1c038cfe7f59b73 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Thu, 8 Jan 2026 21:11:06 +0800
Subject: [PATCH 02/11] feat(clp_s): include file path in sanitization warning
 log

---
 components/core/src/clp_s/JsonFileIterator.cpp | 8 ++++++--
 components/core/src/clp_s/JsonFileIterator.hpp | 5 +++++
 components/core/src/clp_s/JsonParser.cpp       | 2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index a6d043e82e..5b38e45bf7 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -12,12 +12,14 @@ namespace clp_s {
 JsonFileIterator::JsonFileIterator(
         clp::ReaderInterface& reader,
         size_t max_document_size,
+        std::string path,
         size_t buf_size
 )
         : m_buf_size(buf_size),
           m_max_document_size(max_document_size),
           m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]),
-          m_reader(reader) {
+          m_reader(reader),
+          m_path(std::move(path)) {
     read_new_json();
 }
 
@@ -95,8 +97,10 @@ bool JsonFileIterator::read_new_json() {
             }
             size_t bytes_added = m_buf_occupied - old_buf_occupied;
             SPDLOG_WARN(
-                    "Escaped {} control character(s) in JSON: [{}]. Buffer expanded by {} bytes ({} -> {}).",
+                    "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes "
+                    "({} -> {}).",
                     total_sanitized,
+                    m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
                     char_details,
                     bytes_added,
                     old_buf_occupied,
diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp
index a9732ad5c6..dcb4a44d94 100644
--- a/components/core/src/clp_s/JsonFileIterator.hpp
+++ b/components/core/src/clp_s/JsonFileIterator.hpp
@@ -1,6 +1,8 @@
 #ifndef CLP_S_JSONFILEITERATOR_HPP
 #define CLP_S_JSONFILEITERATOR_HPP
 
+#include <string>
+
 #include <simdjson.h>
 
 #include "../clp/ReaderInterface.hpp"
@@ -19,11 +21,13 @@ class JsonFileIterator {
 
      * @param reader the input stream containing JSON
      * @param max_document_size the maximum allowed size of a single document
+     * @param path optional path to the file being read (used for logging)
      * @param buf_size the initial buffer size
      */
     explicit JsonFileIterator(
             clp::ReaderInterface& reader,
             size_t max_document_size,
+            std::string path = {},
             size_t buf_size = 1024 * 1024 /* 1 MiB default */
     );
     ~JsonFileIterator();
@@ -84,6 +88,7 @@ class JsonFileIterator {
     size_t m_max_document_size{0};
     char* m_buf{nullptr};
     clp::ReaderInterface& m_reader;
+    std::string m_path;
     simdjson::ondemand::parser m_parser;
     simdjson::ondemand::document_stream m_stream;
     bool m_eof{false};
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index c0e2123e7f..ff36aac8cc 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -668,7 +668,7 @@ auto JsonParser::ingest_json(
         Path const& path,
         std::string const& archive_creator_id
 ) -> bool {
-    JsonFileIterator json_file_iterator(*reader, m_max_document_size);
+    JsonFileIterator json_file_iterator(*reader, m_max_document_size, path.path);
     if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) {
         SPDLOG_ERROR(
                 "Encountered error - {} - while trying to parse {} after parsing 0 bytes",

From c9fc698269cc0f32732e75ae962711d45be0fc8a Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Thu, 8 Jan 2026 22:28:33 +0800
Subject: [PATCH 03/11] style: apply clang-format to source files

---
 .../core/src/clp_s/JsonFileIterator.cpp       |  5 +--
 .../core/tests/test-clp_s-StringUtils.cpp     | 42 +++++++++++++------
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index 5b38e45bf7..07c2040944 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -107,12 +107,11 @@ bool JsonFileIterator::read_new_json() {
                     m_buf_occupied
             );
 
-            error = m_parser
-                            .iterate_many(
+            error = m_parser.iterate_many(
                                     m_buf,
                                     /* length of data */ m_buf_occupied,
                                     /* batch size of data to parse*/ m_buf_occupied
-                            )
+            )
                             .get(m_stream);
         }
 
diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
index 9163281087..ac9fafdd5c 100644
--- a/components/core/tests/test-clp_s-StringUtils.cpp
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -12,8 +12,9 @@ using clp_s::StringUtils;
 
 // We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes
 // and control characters. Without the `s` suffix, std::string's constructor from a C-string literal
-// stops at the first null byte, truncating the test input. Additionally, we use string concatenation
-// (e.g., "\x00" "end") to prevent hex escape sequences from being extended by following hex digits.
+// stops at the first null byte, truncating the test input. Additionally, we use string
+// concatenation (e.g., "\x00" "end") to prevent hex escape sequences from being extended by
+// following hex digits.
 using namespace std::string_literals;
 
 namespace {
@@ -108,7 +109,10 @@ TEST_CASE("sanitize_json_buffer_escapes_control_chars", "[clp_s][StringUtils]")
 
     SECTION("various control characters") {
         // Test 0x01 (SOH), 0x02 (STX), 0x1F (US)
-        auto input = "{\"key\": \"a\x01" "b\x02" "c\x1F" "d\"}"s;
+        auto input = "{\"key\": \"a\x01"
+                     "b\x02"
+                     "c\x1F"
+                     "d\"}"s;
         std::string expected = R"({"key": "a\u0001b\u0002c\u001fd"})";
         REQUIRE(sanitize_string(input) == expected);
     }
@@ -141,13 +145,15 @@ TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils
 
     SECTION("control char after escaped backslash") {
         // \\\x00 means escaped backslash followed by a control char still inside the string
-        auto input = "{\"key\": \"slash:\\\\\x00" "end\"}"s;
+        auto input = "{\"key\": \"slash:\\\\\x00"
+                     "end\"}"s;
         std::string expected = R"({"key": "slash:\\\u0000end"})";
         REQUIRE(sanitize_string(input) == expected);
     }
 
     SECTION("control char after escaped quote") {
-        auto input = "{\"key\": \"quote:\\\"\x00" "after\"}"s;
+        auto input = "{\"key\": \"quote:\\\"\x00"
+                     "after\"}"s;
         std::string expected = R"({"key": "quote:\"\u0000after"})";
         REQUIRE(sanitize_string(input) == expected);
     }
@@ -156,16 +162,20 @@ TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils
         // \<control-char> is not a valid JSON escape sequence, but we leave it as-is
         // since the sanitizer treats any char after backslash as an escape target.
         // This JSON is invalid anyway (bad escape sequence) and will fail parsing.
-        auto input = "{\"key\": \"bad:\\\x00" "end\"}"s;
-        auto expected = "{\"key\": \"bad:\\\x00" "end\"}"s;
+        auto input = "{\"key\": \"bad:\\\x00"
+                     "end\"}"s;
+        auto expected = "{\"key\": \"bad:\\\x00"
+                        "end\"}"s;
         REQUIRE(sanitize_string(input) == expected);
     }
 
     SECTION("triple backslash followed by control char") {
         // \\\ followed by \x00: first \\ is escaped backslash, third \ starts escape sequence.
         // The third backslash makes the null appear to be an escape target, so it's not escaped.
-        auto input = "{\"key\": \"\\\\\\\x00" "end\"}"s;
-        auto expected = "{\"key\": \"\\\\\\\x00" "end\"}"s;
+        auto input = "{\"key\": \"\\\\\\\x00"
+                     "end\"}"s;
+        auto expected = "{\"key\": \"\\\\\\\x00"
+                        "end\"}"s;
         REQUIRE(sanitize_string(input) == expected);
     }
 }
@@ -226,19 +236,24 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") {
         //
         // Input: truncated JSON where the string is not closed, followed by control chars
         // The sanitizer may incorrectly think it's still in a string and escape the control chars
-        auto input = "{\"key\": \"unclosed string\x00\x01" "after\"}"s;
+        auto input = "{\"key\": \"unclosed string\x00\x01"
+                     "after\"}"s;
         // The sanitizer will escape the control chars because it thinks we're still in a string
         // (the quote after "unclosed string" is escaped, so the string continues)
         std::string sanitized = sanitize_string(input);
         // The control chars should be escaped (behavior may vary based on quote matching)
         // The important thing is that the function doesn't crash and handles it gracefully
-        REQUIRE((sanitized.find('\x00') == std::string::npos || sanitized.find("\\u0000") != std::string::npos));
+        REQUIRE(
+                (sanitized.find('\x00') == std::string::npos
+                 || sanitized.find("\\u0000") != std::string::npos)
+        );
     }
 
     SECTION("truncated JSON ending mid-string") {
         // JSON truncated in the middle of a string with control chars (no closing quote)
         // This simulates a real truncation scenario where the buffer ends mid-string
-        auto input = "{\"key\": \"value\x00\x01" "truncated"s;
+        auto input = "{\"key\": \"value\x00\x01"
+                     "truncated"s;
         std::string sanitized = sanitize_string(input);
         // Should handle gracefully without crashing
         // Control chars inside the (unclosed) string should be escaped
@@ -251,7 +266,8 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") {
     SECTION("truncated JSON with control chars after unmatched quote") {
         // JSON with unmatched quote followed by control chars outside the string
         // The sanitizer may incorrectly escape these if it thinks we're still in a string
-        auto input = "{\"key\": \"value\x00" "}\x01\x02"s;
+        auto input = "{\"key\": \"value\x00"
+                     "}\x01\x02"s;
         std::string sanitized = sanitize_string(input);
         // Function should not crash - behavior may vary but should be consistent
         REQUIRE(sanitized.size() >= input.size());

From baf6366df970af3aba12636f7157103e00c03946 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 02:48:23 +0800
Subject: [PATCH 04/11] feat(clp-s): add --sanitize-invalid-json flag for
 opt-in JSON sanitization

Add a new CLI flag that enables sanitization of invalid JSON during
ingestion. When enabled, the following issues are automatically fixed:

- Invalid UTF-8 sequences: replaced with U+FFFD replacement character
- Unescaped control characters (0x00-0x1F): escaped to \u00XX format
- Invalid surrogate escapes: handled via simdjson's allow_replacement

When disabled (default), parsing fails fast on any of these issues.

Changes:
- Add --sanitize-invalid-json flag to CommandLineArguments
- Add sanitize_utf8_buffer() to Utils for UTF-8 validation/replacement
- Update JsonFileIterator to conditionally sanitize on UTF8_ERROR and
  UNESCAPED_CHARS errors during both initial parsing and iteration
- Update JsonParser to pass flag through and use conditional
  get_string()/unescaped_key() for surrogate escape handling
---
 .../core/src/clp_s/CommandLineArguments.cpp   |   6 +
 .../core/src/clp_s/CommandLineArguments.hpp   |   3 +
 .../core/src/clp_s/JsonFileIterator.cpp       | 227 ++++++++++++++----
 .../core/src/clp_s/JsonFileIterator.hpp       |   3 +
 components/core/src/clp_s/JsonParser.cpp      |  14 +-
 components/core/src/clp_s/JsonParser.hpp      |   2 +
 components/core/src/clp_s/Utils.cpp           | 142 +++++++++++
 components/core/src/clp_s/Utils.hpp           |  23 ++
 components/core/src/clp_s/clp-s.cpp           |   1 +
 9 files changed, 370 insertions(+), 51 deletions(-)

diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 72c0d50d80..fef29442f6 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -256,6 +256,12 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     "structurize-arrays",
                     po::bool_switch(&m_structurize_arrays),
                     "Structurize arrays instead of compressing them as clp strings."
+            )(
+                    "sanitize-invalid-json",
+                    po::bool_switch(&m_sanitize_invalid_json),
+                    "Sanitize invalid JSON by escaping unescaped control characters (0x00-0x1F),"
+                    " replacing invalid UTF-8 sequences with U+FFFD, and handling invalid"
+                    " surrogate escapes. When disabled (default), parsing fails on invalid JSON."
             )(
                     "disable-log-order",
                     po::bool_switch(&m_disable_log_order),
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index b35124981a..2e32b57a68 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -113,6 +113,8 @@ class CommandLineArguments {
 
     bool get_structurize_arrays() const { return m_structurize_arrays; }
 
+    bool get_sanitize_invalid_json() const { return m_sanitize_invalid_json; }
+
     bool get_ordered_decompression() const { return m_ordered_decompression; }
 
     size_t get_target_ordered_chunk_size() const { return m_target_ordered_chunk_size; }
@@ -202,6 +204,7 @@ class CommandLineArguments {
     bool m_no_retain_float_format{false};
     bool m_single_file_archive{false};
     bool m_structurize_arrays{false};
+    bool m_sanitize_invalid_json{false};
     bool m_ordered_decompression{false};
     size_t m_target_ordered_chunk_size{};
     bool m_print_ordered_chunk_stats{false};
diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index 07c2040944..bc9cc234e8 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -12,6 +12,7 @@ namespace clp_s {
 JsonFileIterator::JsonFileIterator(
         clp::ReaderInterface& reader,
         size_t max_document_size,
+        bool sanitize_invalid_json,
         std::string path,
         size_t buf_size
 )
@@ -19,7 +20,8 @@ JsonFileIterator::JsonFileIterator(
           m_max_document_size(max_document_size),
           m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]),
           m_reader(reader),
-          m_path(std::move(path)) {
+          m_path(std::move(path)),
+          m_sanitize_invalid_json(sanitize_invalid_json) {
     read_new_json();
 }
 
@@ -69,50 +71,91 @@ bool JsonFileIterator::read_new_json() {
         )
                              .get(m_stream);
 
-        // If we encounter unescaped control characters in JSON strings, sanitize the buffer
-        // by escaping them to \u00XX format and retry parsing
-        if (simdjson::error_code::UNESCAPED_CHARS == error) {
-            size_t const old_buf_occupied = m_buf_occupied;
-            // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by
-            // reference if reallocation is needed. This keeps m_buf_size in sync with the
-            // actual allocated buffer size.
-            auto const result = StringUtils::sanitize_json_buffer(
-                    m_buf,
-                    m_buf_size,
-                    m_buf_occupied,
-                    simdjson::SIMDJSON_PADDING
-            );
-            m_buf_occupied = result.new_buf_occupied;
-            m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
-
-            // Build log message with details of sanitized characters
-            size_t total_sanitized = 0;
-            std::string char_details;
-            for (auto const& [ch, count] : result.sanitized_char_counts) {
-                if (!char_details.empty()) {
-                    char_details += ", ";
+        // If sanitization is enabled and we encounter errors that can be fixed by sanitization,
+        // sanitize the buffer and retry parsing
+        if (m_sanitize_invalid_json) {
+            // Handle invalid UTF-8 sequences by replacing with U+FFFD
+            if (simdjson::error_code::UTF8_ERROR == error) {
+                size_t const old_buf_occupied = m_buf_occupied;
+                auto const result = StringUtils::sanitize_utf8_buffer(
+                        m_buf,
+                        m_buf_size,
+                        m_buf_occupied,
+                        simdjson::SIMDJSON_PADDING
+                );
+                m_buf_occupied = result.new_buf_occupied;
+                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+                if (!result.sanitized_char_counts.empty()) {
+                    size_t total_replaced = 0;
+                    for (auto const& [ch, count] : result.sanitized_char_counts) {
+                        total_replaced += count;
+                    }
+                    SPDLOG_WARN(
+                            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size "
+                            "changed by {} bytes ({} -> {}).",
+                            total_replaced,
+                            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+                            static_cast<int64_t>(m_buf_occupied)
+                                    - static_cast<int64_t>(old_buf_occupied),
+                            old_buf_occupied,
+                            m_buf_occupied
+                    );
+                }
+
+                error = m_parser.iterate_many(
+                                        m_buf,
+                                        /* length of data */ m_buf_occupied,
+                                        /* batch size of data to parse*/ m_buf_occupied
+                )
+                                .get(m_stream);
+            }
+
+            // Handle unescaped control characters by escaping them to \u00XX format
+            if (simdjson::error_code::UNESCAPED_CHARS == error) {
+                size_t const old_buf_occupied = m_buf_occupied;
+                // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by
+                // reference if reallocation is needed. This keeps m_buf_size in sync with the
+                // actual allocated buffer size.
+                auto const result = StringUtils::sanitize_json_buffer(
+                        m_buf,
+                        m_buf_size,
+                        m_buf_occupied,
+                        simdjson::SIMDJSON_PADDING
+                );
+                m_buf_occupied = result.new_buf_occupied;
+                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+                // Build log message with details of sanitized characters
+                size_t total_sanitized = 0;
+                std::string char_details;
+                for (auto const& [ch, count] : result.sanitized_char_counts) {
+                    if (!char_details.empty()) {
+                        char_details += ", ";
+                    }
+                    char_details
+                            += fmt::format("0x{:02x} ({})", static_cast<unsigned char>(ch), count);
+                    total_sanitized += count;
                 }
-                char_details += fmt::format("0x{:02x} ({})", static_cast<unsigned char>(ch), count);
-                total_sanitized += count;
+                size_t bytes_added = m_buf_occupied - old_buf_occupied;
+                SPDLOG_WARN(
+                        "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} "
+                        "bytes ({} -> {}).",
+                        total_sanitized,
+                        m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+                        char_details,
+                        bytes_added,
+                        old_buf_occupied,
+                        m_buf_occupied
+                );
+
+                error = m_parser.iterate_many(
+                                        m_buf,
+                                        /* length of data */ m_buf_occupied,
+                                        /* batch size of data to parse*/ m_buf_occupied
+                )
+                                .get(m_stream);
             }
-            size_t bytes_added = m_buf_occupied - old_buf_occupied;
-            SPDLOG_WARN(
-                    "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes "
-                    "({} -> {}).",
-                    total_sanitized,
-                    m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
-                    char_details,
-                    bytes_added,
-                    old_buf_occupied,
-                    m_buf_occupied
-            );
-
-            error = m_parser.iterate_many(
-                                    m_buf,
-                                    /* length of data */ m_buf_occupied,
-                                    /* batch size of data to parse*/ m_buf_occupied
-            )
-                            .get(m_stream);
         }
 
         if (error) {
@@ -169,6 +212,59 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
                 return true;
             } else if (m_doc_it.error() == simdjson::error_code::UTF8_ERROR) {
                 maybe_utf8_edge_case = true;
+            } else if (m_sanitize_invalid_json
+                       && m_doc_it.error() == simdjson::error_code::UNESCAPED_CHARS)
+            {
+                // Unescaped control characters detected during document iteration. Sanitize the
+                // buffer and re-setup the document stream to restart from the beginning.
+                size_t const old_buf_occupied = m_buf_occupied;
+                auto const result = StringUtils::sanitize_json_buffer(
+                        m_buf,
+                        m_buf_size,
+                        m_buf_occupied,
+                        simdjson::SIMDJSON_PADDING
+                );
+                m_buf_occupied = result.new_buf_occupied;
+                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+                // Log sanitization details
+                if (!result.sanitized_char_counts.empty()) {
+                    size_t total_sanitized = 0;
+                    std::string char_details;
+                    for (auto const& [ch, count] : result.sanitized_char_counts) {
+                        if (!char_details.empty()) {
+                            char_details += ", ";
+                        }
+                        char_details += fmt::format(
+                                "0x{:02x} ({})",
+                                static_cast<unsigned char>(ch),
+                                count
+                        );
+                        total_sanitized += count;
+                    }
+                    SPDLOG_WARN(
+                            "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by "
+                            "{} bytes ({} -> {}).",
+                            total_sanitized,
+                            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+                            char_details,
+                            m_buf_occupied - old_buf_occupied,
+                            old_buf_occupied,
+                            m_buf_occupied
+                    );
+                }
+
+                // Re-setup the document stream and restart iteration
+                auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied)
+                                     .get(m_stream);
+                if (error) {
+                    m_error_code = error;
+                    return false;
+                }
+                m_doc_it = m_stream.begin();
+                m_first_doc_in_buffer = true;
+                m_next_document_position = 0;
+                continue;
             } else {
                 m_error_code = m_doc_it.error();
                 return false;
@@ -188,8 +284,49 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
             // If we hit a UTF-8 error and either we have reached eof or the buffer occupancy is
             // greater than the maximum document size we assume that the UTF-8 error must have been
             // in the middle of the stream. Note: it is possible that the UTF-8 error is at the end
-            // of the stream and that this is actualy a truncation error. Unfortunately the only way
-            // to check is to parse it ourselves, so we rely on this heuristic for now.
+            // of the stream and that this is actually a truncation error. Unfortunately the only
+            // way to check is to parse it ourselves, so we rely on this heuristic for now.
+            if (m_sanitize_invalid_json) {
+                // Sanitize invalid UTF-8 sequences and retry
+                size_t const old_buf_occupied = m_buf_occupied;
+                auto const result = StringUtils::sanitize_utf8_buffer(
+                        m_buf,
+                        m_buf_size,
+                        m_buf_occupied,
+                        simdjson::SIMDJSON_PADDING
+                );
+                m_buf_occupied = result.new_buf_occupied;
+                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+                if (!result.sanitized_char_counts.empty()) {
+                    size_t total_replaced = 0;
+                    for (auto const& [ch, count] : result.sanitized_char_counts) {
+                        total_replaced += count;
+                    }
+                    SPDLOG_WARN(
+                            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size "
+                            "changed by {} bytes ({} -> {}).",
+                            total_replaced,
+                            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+                            static_cast<int64_t>(m_buf_occupied)
+                                    - static_cast<int64_t>(old_buf_occupied),
+                            old_buf_occupied,
+                            m_buf_occupied
+                    );
+                }
+
+                // Re-setup the document stream and restart iteration
+                auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied)
+                                     .get(m_stream);
+                if (error) {
+                    m_error_code = error;
+                    return false;
+                }
+                m_doc_it = m_stream.begin();
+                m_first_doc_in_buffer = true;
+                m_next_document_position = 0;
+                continue;
+            }
             m_error_code = simdjson::error_code::UTF8_ERROR;
             return false;
         } else if (maybe_utf8_edge_case) {
diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp
index dcb4a44d94..8b9636bd14 100644
--- a/components/core/src/clp_s/JsonFileIterator.hpp
+++ b/components/core/src/clp_s/JsonFileIterator.hpp
@@ -21,12 +21,14 @@ class JsonFileIterator {
 
      * @param reader the input stream containing JSON
      * @param max_document_size the maximum allowed size of a single document
+     * @param sanitize_invalid_json whether to sanitize invalid JSON (control chars, invalid UTF-8)
      * @param path optional path to the file being read (used for logging)
      * @param buf_size the initial buffer size
      */
     explicit JsonFileIterator(
             clp::ReaderInterface& reader,
             size_t max_document_size,
+            bool sanitize_invalid_json,
             std::string path = {},
             size_t buf_size = 1024 * 1024 /* 1 MiB default */
     );
@@ -89,6 +91,7 @@ class JsonFileIterator {
     char* m_buf{nullptr};
     clp::ReaderInterface& m_reader;
     std::string m_path;
+    bool m_sanitize_invalid_json{false};
     simdjson::ondemand::parser m_parser;
     simdjson::ondemand::document_stream m_stream;
     bool m_eof{false};
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index ff36aac8cc..93687beab1 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -144,6 +144,7 @@ JsonParser::JsonParser(JsonParserOption const& option)
           m_structurize_arrays(option.structurize_arrays),
           m_record_log_order(option.record_log_order),
           m_retain_float_format(option.retain_float_format),
+          m_sanitize_invalid_json(option.sanitize_invalid_json),
           m_input_paths(option.input_paths),
           m_network_auth(option.network_auth) {
     if (false == m_timestamp_key.empty()) {
@@ -223,7 +224,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par
         }
 
         cur_field = *object_it_stack.top();
-        cur_key = cur_field.unescaped_key(true);
+        cur_key = cur_field.unescaped_key(m_sanitize_invalid_json);
         cur_value = cur_field.value();
 
         switch (cur_value.type()) {
@@ -302,7 +303,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par
                 break;
             }
             case simdjson::ondemand::json_type::string: {
-                std::string_view value = cur_value.get_string(true);
+                std::string_view value = cur_value.get_string(m_sanitize_invalid_json);
                 if (value.find(' ') != std::string::npos) {
                     node_id = m_archive_writer
                                       ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key);
@@ -407,7 +408,7 @@ void JsonParser::parse_array(simdjson::ondemand::array array, int32_t parent_nod
                 break;
             }
             case simdjson::ondemand::json_type::string: {
-                std::string_view value = cur_value.get_string(true);
+                std::string_view value = cur_value.get_string(m_sanitize_invalid_json);
                 if (value.find(' ') != std::string::npos) {
                     node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, "");
                 } else {
@@ -452,7 +453,7 @@ void JsonParser::parse_line(
     do {
         if (false == object_stack.empty()) {
             cur_field = *object_it_stack.top();
-            cur_key = cur_field.unescaped_key(true);
+            cur_key = cur_field.unescaped_key(m_sanitize_invalid_json);
             line = cur_field.value();
         }
 
@@ -555,7 +556,7 @@ void JsonParser::parse_line(
                 break;
             }
             case simdjson::ondemand::json_type::string: {
-                std::string_view value = line.get_string(true);
+                std::string_view value = line.get_string(m_sanitize_invalid_json);
                 auto const matches_timestamp
                         = m_archive_writer->matches_timestamp(node_id_stack.top(), cur_key);
                 if (matches_timestamp) {
@@ -668,7 +669,8 @@ auto JsonParser::ingest_json(
         Path const& path,
         std::string const& archive_creator_id
 ) -> bool {
-    JsonFileIterator json_file_iterator(*reader, m_max_document_size, path.path);
+    JsonFileIterator
+            json_file_iterator(*reader, m_max_document_size, m_sanitize_invalid_json, path.path);
     if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) {
         SPDLOG_ERROR(
                 "Encountered error - {} - while trying to parse {} after parsing 0 bytes",
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 82f8d04786..e763d5fe71 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -40,6 +40,7 @@ struct JsonParserOption {
     bool record_log_order{true};
     bool retain_float_format{false};
     bool single_file_archive{false};
+    bool sanitize_invalid_json{false};
     NetworkAuthOption network_auth{};
 };
 
@@ -239,6 +240,7 @@ class JsonParser {
     bool m_structurize_arrays{false};
     bool m_record_log_order{true};
     bool m_retain_float_format{false};
+    bool m_sanitize_invalid_json{false};
 
     absl::flat_hash_map<std::pair<uint32_t, NodeType>, std::pair<int32_t, bool>>
             m_ir_node_to_archive_node_id_mapping;
diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index 5efcb7d77e..9458c970dd 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -233,6 +233,148 @@ JsonSanitizeResult StringUtils::sanitize_json_buffer(
     return {sanitized.size(), std::move(sanitized_char_counts)};
 }
 
+namespace {
+/**
+ * Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx pattern).
+ * @param byte The byte to check
+ * @return true if the byte is a valid continuation byte (0x80-0xBF)
+ */
+constexpr bool is_continuation_byte(unsigned char byte) {
+    return (byte & 0xC0) == 0x80;
+}
+
+/**
+ * Validates a UTF-8 sequence starting at the given position and returns the sequence length.
+ * @param buf The buffer containing the UTF-8 data
+ * @param pos Current position in the buffer
+ * @param buf_occupied Total bytes in the buffer
+ * @return The length of the valid UTF-8 sequence (1-4), or 0 if invalid
+ */
+size_t validate_utf8_sequence(char const* buf, size_t pos, size_t buf_occupied) {
+    auto const byte = static_cast<unsigned char>(buf[pos]);
+    size_t remaining = buf_occupied - pos;
+
+    // ASCII (0x00-0x7F)
+    if (byte <= 0x7F) {
+        return 1;
+    }
+
+    // Invalid: continuation byte without leader, or invalid lead bytes
+    if (byte < 0xC2 || byte > 0xF4) {
+        return 0;
+    }
+
+    // 2-byte sequence (0xC2-0xDF)
+    if (byte <= 0xDF) {
+        if (remaining < 2 || !is_continuation_byte(static_cast<unsigned char>(buf[pos + 1]))) {
+            return 0;
+        }
+        return 2;
+    }
+
+    // 3-byte sequence (0xE0-0xEF)
+    if (byte <= 0xEF) {
+        if (remaining < 3) {
+            return 0;
+        }
+        auto const byte2 = static_cast<unsigned char>(buf[pos + 1]);
+        auto const byte3 = static_cast<unsigned char>(buf[pos + 2]);
+
+        if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3)) {
+            return 0;
+        }
+
+        // Check for overlong encoding (E0 requires second byte >= 0xA0)
+        if (byte == 0xE0 && byte2 < 0xA0) {
+            return 0;
+        }
+
+        // Check for surrogate code points (ED with second byte >= 0xA0 means 0xD800-0xDFFF)
+        if (byte == 0xED && byte2 >= 0xA0) {
+            return 0;
+        }
+
+        return 3;
+    }
+
+    // 4-byte sequence (0xF0-0xF4)
+    if (remaining < 4) {
+        return 0;
+    }
+    auto const byte2 = static_cast<unsigned char>(buf[pos + 1]);
+    auto const byte3 = static_cast<unsigned char>(buf[pos + 2]);
+    auto const byte4 = static_cast<unsigned char>(buf[pos + 3]);
+
+    if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3)
+        || !is_continuation_byte(byte4))
+    {
+        return 0;
+    }
+
+    // Check for overlong encoding (F0 requires second byte >= 0x90)
+    if (byte == 0xF0 && byte2 < 0x90) {
+        return 0;
+    }
+
+    // Check for code points > U+10FFFF (F4 requires second byte <= 0x8F)
+    if (byte == 0xF4 && byte2 > 0x8F) {
+        return 0;
+    }
+
+    return 4;
+}
+}  // namespace
+
+JsonSanitizeResult StringUtils::sanitize_utf8_buffer(
+        char*& buf,
+        size_t& buf_size,
+        size_t buf_occupied,
+        size_t simdjson_padding
+) {
+    // Build sanitized content in a temporary buffer, replacing invalid UTF-8 sequences
+    // with the Unicode replacement character U+FFFD (0xEF 0xBF 0xBD).
+    std::string sanitized;
+
+    // We use a special key to count invalid UTF-8 sequences
+    // Using 0xFF as the key since it's never a valid UTF-8 byte
+    std::map<char, size_t> sanitized_char_counts;
+    constexpr char cInvalidUtf8Key = static_cast<char>(0xFF);
+
+    size_t i = 0;
+    while (i < buf_occupied) {
+        size_t seq_len = validate_utf8_sequence(buf, i, buf_occupied);
+        if (seq_len > 0) {
+            // Valid sequence - copy it
+            sanitized.append(buf + i, seq_len);
+            i += seq_len;
+        } else {
+            // Invalid sequence - replace with U+FFFD
+            sanitized.append("\xEF\xBF\xBD", 3);
+            ++sanitized_char_counts[cInvalidUtf8Key];
+            // Skip one byte and continue (maximal subpart replacement strategy)
+            ++i;
+        }
+    }
+
+    // If no changes were made, return early
+    if (sanitized.size() == buf_occupied) {
+        return {buf_occupied, {}};
+    }
+
+    // Grow buffer if needed to hold sanitized content
+    if (sanitized.size() > buf_size) {
+        size_t new_buf_size = sanitized.size();
+        char* new_buf = new char[new_buf_size + simdjson_padding];
+        delete[] buf;
+        buf = new_buf;
+        buf_size = new_buf_size;
+    }
+
+    // Copy sanitized content to buffer
+    std::memcpy(buf, sanitized.data(), sanitized.size());
+    return {sanitized.size(), std::move(sanitized_char_counts)};
+}
+
 void StringUtils::escape_json_string(std::string& destination, std::string_view const source) {
     // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path
     // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index df5d152444..54111d25a6 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -118,6 +118,29 @@ class StringUtils {
             size_t simdjson_padding
     );
 
+    /**
+     * Sanitizes a buffer by replacing invalid UTF-8 sequences with the Unicode replacement
+     * character (U+FFFD). This is used to fix malformed data that contains invalid UTF-8
+     * which would cause JSON parsing errors.
+     *
+     * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed.
+     *            The caller's pointer will be updated to point to the new buffer if reallocation
+     *            occurs. The caller is responsible for deleting the buffer.
+     * @param buf_size Current allocated size of the buffer (excluding simdjson padding).
+     *                 This parameter is modified by reference and will be updated if the buffer
+     *                 is reallocated to reflect the new buffer size.
+     * @param buf_occupied Number of bytes currently used in the buffer.
+     * @param simdjson_padding Amount of padding required after buffer content.
+     * @return Result containing new buf_occupied and counts of each type of invalid sequence
+     *         replaced.
+     */
+    static JsonSanitizeResult sanitize_utf8_buffer(
+            char*& buf,
+            size_t& buf_size,
+            size_t buf_occupied,
+            size_t simdjson_padding
+    );
+
 private:
     /**
      * Converts a character into its two byte hexadecimal representation.
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index c2cf7b6abd..fd24634680 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -107,6 +107,7 @@ bool compress(CommandLineArguments const& command_line_arguments) {
     option.single_file_archive = command_line_arguments.get_single_file_archive();
     option.structurize_arrays = command_line_arguments.get_structurize_arrays();
     option.record_log_order = command_line_arguments.get_record_log_order();
+    option.sanitize_invalid_json = command_line_arguments.get_sanitize_invalid_json();
 
     clp_s::JsonParser parser(option);
     if (false == parser.ingest()) {

From b0b8b0fc9700604ddcdc3d4972479be7e21585d2 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 03:07:34 +0800
Subject: [PATCH 05/11] test(clp-s): add unit tests for sanitize_utf8_buffer
 function

Add comprehensive tests for the new UTF-8 sanitization function covering:
- Valid UTF-8 passthrough (ASCII, multibyte chars, JSON with UTF-8)
- Invalid byte replacement (0xFF, 0xFE, continuation bytes without leader)
- Truncated sequences (2/3/4-byte sequences cut short)
- Overlong encodings (0xC0, 0xC1 lead bytes)
- Surrogate code points (U+D800-U+DFFF)
- Code points above U+10FFFF
- Mixed valid/invalid UTF-8
- Correct replacement character counts
- Buffer growth for many invalid bytes
---
 .../core/tests/test-clp_s-StringUtils.cpp     | 251 ++++++++++++++++++
 1 file changed, 251 insertions(+)

diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
index ac9fafdd5c..57bea49abf 100644
--- a/components/core/tests/test-clp_s-StringUtils.cpp
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -274,6 +274,257 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") {
     }
 }
 
+// =====================================================================================
+// UTF-8 Sanitization Tests
+// =====================================================================================
+
+namespace {
+/**
+ * Helper to create a buffer with simdjson padding and run UTF-8 sanitization.
+ * @param input The input string to sanitize
+ * @return The sanitized string
+ */
+auto sanitize_utf8_string(std::string_view input) -> std::string {
+    size_t buf_size = input.size();
+    size_t buf_occupied = input.size();
+    char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+    std::memcpy(buf, input.data(), input.size());
+
+    try {
+        auto const result = StringUtils::sanitize_utf8_buffer(
+                buf,
+                buf_size,
+                buf_occupied,
+                simdjson::SIMDJSON_PADDING
+        );
+        std::string output(buf, result.new_buf_occupied);
+        delete[] buf;
+        return output;
+    } catch (...) {
+        delete[] buf;
+        throw;
+    }
+}
+
+/**
+ * Count occurrences of U+FFFD replacement character (0xEF 0xBF 0xBD) in a string.
+ */
+auto count_replacement_chars(std::string_view s) -> size_t {
+    size_t count = 0;
+    for (size_t i = 0; i + 2 < s.size(); ++i) {
+        if (static_cast<unsigned char>(s[i]) == 0xEF && static_cast<unsigned char>(s[i + 1]) == 0xBF
+            && static_cast<unsigned char>(s[i + 2]) == 0xBD)
+        {
+            ++count;
+            i += 2;  // Skip past the replacement char
+        }
+    }
+    return count;
+}
+}  // namespace
+
+TEST_CASE("sanitize_utf8_buffer_no_changes", "[clp_s][StringUtils]") {
+    SECTION("valid ASCII") {
+        std::string input = "Hello, World!";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+
+    SECTION("valid UTF-8 multibyte characters") {
+        // 2-byte: é (U+00E9) = 0xC3 0xA9
+        // 3-byte: € (U+20AC) = 0xE2 0x82 0xAC
+        // 4-byte: 𝄞 (U+1D11E) = 0xF0 0x9D 0x84 0x9E
+        std::string input = "café € 𝄞";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+
+    SECTION("valid JSON with UTF-8") {
+        std::string input = R"({"msg": "日本語テスト"})";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+
+    SECTION("empty string") {
+        std::string input = "";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_replaces_invalid_bytes", "[clp_s][StringUtils]") {
+    // U+FFFD is encoded as 0xEF 0xBF 0xBD in UTF-8
+    constexpr char cReplacementChar[] = "\xEF\xBF\xBD";
+
+    SECTION("single invalid byte 0xFF") {
+        auto input = "hello\xFF world"s;
+        std::string expected = "hello" + std::string(cReplacementChar) + " world";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("single invalid byte 0xFE") {
+        auto input = "test\xFE"s;
+        std::string expected = "test" + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("multiple invalid bytes") {
+        auto input = "\xFF\xFE\xFF"s;
+        std::string expected = std::string(cReplacementChar) + cReplacementChar + cReplacementChar;
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("continuation byte without leader (0x80-0xBF)") {
+        auto input = "test\x80 value"s;
+        std::string expected = "test" + std::string(cReplacementChar) + " value";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("truncated 2-byte sequence") {
+        // 0xC3 expects one continuation byte, but we end the string
+        auto input = "test\xC3"s;
+        std::string expected = "test" + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("truncated 3-byte sequence") {
+        // 0xE2 expects two continuation bytes
+        auto input = "test\xE2\x82"s;
+        std::string expected
+                = "test" + std::string(cReplacementChar) + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("truncated 4-byte sequence") {
+        // 0xF0 expects three continuation bytes
+        auto input = "test\xF0\x9D\x84"s;
+        std::string expected = "test" + std::string(cReplacementChar)
+                               + std::string(cReplacementChar) + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("overlong 2-byte encoding") {
+        // 0xC0 0x80 is overlong encoding of NUL (should be just 0x00)
+        // 0xC0 and 0xC1 are never valid UTF-8 lead bytes
+        auto input = "test\xC0\x80"s;
+        // Both bytes are invalid
+        std::string expected
+                = "test" + std::string(cReplacementChar) + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("surrogate code points (U+D800-U+DFFF)") {
+        // 0xED 0xA0 0x80 encodes U+D800 (invalid surrogate)
+        auto input = "test\xED\xA0\x80 end"s;
+        // The sequence is invalid, bytes replaced individually
+        std::string expected = "test" + std::string(cReplacementChar)
+                               + std::string(cReplacementChar) + std::string(cReplacementChar)
+                               + " end";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("code point above U+10FFFF") {
+        // 0xF4 0x90 0x80 0x80 would encode U+110000 (above max)
+        auto input = "test\xF4\x90\x80\x80 end"s;
+        std::string sanitized = sanitize_utf8_string(input);
+        // Should contain replacement characters
+        REQUIRE(count_replacement_chars(sanitized) >= 1);
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_mixed_valid_invalid", "[clp_s][StringUtils]") {
+    constexpr char cReplacementChar[] = "\xEF\xBF\xBD";
+
+    SECTION("invalid byte between valid UTF-8") {
+        // café with invalid byte in middle
+        auto input = "caf\xC3\xA9\xFF test"s;  // café + 0xFF + " test"
+        std::string expected = "caf\xC3\xA9" + std::string(cReplacementChar) + " test";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("JSON with invalid UTF-8 in value") {
+        auto input = "{\"msg\": \"test\xFF value\"}"s;
+        std::string expected = "{\"msg\": \"test" + std::string(cReplacementChar) + " value\"}";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_returns_correct_counts", "[clp_s][StringUtils]") {
+    SECTION("counts invalid sequences") {
+        auto input = "\xFF\xFE\xFF"s;  // 3 invalid bytes
+
+        size_t buf_size = input.size();
+        size_t buf_occupied = input.size();
+        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+        std::memcpy(buf, input.data(), input.size());
+
+        try {
+            auto const result = StringUtils::sanitize_utf8_buffer(
+                    buf,
+                    buf_size,
+                    buf_occupied,
+                    simdjson::SIMDJSON_PADDING
+            );
+
+            // The count uses 0xFF as a special key for invalid UTF-8 sequences
+            REQUIRE_FALSE(result.sanitized_char_counts.empty());
+            size_t total = 0;
+            for (auto const& [ch, count] : result.sanitized_char_counts) {
+                total += count;
+            }
+            REQUIRE(total == 3);
+
+            delete[] buf;
+        } catch (...) {
+            delete[] buf;
+            throw;
+        }
+    }
+
+    SECTION("returns empty counts when no sanitization needed") {
+        std::string input = "valid UTF-8 string";
+
+        size_t buf_size = input.size();
+        size_t buf_occupied = input.size();
+        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+        std::memcpy(buf, input.data(), input.size());
+
+        try {
+            auto const result = StringUtils::sanitize_utf8_buffer(
+                    buf,
+                    buf_size,
+                    buf_occupied,
+                    simdjson::SIMDJSON_PADDING
+            );
+
+            REQUIRE(result.sanitized_char_counts.empty());
+            REQUIRE(result.new_buf_occupied == input.size());
+
+            delete[] buf;
+        } catch (...) {
+            delete[] buf;
+            throw;
+        }
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_handles_buffer_growth", "[clp_s][StringUtils]") {
+    // Each invalid byte (1 byte) becomes U+FFFD (3 bytes), so buffer may need to grow
+    SECTION("many invalid bytes requiring expansion") {
+        // 100 invalid bytes -> 300 bytes after replacement with U+FFFD
+        std::string input(100, '\xFF');
+
+        std::string sanitized = sanitize_utf8_string(input);
+
+        // Verify all bytes were replaced
+        REQUIRE(sanitized.find('\xFF') == std::string::npos);
+        // Verify correct number of replacement characters
+        REQUIRE(count_replacement_chars(sanitized) == 100);
+        // Verify size: 100 * 3 = 300 bytes
+        REQUIRE(sanitized.size() == 300);
+    }
+}
+
+// =====================================================================================
+// JSON Sanitization Character Count Tests
+// =====================================================================================
+
 TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUtils]") {
     SECTION("counts multiple different control characters") {
         // Input with: 3x \x00, 2x \x01, 1x \x1f

From ee13dd3da4e8d827575af7308ee259b850b3bea2 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 03:41:51 +0800
Subject: [PATCH 06/11] fix(clp_s): prevent infinite loop if sanitization makes
 no changes

Add guards in get_json() to exit with error if sanitization functions
don't modify the buffer. This prevents infinite loops in edge cases
where simdjson reports UNESCAPED_CHARS or UTF8_ERROR but the
sanitization functions don't find anything to fix.
---
 components/core/src/clp_s/JsonFileIterator.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index bc9cc234e8..8152ad6d09 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -252,6 +252,11 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
                             old_buf_occupied,
                             m_buf_occupied
                     );
+                } else {
+                    // Sanitization made no changes - report the original error to avoid infinite
+                    // loop
+                    m_error_code = m_doc_it.error();
+                    return false;
                 }
 
                 // Re-setup the document stream and restart iteration
@@ -313,6 +318,11 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
                             old_buf_occupied,
                             m_buf_occupied
                     );
+                } else {
+                    // Sanitization made no changes - report the original error to avoid infinite
+                    // loop
+                    m_error_code = simdjson::error_code::UTF8_ERROR;
+                    return false;
                 }
 
                 // Re-setup the document stream and restart iteration

From 9c2140ea70c2091b112dd353de0c03ffb697bc40 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 03:48:12 +0800
Subject: [PATCH 07/11] style(clp_s): use 'false == expr' instead of '!expr'
 per coding standards

---
 components/core/tests/test-clp_s-StringUtils.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
index 57bea49abf..48794f8d4f 100644
--- a/components/core/tests/test-clp_s-StringUtils.cpp
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -60,7 +60,7 @@ auto can_parse_json(std::string_view json) -> bool {
     simdjson::ondemand::parser parser;
     simdjson::padded_string padded(json);
     auto result = parser.iterate(padded);
-    return !result.error();
+    return false == result.error();
 }
 }  // namespace
 

From 4dbf8f32ea974e36fe1d268604b8369053634968 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 03:53:29 +0800
Subject: [PATCH 08/11] refactor(clp_s): extract sanitize_and_log helper
 methods

Extract sanitize_and_log_utf8() and sanitize_and_log_json() helper
methods to reduce code duplication. The sanitization-and-log pattern
was repeated 4 times with slight variations.

Both helpers return true if sanitization made changes, allowing
callers to detect infinite loop conditions in get_json().
---
 .../core/src/clp_s/JsonFileIterator.cpp       | 217 +++++++-----------
 .../core/src/clp_s/JsonFileIterator.hpp       |  14 ++
 2 files changed, 94 insertions(+), 137 deletions(-)

diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index 8152ad6d09..afe014160c 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -76,85 +76,18 @@ bool JsonFileIterator::read_new_json() {
         if (m_sanitize_invalid_json) {
             // Handle invalid UTF-8 sequences by replacing with U+FFFD
             if (simdjson::error_code::UTF8_ERROR == error) {
-                size_t const old_buf_occupied = m_buf_occupied;
-                auto const result = StringUtils::sanitize_utf8_buffer(
-                        m_buf,
-                        m_buf_size,
-                        m_buf_occupied,
-                        simdjson::SIMDJSON_PADDING
-                );
-                m_buf_occupied = result.new_buf_occupied;
-                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
-
-                if (!result.sanitized_char_counts.empty()) {
-                    size_t total_replaced = 0;
-                    for (auto const& [ch, count] : result.sanitized_char_counts) {
-                        total_replaced += count;
-                    }
-                    SPDLOG_WARN(
-                            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size "
-                            "changed by {} bytes ({} -> {}).",
-                            total_replaced,
-                            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
-                            static_cast<int64_t>(m_buf_occupied)
-                                    - static_cast<int64_t>(old_buf_occupied),
-                            old_buf_occupied,
-                            m_buf_occupied
-                    );
-                }
-
-                error = m_parser.iterate_many(
-                                        m_buf,
-                                        /* length of data */ m_buf_occupied,
-                                        /* batch size of data to parse*/ m_buf_occupied
-                )
-                                .get(m_stream);
+                // Return value intentionally ignored - in read_new_json we always retry after
+                // sanitization regardless of whether changes were made
+                static_cast<void>(sanitize_and_log_utf8());
+                error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
             }
 
             // Handle unescaped control characters by escaping them to \u00XX format
             if (simdjson::error_code::UNESCAPED_CHARS == error) {
-                size_t const old_buf_occupied = m_buf_occupied;
-                // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by
-                // reference if reallocation is needed. This keeps m_buf_size in sync with the
-                // actual allocated buffer size.
-                auto const result = StringUtils::sanitize_json_buffer(
-                        m_buf,
-                        m_buf_size,
-                        m_buf_occupied,
-                        simdjson::SIMDJSON_PADDING
-                );
-                m_buf_occupied = result.new_buf_occupied;
-                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
-
-                // Build log message with details of sanitized characters
-                size_t total_sanitized = 0;
-                std::string char_details;
-                for (auto const& [ch, count] : result.sanitized_char_counts) {
-                    if (!char_details.empty()) {
-                        char_details += ", ";
-                    }
-                    char_details
-                            += fmt::format("0x{:02x} ({})", static_cast<unsigned char>(ch), count);
-                    total_sanitized += count;
-                }
-                size_t bytes_added = m_buf_occupied - old_buf_occupied;
-                SPDLOG_WARN(
-                        "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} "
-                        "bytes ({} -> {}).",
-                        total_sanitized,
-                        m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
-                        char_details,
-                        bytes_added,
-                        old_buf_occupied,
-                        m_buf_occupied
-                );
-
-                error = m_parser.iterate_many(
-                                        m_buf,
-                                        /* length of data */ m_buf_occupied,
-                                        /* batch size of data to parse*/ m_buf_occupied
-                )
-                                .get(m_stream);
+                // Return value intentionally ignored - in read_new_json we always retry after
+                // sanitization regardless of whether changes were made
+                static_cast<void>(sanitize_and_log_json());
+                error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
             }
         }
 
@@ -189,6 +122,76 @@ size_t JsonFileIterator::skip_whitespace_and_get_truncated_bytes() {
     return m_buf_occupied - m_next_document_position;
 }
 
+bool JsonFileIterator::sanitize_and_log_utf8() {
+    size_t const old_buf_occupied = m_buf_occupied;
+    auto const result = StringUtils::sanitize_utf8_buffer(
+            m_buf,
+            m_buf_size,
+            m_buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    m_buf_occupied = result.new_buf_occupied;
+    m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+    if (result.sanitized_char_counts.empty()) {
+        return false;
+    }
+
+    size_t total_replaced = 0;
+    for (auto const& [ch, count] : result.sanitized_char_counts) {
+        total_replaced += count;
+    }
+    SPDLOG_WARN(
+            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size changed by {} bytes "
+            "({} -> {}).",
+            total_replaced,
+            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+            static_cast<int64_t>(m_buf_occupied) - static_cast<int64_t>(old_buf_occupied),
+            old_buf_occupied,
+            m_buf_occupied
+    );
+    return true;
+}
+
+bool JsonFileIterator::sanitize_and_log_json() {
+    size_t const old_buf_occupied = m_buf_occupied;
+    // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by reference if
+    // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size.
+    auto const result = StringUtils::sanitize_json_buffer(
+            m_buf,
+            m_buf_size,
+            m_buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    m_buf_occupied = result.new_buf_occupied;
+    m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+    if (result.sanitized_char_counts.empty()) {
+        return false;
+    }
+
+    size_t total_sanitized = 0;
+    std::string char_details;
+    for (auto const& [ch, count] : result.sanitized_char_counts) {
+        if (false == char_details.empty()) {
+            char_details += ", ";
+        }
+        char_details += fmt::format("0x{:02x} ({})", static_cast<unsigned char>(ch), count);
+        total_sanitized += count;
+    }
+    SPDLOG_WARN(
+            "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes "
+            "({} -> {}).",
+            total_sanitized,
+            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+            char_details,
+            m_buf_occupied - old_buf_occupied,
+            old_buf_occupied,
+            m_buf_occupied
+    );
+    return true;
+}
+
 bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) {
     if (false == m_first_read) {
         ++m_doc_it;
@@ -217,42 +220,7 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
             {
                 // Unescaped control characters detected during document iteration. Sanitize the
                 // buffer and re-setup the document stream to restart from the beginning.
-                size_t const old_buf_occupied = m_buf_occupied;
-                auto const result = StringUtils::sanitize_json_buffer(
-                        m_buf,
-                        m_buf_size,
-                        m_buf_occupied,
-                        simdjson::SIMDJSON_PADDING
-                );
-                m_buf_occupied = result.new_buf_occupied;
-                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
-
-                // Log sanitization details
-                if (!result.sanitized_char_counts.empty()) {
-                    size_t total_sanitized = 0;
-                    std::string char_details;
-                    for (auto const& [ch, count] : result.sanitized_char_counts) {
-                        if (!char_details.empty()) {
-                            char_details += ", ";
-                        }
-                        char_details += fmt::format(
-                                "0x{:02x} ({})",
-                                static_cast<unsigned char>(ch),
-                                count
-                        );
-                        total_sanitized += count;
-                    }
-                    SPDLOG_WARN(
-                            "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by "
-                            "{} bytes ({} -> {}).",
-                            total_sanitized,
-                            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
-                            char_details,
-                            m_buf_occupied - old_buf_occupied,
-                            old_buf_occupied,
-                            m_buf_occupied
-                    );
-                } else {
+                if (false == sanitize_and_log_json()) {
                     // Sanitization made no changes - report the original error to avoid infinite
                     // loop
                     m_error_code = m_doc_it.error();
@@ -293,32 +261,7 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
             // way to check is to parse it ourselves, so we rely on this heuristic for now.
             if (m_sanitize_invalid_json) {
                 // Sanitize invalid UTF-8 sequences and retry
-                size_t const old_buf_occupied = m_buf_occupied;
-                auto const result = StringUtils::sanitize_utf8_buffer(
-                        m_buf,
-                        m_buf_size,
-                        m_buf_occupied,
-                        simdjson::SIMDJSON_PADDING
-                );
-                m_buf_occupied = result.new_buf_occupied;
-                m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
-
-                if (!result.sanitized_char_counts.empty()) {
-                    size_t total_replaced = 0;
-                    for (auto const& [ch, count] : result.sanitized_char_counts) {
-                        total_replaced += count;
-                    }
-                    SPDLOG_WARN(
-                            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size "
-                            "changed by {} bytes ({} -> {}).",
-                            total_replaced,
-                            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
-                            static_cast<int64_t>(m_buf_occupied)
-                                    - static_cast<int64_t>(old_buf_occupied),
-                            old_buf_occupied,
-                            m_buf_occupied
-                    );
-                } else {
+                if (false == sanitize_and_log_utf8()) {
                     // Sanitization made no changes - report the original error to avoid infinite
                     // loop
                     m_error_code = simdjson::error_code::UTF8_ERROR;
diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp
index 8b9636bd14..0ef58d6fa3 100644
--- a/components/core/src/clp_s/JsonFileIterator.hpp
+++ b/components/core/src/clp_s/JsonFileIterator.hpp
@@ -81,6 +81,20 @@ class JsonFileIterator {
      */
     [[nodiscard]] size_t skip_whitespace_and_get_truncated_bytes();
 
+    /**
+     * Sanitizes invalid UTF-8 sequences in the buffer by replacing them with U+FFFD,
+     * updates buffer tracking variables, and logs a warning if changes were made.
+     * @return true if sanitization made changes, false otherwise
+     */
+    [[nodiscard]] bool sanitize_and_log_utf8();
+
+    /**
+     * Sanitizes unescaped control characters in the buffer by escaping them to \\u00XX format,
+     * updates buffer tracking variables, and logs a warning if changes were made.
+     * @return true if sanitization made changes, false otherwise
+     */
+    [[nodiscard]] bool sanitize_and_log_json();
+
     size_t m_truncated_bytes{0};
     size_t m_next_document_position{0};
     size_t m_bytes_read{0};

From c68c60701776b7830bef46e25950b537945853c3 Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 03:58:03 +0800
Subject: [PATCH 09/11] refactor(clp_s): extract test helpers to reduce
 duplication

- Add SanitizeResultWithOutput struct for returning both result and output
- Add sanitize_json_with_result() and sanitize_utf8_with_result() helpers
- Simplify sanitize_string() and sanitize_utf8_string() to use helpers
- Replace manual buffer management in count tests with helper calls

This reduces code duplication in test files and improves maintainability.
---
 .../core/tests/test-clp_s-StringUtils.cpp     | 211 +++++++-----------
 1 file changed, 82 insertions(+), 129 deletions(-)

diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
index 48794f8d4f..f7969ecbee 100644
--- a/components/core/tests/test-clp_s-StringUtils.cpp
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -1,4 +1,5 @@
 #include <cstring>
+#include <memory>
 #include <string>
 #include <string_view>
 
@@ -19,36 +20,47 @@ using namespace std::string_literals;
 
 namespace {
 /**
- * Helper to create a buffer with simdjson padding and run sanitization.
+ * Result structure containing both the sanitization result and the output string.
+ */
+struct SanitizeResultWithOutput {
+    JsonSanitizeResult result;
+    std::string output;
+};
+
+/**
+ * Helper to create a buffer with simdjson padding and run JSON sanitization.
+ * Returns both the result (with character counts) and the sanitized output string.
  * @param input The input string to sanitize
- * @return The sanitized string
- * @note If sanitize_json_buffer reallocates the buffer, the new buffer pointer is returned
- *       and the original buffer is automatically deleted by sanitize_json_buffer. This helper
- *       correctly handles both cases (reallocation and no reallocation) by always deleting
- *       the final buffer pointer.
+ * @return SanitizeResultWithOutput containing result metadata and sanitized string
+ * @note sanitize_json_buffer may reallocate the buffer (passed by reference). This helper
+ *       handles both reallocation and no-reallocation cases correctly.
  */
-auto sanitize_string(std::string_view input) -> std::string {
+auto sanitize_json_with_result(std::string_view input) -> SanitizeResultWithOutput {
     size_t buf_size = input.size();
     size_t buf_occupied = input.size();
+    // Use raw pointer since sanitize_json_buffer may delete and reallocate
     char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
     std::memcpy(buf, input.data(), input.size());
 
-    try {
-        auto const result = StringUtils::sanitize_json_buffer(
-                buf,
-                buf_size,
-                buf_occupied,
-                simdjson::SIMDJSON_PADDING
-        );
-        std::string output(buf, result.new_buf_occupied);
-        // Delete the buffer (may be reallocated, but buf points to the current buffer)
-        delete[] buf;
-        return output;
-    } catch (...) {
-        // If sanitization failed, buf still points to the original buffer
-        delete[] buf;
-        throw;
-    }
+    auto result = StringUtils::sanitize_json_buffer(
+            buf,
+            buf_size,
+            buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    std::string output(buf, result.new_buf_occupied);
+    // buf now points to the final buffer (may have been reallocated)
+    delete[] buf;
+    return {std::move(result), std::move(output)};
+}
+
+/**
+ * Helper to create a buffer with simdjson padding and run JSON sanitization.
+ * @param input The input string to sanitize
+ * @return The sanitized string
+ */
+auto sanitize_string(std::string_view input) -> std::string {
+    return sanitize_json_with_result(input).output;
 }
 
 /**
@@ -281,29 +293,38 @@ TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") {
 namespace {
 /**
  * Helper to create a buffer with simdjson padding and run UTF-8 sanitization.
+ * Returns both the result (with character counts) and the sanitized output string.
  * @param input The input string to sanitize
- * @return The sanitized string
+ * @return SanitizeResultWithOutput containing result metadata and sanitized string
+ * @note sanitize_utf8_buffer may reallocate the buffer (passed by reference). This helper
+ *       handles both reallocation and no-reallocation cases correctly.
  */
-auto sanitize_utf8_string(std::string_view input) -> std::string {
+auto sanitize_utf8_with_result(std::string_view input) -> SanitizeResultWithOutput {
     size_t buf_size = input.size();
     size_t buf_occupied = input.size();
+    // Use raw pointer since sanitize_utf8_buffer may delete and reallocate
     char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
     std::memcpy(buf, input.data(), input.size());
 
-    try {
-        auto const result = StringUtils::sanitize_utf8_buffer(
-                buf,
-                buf_size,
-                buf_occupied,
-                simdjson::SIMDJSON_PADDING
-        );
-        std::string output(buf, result.new_buf_occupied);
-        delete[] buf;
-        return output;
-    } catch (...) {
-        delete[] buf;
-        throw;
-    }
+    auto result = StringUtils::sanitize_utf8_buffer(
+            buf,
+            buf_size,
+            buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    std::string output(buf, result.new_buf_occupied);
+    // buf now points to the final buffer (may have been reallocated)
+    delete[] buf;
+    return {std::move(result), std::move(output)};
+}
+
+/**
+ * Helper to create a buffer with simdjson padding and run UTF-8 sanitization.
+ * @param input The input string to sanitize
+ * @return The sanitized string
+ */
+auto sanitize_utf8_string(std::string_view input) -> std::string {
+    return sanitize_utf8_with_result(input).output;
 }
 
 /**
@@ -449,58 +470,24 @@ TEST_CASE("sanitize_utf8_buffer_returns_correct_counts", "[clp_s][StringUtils]")
     SECTION("counts invalid sequences") {
         auto input = "\xFF\xFE\xFF"s;  // 3 invalid bytes
 
-        size_t buf_size = input.size();
-        size_t buf_occupied = input.size();
-        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
-        std::memcpy(buf, input.data(), input.size());
-
-        try {
-            auto const result = StringUtils::sanitize_utf8_buffer(
-                    buf,
-                    buf_size,
-                    buf_occupied,
-                    simdjson::SIMDJSON_PADDING
-            );
-
-            // The count uses 0xFF as a special key for invalid UTF-8 sequences
-            REQUIRE_FALSE(result.sanitized_char_counts.empty());
-            size_t total = 0;
-            for (auto const& [ch, count] : result.sanitized_char_counts) {
-                total += count;
-            }
-            REQUIRE(total == 3);
-
-            delete[] buf;
-        } catch (...) {
-            delete[] buf;
-            throw;
+        auto [result, output] = sanitize_utf8_with_result(input);
+
+        // The count uses 0xFF as a special key for invalid UTF-8 sequences
+        REQUIRE_FALSE(result.sanitized_char_counts.empty());
+        size_t total = 0;
+        for (auto const& [ch, count] : result.sanitized_char_counts) {
+            total += count;
         }
+        REQUIRE(total == 3);
     }
 
     SECTION("returns empty counts when no sanitization needed") {
         std::string input = "valid UTF-8 string";
 
-        size_t buf_size = input.size();
-        size_t buf_occupied = input.size();
-        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
-        std::memcpy(buf, input.data(), input.size());
-
-        try {
-            auto const result = StringUtils::sanitize_utf8_buffer(
-                    buf,
-                    buf_size,
-                    buf_occupied,
-                    simdjson::SIMDJSON_PADDING
-            );
-
-            REQUIRE(result.sanitized_char_counts.empty());
-            REQUIRE(result.new_buf_occupied == input.size());
-
-            delete[] buf;
-        } catch (...) {
-            delete[] buf;
-            throw;
-        }
+        auto [result, output] = sanitize_utf8_with_result(input);
+
+        REQUIRE(result.sanitized_char_counts.empty());
+        REQUIRE(result.new_buf_occupied == input.size());
     }
 }
 
@@ -530,54 +517,20 @@ TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUti
         // Input with: 3x \x00, 2x \x01, 1x \x1f
         auto input = "{\"a\": \"\x00\x00\x01\x00\x01\x1f\"}"s;
 
-        size_t buf_size = input.size();
-        size_t buf_occupied = input.size();
-        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
-        std::memcpy(buf, input.data(), input.size());
-
-        try {
-            auto const result = StringUtils::sanitize_json_buffer(
-                    buf,
-                    buf_size,
-                    buf_occupied,
-                    simdjson::SIMDJSON_PADDING
-            );
-
-            REQUIRE(result.sanitized_char_counts.size() == 3);
-            REQUIRE(result.sanitized_char_counts.at('\x00') == 3);
-            REQUIRE(result.sanitized_char_counts.at('\x01') == 2);
-            REQUIRE(result.sanitized_char_counts.at('\x1f') == 1);
-
-            delete[] buf;
-        } catch (...) {
-            delete[] buf;
-            throw;
-        }
+        auto [result, output] = sanitize_json_with_result(input);
+
+        REQUIRE(result.sanitized_char_counts.size() == 3);
+        REQUIRE(result.sanitized_char_counts.at('\x00') == 3);
+        REQUIRE(result.sanitized_char_counts.at('\x01') == 2);
+        REQUIRE(result.sanitized_char_counts.at('\x1f') == 1);
     }
 
     SECTION("returns empty counts when no sanitization needed") {
         std::string input = R"({"key": "valid value"})";
 
-        size_t buf_size = input.size();
-        size_t buf_occupied = input.size();
-        char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
-        std::memcpy(buf, input.data(), input.size());
-
-        try {
-            auto const result = StringUtils::sanitize_json_buffer(
-                    buf,
-                    buf_size,
-                    buf_occupied,
-                    simdjson::SIMDJSON_PADDING
-            );
-
-            REQUIRE(result.sanitized_char_counts.empty());
-            REQUIRE(result.new_buf_occupied == input.size());
-
-            delete[] buf;
-        } catch (...) {
-            delete[] buf;
-            throw;
-        }
+        auto [result, output] = sanitize_json_with_result(input);
+
+        REQUIRE(result.sanitized_char_counts.empty());
+        REQUIRE(result.new_buf_occupied == input.size());
     }
 }

From cce34fbeb99f822bfa4fb0745dfa627c5ef80fce Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 04:25:07 +0800
Subject: [PATCH 10/11] refactor(clp_s): improve sanitization code quality and
 efficiency

Code quality improvements:
- Rename JsonSanitizeResult to BufferSanitizeResult for clarity
- Rename helper methods: sanitize_and_log_utf8 -> sanitize_invalid_utf8_and_log,
  sanitize_and_log_json -> sanitize_control_chars_and_log
- Fix typo: "delimeters" -> "delimiters"
- Fix docs: "Pointer to pointer" -> "Reference to pointer"
- Add @note about buffer reallocation to both sanitization helpers
- Add null pointer checks to sanitization functions
- Extract reinitialize_document_stream() helper to reduce duplication
- Add named constant cUtf8ReplacementChar for U+FFFD magic string
- Remove unused variable in char_to_hex()
- Use consistent signed arithmetic in log messages

Efficiency improvements:
- Use lazy-copy approach: only allocate when first issue is found
- Copy valid ranges in bulk using append(ptr, len) instead of byte-by-byte
- Zero allocation in the common case when no sanitization is needed
---
 .../core/src/clp_s/JsonFileIterator.cpp       | 48 +++++-----
 .../core/src/clp_s/JsonFileIterator.hpp       | 15 +++-
 components/core/src/clp_s/Utils.cpp           | 88 ++++++++++++-------
 components/core/src/clp_s/Utils.hpp           | 19 ++--
 .../core/tests/test-clp_s-StringUtils.cpp     |  4 +-
 5 files changed, 105 insertions(+), 69 deletions(-)

diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index afe014160c..447e73617d 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -78,7 +78,7 @@ bool JsonFileIterator::read_new_json() {
             if (simdjson::error_code::UTF8_ERROR == error) {
                 // Return value intentionally ignored - in read_new_json we always retry after
                 // sanitization regardless of whether changes were made
-                static_cast<void>(sanitize_and_log_utf8());
+                static_cast<void>(sanitize_invalid_utf8_and_log());
                 error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
             }
 
@@ -86,7 +86,7 @@ bool JsonFileIterator::read_new_json() {
             if (simdjson::error_code::UNESCAPED_CHARS == error) {
                 // Return value intentionally ignored - in read_new_json we always retry after
                 // sanitization regardless of whether changes were made
-                static_cast<void>(sanitize_and_log_json());
+                static_cast<void>(sanitize_control_chars_and_log());
                 error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
             }
         }
@@ -122,8 +122,10 @@ size_t JsonFileIterator::skip_whitespace_and_get_truncated_bytes() {
     return m_buf_occupied - m_next_document_position;
 }
 
-bool JsonFileIterator::sanitize_and_log_utf8() {
+bool JsonFileIterator::sanitize_invalid_utf8_and_log() {
     size_t const old_buf_occupied = m_buf_occupied;
+    // Note: sanitize_utf8_buffer may reallocate m_buf and will update m_buf_size by reference if
+    // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size.
     auto const result = StringUtils::sanitize_utf8_buffer(
             m_buf,
             m_buf_size,
@@ -142,7 +144,7 @@ bool JsonFileIterator::sanitize_and_log_utf8() {
         total_replaced += count;
     }
     SPDLOG_WARN(
-            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer size changed by {} bytes "
+            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer expanded by {} bytes "
             "({} -> {}).",
             total_replaced,
             m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
@@ -153,7 +155,7 @@ bool JsonFileIterator::sanitize_and_log_utf8() {
     return true;
 }
 
-bool JsonFileIterator::sanitize_and_log_json() {
+bool JsonFileIterator::sanitize_control_chars_and_log() {
     size_t const old_buf_occupied = m_buf_occupied;
     // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by reference if
     // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size.
@@ -185,13 +187,25 @@ bool JsonFileIterator::sanitize_and_log_json() {
             total_sanitized,
             m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
             char_details,
-            m_buf_occupied - old_buf_occupied,
+            static_cast<int64_t>(m_buf_occupied) - static_cast<int64_t>(old_buf_occupied),
             old_buf_occupied,
             m_buf_occupied
     );
     return true;
 }
 
+bool JsonFileIterator::reinitialize_document_stream() {
+    auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
+    if (error) {
+        m_error_code = error;
+        return false;
+    }
+    m_doc_it = m_stream.begin();
+    m_first_doc_in_buffer = true;
+    m_next_document_position = 0;
+    return true;
+}
+
 bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) {
     if (false == m_first_read) {
         ++m_doc_it;
@@ -220,23 +234,16 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
             {
                 // Unescaped control characters detected during document iteration. Sanitize the
                 // buffer and re-setup the document stream to restart from the beginning.
-                if (false == sanitize_and_log_json()) {
+                if (false == sanitize_control_chars_and_log()) {
                     // Sanitization made no changes - report the original error to avoid infinite
                     // loop
                     m_error_code = m_doc_it.error();
                     return false;
                 }
 
-                // Re-setup the document stream and restart iteration
-                auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied)
-                                     .get(m_stream);
-                if (error) {
-                    m_error_code = error;
+                if (false == reinitialize_document_stream()) {
                     return false;
                 }
-                m_doc_it = m_stream.begin();
-                m_first_doc_in_buffer = true;
-                m_next_document_position = 0;
                 continue;
             } else {
                 m_error_code = m_doc_it.error();
@@ -261,23 +268,16 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
             // way to check is to parse it ourselves, so we rely on this heuristic for now.
             if (m_sanitize_invalid_json) {
                 // Sanitize invalid UTF-8 sequences and retry
-                if (false == sanitize_and_log_utf8()) {
+                if (false == sanitize_invalid_utf8_and_log()) {
                     // Sanitization made no changes - report the original error to avoid infinite
                     // loop
                     m_error_code = simdjson::error_code::UTF8_ERROR;
                     return false;
                 }
 
-                // Re-setup the document stream and restart iteration
-                auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied)
-                                     .get(m_stream);
-                if (error) {
-                    m_error_code = error;
+                if (false == reinitialize_document_stream()) {
                     return false;
                 }
-                m_doc_it = m_stream.begin();
-                m_first_doc_in_buffer = true;
-                m_next_document_position = 0;
                 continue;
             }
             m_error_code = simdjson::error_code::UTF8_ERROR;
diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp
index 0ef58d6fa3..2b8be94a4c 100644
--- a/components/core/src/clp_s/JsonFileIterator.hpp
+++ b/components/core/src/clp_s/JsonFileIterator.hpp
@@ -13,7 +13,7 @@ class JsonFileIterator {
     /**
      * An iterator over an input stream containing json objects. JSON is parsed
      * using simdjson::parse_many. This allows simdjson to efficiently find
-     * delimeters between JSON objects, and if enabled parse JSON ahead of time
+     * delimiters between JSON objects, and if enabled parse JSON ahead of time
      * in another thread while the JSON is being iterated over.
      *
      * The buffer grows automatically if there are JSON objects larger than the buffer size.
@@ -85,15 +85,24 @@ class JsonFileIterator {
      * Sanitizes invalid UTF-8 sequences in the buffer by replacing them with U+FFFD,
      * updates buffer tracking variables, and logs a warning if changes were made.
      * @return true if sanitization made changes, false otherwise
+     * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly.
      */
-    [[nodiscard]] bool sanitize_and_log_utf8();
+    [[nodiscard]] bool sanitize_invalid_utf8_and_log();
 
     /**
      * Sanitizes unescaped control characters in the buffer by escaping them to \\u00XX format,
      * updates buffer tracking variables, and logs a warning if changes were made.
      * @return true if sanitization made changes, false otherwise
+     * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly.
      */
-    [[nodiscard]] bool sanitize_and_log_json();
+    [[nodiscard]] bool sanitize_control_chars_and_log();
+
+    /**
+     * Reinitializes the document stream after buffer sanitization.
+     * Resets iteration state to start from the beginning of the buffer.
+     * @return true on success, false if iteration setup fails (m_error_code is set on failure)
+     */
+    [[nodiscard]] bool reinitialize_document_stream();
 
     size_t m_truncated_bytes{0};
     size_t m_next_document_position{0};
diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index 9458c970dd..321032788f 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -164,64 +164,73 @@ bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& n
     return true;
 }
 
-JsonSanitizeResult StringUtils::sanitize_json_buffer(
+BufferSanitizeResult StringUtils::sanitize_json_buffer(
         char*& buf,
         size_t& buf_size,
         size_t buf_occupied,
         size_t simdjson_padding
 ) {
-    // Build sanitized content in a temporary buffer, escaping control characters
-    // inside JSON strings. This is a fallback path only triggered on UNESCAPED_CHARS error,
-    // so we prioritize correctness over performance.
+    // Early return for empty or null buffer
+    if (nullptr == buf || 0 == buf_occupied) {
+        return {buf_occupied, {}};
+    }
+
+    // Sanitize by escaping control characters inside JSON strings. Uses a lazy-copy approach:
+    // only allocate the output string when the first control character is found, and copy
+    // valid ranges in bulk rather than byte-by-byte for efficiency.
     //
     // Note: If the buffer contains unmatched quotes (e.g., truncated JSON), the string state
     // tracking may be incorrect, potentially escaping control characters outside of actual
     // JSON strings. This is acceptable since such malformed JSON will fail parsing anyway.
     std::string sanitized;
-
     std::map<char, size_t> sanitized_char_counts;
     bool in_string = false;
     bool escape_next = false;
+    size_t copy_start = 0;
 
     for (size_t i = 0; i < buf_occupied; ++i) {
-        char c = buf[i];
+        char const c = buf[i];
 
         if (escape_next) {
             escape_next = false;
-            sanitized.push_back(c);
             continue;
         }
 
-        if (c == '\\' && in_string) {
+        if ('\\' == c && in_string) {
             escape_next = true;
-            sanitized.push_back(c);
             continue;
         }
 
-        if (c == '"') {
+        if ('"' == c) {
             in_string = !in_string;
-            sanitized.push_back(c);
             continue;
         }
 
         // Escape control characters (0x00-0x1F) inside strings to \u00XX format
         if (in_string && static_cast<unsigned char>(c) < 0x20) {
+            if (sanitized.empty()) {
+                // First control char found - allocate with extra space for escapes
+                sanitized.reserve(buf_occupied + 64);
+            }
+            // Copy the valid range before this control character
+            sanitized.append(buf + copy_start, i - copy_start);
             char_to_escaped_four_char_hex(sanitized, c);
             ++sanitized_char_counts[c];
-        } else {
-            sanitized.push_back(c);
+            copy_start = i + 1;
         }
     }
 
-    // If no changes were made, return early
-    if (sanitized.size() == buf_occupied) {
+    // If no sanitization was needed, return early without any allocation
+    if (sanitized.empty()) {
         return {buf_occupied, {}};
     }
 
+    // Copy any remaining content after the last control character
+    sanitized.append(buf + copy_start, buf_occupied - copy_start);
+
     // Grow buffer if needed to hold sanitized content
     if (sanitized.size() > buf_size) {
-        // Allocate exactly the size needed since we know the exact size upfront
-        size_t new_buf_size = sanitized.size();
+        size_t const new_buf_size = sanitized.size();
         char* new_buf = new char[new_buf_size + simdjson_padding];
         delete[] buf;
         buf = new_buf;
@@ -325,45 +334,60 @@ size_t validate_utf8_sequence(char const* buf, size_t pos, size_t buf_occupied)
 }
 }  // namespace
 
-JsonSanitizeResult StringUtils::sanitize_utf8_buffer(
+BufferSanitizeResult StringUtils::sanitize_utf8_buffer(
         char*& buf,
         size_t& buf_size,
         size_t buf_occupied,
         size_t simdjson_padding
 ) {
-    // Build sanitized content in a temporary buffer, replacing invalid UTF-8 sequences
-    // with the Unicode replacement character U+FFFD (0xEF 0xBF 0xBD).
-    std::string sanitized;
+    // Early return for empty or null buffer
+    if (nullptr == buf || 0 == buf_occupied) {
+        return {buf_occupied, {}};
+    }
 
-    // We use a special key to count invalid UTF-8 sequences
-    // Using 0xFF as the key since it's never a valid UTF-8 byte
-    std::map<char, size_t> sanitized_char_counts;
+    // Sanitize by replacing invalid UTF-8 sequences with U+FFFD. Uses a lazy-copy approach:
+    // only allocate the output string when the first invalid sequence is found, and copy
+    // valid ranges in bulk rather than byte-by-byte for efficiency.
+    constexpr std::string_view cUtf8ReplacementChar{"\xEF\xBF\xBD", 3};
     constexpr char cInvalidUtf8Key = static_cast<char>(0xFF);
 
+    std::string sanitized;
+    std::map<char, size_t> sanitized_char_counts;
+    size_t copy_start = 0;
+
     size_t i = 0;
     while (i < buf_occupied) {
-        size_t seq_len = validate_utf8_sequence(buf, i, buf_occupied);
+        size_t const seq_len = validate_utf8_sequence(buf, i, buf_occupied);
         if (seq_len > 0) {
-            // Valid sequence - copy it
-            sanitized.append(buf + i, seq_len);
+            // Valid sequence - advance position (will be copied in bulk later)
             i += seq_len;
         } else {
-            // Invalid sequence - replace with U+FFFD
-            sanitized.append("\xEF\xBF\xBD", 3);
+            // Invalid sequence - need to sanitize
+            if (sanitized.empty()) {
+                // First invalid sequence found - allocate with extra space for replacements
+                sanitized.reserve(buf_occupied + 64);
+            }
+            // Copy the valid range before this invalid byte
+            sanitized.append(buf + copy_start, i - copy_start);
+            sanitized.append(cUtf8ReplacementChar);
             ++sanitized_char_counts[cInvalidUtf8Key];
             // Skip one byte and continue (maximal subpart replacement strategy)
             ++i;
+            copy_start = i;
         }
     }
 
-    // If no changes were made, return early
-    if (sanitized.size() == buf_occupied) {
+    // If no sanitization was needed, return early without any allocation
+    if (sanitized.empty()) {
         return {buf_occupied, {}};
     }
 
+    // Copy any remaining valid content after the last invalid byte
+    sanitized.append(buf + copy_start, buf_occupied - copy_start);
+
     // Grow buffer if needed to hold sanitized content
     if (sanitized.size() > buf_size) {
-        size_t new_buf_size = sanitized.size();
+        size_t const new_buf_size = sanitized.size();
         char* new_buf = new char[new_buf_size + simdjson_padding];
         delete[] buf;
         buf = new_buf;
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index 54111d25a6..95033e18c3 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -65,10 +65,14 @@ class UriUtils {
 };
 
 /**
- * Result of sanitizing a JSON buffer, including statistics about what was sanitized.
+ * Result of sanitizing a buffer, including statistics about what was sanitized.
  */
-struct JsonSanitizeResult {
+struct BufferSanitizeResult {
+    /// New number of bytes occupied in the buffer after sanitization.
     size_t new_buf_occupied;
+    /// Map of sanitized characters to their occurrence counts.
+    /// For JSON control char sanitization: key is the actual control character (0x00-0x1F).
+    /// For UTF-8 sanitization: key is 0xFF (sentinel value for all invalid sequences).
     std::map<char, size_t> sanitized_char_counts;
 };
 
@@ -101,7 +105,7 @@ class StringUtils {
      *
      * Characters are escaped using unicode escape sequences (e.g., \x00 becomes \u0000).
      *
-     * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed.
+     * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed.
      *             The caller's pointer will be updated to point to the new buffer if reallocation
      *             occurs. The caller is responsible for deleting the buffer.
      * @param buf_size Current allocated size of the buffer (excluding simdjson padding).
@@ -111,7 +115,7 @@ class StringUtils {
      * @param simdjson_padding Amount of padding required after buffer content.
      * @return Result containing new buf_occupied and counts of each sanitized character.
      */
-    static JsonSanitizeResult sanitize_json_buffer(
+    static BufferSanitizeResult sanitize_json_buffer(
             char*& buf,
             size_t& buf_size,
             size_t buf_occupied,
@@ -123,7 +127,7 @@ class StringUtils {
      * character (U+FFFD). This is used to fix malformed data that contains invalid UTF-8
      * which would cause JSON parsing errors.
      *
-     * @param buf Pointer to pointer to the buffer. May be reallocated if expansion is needed.
+     * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed.
      *            The caller's pointer will be updated to point to the new buffer if reallocation
      *            occurs. The caller is responsible for deleting the buffer.
      * @param buf_size Current allocated size of the buffer (excluding simdjson padding).
@@ -134,7 +138,7 @@ class StringUtils {
      * @return Result containing new buf_occupied and counts of each type of invalid sequence
      *         replaced.
      */
-    static JsonSanitizeResult sanitize_utf8_buffer(
+    static BufferSanitizeResult sanitize_utf8_buffer(
             char*& buf,
             size_t& buf_size,
             size_t buf_occupied,
@@ -148,7 +152,6 @@ class StringUtils {
      * @return the two byte hexadecimal representation of c as an array of two characters.
      */
     static std::array<char, 2> char_to_hex(char c) {
-        std::array<char, 2> ret;
         auto nibble_to_hex = [](char nibble) -> char {
             if ('\x00' <= nibble && nibble <= '\x09') {
                 return '0' + (nibble - '\x00');
@@ -157,7 +160,7 @@ class StringUtils {
             }
         };
 
-        return std::array<char, 2>{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)};
+        return {nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)};
     }
 
     /**
diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
index f7969ecbee..898d0a7be4 100644
--- a/components/core/tests/test-clp_s-StringUtils.cpp
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -8,7 +8,7 @@
 
 #include "../src/clp_s/Utils.hpp"
 
-using clp_s::JsonSanitizeResult;
+using clp_s::BufferSanitizeResult;
 using clp_s::StringUtils;
 
 // We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes
@@ -23,7 +23,7 @@ namespace {
  * Result structure containing both the sanitization result and the output string.
  */
 struct SanitizeResultWithOutput {
-    JsonSanitizeResult result;
+    BufferSanitizeResult result;
     std::string output;
 };
 

From 775f513350958e4578f7a3a474db4a98306b17ed Mon Sep 17 00:00:00 2001
From: Jack Luo <jack.luo@mail.utoronto.ca>
Date: Fri, 9 Jan 2026 04:29:17 +0800
Subject: [PATCH 11/11] style(test): use explicit while loop for clearer skip
 logic in count_replacement_chars

---
 components/core/tests/test-clp_s-StringUtils.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
index 898d0a7be4..e3ef434ecb 100644
--- a/components/core/tests/test-clp_s-StringUtils.cpp
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -332,12 +332,15 @@ auto sanitize_utf8_string(std::string_view input) -> std::string {
  */
 auto count_replacement_chars(std::string_view s) -> size_t {
     size_t count = 0;
-    for (size_t i = 0; i + 2 < s.size(); ++i) {
+    size_t i = 0;
+    while (i + 2 < s.size()) {
         if (static_cast<unsigned char>(s[i]) == 0xEF && static_cast<unsigned char>(s[i + 1]) == 0xBF
             && static_cast<unsigned char>(s[i + 2]) == 0xBD)
         {
             ++count;
-            i += 2;  // Skip past the replacement char
+            i += 3;  // Skip past the replacement char (3 bytes for U+FFFD)
+        } else {
+            ++i;
         }
     }
     return count;