From e31b06f126047b20fd4516f3a9d1aeabe09683cc Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 00:45:09 +0000
Subject: [PATCH 1/7] Initial plan


From 5067ec8ce21d8951a18b35a479add509305246b5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 00:48:14 +0000
Subject: [PATCH 2/7] Add ClusterFuzzLite fuzzing infrastructure

Co-authored-by: bact <128572+bact@users.noreply.github.com>
---
 .github/workflows/clusterfuzzlite.yml |  82 ++++++++++++++
 Dockerfile                            |  32 ++++--
 Dockerfile.app                        |  15 +++
 docker-compose.yml                    |   4 +-
 fuzz/README.md                        | 154 ++++++++++++++++++++++++++
 fuzz/build.sh                         |  28 +++++
 fuzz/fuzz_tokenize.py                 |  51 +++++++++
 fuzz/fuzz_util_normalize.py           |  49 ++++++++
 8 files changed, 405 insertions(+), 10 deletions(-)
 create mode 100644 .github/workflows/clusterfuzzlite.yml
 create mode 100644 Dockerfile.app
 create mode 100644 fuzz/README.md
 create mode 100755 fuzz/build.sh
 create mode 100644 fuzz/fuzz_tokenize.py
 create mode 100644 fuzz/fuzz_util_normalize.py

diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml
new file mode 100644
index 000000000..949a18544
--- /dev/null
+++ b/.github/workflows/clusterfuzzlite.yml
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+
+name: ClusterFuzzLite
+
+on:
+  push:
+    branches:
+      - dev
+    paths-ignore:
+      - '**.cff'
+      - '**.json'
+      - '**.md'
+      - '**.rst'
+      - '**.txt'
+      - '**.yml'
+      - 'docs/**'
+  pull_request:
+    branches:
+      - dev
+    paths-ignore:
+      - '**.cff'
+      - '**.json'
+      - '**.md'
+      - '**.rst'
+      - '**.txt'
+      - '**.yml'
+      - 'docs/**'
+  schedule:
+    - cron: '0 6 * * *'  # Daily at 06:00 UTC
+
+# Avoid duplicate runs for the same source branch and repository.
+# For pull_request events, uses the source repo name from
+# github.event.pull_request.head.repo.full_name; otherwise uses github.repository.
+# For push events, uses the branch name from github.ref_name.
+# For pull_request events, uses the source branch name from github.head_ref.
+# This ensures events for the same repo and branch share the same group,
+# and avoids cross-fork collisions when branch names are reused.
+concurrency:
+  group: >-
+    ${{ github.workflow }}-${{
+      github.event.pull_request.head.repo.full_name || github.repository
+    }}-${{ github.head_ref || github.ref_name }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  issues: write
+
+jobs:
+  fuzzing:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        sanitizer: [address]
+    steps:
+      - name: Build Fuzzers (${{ matrix.sanitizer }})
+        id: build
+        uses: google/clusterfuzzlite/actions/build_fuzzers@v1
+        with:
+          sanitizer: ${{ matrix.sanitizer }}
+          language: python
+
+      - name: Run Fuzzers (${{ matrix.sanitizer }})
+        id: run
+        uses: google/clusterfuzzlite/actions/run_fuzzers@v1
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          fuzz-seconds: 300
+          mode: 'code-change'
+          sanitizer: ${{ matrix.sanitizer }}
+          storage-repo: https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git
+          storage-repo-branch: gh-pages
+          storage-repo-branch-coverage: gh-pages
+
+      - name: Upload crash artifacts
+        if: failure() && steps.run.outcome == 'failure'
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.sanitizer }}-artifacts
+          path: ./out/artifacts
diff --git a/Dockerfile b/Dockerfile
index 7cbf8c3cb..7368893fe 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,15 +1,29 @@
-# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
 # SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
 
-FROM python:3.12
+# Dockerfile for ClusterFuzzLite fuzzing
+# This extends the OSS-Fuzz base builder image for Python projects
 
-WORKDIR /app
+FROM gcr.io/oss-fuzz-base/base-builder-python
 
-COPY . .
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    libicu-dev \
+    pkg-config && \
+    rm -rf /var/lib/apt/lists/*
 
-RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/*
-ENV VIRTUAL_ENV=/opt/venv
-RUN python3 -m venv $VIRTUAL_ENV
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-RUN pip install -e ".[full]" && pip cache purge
+# Copy repository to $SRC/pythainlp
+COPY . $SRC/pythainlp
+
+# Set working directory
+WORKDIR $SRC/pythainlp
+
+# Install pythainlp in development mode with minimal dependencies
+# This installs the package without heavy ML dependencies to speed up builds
+RUN pip install --no-cache-dir -e .
+
+# Copy build script
+COPY fuzz/build.sh $SRC/
diff --git a/Dockerfile.app b/Dockerfile.app
new file mode 100644
index 000000000..7cbf8c3cb
--- /dev/null
+++ b/Dockerfile.app
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.12
+
+WORKDIR /app
+
+COPY . .
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/*
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN pip install -e ".[full]" && pip cache purge
diff --git a/docker-compose.yml b/docker-compose.yml
index 200512fa5..f9d4c3016 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,6 +1,8 @@
 services:
   pythainlp:
-    build: .
+    build:
+      context: .
+      dockerfile: Dockerfile.app
     image: pythainlp:latest
     volumes:
       - .:/workspace
diff --git a/fuzz/README.md b/fuzz/README.md
new file mode 100644
index 000000000..126f1bc0f
--- /dev/null
+++ b/fuzz/README.md
@@ -0,0 +1,154 @@
+# PyThaiNLP Fuzz Testing
+
+This directory contains fuzz testing infrastructure using [ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris).
+
+## Overview
+
+Fuzz testing helps discover edge cases, crashes, and potential security vulnerabilities by feeding random inputs to functions. This setup uses:
+
+- **ClusterFuzzLite**: Google's continuous fuzzing solution for GitHub projects
+- **Atheris**: Coverage-guided Python fuzzing engine
+- **AddressSanitizer**: Memory safety checks
+
+## Directory Structure
+
+```
+fuzz/
+├── build.sh                   # Build script for compiling fuzzers
+├── fuzz_tokenize.py           # Fuzzer for word_tokenize()
+├── fuzz_util_normalize.py     # Fuzzer for normalize()
+└── README.md                  # This file
+```
+
+## Current Fuzzing Targets
+
+### 1. `fuzz_tokenize.py`
+Tests `pythainlp.tokenize.word_tokenize()` with random Unicode input to ensure:
+- No crashes on malformed input
+- Proper handling of edge cases
+- Memory safety
+
+### 2. `fuzz_util_normalize.py`
+Tests `pythainlp.util.normalize()` with random Unicode input to ensure:
+- No crashes on malformed input
+- Proper string normalization
+- Type safety
+
+## Local Testing
+
+To test fuzzers locally:
+
+```bash
+# Install atheris
+pip install atheris
+
+# Run a specific fuzzer for 60 seconds
+python fuzz/fuzz_tokenize.py -max_total_time=60
+
+# Run with specific corpus directory
+python fuzz/fuzz_tokenize.py corpus_dir/ -max_total_time=60
+```
+
+## CI/CD Integration
+
+Fuzzing runs automatically via GitHub Actions:
+- On pull requests to `dev` branch (focuses on code changes)
+- On push to `dev` branch
+- Daily at 06:00 UTC (full fuzzing run)
+
+Configuration: `.github/workflows/clusterfuzzlite.yml`
+
+## Adding New Fuzzers
+
+To add a new fuzzing target:
+
+1. Create a new file `fuzz/fuzz_<module_name>.py`:
+
+```python
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileType: SOURCE
+"""Fuzzing harness for pythainlp.<module>.<function>()"""
+
+import sys
+import atheris
+import pythainlp.<module>
+
+
+def TestOneInput(data: bytes) -> None:
+    """Fuzz target for <function>."""
+    fdp = atheris.FuzzedDataProvider(data)
+    
+    try:
+        # Generate test input
+        text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())
+        
+        # Call target function
+        result = pythainlp.<module>.<function>(text)
+        
+        # Validate output
+        assert isinstance(result, <expected_type>)
+        
+    except (ValueError, TypeError, UnicodeDecodeError):
+        # Expected exceptions
+        pass
+
+
+def main() -> None:
+    """Entry point for the fuzzer."""
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
+```
+
+2. The fuzzer will be automatically discovered and built by `build.sh`
+
+3. No changes needed to GitHub Actions workflow
+
+## Expansion Plan
+
+Future fuzzing targets to consider:
+
+### High Priority
+- **spell/** - Spelling correction functions
+- **soundex/** - Phonetic encoding functions
+- **transliterate/** - Romanization functions
+
+### Medium Priority
+- **corpus/** - Data loading and corpus functions
+- **tag/** - Part-of-speech tagging
+- **parse/** - Parsing functions
+
+### Low Priority
+- **classify/** - Classification functions
+- **generate/** - Text generation functions
+- **summarize/** - Summarization functions
+
+## Troubleshooting
+
+### Fuzzer Crashes
+If a fuzzer finds a crash:
+1. Check the GitHub Actions artifacts for crash reports
+2. Reproduce locally: `python fuzz/fuzz_<name>.py <crash_file>`
+3. Fix the underlying issue in the target function
+4. Re-run fuzzer to verify fix
+
+### Performance Issues
+- Adjust fuzzing time in `.github/workflows/clusterfuzzlite.yml`
+- Default is 300 seconds (5 minutes) per fuzzer
+- For longer sessions, increase the value
+
+### False Positives
+- Update the exception handling in the fuzzer
+- Add expected exceptions to the `except` block
+- Document the reasoning in comments
+
+## Resources
+
+- [ClusterFuzzLite Documentation](https://google.github.io/clusterfuzzlite/)
+- [Atheris Documentation](https://github.com/google/atheris)
+- [OSS-Fuzz](https://github.com/google/oss-fuzz)
+- [libFuzzer Tutorial](https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md)
diff --git a/fuzz/build.sh b/fuzz/build.sh
new file mode 100755
index 000000000..485b9069f
--- /dev/null
+++ b/fuzz/build.sh
@@ -0,0 +1,28 @@
+#!/bin/bash -eu
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileType: SOURCE
+
+# Build script for ClusterFuzzLite fuzzing harnesses
+# This script installs atheris and compiles all fuzzing harnesses
+
+echo "Building PyThaiNLP fuzz targets..."
+
+# Install atheris for Python fuzzing
+pip install atheris
+
+# Find all fuzz_*.py files in the fuzz directory
+for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do
+    fuzzer_basename=$(basename -s .py "$fuzzer")
+    fuzzer_package="fuzz.${fuzzer_basename}"
+
+    echo "Compiling ${fuzzer_basename}..."
+
+    # Compile fuzzer with atheris
+    python -m atheris.instrument_libfuzzer "${fuzzer}" "${OUT}/${fuzzer_basename}"
+
+    # Make fuzzer executable
+    chmod +x "${OUT}/${fuzzer_basename}"
+done
+
+echo "Build completed successfully!"
diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py
new file mode 100644
index 000000000..b2f46d7d5
--- /dev/null
+++ b/fuzz/fuzz_tokenize.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileType: SOURCE
+"""Fuzzing harness for pythainlp.tokenize.word_tokenize()
+
+This fuzzer tests the word_tokenize function with random Unicode input
+to discover edge cases, crashes, and potential security issues.
+"""
+
+import sys
+
+import atheris
+
+import pythainlp.tokenize
+
+
+def TestOneInput(data: bytes) -> None:
+    """Fuzz target for word_tokenize.
+
+    :param bytes data: Random input bytes from the fuzzer
+    """
+    fdp = atheris.FuzzedDataProvider(data)
+
+    try:
+        # Generate random Unicode string
+        text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())
+
+        # Test word_tokenize with default engine
+        result = pythainlp.tokenize.word_tokenize(text)
+
+        # Validate output type
+        assert isinstance(result, list), f"Expected list, got {type(result)}"
+        assert all(isinstance(token, str) for token in result), \
+            "All tokens should be strings"
+
+    except (ValueError, TypeError, UnicodeDecodeError):
+        # Expected exceptions - these are acceptable
+        pass
+    except Exception:
+        # Unexpected exceptions - re-raise for investigation
+        raise
+
+
+def main() -> None:
+    """Entry point for the fuzzer."""
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py
new file mode 100644
index 000000000..ac16c0d27
--- /dev/null
+++ b/fuzz/fuzz_util_normalize.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileType: SOURCE
+"""Fuzzing harness for pythainlp.util.normalize()
+
+This fuzzer tests the normalize function with random Unicode input
+to discover edge cases, crashes, and potential security issues.
+"""
+
+import sys
+
+import atheris
+
+import pythainlp.util
+
+
+def TestOneInput(data: bytes) -> None:
+    """Fuzz target for normalize.
+
+    :param bytes data: Random input bytes from the fuzzer
+    """
+    fdp = atheris.FuzzedDataProvider(data)
+
+    try:
+        # Generate random Unicode string
+        text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes())
+
+        # Test normalize
+        result = pythainlp.util.normalize(text)
+
+        # Validate output type
+        assert isinstance(result, str), f"Expected str, got {type(result)}"
+
+    except (ValueError, TypeError, UnicodeDecodeError):
+        # Expected exceptions - these are acceptable
+        pass
+    except Exception:
+        # Unexpected exceptions - re-raise for investigation
+        raise
+
+
+def main() -> None:
+    """Entry point for the fuzzer."""
+    atheris.Setup(sys.argv, TestOneInput)
+    atheris.Fuzz()
+
+
+if __name__ == "__main__":
+    main()

From 9f9dce4f18507a878cd7b7aeb91a7e15eae22dee Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 00:49:11 +0000
Subject: [PATCH 3/7] Fix markdown line length in fuzz/README.md

Co-authored-by: bact <128572+bact@users.noreply.github.com>
---
 fuzz/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fuzz/README.md b/fuzz/README.md
index 126f1bc0f..77c73bb07 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -1,6 +1,7 @@
 # PyThaiNLP Fuzz Testing
 
-This directory contains fuzz testing infrastructure using [ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris).
+This directory contains fuzz testing infrastructure using
+[ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris).
 
 ## Overview
 

From 604792e4ba2ecf336206ff1cd2560cfc0c4a9c19 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 01:18:24 +0000
Subject: [PATCH 4/7] Move fuzzing Dockerfile to fuzz/ directory for clarity

Co-authored-by: bact <128572+bact@users.noreply.github.com>
---
 .github/workflows/clusterfuzzlite.yml |  1 +
 Dockerfile                            | 32 ++++++++-------------------
 Dockerfile.app                        | 15 -------------
 docker-compose.yml                    |  4 +---
 fuzz/Dockerfile                       | 29 ++++++++++++++++++++++++
 fuzz/README.md                        |  1 +
 6 files changed, 41 insertions(+), 41 deletions(-)
 delete mode 100644 Dockerfile.app
 create mode 100644 fuzz/Dockerfile

diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml
index 949a18544..4aac63141 100644
--- a/.github/workflows/clusterfuzzlite.yml
+++ b/.github/workflows/clusterfuzzlite.yml
@@ -61,6 +61,7 @@ jobs:
         with:
           sanitizer: ${{ matrix.sanitizer }}
           language: python
+          dockerfile-path: fuzz/Dockerfile
 
       - name: Run Fuzzers (${{ matrix.sanitizer }})
         id: run
diff --git a/Dockerfile b/Dockerfile
index 7368893fe..7cbf8c3cb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,29 +1,15 @@
-# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
 # SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
 
-# Dockerfile for ClusterFuzzLite fuzzing
-# This extends the OSS-Fuzz base builder image for Python projects
+FROM python:3.12
 
-FROM gcr.io/oss-fuzz-base/base-builder-python
+WORKDIR /app
 
-# Install system dependencies
-RUN apt-get update && \
-    apt-get install -y --no-install-recommends \
-    build-essential \
-    libicu-dev \
-    pkg-config && \
-    rm -rf /var/lib/apt/lists/*
+COPY . .
 
-# Copy repository to $SRC/pythainlp
-COPY . $SRC/pythainlp
-
-# Set working directory
-WORKDIR $SRC/pythainlp
-
-# Install pythainlp in development mode with minimal dependencies
-# This installs the package without heavy ML dependencies to speed up builds
-RUN pip install --no-cache-dir -e .
-
-# Copy build script
-COPY fuzz/build.sh $SRC/
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/*
+ENV VIRTUAL_ENV=/opt/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN pip install -e ".[full]" && pip cache purge
diff --git a/Dockerfile.app b/Dockerfile.app
deleted file mode 100644
index 7cbf8c3cb..000000000
--- a/Dockerfile.app
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project
-# SPDX-FileType: SOURCE
-# SPDX-License-Identifier: Apache-2.0
-
-FROM python:3.12
-
-WORKDIR /app
-
-COPY . .
-
-RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/*
-ENV VIRTUAL_ENV=/opt/venv
-RUN python3 -m venv $VIRTUAL_ENV
-ENV PATH="$VIRTUAL_ENV/bin:$PATH"
-RUN pip install -e ".[full]" && pip cache purge
diff --git a/docker-compose.yml b/docker-compose.yml
index f9d4c3016..200512fa5 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,8 +1,6 @@
 services:
   pythainlp:
-    build:
-      context: .
-      dockerfile: Dockerfile.app
+    build: .
     image: pythainlp:latest
     volumes:
       - .:/workspace
diff --git a/fuzz/Dockerfile b/fuzz/Dockerfile
new file mode 100644
index 000000000..7368893fe
--- /dev/null
+++ b/fuzz/Dockerfile
@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: 2026 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+
+# Dockerfile for ClusterFuzzLite fuzzing
+# This extends the OSS-Fuzz base builder image for Python projects
+
+FROM gcr.io/oss-fuzz-base/base-builder-python
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential \
+    libicu-dev \
+    pkg-config && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy repository to $SRC/pythainlp
+COPY . $SRC/pythainlp
+
+# Set working directory
+WORKDIR $SRC/pythainlp
+
+# Install pythainlp in development mode with minimal dependencies
+# This installs the package without heavy ML dependencies to speed up builds
+RUN pip install --no-cache-dir -e .
+
+# Copy build script
+COPY fuzz/build.sh $SRC/
diff --git a/fuzz/README.md b/fuzz/README.md
index 77c73bb07..20ee85c80 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -15,6 +15,7 @@ Fuzz testing helps discover edge cases, crashes, and potential security vulnerab
 
 ```
 fuzz/
+├── Dockerfile                 # Docker image for ClusterFuzzLite fuzzing
 ├── build.sh                   # Build script for compiling fuzzers
 ├── fuzz_tokenize.py           # Fuzzer for word_tokenize()
 ├── fuzz_util_normalize.py     # Fuzzer for normalize()

From 499966b830fca6bb3a04325654adacdb7a82c7e5 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 02:44:49 +0000
Subject: [PATCH 5/7] Apply code review suggestions from Copilot PR reviewer

Co-authored-by: bact <128572+bact@users.noreply.github.com>
---
 .github/workflows/clusterfuzzlite.yml |  6 ++----
 fuzz/Dockerfile                       |  4 ++--
 fuzz/README.md                        |  2 +-
 fuzz/build.sh                         | 13 ++++++-------
 fuzz/fuzz_tokenize.py                 | 11 +++++------
 fuzz/fuzz_util_normalize.py           |  7 +++----
 6 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml
index 4aac63141..d91d8911b 100644
--- a/.github/workflows/clusterfuzzlite.yml
+++ b/.github/workflows/clusterfuzzlite.yml
@@ -13,7 +13,6 @@ on:
       - '**.md'
       - '**.rst'
       - '**.txt'
-      - '**.yml'
       - 'docs/**'
   pull_request:
     branches:
@@ -24,7 +23,6 @@ on:
       - '**.md'
       - '**.rst'
       - '**.txt'
-      - '**.yml'
       - 'docs/**'
   schedule:
     - cron: '0 6 * * *'  # Daily at 06:00 UTC
@@ -44,7 +42,7 @@ concurrency:
   cancel-in-progress: true
 
 permissions:
-  contents: read
+  contents: write
   issues: write
 
 jobs:
@@ -69,7 +67,7 @@ jobs:
         with:
           github-token: ${{ secrets.GITHUB_TOKEN }}
           fuzz-seconds: 300
-          mode: 'code-change'
+          mode: ${{ github.event_name == 'pull_request' && 'code-change' || 'batch' }}
           sanitizer: ${{ matrix.sanitizer }}
           storage-repo: https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git
           storage-repo-branch: gh-pages
diff --git a/fuzz/Dockerfile b/fuzz/Dockerfile
index 7368893fe..b59069fc0 100644
--- a/fuzz/Dockerfile
+++ b/fuzz/Dockerfile
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: 2026 PyThaiNLP Project
-# SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileType: SOURCE
 
 # Dockerfile for ClusterFuzzLite fuzzing
 # This extends the OSS-Fuzz base builder image for Python projects
@@ -25,5 +25,5 @@ WORKDIR $SRC/pythainlp
 # This installs the package without heavy ML dependencies to speed up builds
 RUN pip install --no-cache-dir -e .
 
-# Copy build script
+# Copy build script to $SRC/build.sh as expected by OSS-Fuzz/ClusterFuzzLite
 COPY fuzz/build.sh $SRC/
diff --git a/fuzz/README.md b/fuzz/README.md
index 20ee85c80..9d8dc3fbe 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -106,7 +106,7 @@ if __name__ == "__main__":
     main()
 ```
 
-2. The fuzzer will be automatically discovered and built by `build.sh`
+2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up and built.
 
 3. No changes needed to GitHub Actions workflow
 
diff --git a/fuzz/build.sh b/fuzz/build.sh
index 485b9069f..d45f4ed46 100755
--- a/fuzz/build.sh
+++ b/fuzz/build.sh
@@ -4,22 +4,21 @@
 # SPDX-FileType: SOURCE
 
 # Build script for ClusterFuzzLite fuzzing harnesses
-# This script installs atheris and compiles all fuzzing harnesses
+# This script installs atheris and prepares all fuzzing harnesses
 
 echo "Building PyThaiNLP fuzz targets..."
 
-# Install atheris for Python fuzzing
-pip install atheris
+# Install atheris for Python fuzzing with version constraint
+pip install "atheris>=2.3.0"
 
 # Find all fuzz_*.py files in the fuzz directory
 for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do
     fuzzer_basename=$(basename -s .py "$fuzzer")
-    fuzzer_package="fuzz.${fuzzer_basename}"
 
-    echo "Compiling ${fuzzer_basename}..."
+    echo "Preparing ${fuzzer_basename}..."
 
-    # Compile fuzzer with atheris
-    python -m atheris.instrument_libfuzzer "${fuzzer}" "${OUT}/${fuzzer_basename}"
+    # Copy fuzzer to output directory (instrumentation happens at runtime)
+    cp "${fuzzer}" "${OUT}/${fuzzer_basename}"
 
     # Make fuzzer executable
     chmod +x "${OUT}/${fuzzer_basename}"
diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py
index b2f46d7d5..c07e0fc64 100644
--- a/fuzz/fuzz_tokenize.py
+++ b/fuzz/fuzz_tokenize.py
@@ -18,6 +18,7 @@ def TestOneInput(data: bytes) -> None:
     """Fuzz target for word_tokenize.
 
     :param bytes data: Random input bytes from the fuzzer
+    :rtype: None
     """
     fdp = atheris.FuzzedDataProvider(data)
 
@@ -29,16 +30,14 @@ def TestOneInput(data: bytes) -> None:
         result = pythainlp.tokenize.word_tokenize(text)
 
         # Validate output type
-        assert isinstance(result, list), f"Expected list, got {type(result)}"
-        assert all(isinstance(token, str) for token in result), \
-            "All tokens should be strings"
+        if not isinstance(result, list):
+            raise TypeError(f"Expected list, got {type(result)}")
+        if not all(isinstance(token, str) for token in result):
+            raise TypeError("All tokens should be strings")
 
     except (ValueError, TypeError, UnicodeDecodeError):
         # Expected exceptions - these are acceptable
         pass
-    except Exception:
-        # Unexpected exceptions - re-raise for investigation
-        raise
 
 
 def main() -> None:
diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py
index ac16c0d27..c8365c118 100644
--- a/fuzz/fuzz_util_normalize.py
+++ b/fuzz/fuzz_util_normalize.py
@@ -18,6 +18,7 @@ def TestOneInput(data: bytes) -> None:
     """Fuzz target for normalize.
 
     :param bytes data: Random input bytes from the fuzzer
+    :rtype: None
     """
     fdp = atheris.FuzzedDataProvider(data)
 
@@ -29,14 +30,12 @@ def TestOneInput(data: bytes) -> None:
         result = pythainlp.util.normalize(text)
 
         # Validate output type
-        assert isinstance(result, str), f"Expected str, got {type(result)}"
+        if not isinstance(result, str):
+            raise TypeError(f"Expected str, got {type(result)}")
 
     except (ValueError, TypeError, UnicodeDecodeError):
         # Expected exceptions - these are acceptable
         pass
-    except Exception:
-        # Unexpected exceptions - re-raise for investigation
-        raise
 
 
 def main() -> None:

From 7f258cebce48af662d22a37e94be512b437c5909 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 11:40:31 +0000
Subject: [PATCH 6/7] Apply second round of code review suggestions

- Fix SPDX header order (FileCopyrightText, FileType, License-Identifier)
- Add :rtype: None to main() docstrings for complete type annotations
- Replace assert with explicit TypeError in README example
- Add nullglob guard to build.sh for loop
- Pin atheris to exact version (==2.3.0) for security

Co-authored-by: bact <128572+bact@users.noreply.github.com>
---
 fuzz/README.md              | 3 ++-
 fuzz/build.sh               | 7 ++++---
 fuzz/fuzz_tokenize.py       | 7 +++++--
 fuzz/fuzz_util_normalize.py | 7 +++++--
 4 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/fuzz/README.md b/fuzz/README.md
index 9d8dc3fbe..82b97ffba 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -89,7 +89,8 @@ def TestOneInput(data: bytes) -> None:
         result = pythainlp.<module>.<function>(text)
         
         # Validate output
-        assert isinstance(result, <expected_type>)
+        if not isinstance(result, <expected_type>):
+            raise TypeError(f"Expected <expected_type>, got {type(result)}")
         
     except (ValueError, TypeError, UnicodeDecodeError):
         # Expected exceptions
diff --git a/fuzz/build.sh b/fuzz/build.sh
index d45f4ed46..f1e15544f 100755
--- a/fuzz/build.sh
+++ b/fuzz/build.sh
@@ -1,18 +1,19 @@
 #!/bin/bash -eu
 # SPDX-FileCopyrightText: 2026 PyThaiNLP Project
-# SPDX-License-Identifier: Apache-2.0
 # SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
 
 # Build script for ClusterFuzzLite fuzzing harnesses
 # This script installs atheris and prepares all fuzzing harnesses
 
 echo "Building PyThaiNLP fuzz targets..."
 
-# Install atheris for Python fuzzing with version constraint
-pip install "atheris>=2.3.0"
+# Install atheris for Python fuzzing with pinned version for security
+pip install "atheris==2.3.0"
 
 # Find all fuzz_*.py files in the fuzz directory
 for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do
+    [[ -e "$fuzzer" ]] || continue
     fuzzer_basename=$(basename -s .py "$fuzzer")
 
     echo "Preparing ${fuzzer_basename}..."
diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py
index c07e0fc64..57555955d 100644
--- a/fuzz/fuzz_tokenize.py
+++ b/fuzz/fuzz_tokenize.py
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: 2026 PyThaiNLP Project
-# SPDX-License-Identifier: Apache-2.0
 # SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
 """Fuzzing harness for pythainlp.tokenize.word_tokenize()
 
 This fuzzer tests the word_tokenize function with random Unicode input
@@ -41,7 +41,10 @@ def TestOneInput(data: bytes) -> None:
 
 
 def main() -> None:
-    """Entry point for the fuzzer."""
+    """Entry point for the fuzzer.
+    
+    :rtype: None
+    """
     atheris.Setup(sys.argv, TestOneInput)
     atheris.Fuzz()
 
diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py
index c8365c118..a1d486eb6 100644
--- a/fuzz/fuzz_util_normalize.py
+++ b/fuzz/fuzz_util_normalize.py
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: 2026 PyThaiNLP Project
-# SPDX-License-Identifier: Apache-2.0
 # SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
 """Fuzzing harness for pythainlp.util.normalize()
 
 This fuzzer tests the normalize function with random Unicode input
@@ -39,7 +39,10 @@ def TestOneInput(data: bytes) -> None:
 
 
 def main() -> None:
-    """Entry point for the fuzzer."""
+    """Entry point for the fuzzer.
+    
+    :rtype: None
+    """
     atheris.Setup(sys.argv, TestOneInput)
     atheris.Fuzz()
 

From c7cc3561ccb57bd8d1935ccf04820baaf10127eb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Thu, 5 Feb 2026 13:11:59 +0000
Subject: [PATCH 7/7] Add corpus storage best practices and fuzzing artifact
 exclusions

- Add comprehensive "Corpus Storage Best Practices" section to fuzz/README.md
  - Minimize and de-duplicate guidelines with libFuzzer commands
  - Data sanitization guidelines (no sensitive/production data)
  - Dedicated storage explanation (gh-pages branch usage)
  - Crash monitoring and handling procedures
  - Security considerations for public/private repos
  - Corpus management command reference
- Update .gitignore to exclude fuzzing artifacts
  - Local corpus directories (fuzz/corpus/, fuzz/crashes/, fuzz/artifacts/)
  - Fuzzer output files (crash-*, leak-*, timeout-*, oom-*)
  - Profiling data (*.profraw, *.profdata)

Co-authored-by: bact <128572+bact@users.noreply.github.com>
---
 .gitignore     |  11 ++++
 fuzz/README.md | 139 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index ba6490da6..0df3bf34d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -129,3 +129,14 @@ logs/
 # Temp files
 *.tmp
 *.temp
+
+# Fuzzing artifacts
+fuzz/corpus/
+fuzz/crashes/
+fuzz/artifacts/
+fuzz/*.profraw
+fuzz/*.profdata
+crash-*
+leak-*
+timeout-*
+oom-*
diff --git a/fuzz/README.md b/fuzz/README.md
index 82b97ffba..1b9b67513 100644
--- a/fuzz/README.md
+++ b/fuzz/README.md
@@ -107,7 +107,9 @@ if __name__ == "__main__":
     main()
 ```
 
-2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up and built.
+2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by
+   ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up
+   and built.
 
 3. No changes needed to GitHub Actions workflow
 
@@ -155,3 +157,138 @@ If a fuzzer finds a crash:
 - [Atheris Documentation](https://github.com/google/atheris)
 - [OSS-Fuzz](https://github.com/google/oss-fuzz)
 - [libFuzzer Tutorial](https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md)
+
+## Corpus Storage Best Practices
+
+The fuzzing corpus (test inputs that trigger interesting code paths) is automatically managed by
+ClusterFuzzLite and stored in the `gh-pages` branch. However, if you need to manually manage corpus
+data, follow these best practices:
+
+### 1. Minimize and De-duplicate
+
+Keep only the smallest, most unique set of inputs:
+
+```bash
+# Use libFuzzer's merge feature to minimize corpus
+python fuzz/fuzz_tokenize.py -merge=1 minimized_corpus/ original_corpus/
+
+# This keeps only inputs that trigger unique code coverage
+```
+
+The `-merge=1` flag tells libFuzzer to:
+- Remove duplicate inputs that cover the same code paths
+- Keep the smallest input for each unique coverage pattern
+- Output the minimized corpus to the first directory
+
+### 2. Sanitize the Data
+
+**Never use sensitive production data for fuzzing:**
+
+- ✅ Use synthetic test data
+- ✅ Use publicly available sample data
+- ✅ Generate random valid inputs
+- ❌ Do not use real user data
+- ❌ Do not use data containing secrets, passwords, or API keys
+- ❌ Do not use data with personally identifiable information (PII)
+
+**Before committing any corpus:**
+```bash
+# Review corpus files for sensitive data
+find corpus/ -type f -exec head -n 5 {} \;
+
+# Check for common patterns
+grep -r "password\|api_key\|secret\|token" corpus/
+```
+
+### 3. Use Dedicated Storage
+
+**ClusterFuzzLite automatically stores corpus in `gh-pages` branch**, which is separate from the main codebase. This is the recommended approach.
+
+**If storing corpus locally or in version control:**
+- ❌ Do NOT add corpus to the main branch with `git add fuzz/corpus/`
+- ✅ Use a dedicated branch (e.g., `fuzzing-data` or `gh-pages`)
+- ✅ Use GitHub Actions artifacts (already configured)
+- ✅ Use external storage (S3, GCS) for large corpora
+
+**Note:** The `.gitignore` is configured to exclude local corpus artifacts:
+- `fuzz/corpus/` - Corpus files
+- `fuzz/crashes/` - Crash-triggering inputs
+- `fuzz/artifacts/` - Build artifacts
+- `crash-*`, `leak-*`, `timeout-*`, `oom-*` - Fuzzer output files
+
+### 4. Monitor for Crashes
+
+**Never commit a crash-triggering input without fixing the bug first.**
+
+**When a crash is found:**
+
+1. **Reproduce the crash locally:**
+   ```bash
+   # ClusterFuzzLite saves crashes in artifacts
+   python fuzz/fuzz_tokenize.py crash-file
+   ```
+
+2. **Debug and fix the underlying bug:**
+   - Identify the root cause in the target function
+   - Write a unit test that reproduces the issue
+   - Fix the bug in the codebase
+
+3. **Verify the fix:**
+   ```bash
+   # Re-run the fuzzer with the crash input
+   python fuzz/fuzz_tokenize.py crash-file
+   # Should not crash after fix
+   ```
+
+4. **Add as regression test:**
+   ```python
+   # In tests/test_tokenize.py
+   def test_crash_regression_issue_1234():
+       """Regression test for crash found by fuzzer."""
+       # Use the crash-triggering input as a test case
+       result = word_tokenize("...")
+       assert isinstance(result, list)
+   ```
+
+5. **Only then add to corpus:**
+   ```bash
+   # After bug is fixed, add input to corpus for future testing
+   cp crash-file fuzz/corpus/tokenize/
+   ```
+
+### Security Considerations
+
+**Corpus storage in public gh-pages branch is safe for open-source projects:**
+- ✅ Corpus contains only test inputs (strings, bytes)
+- ✅ Does not contain code execution artifacts
+- ✅ Follows standard OSS fuzzing practices (OSS-Fuzz, ClusterFuzzLite)
+
+**Crash artifacts have limited exposure:**
+- Uploaded as GitHub Actions artifacts (not to gh-pages)
+- Have configurable retention period (default: 90 days)
+- Only accessible to repository collaborators
+
+**For private repositories with sensitive concerns:**
+- Consider using a private storage-repo-branch
+- Or disable corpus persistence by removing `storage-repo` parameters from workflow
+- Fuzzing will still work, just won't persist corpus between runs
+
+### Corpus Management Commands
+
+```bash
+# View corpus statistics
+python fuzz/fuzz_tokenize.py corpus/ -runs=0
+
+# Minimize corpus (keep unique inputs only)
+python fuzz/fuzz_tokenize.py -merge=1 minimized/ corpus/
+
+# Find minimum reproducer for a crash
+python fuzz/fuzz_tokenize.py -minimize_crash=1 crash-file
+
+# Run fuzzer with existing corpus
+python fuzz/fuzz_tokenize.py corpus/ -max_total_time=60
+
+# Check corpus coverage
+python fuzz/fuzz_tokenize.py corpus/ -runs=0 -print_coverage=1
+```
+