From e31b06f126047b20fd4516f3a9d1aeabe09683cc Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 00:45:09 +0000 Subject: [PATCH 1/7] Initial plan From 5067ec8ce21d8951a18b35a479add509305246b5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 00:48:14 +0000 Subject: [PATCH 2/7] Add ClusterFuzzLite fuzzing infrastructure Co-authored-by: bact <128572+bact@users.noreply.github.com> --- .github/workflows/clusterfuzzlite.yml | 82 ++++++++++++++ Dockerfile | 32 ++++-- Dockerfile.app | 15 +++ docker-compose.yml | 4 +- fuzz/README.md | 154 ++++++++++++++++++++++++++ fuzz/build.sh | 28 +++++ fuzz/fuzz_tokenize.py | 51 +++++++++ fuzz/fuzz_util_normalize.py | 49 ++++++++ 8 files changed, 405 insertions(+), 10 deletions(-) create mode 100644 .github/workflows/clusterfuzzlite.yml create mode 100644 Dockerfile.app create mode 100644 fuzz/README.md create mode 100755 fuzz/build.sh create mode 100644 fuzz/fuzz_tokenize.py create mode 100644 fuzz/fuzz_util_normalize.py diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml new file mode 100644 index 000000000..949a18544 --- /dev/null +++ b/.github/workflows/clusterfuzzlite.yml @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +name: ClusterFuzzLite + +on: + push: + branches: + - dev + paths-ignore: + - '**.cff' + - '**.json' + - '**.md' + - '**.rst' + - '**.txt' + - '**.yml' + - 'docs/**' + pull_request: + branches: + - dev + paths-ignore: + - '**.cff' + - '**.json' + - '**.md' + - '**.rst' + - '**.txt' + - '**.yml' + - 'docs/**' + schedule: + - cron: '0 6 * * *' # Daily at 06:00 UTC + +# Avoid duplicate runs for the same source branch and repository. +# For pull_request events, uses the source repo name from +# github.event.pull_request.head.repo.full_name; otherwise uses github.repository. +# For push events, uses the branch name from github.ref_name. +# For pull_request events, uses the source branch name from github.head_ref. +# This ensures events for the same repo and branch share the same group, +# and avoids cross-fork collisions when branch names are reused. +concurrency: + group: >- + ${{ github.workflow }}-${{ + github.event.pull_request.head.repo.full_name || github.repository + }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + +permissions: + contents: read + issues: write + +jobs: + fuzzing: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + sanitizer: [address] + steps: + - name: Build Fuzzers (${{ matrix.sanitizer }}) + id: build + uses: google/clusterfuzzlite/actions/build_fuzzers@v1 + with: + sanitizer: ${{ matrix.sanitizer }} + language: python + + - name: Run Fuzzers (${{ matrix.sanitizer }}) + id: run + uses: google/clusterfuzzlite/actions/run_fuzzers@v1 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + fuzz-seconds: 300 + mode: 'code-change' + sanitizer: ${{ matrix.sanitizer }} + storage-repo: https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git + storage-repo-branch: gh-pages + storage-repo-branch-coverage: gh-pages + + - name: Upload crash artifacts + if: failure() && steps.run.outcome == 'failure' + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.sanitizer }}-artifacts + path: ./out/artifacts diff --git a/Dockerfile b/Dockerfile index 7cbf8c3cb..7368893fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,29 @@ -# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -FROM python:3.12 +# Dockerfile for ClusterFuzzLite fuzzing +# This extends the OSS-Fuzz base builder image for Python projects -WORKDIR /app +FROM gcr.io/oss-fuzz-base/base-builder-python -COPY . . +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libicu-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* -RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/* -ENV VIRTUAL_ENV=/opt/venv -RUN python3 -m venv $VIRTUAL_ENV -ENV PATH="$VIRTUAL_ENV/bin:$PATH" -RUN pip install -e ".[full]" && pip cache purge +# Copy repository to $SRC/pythainlp +COPY . $SRC/pythainlp + +# Set working directory +WORKDIR $SRC/pythainlp + +# Install pythainlp in development mode with minimal dependencies +# This installs the package without heavy ML dependencies to speed up builds +RUN pip install --no-cache-dir -e . + +# Copy build script +COPY fuzz/build.sh $SRC/ diff --git a/Dockerfile.app b/Dockerfile.app new file mode 100644 index 000000000..7cbf8c3cb --- /dev/null +++ b/Dockerfile.app @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +FROM python:3.12 + +WORKDIR /app + +COPY . . + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/* +ENV VIRTUAL_ENV=/opt/venv +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN pip install -e ".[full]" && pip cache purge diff --git a/docker-compose.yml b/docker-compose.yml index 200512fa5..f9d4c3016 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,8 @@ services: pythainlp: - build: . + build: + context: . + dockerfile: Dockerfile.app image: pythainlp:latest volumes: - .:/workspace diff --git a/fuzz/README.md b/fuzz/README.md new file mode 100644 index 000000000..126f1bc0f --- /dev/null +++ b/fuzz/README.md @@ -0,0 +1,154 @@ +# PyThaiNLP Fuzz Testing + +This directory contains fuzz testing infrastructure using [ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris). + +## Overview + +Fuzz testing helps discover edge cases, crashes, and potential security vulnerabilities by feeding random inputs to functions. This setup uses: + +- **ClusterFuzzLite**: Google's continuous fuzzing solution for GitHub projects +- **Atheris**: Coverage-guided Python fuzzing engine +- **AddressSanitizer**: Memory safety checks + +## Directory Structure + +``` +fuzz/ +├── build.sh # Build script for compiling fuzzers +├── fuzz_tokenize.py # Fuzzer for word_tokenize() +├── fuzz_util_normalize.py # Fuzzer for normalize() +└── README.md # This file +``` + +## Current Fuzzing Targets + +### 1. `fuzz_tokenize.py` +Tests `pythainlp.tokenize.word_tokenize()` with random Unicode input to ensure: +- No crashes on malformed input +- Proper handling of edge cases +- Memory safety + +### 2. `fuzz_util_normalize.py` +Tests `pythainlp.util.normalize()` with random Unicode input to ensure: +- No crashes on malformed input +- Proper string normalization +- Type safety + +## Local Testing + +To test fuzzers locally: + +```bash +# Install atheris +pip install atheris + +# Run a specific fuzzer for 60 seconds +python fuzz/fuzz_tokenize.py -max_total_time=60 + +# Run with specific corpus directory +python fuzz/fuzz_tokenize.py corpus_dir/ -max_total_time=60 +``` + +## CI/CD Integration + +Fuzzing runs automatically via GitHub Actions: +- On pull requests to `dev` branch (focuses on code changes) +- On push to `dev` branch +- Daily at 06:00 UTC (full fuzzing run) + +Configuration: `.github/workflows/clusterfuzzlite.yml` + +## Adding New Fuzzers + +To add a new fuzzing target: + +1. Create a new file `fuzz/fuzz_.py`: + +```python +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE +"""Fuzzing harness for pythainlp..()""" + +import sys +import atheris +import pythainlp. + + +def TestOneInput(data: bytes) -> None: + """Fuzz target for .""" + fdp = atheris.FuzzedDataProvider(data) + + try: + # Generate test input + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Call target function + result = pythainlp..(text) + + # Validate output + assert isinstance(result, ) + + except (ValueError, TypeError, UnicodeDecodeError): + # Expected exceptions + pass + + +def main() -> None: + """Entry point for the fuzzer.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() +``` + +2. The fuzzer will be automatically discovered and built by `build.sh` + +3. No changes needed to GitHub Actions workflow + +## Expansion Plan + +Future fuzzing targets to consider: + +### High Priority +- **spell/** - Spelling correction functions +- **soundex/** - Phonetic encoding functions +- **transliterate/** - Romanization functions + +### Medium Priority +- **corpus/** - Data loading and corpus functions +- **tag/** - Part-of-speech tagging +- **parse/** - Parsing functions + +### Low Priority +- **classify/** - Classification functions +- **generate/** - Text generation functions +- **summarize/** - Summarization functions + +## Troubleshooting + +### Fuzzer Crashes +If a fuzzer finds a crash: +1. Check the GitHub Actions artifacts for crash reports +2. Reproduce locally: `python fuzz/fuzz_.py ` +3. Fix the underlying issue in the target function +4. Re-run fuzzer to verify fix + +### Performance Issues +- Adjust fuzzing time in `.github/workflows/clusterfuzzlite.yml` +- Default is 300 seconds (5 minutes) per fuzzer +- For longer sessions, increase the value + +### False Positives +- Update the exception handling in the fuzzer +- Add expected exceptions to the `except` block +- Document the reasoning in comments + +## Resources + +- [ClusterFuzzLite Documentation](https://google.github.io/clusterfuzzlite/) +- [Atheris Documentation](https://github.com/google/atheris) +- [OSS-Fuzz](https://github.com/google/oss-fuzz) +- [libFuzzer Tutorial](https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md) diff --git a/fuzz/build.sh b/fuzz/build.sh new file mode 100755 index 000000000..485b9069f --- /dev/null +++ b/fuzz/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash -eu +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE + +# Build script for ClusterFuzzLite fuzzing harnesses +# This script installs atheris and compiles all fuzzing harnesses + +echo "Building PyThaiNLP fuzz targets..." + +# Install atheris for Python fuzzing +pip install atheris + +# Find all fuzz_*.py files in the fuzz directory +for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do + fuzzer_basename=$(basename -s .py "$fuzzer") + fuzzer_package="fuzz.${fuzzer_basename}" + + echo "Compiling ${fuzzer_basename}..." + + # Compile fuzzer with atheris + python -m atheris.instrument_libfuzzer "${fuzzer}" "${OUT}/${fuzzer_basename}" + + # Make fuzzer executable + chmod +x "${OUT}/${fuzzer_basename}" +done + +echo "Build completed successfully!" diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py new file mode 100644 index 000000000..b2f46d7d5 --- /dev/null +++ b/fuzz/fuzz_tokenize.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE +"""Fuzzing harness for pythainlp.tokenize.word_tokenize() + +This fuzzer tests the word_tokenize function with random Unicode input +to discover edge cases, crashes, and potential security issues. +""" + +import sys + +import atheris + +import pythainlp.tokenize + + +def TestOneInput(data: bytes) -> None: + """Fuzz target for word_tokenize. + + :param bytes data: Random input bytes from the fuzzer + """ + fdp = atheris.FuzzedDataProvider(data) + + try: + # Generate random Unicode string + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Test word_tokenize with default engine + result = pythainlp.tokenize.word_tokenize(text) + + # Validate output type + assert isinstance(result, list), f"Expected list, got {type(result)}" + assert all(isinstance(token, str) for token in result), \ + "All tokens should be strings" + + except (ValueError, TypeError, UnicodeDecodeError): + # Expected exceptions - these are acceptable + pass + except Exception: + # Unexpected exceptions - re-raise for investigation + raise + + +def main() -> None: + """Entry point for the fuzzer.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py new file mode 100644 index 000000000..ac16c0d27 --- /dev/null +++ b/fuzz/fuzz_util_normalize.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE +"""Fuzzing harness for pythainlp.util.normalize() + +This fuzzer tests the normalize function with random Unicode input +to discover edge cases, crashes, and potential security issues. +""" + +import sys + +import atheris + +import pythainlp.util + + +def TestOneInput(data: bytes) -> None: + """Fuzz target for normalize. + + :param bytes data: Random input bytes from the fuzzer + """ + fdp = atheris.FuzzedDataProvider(data) + + try: + # Generate random Unicode string + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Test normalize + result = pythainlp.util.normalize(text) + + # Validate output type + assert isinstance(result, str), f"Expected str, got {type(result)}" + + except (ValueError, TypeError, UnicodeDecodeError): + # Expected exceptions - these are acceptable + pass + except Exception: + # Unexpected exceptions - re-raise for investigation + raise + + +def main() -> None: + """Entry point for the fuzzer.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() From 9f9dce4f18507a878cd7b7aeb91a7e15eae22dee Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 00:49:11 +0000 Subject: [PATCH 3/7] Fix markdown line length in fuzz/README.md Co-authored-by: bact <128572+bact@users.noreply.github.com> --- fuzz/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fuzz/README.md b/fuzz/README.md index 126f1bc0f..77c73bb07 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -1,6 +1,7 @@ # PyThaiNLP Fuzz Testing -This directory contains fuzz testing infrastructure using [ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris). +This directory contains fuzz testing infrastructure using +[ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris). ## Overview From 604792e4ba2ecf336206ff1cd2560cfc0c4a9c19 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 01:18:24 +0000 Subject: [PATCH 4/7] Move fuzzing Dockerfile to fuzz/ directory for clarity Co-authored-by: bact <128572+bact@users.noreply.github.com> --- .github/workflows/clusterfuzzlite.yml | 1 + Dockerfile | 32 ++++++++------------------- Dockerfile.app | 15 ------------- docker-compose.yml | 4 +--- fuzz/Dockerfile | 29 ++++++++++++++++++++++++ fuzz/README.md | 1 + 6 files changed, 41 insertions(+), 41 deletions(-) delete mode 100644 Dockerfile.app create mode 100644 fuzz/Dockerfile diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml index 949a18544..4aac63141 100644 --- a/.github/workflows/clusterfuzzlite.yml +++ b/.github/workflows/clusterfuzzlite.yml @@ -61,6 +61,7 @@ jobs: with: sanitizer: ${{ matrix.sanitizer }} language: python + dockerfile-path: fuzz/Dockerfile - name: Run Fuzzers (${{ matrix.sanitizer }}) id: run diff --git a/Dockerfile b/Dockerfile index 7368893fe..7cbf8c3cb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,29 +1,15 @@ -# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -# Dockerfile for ClusterFuzzLite fuzzing -# This extends the OSS-Fuzz base builder image for Python projects +FROM python:3.12 -FROM gcr.io/oss-fuzz-base/base-builder-python +WORKDIR /app -# Install system dependencies -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - libicu-dev \ - pkg-config && \ - rm -rf /var/lib/apt/lists/* +COPY . . -# Copy repository to $SRC/pythainlp -COPY . $SRC/pythainlp - -# Set working directory -WORKDIR $SRC/pythainlp - -# Install pythainlp in development mode with minimal dependencies -# This installs the package without heavy ML dependencies to speed up builds -RUN pip install --no-cache-dir -e . - -# Copy build script -COPY fuzz/build.sh $SRC/ +RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/* +ENV VIRTUAL_ENV=/opt/venv +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" +RUN pip install -e ".[full]" && pip cache purge diff --git a/Dockerfile.app b/Dockerfile.app deleted file mode 100644 index 7cbf8c3cb..000000000 --- a/Dockerfile.app +++ /dev/null @@ -1,15 +0,0 @@ -# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project -# SPDX-FileType: SOURCE -# SPDX-License-Identifier: Apache-2.0 - -FROM python:3.12 - -WORKDIR /app - -COPY . . - -RUN apt-get update && apt-get install -y --no-install-recommends build-essential libicu-dev python3-pip python3-venv pkg-config && rm -rf /var/lib/apt/lists/* -ENV VIRTUAL_ENV=/opt/venv -RUN python3 -m venv $VIRTUAL_ENV -ENV PATH="$VIRTUAL_ENV/bin:$PATH" -RUN pip install -e ".[full]" && pip cache purge diff --git a/docker-compose.yml b/docker-compose.yml index f9d4c3016..200512fa5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,8 +1,6 @@ services: pythainlp: - build: - context: . - dockerfile: Dockerfile.app + build: . image: pythainlp:latest volumes: - .:/workspace diff --git a/fuzz/Dockerfile b/fuzz/Dockerfile new file mode 100644 index 000000000..7368893fe --- /dev/null +++ b/fuzz/Dockerfile @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +# Dockerfile for ClusterFuzzLite fuzzing +# This extends the OSS-Fuzz base builder image for Python projects + +FROM gcr.io/oss-fuzz-base/base-builder-python + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libicu-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* + +# Copy repository to $SRC/pythainlp +COPY . $SRC/pythainlp + +# Set working directory +WORKDIR $SRC/pythainlp + +# Install pythainlp in development mode with minimal dependencies +# This installs the package without heavy ML dependencies to speed up builds +RUN pip install --no-cache-dir -e . + +# Copy build script +COPY fuzz/build.sh $SRC/ diff --git a/fuzz/README.md b/fuzz/README.md index 77c73bb07..20ee85c80 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -15,6 +15,7 @@ Fuzz testing helps discover edge cases, crashes, and potential security vulnerab ``` fuzz/ +├── Dockerfile # Docker image for ClusterFuzzLite fuzzing ├── build.sh # Build script for compiling fuzzers ├── fuzz_tokenize.py # Fuzzer for word_tokenize() ├── fuzz_util_normalize.py # Fuzzer for normalize() From 499966b830fca6bb3a04325654adacdb7a82c7e5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 02:44:49 +0000 Subject: [PATCH 5/7] Apply code review suggestions from Copilot PR reviewer Co-authored-by: bact <128572+bact@users.noreply.github.com> --- .github/workflows/clusterfuzzlite.yml | 6 ++---- fuzz/Dockerfile | 4 ++-- fuzz/README.md | 2 +- fuzz/build.sh | 13 ++++++------- fuzz/fuzz_tokenize.py | 11 +++++------ fuzz/fuzz_util_normalize.py | 7 +++---- 6 files changed, 19 insertions(+), 24 deletions(-) diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml index 4aac63141..d91d8911b 100644 --- a/.github/workflows/clusterfuzzlite.yml +++ b/.github/workflows/clusterfuzzlite.yml @@ -13,7 +13,6 @@ on: - '**.md' - '**.rst' - '**.txt' - - '**.yml' - 'docs/**' pull_request: branches: @@ -24,7 +23,6 @@ on: - '**.md' - '**.rst' - '**.txt' - - '**.yml' - 'docs/**' schedule: - cron: '0 6 * * *' # Daily at 06:00 UTC @@ -44,7 +42,7 @@ concurrency: cancel-in-progress: true permissions: - contents: read + contents: write issues: write jobs: @@ -69,7 +67,7 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} fuzz-seconds: 300 - mode: 'code-change' + mode: ${{ github.event_name == 'pull_request' && 'code-change' || 'batch' }} sanitizer: ${{ matrix.sanitizer }} storage-repo: https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git storage-repo-branch: gh-pages diff --git a/fuzz/Dockerfile b/fuzz/Dockerfile index 7368893fe..b59069fc0 100644 --- a/fuzz/Dockerfile +++ b/fuzz/Dockerfile @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 PyThaiNLP Project -# SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE # Dockerfile for ClusterFuzzLite fuzzing # This extends the OSS-Fuzz base builder image for Python projects @@ -25,5 +25,5 @@ WORKDIR $SRC/pythainlp # This installs the package without heavy ML dependencies to speed up builds RUN pip install --no-cache-dir -e . -# Copy build script +# Copy build script to $SRC/build.sh as expected by OSS-Fuzz/ClusterFuzzLite COPY fuzz/build.sh $SRC/ diff --git a/fuzz/README.md b/fuzz/README.md index 20ee85c80..9d8dc3fbe 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -106,7 +106,7 @@ if __name__ == "__main__": main() ``` -2. The fuzzer will be automatically discovered and built by `build.sh` +2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up and built. 3. No changes needed to GitHub Actions workflow diff --git a/fuzz/build.sh b/fuzz/build.sh index 485b9069f..d45f4ed46 100755 --- a/fuzz/build.sh +++ b/fuzz/build.sh @@ -4,22 +4,21 @@ # SPDX-FileType: SOURCE # Build script for ClusterFuzzLite fuzzing harnesses -# This script installs atheris and compiles all fuzzing harnesses +# This script installs atheris and prepares all fuzzing harnesses echo "Building PyThaiNLP fuzz targets..." -# Install atheris for Python fuzzing -pip install atheris +# Install atheris for Python fuzzing with version constraint +pip install "atheris>=2.3.0" # Find all fuzz_*.py files in the fuzz directory for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do fuzzer_basename=$(basename -s .py "$fuzzer") - fuzzer_package="fuzz.${fuzzer_basename}" - echo "Compiling ${fuzzer_basename}..." + echo "Preparing ${fuzzer_basename}..." - # Compile fuzzer with atheris - python -m atheris.instrument_libfuzzer "${fuzzer}" "${OUT}/${fuzzer_basename}" + # Copy fuzzer to output directory (instrumentation happens at runtime) + cp "${fuzzer}" "${OUT}/${fuzzer_basename}" # Make fuzzer executable chmod +x "${OUT}/${fuzzer_basename}" diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py index b2f46d7d5..c07e0fc64 100644 --- a/fuzz/fuzz_tokenize.py +++ b/fuzz/fuzz_tokenize.py @@ -18,6 +18,7 @@ def TestOneInput(data: bytes) -> None: """Fuzz target for word_tokenize. :param bytes data: Random input bytes from the fuzzer + :rtype: None """ fdp = atheris.FuzzedDataProvider(data) @@ -29,16 +30,14 @@ def TestOneInput(data: bytes) -> None: result = pythainlp.tokenize.word_tokenize(text) # Validate output type - assert isinstance(result, list), f"Expected list, got {type(result)}" - assert all(isinstance(token, str) for token in result), \ - "All tokens should be strings" + if not isinstance(result, list): + raise TypeError(f"Expected list, got {type(result)}") + if not all(isinstance(token, str) for token in result): + raise TypeError("All tokens should be strings") except (ValueError, TypeError, UnicodeDecodeError): # Expected exceptions - these are acceptable pass - except Exception: - # Unexpected exceptions - re-raise for investigation - raise def main() -> None: diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py index ac16c0d27..c8365c118 100644 --- a/fuzz/fuzz_util_normalize.py +++ b/fuzz/fuzz_util_normalize.py @@ -18,6 +18,7 @@ def TestOneInput(data: bytes) -> None: """Fuzz target for normalize. :param bytes data: Random input bytes from the fuzzer + :rtype: None """ fdp = atheris.FuzzedDataProvider(data) @@ -29,14 +30,12 @@ def TestOneInput(data: bytes) -> None: result = pythainlp.util.normalize(text) # Validate output type - assert isinstance(result, str), f"Expected str, got {type(result)}" + if not isinstance(result, str): + raise TypeError(f"Expected str, got {type(result)}") except (ValueError, TypeError, UnicodeDecodeError): # Expected exceptions - these are acceptable pass - except Exception: - # Unexpected exceptions - re-raise for investigation - raise def main() -> None: From 7f258cebce48af662d22a37e94be512b437c5909 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 11:40:31 +0000 Subject: [PATCH 6/7] Apply second round of code review suggestions - Fix SPDX header order (FileCopyrightText, FileType, License-Identifier) - Add :rtype: None to main() docstrings for complete type annotations - Replace assert with explicit TypeError in README example - Add nullglob guard to build.sh for loop - Pin atheris to exact version (==2.3.0) for security Co-authored-by: bact <128572+bact@users.noreply.github.com> --- fuzz/README.md | 3 ++- fuzz/build.sh | 7 ++++--- fuzz/fuzz_tokenize.py | 7 +++++-- fuzz/fuzz_util_normalize.py | 7 +++++-- 4 files changed, 16 insertions(+), 8 deletions(-) diff --git a/fuzz/README.md b/fuzz/README.md index 9d8dc3fbe..82b97ffba 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -89,7 +89,8 @@ def TestOneInput(data: bytes) -> None: result = pythainlp..(text) # Validate output - assert isinstance(result, ) + if not isinstance(result, ): + raise TypeError(f"Expected , got {type(result)}") except (ValueError, TypeError, UnicodeDecodeError): # Expected exceptions diff --git a/fuzz/build.sh b/fuzz/build.sh index d45f4ed46..f1e15544f 100755 --- a/fuzz/build.sh +++ b/fuzz/build.sh @@ -1,18 +1,19 @@ #!/bin/bash -eu # SPDX-FileCopyrightText: 2026 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 # SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 # Build script for ClusterFuzzLite fuzzing harnesses # This script installs atheris and prepares all fuzzing harnesses echo "Building PyThaiNLP fuzz targets..." -# Install atheris for Python fuzzing with version constraint -pip install "atheris>=2.3.0" +# Install atheris for Python fuzzing with pinned version for security +pip install "atheris==2.3.0" # Find all fuzz_*.py files in the fuzz directory for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do + [[ -e "$fuzzer" ]] || continue fuzzer_basename=$(basename -s .py "$fuzzer") echo "Preparing ${fuzzer_basename}..." diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py index c07e0fc64..57555955d 100644 --- a/fuzz/fuzz_tokenize.py +++ b/fuzz/fuzz_tokenize.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 # SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 """Fuzzing harness for pythainlp.tokenize.word_tokenize() This fuzzer tests the word_tokenize function with random Unicode input @@ -41,7 +41,10 @@ def TestOneInput(data: bytes) -> None: def main() -> None: - """Entry point for the fuzzer.""" + """Entry point for the fuzzer. + + :rtype: None + """ atheris.Setup(sys.argv, TestOneInput) atheris.Fuzz() diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py index c8365c118..a1d486eb6 100644 --- a/fuzz/fuzz_util_normalize.py +++ b/fuzz/fuzz_util_normalize.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: 2026 PyThaiNLP Project -# SPDX-License-Identifier: Apache-2.0 # SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 """Fuzzing harness for pythainlp.util.normalize() This fuzzer tests the normalize function with random Unicode input @@ -39,7 +39,10 @@ def TestOneInput(data: bytes) -> None: def main() -> None: - """Entry point for the fuzzer.""" + """Entry point for the fuzzer. + + :rtype: None + """ atheris.Setup(sys.argv, TestOneInput) atheris.Fuzz() From c7cc3561ccb57bd8d1935ccf04820baaf10127eb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 13:11:59 +0000 Subject: [PATCH 7/7] Add corpus storage best practices and fuzzing artifact exclusions - Add comprehensive "Corpus Storage Best Practices" section to fuzz/README.md - Minimize and de-duplicate guidelines with libFuzzer commands - Data sanitization guidelines (no sensitive/production data) - Dedicated storage explanation (gh-pages branch usage) - Crash monitoring and handling procedures - Security considerations for public/private repos - Corpus management command reference - Update .gitignore to exclude fuzzing artifacts - Local corpus directories (fuzz/corpus/, fuzz/crashes/, fuzz/artifacts/) - Fuzzer output files (crash-*, leak-*, timeout-*, oom-*) - Profiling data (*.profraw, *.profdata) Co-authored-by: bact <128572+bact@users.noreply.github.com> --- .gitignore | 11 ++++ fuzz/README.md | 139 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index ba6490da6..0df3bf34d 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,14 @@ logs/ # Temp files *.tmp *.temp + +# Fuzzing artifacts +fuzz/corpus/ +fuzz/crashes/ +fuzz/artifacts/ +fuzz/*.profraw +fuzz/*.profdata +crash-* +leak-* +timeout-* +oom-* diff --git a/fuzz/README.md b/fuzz/README.md index 82b97ffba..1b9b67513 100644 --- a/fuzz/README.md +++ b/fuzz/README.md @@ -107,7 +107,9 @@ if __name__ == "__main__": main() ``` -2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up and built. +2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by + ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up + and built. 3. No changes needed to GitHub Actions workflow @@ -155,3 +157,138 @@ If a fuzzer finds a crash: - [Atheris Documentation](https://github.com/google/atheris) - [OSS-Fuzz](https://github.com/google/oss-fuzz) - [libFuzzer Tutorial](https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md) + +## Corpus Storage Best Practices + +The fuzzing corpus (test inputs that trigger interesting code paths) is automatically managed by +ClusterFuzzLite and stored in the `gh-pages` branch. However, if you need to manually manage corpus +data, follow these best practices: + +### 1. Minimize and De-duplicate + +Keep only the smallest, most unique set of inputs: + +```bash +# Use libFuzzer's merge feature to minimize corpus +python fuzz/fuzz_tokenize.py -merge=1 minimized_corpus/ original_corpus/ + +# This keeps only inputs that trigger unique code coverage +``` + +The `-merge=1` flag tells libFuzzer to: +- Remove duplicate inputs that cover the same code paths +- Keep the smallest input for each unique coverage pattern +- Output the minimized corpus to the first directory + +### 2. Sanitize the Data + +**Never use sensitive production data for fuzzing:** + +- ✅ Use synthetic test data +- ✅ Use publicly available sample data +- ✅ Generate random valid inputs +- ❌ Do not use real user data +- ❌ Do not use data containing secrets, passwords, or API keys +- ❌ Do not use data with personally identifiable information (PII) + +**Before committing any corpus:** +```bash +# Review corpus files for sensitive data +find corpus/ -type f -exec head -n 5 {} \; + +# Check for common patterns +grep -r "password\|api_key\|secret\|token" corpus/ +``` + +### 3. Use Dedicated Storage + +**ClusterFuzzLite automatically stores corpus in `gh-pages` branch**, which is separate from the main codebase. This is the recommended approach. + +**If storing corpus locally or in version control:** +- ❌ Do NOT add corpus to the main branch with `git add fuzz/corpus/` +- ✅ Use a dedicated branch (e.g., `fuzzing-data` or `gh-pages`) +- ✅ Use GitHub Actions artifacts (already configured) +- ✅ Use external storage (S3, GCS) for large corpora + +**Note:** The `.gitignore` is configured to exclude local corpus artifacts: +- `fuzz/corpus/` - Corpus files +- `fuzz/crashes/` - Crash-triggering inputs +- `fuzz/artifacts/` - Build artifacts +- `crash-*`, `leak-*`, `timeout-*`, `oom-*` - Fuzzer output files + +### 4. Monitor for Crashes + +**Never commit a crash-triggering input without fixing the bug first.** + +**When a crash is found:** + +1. **Reproduce the crash locally:** + ```bash + # ClusterFuzzLite saves crashes in artifacts + python fuzz/fuzz_tokenize.py crash-file + ``` + +2. **Debug and fix the underlying bug:** + - Identify the root cause in the target function + - Write a unit test that reproduces the issue + - Fix the bug in the codebase + +3. **Verify the fix:** + ```bash + # Re-run the fuzzer with the crash input + python fuzz/fuzz_tokenize.py crash-file + # Should not crash after fix + ``` + +4. **Add as regression test:** + ```python + # In tests/test_tokenize.py + def test_crash_regression_issue_1234(): + """Regression test for crash found by fuzzer.""" + # Use the crash-triggering input as a test case + result = word_tokenize("...") + assert isinstance(result, list) + ``` + +5. **Only then add to corpus:** + ```bash + # After bug is fixed, add input to corpus for future testing + cp crash-file fuzz/corpus/tokenize/ + ``` + +### Security Considerations + +**Corpus storage in public gh-pages branch is safe for open-source projects:** +- ✅ Corpus contains only test inputs (strings, bytes) +- ✅ Does not contain code execution artifacts +- ✅ Follows standard OSS fuzzing practices (OSS-Fuzz, ClusterFuzzLite) + +**Crash artifacts have limited exposure:** +- Uploaded as GitHub Actions artifacts (not to gh-pages) +- Have configurable retention period (default: 90 days) +- Only accessible to repository collaborators + +**For private repositories with sensitive concerns:** +- Consider using a private storage-repo-branch +- Or disable corpus persistence by removing `storage-repo` parameters from workflow +- Fuzzing will still work, just won't persist corpus between runs + +### Corpus Management Commands + +```bash +# View corpus statistics +python fuzz/fuzz_tokenize.py corpus/ -runs=0 + +# Minimize corpus (keep unique inputs only) +python fuzz/fuzz_tokenize.py -merge=1 minimized/ corpus/ + +# Find minimum reproducer for a crash +python fuzz/fuzz_tokenize.py -minimize_crash=1 crash-file + +# Run fuzzer with existing corpus +python fuzz/fuzz_tokenize.py corpus/ -max_total_time=60 + +# Check corpus coverage +python fuzz/fuzz_tokenize.py corpus/ -runs=0 -print_coverage=1 +``` +