diff --git a/.github/workflows/clusterfuzzlite.yml b/.github/workflows/clusterfuzzlite.yml new file mode 100644 index 000000000..d91d8911b --- /dev/null +++ b/.github/workflows/clusterfuzzlite.yml @@ -0,0 +1,81 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 + +name: ClusterFuzzLite + +on: + push: + branches: + - dev + paths-ignore: + - '**.cff' + - '**.json' + - '**.md' + - '**.rst' + - '**.txt' + - 'docs/**' + pull_request: + branches: + - dev + paths-ignore: + - '**.cff' + - '**.json' + - '**.md' + - '**.rst' + - '**.txt' + - 'docs/**' + schedule: + - cron: '0 6 * * *' # Daily at 06:00 UTC + +# Avoid duplicate runs for the same source branch and repository. +# For pull_request events, uses the source repo name from +# github.event.pull_request.head.repo.full_name; otherwise uses github.repository. +# For push events, uses the branch name from github.ref_name. +# For pull_request events, uses the source branch name from github.head_ref. +# This ensures events for the same repo and branch share the same group, +# and avoids cross-fork collisions when branch names are reused. +concurrency: + group: >- + ${{ github.workflow }}-${{ + github.event.pull_request.head.repo.full_name || github.repository + }}-${{ github.head_ref || github.ref_name }} + cancel-in-progress: true + +permissions: + contents: write + issues: write + +jobs: + fuzzing: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + sanitizer: [address] + steps: + - name: Build Fuzzers (${{ matrix.sanitizer }}) + id: build + uses: google/clusterfuzzlite/actions/build_fuzzers@v1 + with: + sanitizer: ${{ matrix.sanitizer }} + language: python + dockerfile-path: fuzz/Dockerfile + + - name: Run Fuzzers (${{ matrix.sanitizer }}) + id: run + uses: google/clusterfuzzlite/actions/run_fuzzers@v1 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + fuzz-seconds: 300 + mode: ${{ github.event_name == 'pull_request' && 'code-change' || 'batch' }} + sanitizer: ${{ matrix.sanitizer }} + storage-repo: https://${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git + storage-repo-branch: gh-pages + storage-repo-branch-coverage: gh-pages + + - name: Upload crash artifacts + if: failure() && steps.run.outcome == 'failure' + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.sanitizer }}-artifacts + path: ./out/artifacts diff --git a/.gitignore b/.gitignore index ba6490da6..0df3bf34d 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,14 @@ logs/ # Temp files *.tmp *.temp + +# Fuzzing artifacts +fuzz/corpus/ +fuzz/crashes/ +fuzz/artifacts/ +fuzz/*.profraw +fuzz/*.profdata +crash-* +leak-* +timeout-* +oom-* diff --git a/fuzz/Dockerfile b/fuzz/Dockerfile new file mode 100644 index 000000000..b59069fc0 --- /dev/null +++ b/fuzz/Dockerfile @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE + +# Dockerfile for ClusterFuzzLite fuzzing +# This extends the OSS-Fuzz base builder image for Python projects + +FROM gcr.io/oss-fuzz-base/base-builder-python + +# Install system dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + libicu-dev \ + pkg-config && \ + rm -rf /var/lib/apt/lists/* + +# Copy repository to $SRC/pythainlp +COPY . $SRC/pythainlp + +# Set working directory +WORKDIR $SRC/pythainlp + +# Install pythainlp in development mode with minimal dependencies +# This installs the package without heavy ML dependencies to speed up builds +RUN pip install --no-cache-dir -e . + +# Copy build script to $SRC/build.sh as expected by OSS-Fuzz/ClusterFuzzLite +COPY fuzz/build.sh $SRC/ diff --git a/fuzz/README.md b/fuzz/README.md new file mode 100644 index 000000000..1b9b67513 --- /dev/null +++ b/fuzz/README.md @@ -0,0 +1,294 @@ +# PyThaiNLP Fuzz Testing + +This directory contains fuzz testing infrastructure using +[ClusterFuzzLite](https://google.github.io/clusterfuzzlite/) and [Atheris](https://github.com/google/atheris). + +## Overview + +Fuzz testing helps discover edge cases, crashes, and potential security vulnerabilities by feeding random inputs to functions. This setup uses: + +- **ClusterFuzzLite**: Google's continuous fuzzing solution for GitHub projects +- **Atheris**: Coverage-guided Python fuzzing engine +- **AddressSanitizer**: Memory safety checks + +## Directory Structure + +``` +fuzz/ +├── Dockerfile # Docker image for ClusterFuzzLite fuzzing +├── build.sh # Build script for compiling fuzzers +├── fuzz_tokenize.py # Fuzzer for word_tokenize() +├── fuzz_util_normalize.py # Fuzzer for normalize() +└── README.md # This file +``` + +## Current Fuzzing Targets + +### 1. `fuzz_tokenize.py` +Tests `pythainlp.tokenize.word_tokenize()` with random Unicode input to ensure: +- No crashes on malformed input +- Proper handling of edge cases +- Memory safety + +### 2. `fuzz_util_normalize.py` +Tests `pythainlp.util.normalize()` with random Unicode input to ensure: +- No crashes on malformed input +- Proper string normalization +- Type safety + +## Local Testing + +To test fuzzers locally: + +```bash +# Install atheris +pip install atheris + +# Run a specific fuzzer for 60 seconds +python fuzz/fuzz_tokenize.py -max_total_time=60 + +# Run with specific corpus directory +python fuzz/fuzz_tokenize.py corpus_dir/ -max_total_time=60 +``` + +## CI/CD Integration + +Fuzzing runs automatically via GitHub Actions: +- On pull requests to `dev` branch (focuses on code changes) +- On push to `dev` branch +- Daily at 06:00 UTC (full fuzzing run) + +Configuration: `.github/workflows/clusterfuzzlite.yml` + +## Adding New Fuzzers + +To add a new fuzzing target: + +1. Create a new file `fuzz/fuzz_.py`: + +```python +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileType: SOURCE +"""Fuzzing harness for pythainlp..()""" + +import sys +import atheris +import pythainlp. + + +def TestOneInput(data: bytes) -> None: + """Fuzz target for .""" + fdp = atheris.FuzzedDataProvider(data) + + try: + # Generate test input + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Call target function + result = pythainlp..(text) + + # Validate output + if not isinstance(result, ): + raise TypeError(f"Expected , got {type(result)}") + + except (ValueError, TypeError, UnicodeDecodeError): + # Expected exceptions + pass + + +def main() -> None: + """Entry point for the fuzzer.""" + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() +``` + +2. Ensure the new fuzzer file name follows the ``fuzz_*.py`` pattern so it can be discovered by + ``build.sh``, and run ``bash fuzz/build.sh`` locally to verify that your fuzzer is picked up + and built. + +3. No changes needed to GitHub Actions workflow + +## Expansion Plan + +Future fuzzing targets to consider: + +### High Priority +- **spell/** - Spelling correction functions +- **soundex/** - Phonetic encoding functions +- **transliterate/** - Romanization functions + +### Medium Priority +- **corpus/** - Data loading and corpus functions +- **tag/** - Part-of-speech tagging +- **parse/** - Parsing functions + +### Low Priority +- **classify/** - Classification functions +- **generate/** - Text generation functions +- **summarize/** - Summarization functions + +## Troubleshooting + +### Fuzzer Crashes +If a fuzzer finds a crash: +1. Check the GitHub Actions artifacts for crash reports +2. Reproduce locally: `python fuzz/fuzz_.py ` +3. Fix the underlying issue in the target function +4. Re-run fuzzer to verify fix + +### Performance Issues +- Adjust fuzzing time in `.github/workflows/clusterfuzzlite.yml` +- Default is 300 seconds (5 minutes) per fuzzer +- For longer sessions, increase the value + +### False Positives +- Update the exception handling in the fuzzer +- Add expected exceptions to the `except` block +- Document the reasoning in comments + +## Resources + +- [ClusterFuzzLite Documentation](https://google.github.io/clusterfuzzlite/) +- [Atheris Documentation](https://github.com/google/atheris) +- [OSS-Fuzz](https://github.com/google/oss-fuzz) +- [libFuzzer Tutorial](https://github.com/google/fuzzing/blob/master/tutorial/libFuzzerTutorial.md) + +## Corpus Storage Best Practices + +The fuzzing corpus (test inputs that trigger interesting code paths) is automatically managed by +ClusterFuzzLite and stored in the `gh-pages` branch. However, if you need to manually manage corpus +data, follow these best practices: + +### 1. Minimize and De-duplicate + +Keep only the smallest, most unique set of inputs: + +```bash +# Use libFuzzer's merge feature to minimize corpus +python fuzz/fuzz_tokenize.py -merge=1 minimized_corpus/ original_corpus/ + +# This keeps only inputs that trigger unique code coverage +``` + +The `-merge=1` flag tells libFuzzer to: +- Remove duplicate inputs that cover the same code paths +- Keep the smallest input for each unique coverage pattern +- Output the minimized corpus to the first directory + +### 2. Sanitize the Data + +**Never use sensitive production data for fuzzing:** + +- ✅ Use synthetic test data +- ✅ Use publicly available sample data +- ✅ Generate random valid inputs +- ❌ Do not use real user data +- ❌ Do not use data containing secrets, passwords, or API keys +- ❌ Do not use data with personally identifiable information (PII) + +**Before committing any corpus:** +```bash +# Review corpus files for sensitive data +find corpus/ -type f -exec head -n 5 {} \; + +# Check for common patterns +grep -r "password\|api_key\|secret\|token" corpus/ +``` + +### 3. Use Dedicated Storage + +**ClusterFuzzLite automatically stores corpus in `gh-pages` branch**, which is separate from the main codebase. This is the recommended approach. + +**If storing corpus locally or in version control:** +- ❌ Do NOT add corpus to the main branch with `git add fuzz/corpus/` +- ✅ Use a dedicated branch (e.g., `fuzzing-data` or `gh-pages`) +- ✅ Use GitHub Actions artifacts (already configured) +- ✅ Use external storage (S3, GCS) for large corpora + +**Note:** The `.gitignore` is configured to exclude local corpus artifacts: +- `fuzz/corpus/` - Corpus files +- `fuzz/crashes/` - Crash-triggering inputs +- `fuzz/artifacts/` - Build artifacts +- `crash-*`, `leak-*`, `timeout-*`, `oom-*` - Fuzzer output files + +### 4. Monitor for Crashes + +**Never commit a crash-triggering input without fixing the bug first.** + +**When a crash is found:** + +1. **Reproduce the crash locally:** + ```bash + # ClusterFuzzLite saves crashes in artifacts + python fuzz/fuzz_tokenize.py crash-file + ``` + +2. **Debug and fix the underlying bug:** + - Identify the root cause in the target function + - Write a unit test that reproduces the issue + - Fix the bug in the codebase + +3. **Verify the fix:** + ```bash + # Re-run the fuzzer with the crash input + python fuzz/fuzz_tokenize.py crash-file + # Should not crash after fix + ``` + +4. **Add as regression test:** + ```python + # In tests/test_tokenize.py + def test_crash_regression_issue_1234(): + """Regression test for crash found by fuzzer.""" + # Use the crash-triggering input as a test case + result = word_tokenize("...") + assert isinstance(result, list) + ``` + +5. **Only then add to corpus:** + ```bash + # After bug is fixed, add input to corpus for future testing + cp crash-file fuzz/corpus/tokenize/ + ``` + +### Security Considerations + +**Corpus storage in public gh-pages branch is safe for open-source projects:** +- ✅ Corpus contains only test inputs (strings, bytes) +- ✅ Does not contain code execution artifacts +- ✅ Follows standard OSS fuzzing practices (OSS-Fuzz, ClusterFuzzLite) + +**Crash artifacts have limited exposure:** +- Uploaded as GitHub Actions artifacts (not to gh-pages) +- Have configurable retention period (default: 90 days) +- Only accessible to repository collaborators + +**For private repositories with sensitive concerns:** +- Consider using a private storage-repo-branch +- Or disable corpus persistence by removing `storage-repo` parameters from workflow +- Fuzzing will still work, just won't persist corpus between runs + +### Corpus Management Commands + +```bash +# View corpus statistics +python fuzz/fuzz_tokenize.py corpus/ -runs=0 + +# Minimize corpus (keep unique inputs only) +python fuzz/fuzz_tokenize.py -merge=1 minimized/ corpus/ + +# Find minimum reproducer for a crash +python fuzz/fuzz_tokenize.py -minimize_crash=1 crash-file + +# Run fuzzer with existing corpus +python fuzz/fuzz_tokenize.py corpus/ -max_total_time=60 + +# Check corpus coverage +python fuzz/fuzz_tokenize.py corpus/ -runs=0 -print_coverage=1 +``` + diff --git a/fuzz/build.sh b/fuzz/build.sh new file mode 100755 index 000000000..f1e15544f --- /dev/null +++ b/fuzz/build.sh @@ -0,0 +1,28 @@ +#!/bin/bash -eu +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +# Build script for ClusterFuzzLite fuzzing harnesses +# This script installs atheris and prepares all fuzzing harnesses + +echo "Building PyThaiNLP fuzz targets..." + +# Install atheris for Python fuzzing with pinned version for security +pip install "atheris==2.3.0" + +# Find all fuzz_*.py files in the fuzz directory +for fuzzer in "${SRC}/pythainlp/fuzz"/fuzz_*.py; do + [[ -e "$fuzzer" ]] || continue + fuzzer_basename=$(basename -s .py "$fuzzer") + + echo "Preparing ${fuzzer_basename}..." + + # Copy fuzzer to output directory (instrumentation happens at runtime) + cp "${fuzzer}" "${OUT}/${fuzzer_basename}" + + # Make fuzzer executable + chmod +x "${OUT}/${fuzzer_basename}" +done + +echo "Build completed successfully!" diff --git a/fuzz/fuzz_tokenize.py b/fuzz/fuzz_tokenize.py new file mode 100644 index 000000000..57555955d --- /dev/null +++ b/fuzz/fuzz_tokenize.py @@ -0,0 +1,53 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Fuzzing harness for pythainlp.tokenize.word_tokenize() + +This fuzzer tests the word_tokenize function with random Unicode input +to discover edge cases, crashes, and potential security issues. +""" + +import sys + +import atheris + +import pythainlp.tokenize + + +def TestOneInput(data: bytes) -> None: + """Fuzz target for word_tokenize. + + :param bytes data: Random input bytes from the fuzzer + :rtype: None + """ + fdp = atheris.FuzzedDataProvider(data) + + try: + # Generate random Unicode string + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Test word_tokenize with default engine + result = pythainlp.tokenize.word_tokenize(text) + + # Validate output type + if not isinstance(result, list): + raise TypeError(f"Expected list, got {type(result)}") + if not all(isinstance(token, str) for token in result): + raise TypeError("All tokens should be strings") + + except (ValueError, TypeError, UnicodeDecodeError): + # Expected exceptions - these are acceptable + pass + + +def main() -> None: + """Entry point for the fuzzer. + + :rtype: None + """ + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main() diff --git a/fuzz/fuzz_util_normalize.py b/fuzz/fuzz_util_normalize.py new file mode 100644 index 000000000..a1d486eb6 --- /dev/null +++ b/fuzz/fuzz_util_normalize.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +"""Fuzzing harness for pythainlp.util.normalize() + +This fuzzer tests the normalize function with random Unicode input +to discover edge cases, crashes, and potential security issues. +""" + +import sys + +import atheris + +import pythainlp.util + + +def TestOneInput(data: bytes) -> None: + """Fuzz target for normalize. + + :param bytes data: Random input bytes from the fuzzer + :rtype: None + """ + fdp = atheris.FuzzedDataProvider(data) + + try: + # Generate random Unicode string + text = fdp.ConsumeUnicodeNoSurrogates(fdp.remaining_bytes()) + + # Test normalize + result = pythainlp.util.normalize(text) + + # Validate output type + if not isinstance(result, str): + raise TypeError(f"Expected str, got {type(result)}") + + except (ValueError, TypeError, UnicodeDecodeError): + # Expected exceptions - these are acceptable + pass + + +def main() -> None: + """Entry point for the fuzzer. + + :rtype: None + """ + atheris.Setup(sys.argv, TestOneInput) + atheris.Fuzz() + + +if __name__ == "__main__": + main()