diff --git a/.editorconfig b/.editorconfig index bb53136..70e0c4a 100644 --- a/.editorconfig +++ b/.editorconfig @@ -9,4 +9,7 @@ indent_size = 4 end_of_line = lf charset = utf-8 trim_trailing_whitespace = true -insert_final_newline = true \ No newline at end of file +insert_final_newline = true + +[*.yml] +indent_size = 2 \ No newline at end of file diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..20145a2 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,4 @@ +# This file is used to automatically assign reviewers to PRs +# For more information see: https://help.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners + +* @jacksonpradolima diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..8b55176 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,64 @@ +name: Bug report +description: Report an issue or bug with this library +labels: ['bug'] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! + - type: checkboxes + id: non_api + attributes: + label: Confirm this is an issue with the gsp-py library. + description: Issues relevant to other tools should be directed to their respective repositories. + options: + - label: This is an issue with the gsp-py library + required: true + - type: textarea + id: what-happened + attributes: + label: Describe the bug + description: A clear and concise description of what the bug is, and any additional context. + placeholder: Tell us what you see! + validations: + required: true + - type: textarea + id: repro-steps + attributes: + label: To Reproduce + description: Steps to reproduce the behavior. + placeholder: | + 1. Fetch a '...' + 2. Update the '....' + 3. See error + validations: + required: true + - type: textarea + id: code-snippets + attributes: + label: Code snippets + description: If applicable, add code snippets to help explain your problem. + render: Python + validations: + required: false + - type: input + id: os + attributes: + label: OS + placeholder: macOS + validations: + required: true + - type: input + id: language-version + attributes: + label: Python version + placeholder: Python v3.11.4 + validations: + required: true + - type: input + id: lib-version + attributes: + label: Library version + placeholder: gsp-py v1.0.0 + validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..6332d5d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,7 @@ +blank_issues_enabled: false +contact_links: + - name: gsp-py Support + url: https://github.com/jacksonpradolima/gsp-py + about: | + Please only file issues here if they relate to actual bugs or feature requests for the gsp-py project. + For more general discussions or questions, open an issue and select the appropriate template. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..c0cdc02 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,28 @@ +name: Feature request +description: Suggest an idea for this library +labels: ['feature-request'] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this feature request! + - type: checkboxes + id: non_api + attributes: + label: Confirm this is a feature request for the gsp-py library. + description: Feature requests relevant to other tools should be directed to the correct repository. + options: + - label: This is a feature request for the gsp-py library + required: true + - type: textarea + id: feature + attributes: + label: Describe the feature or improvement you're requesting + description: A clear and concise description of what you want to happen. + validations: + required: true + - type: textarea + id: context + attributes: + label: Additional context + description: Add any other context about the feature request here. diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml new file mode 100644 index 0000000..b81ddd9 --- /dev/null +++ b/.github/workflows/code_quality.yml @@ -0,0 +1,51 @@ +name: Code Quality + +on: + pull_request: + types: [ opened, synchronize, reopened, edited, ready_for_review ] + +jobs: + code-quality: + name: Code Quality Checks + runs-on: ubuntu-latest + + steps: + # Step 1: Checkout the repository code + - name: Checkout code + uses: actions/checkout@v4 + + # Step 2: Install Rye (via Curl) + - name: Install Rye + uses: eifinger/setup-rye@v4 + with: + version: '0.43.0' + enable-cache: 'true' + + # Step 3: Sync dependencies + - name: Sync dependencies + run: rye sync + + # Step 4: Get changed Python files + - name: Get Python changed files + id: changed-py-files + uses: tj-actions/changed-files@v45 + with: + files: | + *.py + **/*.py + + # Step 5: Run Ruff for only changed files + - name: Run Ruff (Lint) + if: steps.changed-py-files.outputs.any_changed == 'true' + run: | + echo "Running Ruff on changed files..." + echo "Changed files: ${{ steps.changed-py-files.outputs.all_changed_files }}" + rye run lint ${{ steps.changed-py-files.outputs.all_changed_files }} + + # Step 6: Run Pyright for only changed files + - name: Run Pyright (Type Check) + if: steps.changed-py-files.outputs.any_changed == 'true' + run: | + echo "Running Pyright on changed files..." + echo "Changed files: ${{ steps.changed-py-files.outputs.all_changed_files }}" + rye run typecheck ${{ steps.changed-py-files.outputs.all_changed_files }} diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 77cd7ba..eb3a962 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -19,7 +19,10 @@ jobs: python-version: 3.11 - name: Install dependencies - run: pip install -r requirements-dev.txt + run: | + pip install pytest==8.3.4 \ + pytest-benchmark==5.1.0 \ + pytest-cov==6.0.0 - name: Run tests run: pytest --cov --cov-branch --junitxml=junit.xml -o junit_family=legacy diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 8e17949..119658f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -10,22 +10,25 @@ jobs: runs-on: ubuntu-latest environment: name: pypi - url: https://pypi.org/p/gsppy + url: https://pypi.org/project/gsppy/ permissions: id-token: write steps: - uses: actions/checkout@v4 + - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.x" - - name: Install dependencies + + - name: Install build dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel + pip install build + - name: Build package run: | - python setup.py sdist bdist_wheel # Could also be python -m build + python -m build + - name: Publish package distributions to PyPI uses: pypa/gh-action-pypi-publish@v1.12.3 - diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index d7a9e24..0000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,54 +0,0 @@ -name: Pylint - -on: - pull_request: - types: [opened, synchronize, reopened, edited, ready_for_review] - -jobs: - pylint: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - fetch-depth: 0 # Retrieve full commit history for proper diff. - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: 3.11 - - - name: Install dependencies - run: python -m pip install pylint - - - name: Create Pylintrc file - run: | - echo "[MASTER]" > .pylintrc - echo "max-line-length=120" >> .pylintrc - echo "disable=" >> .pylintrc - echo " C0103, # variable naming style" >> .pylintrc - echo " logging-format-interpolation, # prefer % formatting" >> .pylintrc - echo " broad-except, # catch all exceptions" >> .pylintrc - echo " too-many-locals, # relax constraints" >> .pylintrc - echo " too-few-public-methods," >> .pylintrc - echo " too-many-instance-attributes," >> .pylintrc - echo " too-many-arguments," >> .pylintrc - echo " import-error," >> .pylintrc - echo " attribute-defined-outside-init," >> .pylintrc - echo " redefined-outer-name" >> .pylintrc - - - name: Get Python changed files - id: changed-py-files - uses: tj-actions/changed-files@v45 - with: - files: | - *.py - **/*.py - - - name: Run pylint if Python files changed - if: steps.changed-py-files.outputs.any_changed == 'true' - run: | - echo "One or more Python files have changed." - echo "List of changed files: ${{ steps.changed-py-files.outputs.all_changed_files }}" - python -m pylint --rcfile=.pylintrc ${{ steps.changed-py-files.outputs.all_changed_files }} \ No newline at end of file diff --git a/.python-version b/.python-version index c69ac4c..455348b 100644 --- a/.python-version +++ b/.python-version @@ -1,6 +1,6 @@ -3.6.15 -3.7.12 3.8.10 3.9.16 3.10.12 3.11.4 +3.12.8 +3.13.1 diff --git a/CHANGELOG.md b/CHANGELOG.md index e28a0fb..02a5680 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,53 @@ # Changelog +## [v2.2.0] - 2024-12-27 + +### **New Features** +- Added a `.github/CODEOWNERS` file to automatically assign reviewers for pull requests. +- Introduced new issue templates: + - `bug_report.yml` for structured bug reporting. + - `feature_request.yml` for feature suggestions. + - `config.yml` to disable blank issues and provide guidance for issue reporting. + +### **Infrastructure Improvements** +- Added `SECURITY.md` to define the project's security policy, including supported versions and responsible disclosure practices. +- Updated `.python-version` to include support for Python versions `3.12.8` and `3.13.1`. +- **Migrated Dependency and Virtual Environment Management to Rye**: + - Introduced [Rye](https://github.com/mitsuhiko/rye) for managing Python dependencies and virtual environments. + - Deprecated the `requirements-dev.txt` file in favor of managing dependencies in `pyproject.toml`. + - Updated documentation to include instructions for using `rye sync` to set up the project environment. + - Updated CI workflows to install and use dependencies directly via Rye. + +### **CLI Enhancements** +- Improved `gsppy/cli.py`: + - Enhanced logging setup with verbosity options (`--verbose`). + - Added type annotations for improved readability and maintainability. + - Refined error handling and user feedback for invalid inputs. + +### **Algorithm and Utility Enhancements** +- Updated GSP implementation in `gsppy/gsp.py`: + - Added type annotations and clarified logic in methods. + - Improved `_worker_batch` and `_support` functions for better performance and readability. +- Enhanced utility functions in `gsppy/utils.py`: + - Added stricter typing and validation in utilities like `is_subsequence_in_list`. + +### **Testing and Quality** +- Reorganized tests: + - Moved test files from `gsppy/tests/` to `tests/` for a cleaner structure. + - Refactored test cases with type annotations and enhanced mock handling. +- Introduced additional dev dependencies in `requirements-dev.txt`: + - `mypy`, `pyright`, and `ruff` for static analysis and linting. + - `cython` for performance optimization in future releases. +- Added `mypy.ini` and updated `pyproject.toml` for stricter type-checking and configuration consistency. + +### **Build and Packaging** +- Migrated to `pyproject.toml` for modern Python packaging: + - Removed `setup.py` and `setup.cfg`. + - Introduced `hatch` and `hatchling` for streamlined builds. +- Updated GitHub Actions workflows: + - Fixed PyPI URL in `publish.yml`. + - Updated build steps to use `python -m build` for consistency. + ## [v2.1.0] - 2024-12-26 ### **Compatibility Updates** @@ -98,7 +146,28 @@ ## **Summary of Changes** -### From v2.0 to v2.1.0 -- Enhanced compatibility with earlier Python versions (3.8+). -- Improved testing and dependency management. -- Updated workflows and documentation for broader support and clarity. +### From v2.1.0 to v2.2.0 +- **New Features**: + - Added `.github/CODEOWNERS` and issue templates for better pull request and issue management. + - Introduced a `SECURITY.md` file to define project security policies. + +- **Infrastructure Improvements**: + - Added Python 3.12.8 and 3.13.1 compatibility in `.python-version`. + - Migrated to `pyproject.toml` for modern Python packaging, replacing `setup.py` and `setup.cfg`. + - Adopted **Rye** for dependency management and virtual environment setup: + - Deprecated `requirements-dev.txt` in favor of managing all dependencies in `pyproject.toml`. + - Updated workflows and documentation to reflect the use of `rye sync` for environment setu + +- **CLI Enhancements**: + - Improved logging, error handling, and added type annotations in the CLI. + +- **Algorithm & Utility Enhancements**: + - Refactored the GSP algorithm for better performance and clarity. + - Enhanced utility functions with stricter typing and validation. + +- **Testing & Quality**: + - Reorganized tests into a cleaner structure, added static analysis tools, and improved type-checking configurations. + +- **Build & Packaging**: + - Streamlined the build process with `hatch` and `hatchling`. + - Updated GitHub Actions workflows for improved consistency and functionality. diff --git a/CITATION.cff b/CITATION.cff index 48c4f6c..165b460 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,7 +6,7 @@ authors: given-names: "Jackson Antonio do" orcid: "https://orcid.org/10.5281/zenodo.3333987" year: 2024 -version: "2.1.0" +version: "2.2.0" doi: "10.5281/zenodo.3333987" url: "https://github.com/jacksonpradolima/gsp-py" repository-code: "https://github.com/jacksonpradolima/gsp-py" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5ce1f16..2be4c5a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -97,28 +97,42 @@ To maintain consistency and code quality, please follow these coding guidelines: To get familiar with the existing code, follow these steps: 1. **Setup Environment**: - - Create a virtual environment using `venv`: + This project uses [Rye](https://github.com/mitsuhiko/rye) for managing dependencies and the virtual environment. Follow these instructions to set it up: + + - Install Rye (if not already installed): + ```bash + curl -sSf https://rye.astral.sh/get | bash + ``` + + Make sure Rye's binary directory is added to your `PATH`: ```bash - python3 -m venv venv - source venv/bin/activate # Activate the virtual environment + export PATH="$HOME/.rye/bin:$PATH" ``` - - Install dependencies: + - Install project dependencies using Rye: ```bash - pip install -r requirements.txt + rye sync ``` + This command reads the dependencies specified in the `pyproject.toml` file and installs them into a local environment managed by Rye. + 2. **Run Tests**: - Use `pytest` to verify the baseline state: + Use Rye to run tests and verify the baseline state: ```bash - pytest + rye run test ``` + The `test` script is defined in the `pyproject.toml` under `[tool.rye.scripts]` and uses `pytest`. + 3. **Explore the Code**: - The main entry point for the GSP algorithm is in the `gsppy` module. The libraries for support counting, candidate generation, and additional utility functions are also there. + The main entry point for the GSP algorithm is in the `gsppy` module. The libraries for support counting, candidate generation, and additional utility functions are also within this module. --- +### Notes: +- No need to create a `venv` or install dependencies manually with `pip`; Rye handles everything based on the `pyproject.toml` file. +- If you’re unfamiliar with Rye, refer to its [documentation](https://github.com/mitsuhiko/rye). + ## Reporting Issues To report a bug or suggest an enhancement, open an issue on GitHub: diff --git a/README.md b/README.md index 48860e7..52a24b6 100644 --- a/README.md +++ b/README.md @@ -93,16 +93,45 @@ pip install gsppy ## 🛠️ Developer Installation -For contributors and developers, GSP-Py provides additional dependencies for development purposes (e.g., testing and -linting). +This project uses [Rye](https://github.com/mitsuhiko/rye) for managing dependencies, running scripts, and setting up the environment. Follow these steps to install and set up Rye for this project: -To install the package along with development dependencies, use: +#### 1. Install Rye +Run the following command to install Rye: ```bash -pip install .[dev] +curl -sSf https://rye.astral.sh/get | bash +``` + +If the `~/.rye/bin` directory is not in your PATH, add the following line to your shell configuration file (e.g., `~/.bashrc`, `~/.zshrc`, etc.): + +```bash +export PATH="$HOME/.rye/bin:$PATH" ``` -The `dev` category includes tools such as `pytest`, `pylint`, and others to ensure code quality and maintainability. +Reload your shell configuration file: + +```bash +source ~/.bashrc # or `source ~/.zshrc` +``` + +#### 2. Set Up the Project Environment +To configure the project environment and install its dependencies, run: + +```bash +rye sync +``` + +#### 3. Use Rye Scripts +Once the environment is set up, you can run the following commands to simplify project tasks: + +- Run tests: `rye run test` +- Format code: `rye run format` +- Lint code: `rye run lint` +- Type-check: `rye run typecheck` + +#### Notes +- Rye automatically reads dependencies and scripts from the `pyproject.toml` file. +- No need for `requirements.txt`, as Rye manages all dependencies! ## 💡 Usage diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..58226aa --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,44 @@ +# Security Policy + +## Supported Versions + +The following versions of `gsp-py` are actively supported with security updates and maintenance: + +| Version | Supported | +|---------------|--------------------| +| 2.x.x | ✅ | +| < 2.0.0 | ❌ | + +We strongly recommend upgrading to the latest version of `gsp-py` to ensure the most secure and stable experience. + +--- + +## Reporting Security Issues + +`gsp-py` is maintained by [Jackson Antonio do Prado Lima](https://github.com/jacksonpradolima), who takes security seriously. We encourage you to promptly report any security vulnerabilities to ensure the library remains secure for all users. + +To report a security issue, please contact the `gsp-py` maintainer at **[jacksonpradolima@gmail.com](mailto:jacksonpradolima@gmail.com)**. + +--- + +## Responsible Disclosure + +We greatly appreciate the efforts of security researchers and individuals who responsibly disclose vulnerabilities to our team. If you believe you have found a security vulnerability, we kindly request you follow responsible disclosure practices: + +1. **Report privately**: Please contact us directly and avoid publicly disclosing the issue. +2. **Provide sufficient details**: Include the version of `gsp-py`, a description of the vulnerability, and any applicable steps to reproduce the issue. +3. **Allow time to address the issue**: Grant us a reasonable amount of time to investigate and release a fix before public disclosure. + +This helps ensure that vulnerabilities are resolved without unnecessary risk to users of the library. + +--- + +## Reporting Non-Library Security Issues + +If you discover a security issue that is unrelated to `gsp-py` but pertains to external services or products (for example, dependency vulnerabilities), please report these issues directly to the respective organization responsible. + +--- + +## Acknowledgment + +Thank you for helping us maintain the security and reliability of `gsp-py`. Your effort ensures the safety of the library and the systems they interact with. diff --git a/gsppy/cli.py b/gsppy/cli.py index 4b51eb7..98b84a2 100644 --- a/gsppy/cli.py +++ b/gsppy/cli.py @@ -27,17 +27,37 @@ This CLI empowers users to perform sequential pattern mining on transactional data efficiently through a simple command-line interface. """ -import argparse +import os import csv +import sys import json import logging -import os -from typing import List +import argparse +from typing import Dict, List, Tuple from gsppy.gsp import GSP +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(message)s", # Simplified to keep CLI output clean + handlers=[logging.StreamHandler(sys.stdout)], +) +logger = logging.getLogger(__name__) -def read_transactions_from_json(file_path: str) -> List[List]: + +def setup_logging(verbose: bool) -> None: + """ + Set the logging level based on the verbosity of the CLI output. + :param verbose: Whether to enable verbose logging. + """ + if verbose: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + +def read_transactions_from_json(file_path: str) -> List[List[str]]: """ Read transactions from a JSON file. @@ -52,9 +72,7 @@ def read_transactions_from_json(file_path: str) -> List[List]: """ try: with open(file_path, 'r', encoding='utf-8') as f: - transactions = json.load(f) - if not isinstance(transactions, list) or not all(isinstance(t, list) for t in transactions): - raise ValueError("File should contain a JSON array of transaction lists.") + transactions: List[List[str]] = json.load(f) return transactions except Exception as e: msg = f"Error reading transaction data from JSON file '{file_path}': {e}" @@ -62,7 +80,7 @@ def read_transactions_from_json(file_path: str) -> List[List]: raise ValueError(msg) from e -def read_transactions_from_csv(file_path: str) -> List[List]: +def read_transactions_from_csv(file_path: str) -> List[List[str]]: """ Read transactions from a CSV file. @@ -76,7 +94,7 @@ def read_transactions_from_csv(file_path: str) -> List[List]: ValueError: If the file cannot be read or contains invalid data. """ try: - transactions = [] + transactions: List[List[str]] = [] with open(file_path, newline='', encoding='utf-8') as csvfile: reader = csv.reader(csvfile) for row in reader: @@ -92,7 +110,7 @@ def read_transactions_from_csv(file_path: str) -> List[List]: raise ValueError(msg) from e -def detect_and_read_file(file_path: str) -> List[List]: +def detect_and_read_file(file_path: str) -> List[List[str]]: """ Detect file format (CSV or JSON) and read transactions. @@ -120,7 +138,7 @@ def detect_and_read_file(file_path: str) -> List[List]: raise ValueError("Unsupported file format. Please provide a JSON or CSV file.") -def main(): +def main() -> None: """ Main function to handle CLI input and run the GSP algorithm. @@ -150,32 +168,42 @@ def main(): help="Minimum support threshold as a fraction of total transactions (default: 0.2)" ) + # Verbose output argument + parser.add_argument( + '--verbose', + action='store_true', + help='Enable verbose output for debugging purposes.' + ) + # Parse arguments args = parser.parse_args() + # Setup logging verbosity + setup_logging(args.verbose) + # Automatically detect and load transactions try: transactions = detect_and_read_file(args.file) except ValueError as e: - print(f"Error: {e}") + logger.error(f"Error: {e}") return # Check min_support if args.min_support <= 0.0 or args.min_support > 1.0: - print("Error: min_support must be in the range (0.0, 1.0].") + logger.error("Error: min_support must be in the range (0.0, 1.0].") return # Initialize and run GSP algorithm try: gsp = GSP(transactions) - patterns = gsp.search(min_support=args.min_support) - print("Frequent Patterns Found:") + patterns: List[Dict[Tuple[str, ...], int]] = gsp.search(min_support=args.min_support) + logger.info("Frequent Patterns Found:") for i, level in enumerate(patterns, start=1): - print(f"\n{i}-Sequence Patterns:") + logger.info(f"\n{i}-Sequence Patterns:") for pattern, support in level.items(): - print(f"Pattern: {pattern}, Support: {support}") + logger.info(f"Pattern: {pattern}, Support: {support}") except Exception as e: - print(f"Error executing GSP algorithm: {e}") + logger.error(f"Error executing GSP algorithm: {e}") if __name__ == '__main__': diff --git a/gsppy/gsp.py b/gsppy/gsp.py index ec82565..d953f7b 100644 --- a/gsppy/gsp.py +++ b/gsppy/gsp.py @@ -86,9 +86,9 @@ """ import logging import multiprocessing as mp -from collections import Counter +from typing import Any, Dict, List, Tuple from itertools import chain -from typing import List, Dict, Tuple +from collections import Counter from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous @@ -114,7 +114,7 @@ class GSP: k-sequence for pattern generation. """ - def __init__(self, raw_transactions: List[List]): + def __init__(self, raw_transactions: List[List[str]]): """ Initialize the GSP algorithm with raw transactional data. @@ -132,10 +132,10 @@ def __init__(self, raw_transactions: List[List]): ValueError: If the input transaction dataset is empty, contains fewer than two transactions, or is not properly formatted. """ - self.freq_patterns = [] + self.freq_patterns: List[Dict[Tuple[str, ...], int]] = [] self._pre_processing(raw_transactions) - def _pre_processing(self, raw_transactions: List[List]): + def _pre_processing(self, raw_transactions: List[List[str]]) -> None: """ Validate and preprocess the input transactional dataset. @@ -167,20 +167,19 @@ def _pre_processing(self, raw_transactions: List[List]): logger.error(msg) raise ValueError(msg) - if not all(isinstance(item, list) for item in raw_transactions): - msg = "The dataset must be a list of transactions." - logger.error(msg) - raise ValueError(msg) - logger.info("Pre-processing transactions...") self.max_size = max(len(item) for item in raw_transactions) - self.transactions = [tuple(transaction) for transaction in raw_transactions] - counts = Counter(chain.from_iterable(raw_transactions)) - self.unique_candidates = [(item,) for item in counts.keys()] + self.transactions: List[Tuple[str, ...]] = [tuple(transaction) for transaction in raw_transactions] + counts: Counter[str] = Counter(chain.from_iterable(raw_transactions)) + self.unique_candidates: list[tuple[str, Any]] = [(item,) for item in counts.keys()] logger.debug("Unique candidates: %s", self.unique_candidates) @staticmethod - def _worker_batch(batch: List[Tuple], transactions: List[Tuple], min_support: int) -> List[Tuple[Tuple, int]]: + def _worker_batch( + batch: List[Tuple[str, ...]], + transactions: List[Tuple[str, ...]], + min_support: int + ) -> List[Tuple[Tuple[str, ...], int]]: """ Evaluate a batch of candidate sequences to compute their support. @@ -198,14 +197,17 @@ def _worker_batch(batch: List[Tuple], transactions: List[Tuple], min_support: in - A candidate sequence. - The candidate's support count. """ - results = [] + results: List[Tuple[Tuple[str, ...], int]] = [] for item in batch: frequency = sum(1 for t in transactions if is_subsequence_in_list(item, t)) if frequency >= min_support: results.append((item, frequency)) return results - def _support(self, items: List[Tuple], min_support: float = 0, batch_size: int = 100) -> Dict[Tuple, int]: + def _support( + self, + items: List[Tuple[str, ...]], min_support: float = 0, batch_size: int = 100 + ) -> Dict[Tuple[str, ...], int]: """ Calculate support counts for candidate sequences, using parallel processing. @@ -235,7 +237,7 @@ def _support(self, items: List[Tuple], min_support: float = 0, batch_size: int = # Flatten the list of results and convert to a dictionary return {item: freq for batch in batch_results for item, freq in batch} - def _print_status(self, run: int, candidates: List[Tuple]): + def _print_status(self, run: int, candidates: List[Tuple[str, ...]]) -> None: """ Log progress information for the current GSP iteration. @@ -249,7 +251,7 @@ def _print_status(self, run: int, candidates: List[Tuple]): logger.info("Run %d: %d candidates filtered to %d.", run, len(candidates), len(self.freq_patterns[run - 1])) - def search(self, min_support: float = 0.2) -> List[Dict[Tuple, int]]: + def search(self, min_support: float = 0.2) -> List[Dict[Tuple[str, ...], int]]: """ Execute the Generalized Sequential Pattern (GSP) mining algorithm. @@ -263,8 +265,9 @@ def search(self, min_support: float = 0.2) -> List[Dict[Tuple, int]]: appears in at least 30% of all transactions. Returns: - List[Dict[Tuple, int]]: A list where each element corresponds to a k-sequence-level - dictionary, mapping frequent patterns to their support counts. + List[Dict[Tuple[str, ...], int]]: A list of dictionaries containing frequent patterns + at each k-sequence level, with patterns as keys + and their support counts as values. Raises: ValueError: If the minimum support threshold is not in the range `(0.0, 1.0]`. diff --git a/gsppy/utils.py b/gsppy/utils.py index 4456f62..ce053c9 100644 --- a/gsppy/utils.py +++ b/gsppy/utils.py @@ -20,34 +20,35 @@ These utilities are designed to support sequence processing tasks and can be adapted to various domains, such as data mining, recommendation systems, and sequence analysis. """ +from typing import Dict, List, Tuple, Sequence, Generator from functools import lru_cache from itertools import product -from typing import List, Tuple, Generator, Dict -def split_into_batches(items: List[Tuple], batch_size: int) -> Generator[List[Tuple], None, None]: +def split_into_batches( + items: Sequence[Tuple[str, ...]], batch_size: int +) -> Generator[Sequence[Tuple[str, ...]], None, None]: """ Split the list of items into smaller batches. Parameters: - items (List[Tuple]): The list of candidate items. + items (Sequence[Tuple]): A sequence of items to be batched. batch_size (int): The maximum size of each batch. Returns: - List[List[Tuple]]: A list of batches, where each batch contains a subset of candidate items. + Generator[Sequence[Tuple], None, None]: A generator yielding batches of items. """ for i in range(0, len(items), batch_size): yield items[i:i + batch_size] -# Cache the results of the slice comparison function to avoid redundant calculations @lru_cache(maxsize=None) -def is_subsequence_in_list(subsequence: Tuple, sequence: Tuple) -> bool: +def is_subsequence_in_list(subsequence: Tuple[str, ...], sequence: Tuple[str, ...]) -> bool: """ Check if a subsequence exists within a sequence as a contiguous subsequence. Parameters: - subsequence: Tuple (tuple): The sequence to search for. + subsequence: (tuple): The sequence to search for. sequence (tuple): The sequence to search within. Returns: @@ -67,12 +68,14 @@ def is_subsequence_in_list(subsequence: Tuple, sequence: Tuple) -> bool: return any(sequence[i:i + len_sub] == subsequence for i in range(len_seq - len_sub + 1)) -def generate_candidates_from_previous(prev_patterns: Dict[Tuple, int]) -> List[Tuple]: +def generate_candidates_from_previous( + prev_patterns: Dict[Tuple[str, ...], int] +) -> List[Tuple[str, ...]]: """ Generate joined candidates from the previous level's frequent patterns. Parameters: - prev_patterns (Dict[Tuple, int]): Frequent patterns at the previous level. + prev_patterns (Dict[Tuple, int]): A dictionary of frequent patterns from the previous level. Returns: List[Tuple]: Candidate patterns for the next level. diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..4aea7c7 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,54 @@ +[mypy] +# Ignore errors from the typing module +python_version = 3.9 + +# Output configuration +pretty = True +show_error_codes = True + +# Type-checking strictness configuration +strict_equality = True +implicit_reexport = True +check_untyped_defs = True +no_implicit_optional = True + +# Warnings +warn_return_any = True +warn_unreachable = True +warn_unused_configs = True + +# Turn these options off as it could cause conflicts +# with the Pyright options. +warn_unused_ignores = False +warn_redundant_casts = False + +# Error handling and strict type management +disallow_any_generics = True +disallow_untyped_defs = True +disallow_untyped_calls = True +disallow_subclassing_any = True +disallow_incomplete_defs = True +disallow_untyped_decorators = True + +# Fine-grained caching +cache_fine_grained = True + +# Exclusion of specific files and directories +exclude = ^(tests/.*|examples/.*)$ + +# By default, mypy reports an error if you assign a value to the result +# of a function call that doesn't return anything. We do this in our test +# cases: +# ``` +# result = ... +# assert result is None +# ``` +# Changing this codegen to make mypy happy would increase complexity +# and would not be worth it. +disable_error_code = func-returns-value + +# https://github.com/python/mypy/issues/12162 +[mypy.overrides] +module = "black.files.*" +ignore_errors = True +ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..aa1e111 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,166 @@ +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme"] +build-backend = "hatchling.build" + +[project] +name = "gsppy" +version = "2.2.0" +description = "GSP (Generalized Sequence Pattern) algorithm in Python" +keywords = ["GSP", "sequential patterns", "data analysis", "sequence mining"] +license = { file = "LICENSE" } +requires-python = ">=3.8" +readme = { file = "README.md", content-type = "text/markdown" } +homepage = "https://github.com/jacksonpradolima/gsp-py" +repository = "https://github.com/jacksonpradolima/gsp-py" +authors = [{ name = "Jackson Antonio do Prado Lima", email = "jacksonpradolima@gmail.com" }] +maintainers = [{ name = "Jackson Antonio do Prado Lima", email = "jacksonpradolima@gmail.com" }] +classifiers = [ + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Operating System :: OS Independent", + "License :: OSI Approved :: MIT License", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Information Analysis", + "Topic :: Software Development :: Libraries :: Python Modules", + "Natural Language :: English", +] + +dependencies = [] + +[project.urls] +Homepage = "https://github.com/jacksonpradolima/gsp-py" + +[project.scripts] +gsppy = "gsppy.cli:main" + +[project.optional-dependencies] +dev = [ + "cython==3.0.11", + "hatch==1.14.0", + "hatchling==1.27.0", + "mypy==1.14.0", + "pylint==3.3.3", + "pyright==1.1.391", + "pytest==8.3.4", + "pytest-benchmark==5.1.0", + "pytest-cov==6.0.0", + "ruff==0.8.4", + "tox==4.23.2", +] + +[tool.hatch.build] +include = ["gsppy/*"] + +[tool.hatch.metadata.hooks.fancy-pypi-readme] +content-type = "text/markdown" + +[tool.hatch.build.targets.sdist] +# Basically everything except hidden files/directories (such as .github, .python-version, etc) +include = [ + "/*.toml", + "/*.json", + "/*.md", + "/*.ini", + "bin/*", + "gsppy/*", + "tests/*", +] + +[tool.mypy] +python_version = "3.8" +check_untyped_defs = true +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 +output-format = "grouped" +target-version = "py38" + +[tool.ruff.format] +docstring-code-format = true + +[tool.ruff.lint] +select = [ + # isort + "I", + # bugbear rules + "B", + # mutable defaults + "B006", + # remove unused imports + "F401", + # bare except statements + "E722", + # unused arguments + "ARG", + # print statements + "T201", + "T203", + # misuse of typing.TYPE_CHECKING + "TCH004", + # import rules + "TID251", +] +ignore = [ + # mutable defaults + "B006", +] +unfixable = [ + # disable auto fix for print statements + "T201", + "T203", +] + +[tool.ruff.lint.flake8-tidy-imports.banned-api] +"functools.lru_cache".msg = "This function does not retain type information for the wrapped function's arguments; The `lru_cache` function from `_utils` should be used instead" + +[tool.ruff.lint.isort] +length-sort = true +length-sort-straight = true +combine-as-imports = true +extra-standard-library = ["typing_extensions"] +known-first-party = ["gsp", "tests"] + +[tool.ruff.lint.per-file-ignores] +"tests/**.py" = ["T201", "T203"] +"gsppy/utils.py" = ["TID251"] + +[tool.pyright] +# this enables practically every flag given by pyright. +# there are a couple of flags that are still disabled by +# default in strict mode as they are experimental and niche. +typeCheckingMode = "strict" +pythonVersion = "3.8" +exclude = [] +reportImplicitOverride = true +reportImportCycles = false +reportPrivateUsage = false + +[tool.pytest.ini_options] +testpaths = ["tests"] +addopts = "--tb=short -v" +xfail_strict = true +filterwarnings = [ + "error" +] + +[tool.rye] +dev-dependencies = [ + "pyright>=1.1.391", + "ruff>=0.8.4", + "pytest>=8.3.4", + "pytest-benchmark>=4.0.0", + "tox>=4.23.2", + "pylint>=3.2.7", +] + +[tool.rye.scripts] +test = "pytest" +format = "ruff check --fix ." +lint = "ruff check ." +typecheck = "pyright" +tox = "tox -r" diff --git a/requirements-dev.lock b/requirements-dev.lock new file mode 100644 index 0000000..92a587e --- /dev/null +++ b/requirements-dev.lock @@ -0,0 +1,73 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. +astroid==3.2.4 + # via pylint +cachetools==5.5.0 + # via tox +chardet==5.2.0 + # via tox +colorama==0.4.6 + # via tox +dill==0.3.9 + # via pylint +distlib==0.3.9 + # via virtualenv +exceptiongroup==1.2.2 + # via pytest +filelock==3.16.1 + # via tox + # via virtualenv +iniconfig==2.0.0 + # via pytest +isort==5.13.2 + # via pylint +mccabe==0.7.0 + # via pylint +nodeenv==1.9.1 + # via pyright +packaging==24.2 + # via pyproject-api + # via pytest + # via tox +platformdirs==4.3.6 + # via pylint + # via tox + # via virtualenv +pluggy==1.5.0 + # via pytest + # via tox +py-cpuinfo==9.0.0 + # via pytest-benchmark +pylint==3.2.7 +pyproject-api==1.8.0 + # via tox +pyright==1.1.391 +pytest==8.3.4 + # via pytest-benchmark +pytest-benchmark==4.0.0 +ruff==0.8.4 +tomli==2.2.1 + # via pylint + # via pyproject-api + # via pytest + # via tox +tomlkit==0.13.2 + # via pylint +tox==4.23.2 +typing-extensions==4.12.2 + # via astroid + # via pylint + # via pyright + # via tox +virtualenv==20.28.0 + # via tox diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index fb78c8b..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,6 +0,0 @@ --e . -pylint==3.3.3 -pytest==8.3.4 -pytest-benchmark==5.1.0 -pytest-cov==6.0.0 -tox==4.23.2 diff --git a/requirements.lock b/requirements.lock new file mode 100644 index 0000000..505fd45 --- /dev/null +++ b/requirements.lock @@ -0,0 +1,12 @@ +# generated by rye +# use `rye lock` or `rye sync` to update this lockfile +# +# last locked with the following flags: +# pre: false +# features: [] +# all-features: false +# with-sources: false +# generate-hashes: false +# universal: false + +-e file:. diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index e69de29..0000000 diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 224a779..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[metadata] -description-file = README.md \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 3ea91c1..0000000 --- a/setup.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Setup configuration file for the GSP (Generalized Sequential Patterns) package. - -This script contains metadata and instructions for building, packaging, and distributing -the GSP Python package. It uses setuptools to define package information, dependencies, -and other packaging requirements. -""" -from os.path import abspath, dirname, join - -from setuptools import find_packages, setup - -basedir = abspath(dirname(__file__)) - -with open(join(basedir, 'README.md'), encoding='utf-8') as f: - README = f.read() - -setup( - name='gsppy', - version='2.1.0', - description='GSP (Generalized Sequence Pattern) algorithm in Python', - long_description=README, - long_description_content_type='text/markdown', - author='Jackson Antonio do Prado Lima', - author_email='jacksonpradolima@gmail.com', - maintainer='Jackson Antonio do Prado Lima', - maintainer_email='jacksonpradolima@gmail.com', - license='MIT', - url='https://github.com/jacksonpradolima/gsp-py', - packages=find_packages(exclude=['test_']), - python_requires='>=3.8', - install_requires=[ - # No additional runtime dependencies are required since the project uses standard library modules only. - ], - extras_require={ - 'dev': [ - 'pylint==3.3.3', - 'pytest==8.3.4', - 'pytest-benchmark==5.1.0', - 'pytest-cov==6.0.0', - ], - }, - classifiers=[ - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Operating System :: OS Independent', - 'License :: OSI Approved :: MIT License', - 'Intended Audience :: Science/Research', - 'Topic :: Scientific/Engineering :: Information Analysis', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Natural Language :: English' - ], - keywords='GSP, sequential patterns, data analysis, sequence mining', - entry_points={ - 'console_scripts': [ - 'gsppy = gsppy.cli:main', - ], - }, - tests_require=['pytest'], - test_suite='gsppy.tests', -) diff --git a/gsppy/tests/__init__.py b/tests/__init__.py similarity index 100% rename from gsppy/tests/__init__.py rename to tests/__init__.py diff --git a/gsppy/tests/test_cli.py b/tests/test_cli.py similarity index 65% rename from gsppy/tests/test_cli.py rename to tests/test_cli.py index f6801eb..6d66323 100644 --- a/gsppy/tests/test_cli.py +++ b/tests/test_cli.py @@ -19,38 +19,23 @@ without affecting the file system. Pytest is utilized for parametrized testing to improve coverage and reduce redundancy in test cases. """ -import json import os -import runpy -import sys +import json +import logging import tempfile +import subprocess +from typing import Any, Generator from unittest.mock import patch import pytest +from pytest import MonkeyPatch -from gsppy.cli import detect_and_read_file, main +from gsppy.cli import main, detect_and_read_file from gsppy.gsp import GSP -def test_invalid_json_structure(): - """ - Test if a JSON file with an invalid structure raises an error. - """ - # Create an invalid JSON structure that does not adhere to the expected format - with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file: - temp_file.write(json.dumps({"invalid": "data"})) - temp_file_name = temp_file.name - - # Attempt to read the invalid JSON file - with pytest.raises(ValueError, match="File should contain a JSON array of transaction lists."): - detect_and_read_file(temp_file_name) - - # Cleanup - os.unlink(temp_file_name) - - @pytest.fixture -def valid_json_file(): +def valid_json_file() -> Generator[Any, Any, Any]: """Fixture to create a valid JSON file.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w") as temp_file: json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file) @@ -60,7 +45,7 @@ def valid_json_file(): @pytest.fixture -def valid_csv_file(): +def valid_csv_file() -> Generator[Any, Any, Any]: """Fixture to create a valid CSV file.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file: temp_file.write(b"Bread,Milk\nMilk,Diaper\nBread,Diaper,Beer\n") @@ -70,7 +55,7 @@ def valid_csv_file(): @pytest.fixture -def invalid_json_file(): +def invalid_json_file() -> Generator[Any, Any, Any]: """Fixture to create an invalid JSON file.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as temp_file: temp_file.write(b"{invalid_json: true") # Malformed JSON @@ -80,7 +65,7 @@ def invalid_json_file(): @pytest.fixture -def invalid_csv_file(): +def invalid_csv_file() -> Generator[Any, Any, Any]: """Fixture to create an invalid CSV file.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file: temp_file.write(b",,\nBread,,Milk\n") # Broken format @@ -90,7 +75,7 @@ def invalid_csv_file(): @pytest.fixture -def unsupported_file(): +def unsupported_file() -> Generator[Any, Any, Any]: """Fixture to create an unsupported file.""" with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file: temp_file.write(b"This is a plain text file.") @@ -99,34 +84,34 @@ def unsupported_file(): os.unlink(temp_file_name) -def test_valid_json_file(valid_json_file): +def test_valid_json_file(valid_json_file: Generator[Any, Any, Any]): """Test if a valid JSON file is correctly read.""" - transactions = detect_and_read_file(valid_json_file) + transactions = detect_and_read_file(str(valid_json_file)) assert transactions == [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]] -def test_valid_csv_file(valid_csv_file): +def test_valid_csv_file(valid_csv_file: Generator[Any, Any, Any]): """Test if a valid CSV file is correctly read.""" - transactions = detect_and_read_file(valid_csv_file) + transactions = detect_and_read_file(str(valid_csv_file)) assert transactions == [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]] -def test_invalid_json_file(invalid_json_file): +def test_invalid_json_file(invalid_json_file: Generator[Any, Any, Any]): """Test if an invalid JSON file raises an error.""" with pytest.raises(ValueError, match="Error reading transaction data from JSON file"): - detect_and_read_file(invalid_json_file) + detect_and_read_file(str(invalid_json_file)) -def test_invalid_csv_file(invalid_csv_file): +def test_invalid_csv_file(invalid_csv_file: Generator[Any, Any, Any]): """Test if an invalid CSV file raises an error.""" with pytest.raises(ValueError, match="Error reading transaction data from CSV file"): - detect_and_read_file(invalid_csv_file) + detect_and_read_file(str(invalid_csv_file)) -def test_unsupported_file_format(unsupported_file): +def test_unsupported_file_format(unsupported_file: Generator[Any, Any, Any]): """Test if an unsupported file format raises an error.""" with pytest.raises(ValueError, match="Unsupported file format"): - detect_and_read_file(unsupported_file) + detect_and_read_file(str(unsupported_file)) def test_non_existent_file(): @@ -136,7 +121,7 @@ def test_non_existent_file(): @pytest.mark.parametrize("min_support", [-0.1, 1.1]) -def test_invalid_min_support_gsp(min_support): +def test_invalid_min_support_gsp(min_support: float): """Test if invalid min_support values raise an error.""" transactions = [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]] gsp = GSP(transactions) @@ -145,7 +130,7 @@ def test_invalid_min_support_gsp(min_support): @pytest.mark.parametrize("min_support", [0.5]) -def test_valid_min_support_gsp(min_support): +def test_valid_min_support_gsp(min_support: float): """Test if valid min_support values work with the GSP algorithm.""" transactions = [["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]] gsp = GSP(transactions) @@ -154,7 +139,7 @@ def test_valid_min_support_gsp(min_support): assert patterns[0] # Ensure frequent patterns are not empty -def test_main_invalid_json_file(monkeypatch, capfd): +def test_main_invalid_json_file(monkeypatch: MonkeyPatch): """ Test `main()` with a JSON file that has an invalid structure. """ @@ -168,17 +153,20 @@ def test_main_invalid_json_file(monkeypatch, capfd): 'sys.argv', ['main', '--file', temp_file_name, '--min_support', '0.2'] ) - main() + # Mock logger.error and test messages directly + with patch("gsppy.cli.logger.error") as mock_error: + main() - # Capture output - captured = capfd.readouterr() - assert "File should contain a JSON array of transaction lists." in captured.out + # Assert correct error message was logged + mock_error.assert_called_with( + "Error executing GSP algorithm: GSP requires multiple transactions to find meaningful patterns." + ) # Cleanup os.unlink(temp_file_name) -def test_main_non_existent_file(monkeypatch, capfd): +def test_main_non_existent_file(monkeypatch: MonkeyPatch): """ Test `main()` with a file that does not exist. """ @@ -187,14 +175,12 @@ def test_main_non_existent_file(monkeypatch, capfd): 'sys.argv', ['main', '--file', 'non_existent.json', '--min_support', '0.2'] ) - main() - - # Capture output - captured = capfd.readouterr() - assert "File 'non_existent.json' does not exist." in captured.out + with patch("gsppy.cli.logger.error") as mock_error: + main() + mock_error.assert_called_with("Error: File 'non_existent.json' does not exist.") -def test_main_valid_json_file(monkeypatch, capfd): +def test_main_valid_json_file(monkeypatch: MonkeyPatch): """ Test `main()` with a valid JSON file. """ @@ -208,17 +194,15 @@ def test_main_valid_json_file(monkeypatch, capfd): 'sys.argv', ['main', '--file', temp_file_name, '--min_support', '0.2'] ) - main() - - # Capture output - captured = capfd.readouterr() - assert "Frequent Patterns Found:" in captured.out + with patch("gsppy.cli.logger.info") as mock_info: + main() + mock_info.assert_any_call("Frequent Patterns Found:") # Check for expected log message # Cleanup os.unlink(temp_file_name) -def test_main_invalid_min_support(monkeypatch, capfd): +def test_main_invalid_min_support(monkeypatch: MonkeyPatch): """ Test `main()` with an invalid `min_support` value. """ @@ -232,17 +216,15 @@ def test_main_invalid_min_support(monkeypatch, capfd): 'sys.argv', ['main', '--file', temp_file_name, '--min_support', '-1.0'] # Invalid min_support ) - main() - - # Capture output - captured = capfd.readouterr() - assert "Error: min_support must be in the range (0.0, 1.0]." in captured.out + with patch("gsppy.cli.logger.error") as mock_error: + main() + mock_error.assert_called_with("Error: min_support must be in the range (0.0, 1.0].") # Cleanup os.unlink(temp_file_name) -def test_main_entry_point(monkeypatch, capfd): +def test_main_entry_point(): """ Test the script entry point (`if __name__ == '__main__': main()`). """ @@ -251,27 +233,28 @@ def test_main_entry_point(monkeypatch, capfd): json.dump([["Bread", "Milk"], ["Milk", "Diaper"], ["Bread", "Diaper", "Beer"]], temp_file) temp_file_name = temp_file.name - # Mock CLI arguments - Simulating script call - monkeypatch.setattr( - 'sys.argv', ['gsppy.cli', '--file', temp_file_name, '--min_support', '0.2'] - ) + # Get the CLI script path + cli_script = os.path.abspath(os.path.join(os.path.dirname(__file__), '../gsppy/cli.py')) + + # Set up the environment with the correct PYTHONPATH + env = os.environ.copy() + env['PYTHONPATH'] = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) # Add project root to PYTHONPATH - # Remove the module from sys.modules before running it - if 'gsppy.cli' in sys.modules: - del sys.modules['gsppy.cli'] + # Construct the command to run the script + cmd = [os.environ.get('PYTHON', 'python'), cli_script, '--file', temp_file_name, '--min_support', '0.2'] - # Use `runpy` to execute the script as if it were run from the command line - runpy.run_module('gsppy.cli', run_name='__main__') + # Run the script using subprocess + process = subprocess.run(cmd, text=True, capture_output=True, env=env) - # Capture the output - captured = capfd.readouterr() - assert "Frequent Patterns Found:" in captured.out + # Assert that the output contains the expected message + assert process.returncode == 0 + assert "Frequent Patterns Found:" in process.stdout # Cleanup os.unlink(temp_file_name) -def test_main_edge_case_min_support(monkeypatch, capfd): +def test_main_edge_case_min_support(monkeypatch: MonkeyPatch): """ Test `main()` with edge-case values for `min_support` (valid and invalid). """ @@ -284,23 +267,23 @@ def test_main_edge_case_min_support(monkeypatch, capfd): monkeypatch.setattr( 'sys.argv', ['main', '--file', temp_file_name, '--min_support', '1.0'] ) - main() - captured = capfd.readouterr() - assert "Frequent Patterns Found:" in captured.out + with patch("gsppy.cli.logger.info") as mock_info: + main() + mock_info.assert_any_call("Frequent Patterns Found:") # Case 2: `min_support` = -1.0 (Invalid Edge Case) monkeypatch.setattr( 'sys.argv', ['main', '--file', temp_file_name, '--min_support', '-1.0'] ) - main() - captured = capfd.readouterr() - assert "Error: min_support must be in the range (0.0, 1.0]." in captured.out + with patch("gsppy.cli.logger.error") as mock_error: + main() + mock_error.assert_called_with("Error: min_support must be in the range (0.0, 1.0].") # Cleanup os.unlink(temp_file_name) -def test_main_gsp_exception(monkeypatch, capfd): +def test_main_gsp_exception(monkeypatch: MonkeyPatch): """ Test `main()` when the GSP algorithm raises an exception. """ @@ -315,12 +298,30 @@ def test_main_gsp_exception(monkeypatch, capfd): ) # Step 3: Mock GSP.search to raise an exception - with patch('gsppy.gsp.GSP.search', side_effect=Exception("Simulated GSP failure")): + with patch('gsppy.gsp.GSP.search', side_effect=Exception("Simulated GSP failure")), \ + patch("gsppy.cli.logger.error") as mock_error: main() - # Step 4: Capture output and assert the error message - captured = capfd.readouterr() - assert "Error executing GSP algorithm: Simulated GSP failure" in captured.out + # Step 4: Assert the error message was logged + mock_error.assert_called_with("Error executing GSP algorithm: Simulated GSP failure") # Step 5: Cleanup os.unlink(temp_file_name) + + +def test_setup_logging_verbose(monkeypatch: MonkeyPatch): + """ + Test `setup_logging` sets logging level to DEBUG when `--verbose` is provided. + """ + # Mock CLI arguments to include the verbose flag + monkeypatch.setattr( + 'sys.argv', ['main', '--file', 'test_data.json', '--min_support', '0.2', '--verbose'] + ) + + with patch('gsppy.cli.logger.setLevel') as mock_setLevel: + with patch('gsppy.cli.detect_and_read_file', return_value=[["Bread", "Milk"]]): # Mock file reading + with patch('gsppy.cli.GSP.search', return_value=[{("Bread",): 1}]): # Mock GSP search + main() # Run the CLI + + # Check that the logger level was set to DEBUG + mock_setLevel.assert_called_with(logging.DEBUG) diff --git a/gsppy/tests/test_gsp.py b/tests/test_gsp.py similarity index 86% rename from gsppy/tests/test_gsp.py rename to tests/test_gsp.py index d621dc3..d9e5f2b 100644 --- a/gsppy/tests/test_gsp.py +++ b/tests/test_gsp.py @@ -17,16 +17,18 @@ Author: Jackson Antonio do Prado Lima Email: jacksonpradolima@gmail.com """ -import random import re +import random +from typing import List import pytest +from pytest_benchmark.fixture import BenchmarkFixture # type: ignore from gsppy.gsp import GSP @pytest.fixture -def supermarket_transactions(): +def supermarket_transactions() -> List[List[str]]: """ Fixture to provide a dataset representing supermarket transactions. @@ -43,7 +45,7 @@ def supermarket_transactions(): @pytest.fixture -def random_transactions(): +def random_transactions() -> List[List[str]]: """ Fixture to generate a random dataset of transactions. @@ -53,19 +55,19 @@ def random_transactions(): return [[random.choice(['A', 'B', 'C', 'D', 'E']) for _ in range(random.randint(2, 10))] for _ in range(100)] -def test_empty_transactions(): +def test_empty_transactions() -> None: """ Test the GSP algorithm with an empty dataset. Asserts: - A ValueError is raised indicating that the dataset is empty. """ - transactions = [] + transactions: List[List[str]] = [] with pytest.raises(ValueError, match="Input transactions are empty"): GSP(transactions) -def test_single_transaction(): +def test_single_transaction() -> None: """ Test the GSP algorithm with a single transaction. @@ -77,18 +79,6 @@ def test_single_transaction(): GSP(transactions) -def test_invalid_transaction_format(): - """ - Test the GSP algorithm with invalid transaction formats. - - Asserts: - - A ValueError is raised indicating that the transactions must be lists of lists. - """ - invalid_data = ["A", "B"] # Invalid format: not a list of lists - with pytest.raises(ValueError, match="The dataset must be a list of transactions."): - GSP(invalid_data) - - @pytest.mark.parametrize( "min_support, expected_error", [ @@ -97,7 +87,8 @@ def test_invalid_transaction_format(): (1.1, re.escape("Minimum support must be in the range (0.0, 1.0]")), ] ) -def test_invalid_min_support(supermarket_transactions, min_support, expected_error): +def test_invalid_min_support(supermarket_transactions: List[List[str]], min_support: float, + expected_error: str) -> None: """ Test the GSP algorithm with invalid minimum support values. @@ -109,7 +100,7 @@ def test_invalid_min_support(supermarket_transactions, min_support, expected_err gsp.search(min_support=min_support) -def test_valid_min_support_edge(supermarket_transactions): +def test_valid_min_support_edge(supermarket_transactions: List[List[str]]) -> None: """ Test the GSP algorithm with a valid edge value for min_support. @@ -121,7 +112,7 @@ def test_valid_min_support_edge(supermarket_transactions): assert not result, "Expected no frequent patterns with min_support = 1.0" -def test_min_support_valid(supermarket_transactions): +def test_min_support_valid(supermarket_transactions: List[List[str]]) -> None: """ Test the GSP algorithm with a minimum support set just above 0.0. @@ -138,7 +129,7 @@ def test_min_support_valid(supermarket_transactions): assert result_level_1 == level_1_patterns, f"Level 1 patterns mismatch. Got {result_level_1}" -def test_no_frequent_items(supermarket_transactions): +def test_no_frequent_items(supermarket_transactions: List[List[str]]) -> None: """ Test the GSP algorithm with a high minimum support value. @@ -150,7 +141,7 @@ def test_no_frequent_items(supermarket_transactions): assert not result, "High minimum support should filter out all items." -def test_worker_batch_static_method(supermarket_transactions): +def test_worker_batch_static_method(supermarket_transactions: List[List[str]]) -> None: """ Test the _worker_batch method directly for checkpoint validation. @@ -165,11 +156,11 @@ def test_worker_batch_static_method(supermarket_transactions): # Call the '_worker_batch' method # This test accesses `_worker_batch` to test internal functionality - results = GSP._worker_batch(batch, transactions, min_support) # pylint: disable=protected-access + results = GSP._worker_batch(batch, transactions, min_support) # pylint: disable=protected-access assert results == expected, f"Expected results {expected}, but got {results}" -def test_frequent_patterns(supermarket_transactions): +def test_frequent_patterns(supermarket_transactions: List[List[str]]) -> None: """ Test the GSP algorithm with supermarket transactions and a realistic minimum support. @@ -186,7 +177,7 @@ def test_frequent_patterns(supermarket_transactions): assert result == expected, "Frequent patterns do not match expected results." -def test_random_transactions(random_transactions): +def test_random_transactions(random_transactions: List[List[str]]) -> None: """ Test the GSP algorithm with a random dataset. @@ -198,7 +189,7 @@ def test_random_transactions(random_transactions): assert len(result) > 0, "Random transactions should yield some frequent patterns with low min_support." -def test_large_transactions(): +def test_large_transactions() -> None: """ Test the GSP algorithm with a large single transaction. @@ -210,7 +201,7 @@ def test_large_transactions(): GSP(transactions) -def test_partial_match(supermarket_transactions): +def test_partial_match(supermarket_transactions: List[List[str]]) -> None: """ Test the GSP algorithm with additional partial matches. @@ -239,7 +230,7 @@ def test_partial_match(supermarket_transactions): @pytest.mark.parametrize("min_support", [0.1, 0.2, 0.3, 0.4, 0.5]) -def test_benchmark(benchmark, supermarket_transactions, min_support): +def test_benchmark(benchmark: BenchmarkFixture, supermarket_transactions: List[List[str]], min_support: float) -> None: """ Benchmark the GSP algorithm's performance using the supermarket dataset. diff --git a/gsppy/tests/test_utils.py b/tests/test_utils.py similarity index 79% rename from gsppy/tests/test_utils.py rename to tests/test_utils.py index 10aeef1..4d16bbc 100644 --- a/gsppy/tests/test_utils.py +++ b/tests/test_utils.py @@ -8,6 +8,8 @@ Each function is tested for standard cases, edge cases, and error handling to ensure robustness. """ +from typing import Dict, List, Tuple + from gsppy.utils import split_into_batches, is_subsequence_in_list, generate_candidates_from_previous @@ -16,10 +18,10 @@ def test_split_into_batches(): Test the `split_into_batches` utility function. """ # Test with exact batches - items = [(1,), (2,), (3,), (4,), (5,)] + items = [("1",), ("2",), ("3",), ("4",), ("5",)] batch_size = 2 result = list(split_into_batches(items, batch_size)) - assert result == [[(1,), (2,)], [(3,), (4,)], [(5,)]], "Failed exact batch split" + assert result == [[("1",), ("2",)], [("3",), ("4",)], [("5",)]], "Failed exact batch split" # Test with a batch size greater than the number of items batch_size = 10 @@ -29,10 +31,10 @@ def test_split_into_batches(): # Test with batch size of 1 batch_size = 1 result = list(split_into_batches(items, batch_size)) - assert result == [[(1,)], [(2,)], [(3,)], [(4,)], [(5,)]], "Failed batch size of 1" + assert result == [[("1",)], [("2",)], [("3",)], [("4",)], [("5",)]], "Failed batch size of 1" # Test empty input - items = [] + items: List[Tuple[str]] = [] batch_size = 3 result = list(split_into_batches(items, batch_size)) assert not result, "Failed empty input" @@ -64,28 +66,28 @@ def test_generate_candidates_from_previous(): """ # Test if candidates are generated correctly prev_patterns = { - (1, 2): 3, - (2, 3): 4, - (3, 4): 5, - (1, 3): 2 # Non-joinable with others as a k-1 match + ("1", "2"): 3, + ("2", "3"): 4, + ("3", "4"): 5, + ("1", "3"): 2 # Non-joinable with others as a k-1 match } result = set(generate_candidates_from_previous(prev_patterns)) - # Expected candidates: joining (1, 2) with (2, 3) and (2, 3) with (3, 4) - expected = {(1, 2, 3), (2, 3, 4)} + # Expected candidates: joining ("1", "2") with ("2", "3") and ("2", "3") with ("3", "4") + expected = {("1", "2", "3"), ("2", "3", "4")} assert expected.issubset(result), f"Missing expected candidates. Got {result}, expected at least {expected}" # Test with no joinable patterns prev_patterns = { - (1,): 3, - (2,): 4 + ("1",): 3, + ("2",): 4 } result = set(generate_candidates_from_previous(prev_patterns)) # For single-element disjoint patterns, candidates may still be generated but GSP will filter later - assert result == {(1, 2), (2, 1)}, f"Unexpected disjoint candidates. Got {result}" + assert result == {("1", "2"), ("2", "1")}, f"Unexpected disjoint candidates. Got {result}" # Test with empty patterns - prev_patterns = {} + prev_patterns: Dict[Tuple[str, ...], int] = {} result = set(generate_candidates_from_previous(prev_patterns)) assert result == set(), f"Failed empty input handling. Got {result}" diff --git a/tox.ini b/tox.ini index 77fe128..fbfd058 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38, py39, py310, py311 +envlist = py38, py39, py310, py311, py312, py313 [testenv] deps =