diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index cd4600b..a13e59b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -33,7 +33,22 @@ jobs: run: mypy src - name: Run tests - run: pytest --cov=src/datafold --cov-report=xml + run: pytest --cov=src/datafold --cov-report=xml --cov-fail-under=80 + + - name: Check documentation consistency + run: python scripts/check-docs.py + + - name: Validate CHANGELOG + run: | + if ! grep -q "^## \[" CHANGELOG.md; then + echo "CHANGELOG.md must contain version headers" + exit 1 + fi + if ! grep -q "^## \[Unreleased\]" CHANGELOG.md; then + echo "CHANGELOG.md must contain [Unreleased] section" + exit 1 + fi + echo "CHANGELOG.md is valid" - name: Upload coverage uses: codecov/codecov-action@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index 88e4bf2..d5df80c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ All notable changes to DataFold Agent will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.1.0] - 2024-12-18 +## [0.1.0] - 2025-01-15 ### Added @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - SQL connector supporting PostgreSQL, MySQL, ClickHouse, and SQLite - Freshness detection with configurable `max_age_hours` and baseline factor - Volume detection with `min_row_count` and statistical deviation thresholds + - Schema drift detection for column additions, removals, and type changes - Behavioral baseline learning from historical snapshots - SQLite state storage with migrations and retention policies @@ -61,7 +62,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Planned for 0.2.0 -- Schema drift detection - Prometheus metrics endpoint - PostgreSQL storage backend (for multi-agent setups) - BigQuery connector diff --git a/README.md b/README.md index 63a3644..627b9fa 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![CI](https://github.com/datafold/agent/actions/workflows/ci.yaml/badge.svg)](https://github.com/datafold/agent/actions/workflows/ci.yaml) [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/) -[![License: AGPL-3.0](https://img.shields.io/badge/License-AGPL--3.0-blue.svg)](https://opensource.org/licenses/MIT) +[![License: AGPL-3.0](https://img.shields.io/badge/License-AGPL--3.0-blue.svg)](https://www.gnu.org/licenses/agpl-3.0) ## Why DataFold? diff --git a/docs/architecture.md b/docs/architecture.md index 1f4d448..0427e25 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -46,8 +46,8 @@ Internal design and component overview. │ │ │ (SQLite) │ │ - postgres │ │ │ │ - mysql │ │ - snapshots │ - │ - clickhouse │ │ - alert_states │ - │ - sqlite │ │ - delivery_log │ + │ - clickhouse │ │ - alert_state │ + │ - sqlite │ │ - deliveries │ └────────┬────────┘ └────────┬────────┘ │ │ │ ┌────────────────────┘ @@ -270,30 +270,30 @@ CREATE TABLE snapshots ( ); -- Alert state (per source + target) -CREATE TABLE alert_states ( - id INTEGER PRIMARY KEY, +CREATE TABLE alert_state ( source_name TEXT NOT NULL, target_name TEXT NOT NULL, - notified_status TEXT, - notified_reason_hash TEXT, - last_change_at TEXT, + notified_status TEXT NOT NULL, + notified_reason_hash TEXT NOT NULL, + last_change_at TEXT NOT NULL, last_sent_at TEXT, cooldown_until TEXT, - UNIQUE(source_name, target_name) + PRIMARY KEY (source_name, target_name) ); -- Delivery log -CREATE TABLE delivery_log ( - id INTEGER PRIMARY KEY, +CREATE TABLE deliveries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, source_name TEXT NOT NULL, target_name TEXT NOT NULL, event_type TEXT NOT NULL, - payload_hash TEXT, - delivered_at TEXT NOT NULL, + payload_hash TEXT NOT NULL, + sent_at TEXT NOT NULL, success INTEGER NOT NULL, status_code INTEGER, latency_ms INTEGER, - error_message TEXT + error_message TEXT, + attempts INTEGER NOT NULL DEFAULT 1 ); -- Schema metadata diff --git a/docs/configuration.md b/docs/configuration.md index 453af14..1bdafdb 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -8,7 +8,8 @@ DataFold looks for configuration in this order: 1. `--config` CLI argument 2. `./datafold.yaml` 3. `./datafold.yml` -4. `~/.datafold/config.yaml` +4. `~/.config/datafold/datafold.yaml` +5. `/etc/datafold/datafold.yaml` ## Full Schema diff --git a/pyproject.toml b/pyproject.toml index b41ae1d..3f5f536 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ name = "datafold-agent" version = "0.1.0" description = "Automated data quality & drift detection agent" readme = "README.md" -license = "MIT" +license = "AGPL-3.0" requires-python = ">=3.10" authors = [ { name = "DataFold Team" } @@ -17,7 +17,7 @@ classifiers = [ "Development Status :: 4 - Beta", "Environment :: Console", "Intended Audience :: Developers", - "License :: OSI Approved :: MIT License", + "License :: OSI Approved :: GNU Affero General Public License v3", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -97,8 +97,11 @@ source = ["src/datafold"] branch = true [tool.coverage.report] +fail_under = 80 +show_missing = true exclude_lines = [ "pragma: no cover", "if TYPE_CHECKING:", "raise NotImplementedError", + "if __name__ == .__main__.:", ] diff --git a/scripts/check-docs.py b/scripts/check-docs.py new file mode 100755 index 0000000..86921ed --- /dev/null +++ b/scripts/check-docs.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +"""Check documentation consistency with code.""" + +from __future__ import annotations + +import re +import sys +from pathlib import Path + +REPO_ROOT = Path(__file__).parent.parent + + +def check_license_consistency() -> list[str]: + """Check that license is consistent across files.""" + errors = [] + expected_license = "AGPL-3.0" + + # Check pyproject.toml + pyproject = REPO_ROOT / "pyproject.toml" + if pyproject.exists(): + content = pyproject.read_text() + if 'license = "MIT"' in content: + errors.append("pyproject.toml: License should be AGPL-3.0, not MIT") + if "MIT License" in content: + errors.append("pyproject.toml: Classifier should use AGPL, not MIT") + + # Check README.md + readme = REPO_ROOT / "README.md" + if readme.exists(): + content = readme.read_text() + if "opensource.org/licenses/MIT" in content: + errors.append("README.md: License link should point to AGPL, not MIT") + + # Check LICENSE file + license_file = REPO_ROOT / "LICENSE" + if license_file.exists(): + content = license_file.read_text() + if "GNU AFFERO GENERAL PUBLIC LICENSE" not in content: + errors.append("LICENSE: Should contain AGPL-3.0 license text") + + return errors + + +def check_table_names_in_docs() -> list[str]: + """Check that table names in docs match actual code.""" + errors = [] + + # Read actual table names from sqlite.py + sqlite_file = REPO_ROOT / "src" / "datafold" / "storage" / "sqlite.py" + if not sqlite_file.exists(): + return errors + + sqlite_content = sqlite_file.read_text() + + # Expected table names from code + expected_tables = { + "snapshots", + "alert_state", + "deliveries", + "schema_meta", + } + + # Check architecture.md + arch_file = REPO_ROOT / "docs" / "architecture.md" + if arch_file.exists(): + content = arch_file.read_text() + + # Check for old/wrong table names + wrong_names = { + "alert_states": "alert_state", + "delivery_log": "deliveries", + } + + for wrong, correct in wrong_names.items(): + if wrong in content: + errors.append( + f"docs/architecture.md: Use '{correct}' instead of '{wrong}'" + ) + + return errors + + +def check_config_paths() -> list[str]: + """Check that config file paths in docs match code.""" + errors = [] + + # Read actual paths from config.py + config_file = REPO_ROOT / "src" / "datafold" / "config.py" + if not config_file.exists(): + return errors + + config_content = config_file.read_text() + + # Check configuration.md + config_doc = REPO_ROOT / "docs" / "configuration.md" + if config_doc.exists(): + content = config_doc.read_text() + + # Check for wrong paths + if "~/.datafold/config.yaml" in content: + errors.append( + "docs/configuration.md: Use '~/.config/datafold/datafold.yaml' " + "instead of '~/.datafold/config.yaml'" + ) + + return errors + + +def check_version_consistency() -> list[str]: + """Check version consistency across files.""" + errors = [] + + # Get version from pyproject.toml + pyproject = REPO_ROOT / "pyproject.toml" + if not pyproject.exists(): + return errors + + pyproject_content = pyproject.read_text() + version_match = re.search(r'version\s*=\s*"([^"]+)"', pyproject_content) + if not version_match: + errors.append("pyproject.toml: Could not find version") + return errors + + expected_version = version_match.group(1) + + # Check __init__.py + init_file = REPO_ROOT / "src" / "datafold" / "__init__.py" + if init_file.exists(): + content = init_file.read_text() + if f'__version__ = "{expected_version}"' not in content: + init_version = re.search(r'__version__\s*=\s*"([^"]+)"', content) + if init_version: + actual = init_version.group(1) + if actual != expected_version: + errors.append( + f"src/datafold/__init__.py: Version '{actual}' " + f"doesn't match pyproject.toml '{expected_version}'" + ) + + return errors + + +def main() -> int: + """Run all documentation checks.""" + all_errors: list[str] = [] + + print("Checking documentation consistency...") + print() + + checks = [ + ("License consistency", check_license_consistency), + ("Table names in docs", check_table_names_in_docs), + ("Config paths in docs", check_config_paths), + ("Version consistency", check_version_consistency), + ] + + for name, check_fn in checks: + print(f" Checking {name}...") + errors = check_fn() + if errors: + for error in errors: + print(f" ❌ {error}") + all_errors.extend(errors) + else: + print(f" ✓ OK") + + print() + + if all_errors: + print(f"Found {len(all_errors)} issue(s)") + return 1 + + print("All documentation checks passed!") + return 0 + + +if __name__ == "__main__": + sys.exit(main())