Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,22 @@ jobs:
run: mypy src

- name: Run tests
run: pytest --cov=src/datafold --cov-report=xml
run: pytest --cov=src/datafold --cov-report=xml --cov-fail-under=80

- name: Check documentation consistency
run: python scripts/check-docs.py

- name: Validate CHANGELOG
run: |
if ! grep -q "^## \[" CHANGELOG.md; then
echo "CHANGELOG.md must contain version headers"
exit 1
fi
if ! grep -q "^## \[Unreleased\]" CHANGELOG.md; then
echo "CHANGELOG.md must contain [Unreleased] section"
exit 1
fi
echo "CHANGELOG.md is valid"

- name: Upload coverage
uses: codecov/codecov-action@v4
Expand Down
4 changes: 2 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@ All notable changes to DataFold Agent will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.0] - 2024-12-18
## [0.1.0] - 2025-01-15

### Added

- **Core Functionality**
- SQL connector supporting PostgreSQL, MySQL, ClickHouse, and SQLite
- Freshness detection with configurable `max_age_hours` and baseline factor
- Volume detection with `min_row_count` and statistical deviation thresholds
- Schema drift detection for column additions, removals, and type changes
- Behavioral baseline learning from historical snapshots
- SQLite state storage with migrations and retention policies

Expand Down Expand Up @@ -61,7 +62,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Planned for 0.2.0

- Schema drift detection
- Prometheus metrics endpoint
- PostgreSQL storage backend (for multi-agent setups)
- BigQuery connector
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

[![CI](https://github.com/datafold/agent/actions/workflows/ci.yaml/badge.svg)](https://github.com/datafold/agent/actions/workflows/ci.yaml)
[![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
[![License: AGPL-3.0](https://img.shields.io/badge/License-AGPL--3.0-blue.svg)](https://opensource.org/licenses/MIT)
[![License: AGPL-3.0](https://img.shields.io/badge/License-AGPL--3.0-blue.svg)](https://www.gnu.org/licenses/agpl-3.0)

## Why DataFold?

Expand Down
26 changes: 13 additions & 13 deletions docs/architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ Internal design and component overview.
│ │ │ (SQLite) │
│ - postgres │ │ │
│ - mysql │ │ - snapshots │
│ - clickhouse │ │ - alert_states
│ - sqlite │ │ - delivery_log
│ - clickhouse │ │ - alert_state
│ - sqlite │ │ - deliveries
└────────┬────────┘ └────────┬────────┘
│ │
│ ┌────────────────────┘
Expand Down Expand Up @@ -270,30 +270,30 @@ CREATE TABLE snapshots (
);

-- Alert state (per source + target)
CREATE TABLE alert_states (
id INTEGER PRIMARY KEY,
CREATE TABLE alert_state (
source_name TEXT NOT NULL,
target_name TEXT NOT NULL,
notified_status TEXT,
notified_reason_hash TEXT,
last_change_at TEXT,
notified_status TEXT NOT NULL,
notified_reason_hash TEXT NOT NULL,
last_change_at TEXT NOT NULL,
last_sent_at TEXT,
cooldown_until TEXT,
UNIQUE(source_name, target_name)
PRIMARY KEY (source_name, target_name)
);

-- Delivery log
CREATE TABLE delivery_log (
id INTEGER PRIMARY KEY,
CREATE TABLE deliveries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
source_name TEXT NOT NULL,
target_name TEXT NOT NULL,
event_type TEXT NOT NULL,
payload_hash TEXT,
delivered_at TEXT NOT NULL,
payload_hash TEXT NOT NULL,
sent_at TEXT NOT NULL,
success INTEGER NOT NULL,
status_code INTEGER,
latency_ms INTEGER,
error_message TEXT
error_message TEXT,
attempts INTEGER NOT NULL DEFAULT 1
);

-- Schema metadata
Expand Down
3 changes: 2 additions & 1 deletion docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ DataFold looks for configuration in this order:
1. `--config` CLI argument
2. `./datafold.yaml`
3. `./datafold.yml`
4. `~/.datafold/config.yaml`
4. `~/.config/datafold/datafold.yaml`
5. `/etc/datafold/datafold.yaml`

## Full Schema

Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ name = "datafold-agent"
version = "0.1.0"
description = "Automated data quality & drift detection agent"
readme = "README.md"
license = "MIT"
license = "AGPL-3.0"
requires-python = ">=3.10"
authors = [
{ name = "DataFold Team" }
Expand All @@ -17,7 +17,7 @@ classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"License :: OSI Approved :: GNU Affero General Public License v3",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
Expand Down Expand Up @@ -97,8 +97,11 @@ source = ["src/datafold"]
branch = true

[tool.coverage.report]
fail_under = 80
show_missing = true
exclude_lines = [
"pragma: no cover",
"if TYPE_CHECKING:",
"raise NotImplementedError",
"if __name__ == .__main__.:",
]
178 changes: 178 additions & 0 deletions scripts/check-docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/usr/bin/env python3
"""Check documentation consistency with code."""

from __future__ import annotations

import re
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).parent.parent


def check_license_consistency() -> list[str]:
"""Check that license is consistent across files."""
errors = []
expected_license = "AGPL-3.0"

# Check pyproject.toml
pyproject = REPO_ROOT / "pyproject.toml"
if pyproject.exists():
content = pyproject.read_text()
if 'license = "MIT"' in content:
errors.append("pyproject.toml: License should be AGPL-3.0, not MIT")
if "MIT License" in content:
errors.append("pyproject.toml: Classifier should use AGPL, not MIT")

# Check README.md
readme = REPO_ROOT / "README.md"
if readme.exists():
content = readme.read_text()
if "opensource.org/licenses/MIT" in content:
errors.append("README.md: License link should point to AGPL, not MIT")

# Check LICENSE file
license_file = REPO_ROOT / "LICENSE"
if license_file.exists():
content = license_file.read_text()
if "GNU AFFERO GENERAL PUBLIC LICENSE" not in content:
errors.append("LICENSE: Should contain AGPL-3.0 license text")

return errors


def check_table_names_in_docs() -> list[str]:
"""Check that table names in docs match actual code."""
errors = []

# Read actual table names from sqlite.py
sqlite_file = REPO_ROOT / "src" / "datafold" / "storage" / "sqlite.py"
if not sqlite_file.exists():
return errors

sqlite_content = sqlite_file.read_text()

# Expected table names from code
expected_tables = {
"snapshots",
"alert_state",
"deliveries",
"schema_meta",
}

# Check architecture.md
arch_file = REPO_ROOT / "docs" / "architecture.md"
if arch_file.exists():
content = arch_file.read_text()

# Check for old/wrong table names
wrong_names = {
"alert_states": "alert_state",
"delivery_log": "deliveries",
}

for wrong, correct in wrong_names.items():
if wrong in content:
errors.append(
f"docs/architecture.md: Use '{correct}' instead of '{wrong}'"
)

return errors


def check_config_paths() -> list[str]:
"""Check that config file paths in docs match code."""
errors = []

# Read actual paths from config.py
config_file = REPO_ROOT / "src" / "datafold" / "config.py"
if not config_file.exists():
return errors

config_content = config_file.read_text()

# Check configuration.md
config_doc = REPO_ROOT / "docs" / "configuration.md"
if config_doc.exists():
content = config_doc.read_text()

# Check for wrong paths
if "~/.datafold/config.yaml" in content:
errors.append(
"docs/configuration.md: Use '~/.config/datafold/datafold.yaml' "
"instead of '~/.datafold/config.yaml'"
)

return errors


def check_version_consistency() -> list[str]:
"""Check version consistency across files."""
errors = []

# Get version from pyproject.toml
pyproject = REPO_ROOT / "pyproject.toml"
if not pyproject.exists():
return errors

pyproject_content = pyproject.read_text()
version_match = re.search(r'version\s*=\s*"([^"]+)"', pyproject_content)
if not version_match:
errors.append("pyproject.toml: Could not find version")
return errors

expected_version = version_match.group(1)

# Check __init__.py
init_file = REPO_ROOT / "src" / "datafold" / "__init__.py"
if init_file.exists():
content = init_file.read_text()
if f'__version__ = "{expected_version}"' not in content:
init_version = re.search(r'__version__\s*=\s*"([^"]+)"', content)
if init_version:
actual = init_version.group(1)
if actual != expected_version:
errors.append(
f"src/datafold/__init__.py: Version '{actual}' "
f"doesn't match pyproject.toml '{expected_version}'"
)

return errors


def main() -> int:
"""Run all documentation checks."""
all_errors: list[str] = []

print("Checking documentation consistency...")
print()

checks = [
("License consistency", check_license_consistency),
("Table names in docs", check_table_names_in_docs),
("Config paths in docs", check_config_paths),
("Version consistency", check_version_consistency),
]

for name, check_fn in checks:
print(f" Checking {name}...")
errors = check_fn()
if errors:
for error in errors:
print(f" ❌ {error}")
all_errors.extend(errors)
else:
print(f" ✓ OK")

print()

if all_errors:
print(f"Found {len(all_errors)} issue(s)")
return 1

print("All documentation checks passed!")
return 0


if __name__ == "__main__":
sys.exit(main())
Loading