From 5301de3a3f103668b7aa0c9b808fce8c66681763 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 24 Mar 2026 20:00:04 -0700 Subject: [PATCH 1/4] feat: add check_occurrences() for occurrence data integrity Co-Authored-By: Claude Opus 4.6 (1M context) --- ami/main/checks.py | 102 ++++++++++++++++++++++++++++++++++++ ami/main/tests.py | 128 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+) create mode 100644 ami/main/checks.py diff --git a/ami/main/checks.py b/ami/main/checks.py new file mode 100644 index 000000000..4a207b0e5 --- /dev/null +++ b/ami/main/checks.py @@ -0,0 +1,102 @@ +import logging +from dataclasses import dataclass, field + +from django.db.models import Count + +logger = logging.getLogger(__name__) + + +@dataclass +class OccurrenceCheckReport: + missing_determination: list[int] = field(default_factory=list) + orphaned_occurrences: list[int] = field(default_factory=list) + orphaned_detections: list[int] = field(default_factory=list) + fixed_determinations: int = 0 + deleted_occurrences: int = 0 + + @property + def has_issues(self) -> bool: + return bool(self.missing_determination or self.orphaned_occurrences or self.orphaned_detections) + + @property + def summary(self) -> str: + parts = [] + if self.missing_determination: + s = f"{len(self.missing_determination)} missing determination" + if self.fixed_determinations: + s += f" ({self.fixed_determinations} fixed)" + parts.append(s) + if self.orphaned_occurrences: + s = f"{len(self.orphaned_occurrences)} orphaned occurrences" + if self.deleted_occurrences: + s += f" ({self.deleted_occurrences} deleted)" + parts.append(s) + if self.orphaned_detections: + parts.append(f"{len(self.orphaned_detections)} orphaned detections") + return ", ".join(parts) if parts else "No issues found" + + +def check_occurrences( + project_id: int | None = None, + fix: bool = False, +) -> OccurrenceCheckReport: + """ + Check occurrence data integrity and optionally fix issues. + + Args: + project_id: Scope to a single project. None = all projects. + fix: If True, auto-fix what can be fixed. If False (default), report only. + + Returns: + OccurrenceCheckReport with findings and fix counts. + """ + from ami.main.models import Detection, Occurrence, update_occurrence_determination + + report = OccurrenceCheckReport() + + # Base querysets scoped by project + occ_qs = Occurrence.objects.all() + det_qs = Detection.objects.all() + if project_id is not None: + occ_qs = occ_qs.filter(project_id=project_id) + det_qs = det_qs.filter(source_image__deployment__project_id=project_id) + + # Check 1: Missing determination + # Occurrences with classifications but no determination set + missing = occ_qs.filter( + determination__isnull=True, + detections__classifications__isnull=False, + ).distinct() + report.missing_determination = list(missing.values_list("pk", flat=True)) + + if fix and report.missing_determination: + for occ in missing.iterator(): + if update_occurrence_determination(occ, current_determination=None, save=True): + report.fixed_determinations += 1 + logger.info( + "Fixed %d/%d missing determinations", + report.fixed_determinations, + len(report.missing_determination), + ) + + # Check 2: Orphaned occurrences (no detections) + orphaned_occ = occ_qs.annotate(det_count=Count("detections")).filter(det_count=0) + report.orphaned_occurrences = list(orphaned_occ.values_list("pk", flat=True)) + + if fix and report.orphaned_occurrences: + deleted_count, _ = orphaned_occ.delete() + report.deleted_occurrences = deleted_count + logger.info("Deleted %d orphaned occurrences", deleted_count) + + # Check 3: Orphaned detections (no occurrence) + orphaned_det = det_qs.filter(occurrence__isnull=True) + report.orphaned_detections = list(orphaned_det.values_list("pk", flat=True)) + + if report.orphaned_detections: + logger.warning( + "Found %d orphaned detections (no occurrence linked): %s", + len(report.orphaned_detections), + report.orphaned_detections[:10], + ) + + return report diff --git a/ami/main/tests.py b/ami/main/tests.py index f82148937..dcd08d94a 100644 --- a/ami/main/tests.py +++ b/ami/main/tests.py @@ -15,6 +15,7 @@ from ami.exports.models import DataExport from ami.jobs.models import VALID_JOB_TYPES, Job +from ami.main.checks import check_occurrences from ami.main.models import ( Classification, Deployment, @@ -3744,3 +3745,130 @@ def test_list_pipelines_public_project_non_member(self): self.client.force_authenticate(user=non_member) response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) + + +class TestCheckOccurrences(TestCase): + def setUp(self): + self.project = Project.objects.create(name="Integrity Test Project") + self.deployment = Deployment.objects.create(name="Test Deployment", project=self.project) + self.event = Event.objects.create( + deployment=self.deployment, + project=self.project, + start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc), + ) + self.taxon = Taxon.objects.create(name="Test Species", rank=TaxonRank.SPECIES) + self.source_image = SourceImage.objects.create( + deployment=self.deployment, + event=self.event, + ) + + def _create_occurrence_with_classification(self, determination=None): + """Helper: create occurrence -> detection -> classification chain.""" + occurrence = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + determination=determination, + ) + detection = Detection.objects.create( + source_image=self.source_image, + occurrence=occurrence, + ) + Classification.objects.create( + detection=detection, + taxon=self.taxon, + score=0.9, + terminal=True, + ) + return occurrence + + def test_no_issues(self): + """Clean data should report no issues.""" + self._create_occurrence_with_classification(determination=self.taxon) + report = check_occurrences(project_id=self.project.pk) + self.assertFalse(report.has_issues) + self.assertEqual(len(report.missing_determination), 0) + self.assertEqual(len(report.orphaned_occurrences), 0) + self.assertEqual(len(report.orphaned_detections), 0) + + def test_missing_determination_detected(self): + """Occurrence with classification but null determination should be flagged.""" + occurrence = self._create_occurrence_with_classification(determination=None) + # Force determination to None (save() would auto-set it) + Occurrence.objects.filter(pk=occurrence.pk).update(determination=None) + + report = check_occurrences(project_id=self.project.pk) + self.assertTrue(report.has_issues) + self.assertIn(occurrence.pk, report.missing_determination) + self.assertEqual(report.fixed_determinations, 0) + + def test_missing_determination_fixed(self): + """With fix=True, missing determination should be repaired.""" + occurrence = self._create_occurrence_with_classification(determination=None) + Occurrence.objects.filter(pk=occurrence.pk).update(determination=None) + + report = check_occurrences(project_id=self.project.pk, fix=True) + self.assertEqual(report.fixed_determinations, 1) + + occurrence.refresh_from_db() + self.assertIsNotNone(occurrence.determination) + + def test_orphaned_occurrence_detected(self): + """Occurrence with no detections should be flagged.""" + orphan = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + ) + report = check_occurrences(project_id=self.project.pk) + self.assertIn(orphan.pk, report.orphaned_occurrences) + + def test_orphaned_occurrence_fixed(self): + """With fix=True, orphaned occurrences should be deleted.""" + orphan = Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + ) + report = check_occurrences(project_id=self.project.pk, fix=True) + self.assertEqual(report.deleted_occurrences, 1) + self.assertFalse(Occurrence.objects.filter(pk=orphan.pk).exists()) + + def test_orphaned_detection_detected(self): + """Detection with no occurrence should be flagged.""" + det = Detection.objects.create( + source_image=self.source_image, + occurrence=None, + ) + report = check_occurrences(project_id=self.project.pk) + self.assertIn(det.pk, report.orphaned_detections) + + def test_project_filter(self): + """Issues in other projects should not be reported.""" + other_project = Project.objects.create(name="Other Project") + other_deployment = Deployment.objects.create(name="Other Dep", project=other_project) + other_event = Event.objects.create( + deployment=other_deployment, + project=other_project, + start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc), + ) + Occurrence.objects.create( + project=other_project, + event=other_event, + deployment=other_deployment, + ) # orphaned in other project + + report = check_occurrences(project_id=self.project.pk) + self.assertEqual(len(report.orphaned_occurrences), 0) + + def test_report_summary(self): + """Summary should be a non-empty string when issues exist.""" + Occurrence.objects.create( + project=self.project, + event=self.event, + deployment=self.deployment, + ) + report = check_occurrences(project_id=self.project.pk) + self.assertTrue(report.has_issues) + self.assertIsInstance(report.summary, str) + self.assertGreater(len(report.summary), 0) From 58971f5d7cdb653e4136383ca6d46a1150504b90 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 24 Mar 2026 20:00:32 -0700 Subject: [PATCH 2/4] feat: add check_occurrences management command Co-Authored-By: Claude Opus 4.6 (1M context) --- .../management/commands/check_occurrences.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 ami/main/management/commands/check_occurrences.py diff --git a/ami/main/management/commands/check_occurrences.py b/ami/main/management/commands/check_occurrences.py new file mode 100644 index 000000000..ac53207b0 --- /dev/null +++ b/ami/main/management/commands/check_occurrences.py @@ -0,0 +1,69 @@ +import logging + +from django.core.management.base import BaseCommand + +from ami.main.checks import check_occurrences + +logger = logging.getLogger(__name__) + + +class Command(BaseCommand): + help = "Check occurrence data integrity and optionally fix issues" + + def add_arguments(self, parser): + parser.add_argument( + "--project-id", + type=int, + default=None, + help="Scope to a single project ID", + ) + parser.add_argument( + "--fix", + action="store_true", + help="Auto-fix issues (missing determinations, orphaned occurrences)", + ) + + def handle(self, *args, **options): + project_id = options["project_id"] + fix = options["fix"] + + scope = f"project {project_id}" if project_id else "all projects" + self.stdout.write(f"Checking occurrence integrity for {scope}...") + + report = check_occurrences(project_id=project_id, fix=fix) + + # Missing determination + label = "Missing determination" + count = len(report.missing_determination) + if fix and report.fixed_determinations: + self.stdout.write(f" {label}: {count} found, {report.fixed_determinations} fixed") + elif count: + self.stdout.write(self.style.WARNING(f" {label}: {count} found")) + else: + self.stdout.write(f" {label}: 0") + + # Orphaned occurrences + label = "Orphaned occurrences" + count = len(report.orphaned_occurrences) + if fix and report.deleted_occurrences: + self.stdout.write(f" {label}: {count} found, {report.deleted_occurrences} deleted") + elif count: + self.stdout.write(self.style.WARNING(f" {label}: {count} found")) + else: + self.stdout.write(f" {label}: 0") + + # Orphaned detections + label = "Orphaned detections" + count = len(report.orphaned_detections) + if count: + self.stdout.write(self.style.WARNING(f" {label}: {count} found")) + else: + self.stdout.write(f" {label}: 0") + + # Summary + if report.has_issues and not fix: + self.stdout.write(self.style.NOTICE("\nRun with --fix to repair fixable issues.")) + elif report.has_issues and fix: + self.stdout.write(self.style.SUCCESS("\nDone. Applied fixes.")) + else: + self.stdout.write(self.style.SUCCESS("\nNo issues found.")) From 9e2bb1004ebb100b6ffa7fc2585c60677ecb9c05 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 24 Mar 2026 20:00:42 -0700 Subject: [PATCH 3/4] feat: add check_occurrences periodic celery task Co-Authored-By: Claude Opus 4.6 (1M context) --- ami/main/tasks.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 ami/main/tasks.py diff --git a/ami/main/tasks.py b/ami/main/tasks.py new file mode 100644 index 000000000..d75e75294 --- /dev/null +++ b/ami/main/tasks.py @@ -0,0 +1,18 @@ +import logging + +from config import celery_app + +logger = logging.getLogger(__name__) + + +@celery_app.task() +def check_occurrences_task(): + """Periodic occurrence integrity check. Report-only, logs warnings.""" + from ami.main.checks import check_occurrences + + report = check_occurrences(fix=False) + if report.has_issues: + logger.warning("Occurrence integrity issues: %s", report.summary) + else: + logger.info("Occurrence integrity check passed") + return report.summary From 103dce5d48a26a43fca528fbcd66a7c3a3068219 Mon Sep 17 00:00:00 2001 From: Michael Bunsen Date: Tue, 24 Mar 2026 20:00:47 -0700 Subject: [PATCH 4/4] docs: add check_occurrences design spec Co-Authored-By: Claude Opus 4.6 (1M context) --- .../2026-03-25-check-occurrences-design.md | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 docs/superpowers/specs/2026-03-25-check-occurrences-design.md diff --git a/docs/superpowers/specs/2026-03-25-check-occurrences-design.md b/docs/superpowers/specs/2026-03-25-check-occurrences-design.md new file mode 100644 index 000000000..b043b42b2 --- /dev/null +++ b/docs/superpowers/specs/2026-03-25-check-occurrences-design.md @@ -0,0 +1,149 @@ +# Design: check_occurrences + +## Problem + +Occurrences can end up in inconsistent states through normal pipeline operation: +- Localization creates detections + occurrences, but classification may fail or never run, leaving occurrences with no determination +- Detections can become orphaned (no occurrence linked) if occurrence creation fails mid-pipeline +- Occurrences can become orphaned (no detections) if detections are deleted + +There's no mechanism to detect or repair these issues. On the demo environment, 481 occurrences with null determinations crashed the frontend UI (which doesn't handle `determination: null`). + +## Solution + +A reusable `check_occurrences()` function in `ami/main/checks.py` that detects and optionally fixes data integrity issues. Callable from a management command (manual), a celery periodic task (automated monitoring), and potentially post-pipeline-save. + +## Checks + +### 1. Missing determination +**Query:** Occurrences where `determination IS NULL` but at least one detection has a classification. +```python +Occurrence.objects.filter( + determination__isnull=True, + detections__classifications__isnull=False +).distinct() +``` +**Fix:** Call `update_occurrence_determination(occurrence, save=True)` for each. +**Severity:** Error — these should always have a determination. + +### 2. Orphaned occurrences +**Query:** Occurrences with zero detections. +```python +Occurrence.objects.annotate( + det_count=Count("detections") +).filter(det_count=0) +``` +**Fix:** Delete the occurrence (no useful data without detections). +**Severity:** Warning — may be legitimate during pipeline processing. + +### 3. Orphaned detections +**Query:** Detections where `occurrence IS NULL`. +```python +Detection.objects.filter(occurrence__isnull=True) +``` +**Fix:** Log only. Re-linking requires pipeline context (which source image, event, etc). Could potentially call `create_and_update_occurrences_for_detections()` but that's a heavier operation best left to manual intervention. +**Severity:** Warning. + +## API + +### Core function + +```python +# ami/main/checks.py + +@dataclass +class OccurrenceCheckReport: + missing_determination: list[int] # occurrence PKs + orphaned_occurrences: list[int] # occurrence PKs (no detections) + orphaned_detections: list[int] # detection PKs (no occurrence) + fixed_determinations: int # count auto-fixed (when fix=True) + deleted_occurrences: int # count deleted (when fix=True) + + @property + def has_issues(self) -> bool: + return bool( + self.missing_determination + or self.orphaned_occurrences + or self.orphaned_detections + ) + + @property + def summary(self) -> str: + """Human-readable one-line summary.""" + ... + + +def check_occurrences( + project_id: int | None = None, + fix: bool = False, +) -> OccurrenceCheckReport: + """ + Check occurrence data integrity and optionally fix issues. + + Args: + project_id: Scope to a single project. None = all projects. + fix: If True, auto-fix what can be fixed (determinations, orphaned occurrences). + If False (default), report only. + + Returns: + OccurrenceCheckReport with findings and fix counts. + """ +``` + +### Management command + +``` +manage.py check_occurrences [--project-id N] [--fix] +``` + +Output format: +``` +Checking occurrence integrity... + Project: Vermont Atlas of Life (#5) + + Missing determination: 12 found, 12 fixed + Orphaned occurrences: 3 found, 3 deleted + Orphaned detections: 0 found + + Done. Fixed 15 issues. +``` + +Without `--fix`: +``` + Missing determination: 12 found + Orphaned occurrences: 3 found + Orphaned detections: 0 found + + Found 15 issues. Run with --fix to repair. +``` + +### Celery task + +```python +# ami/main/tasks.py + +@shared_task +def check_occurrences_task(): + """Periodic occurrence integrity check. Report-only, logs warnings.""" + report = check_occurrences(fix=False) + if report.has_issues: + logger.warning("Occurrence integrity issues found: %s", report.summary) + return report.summary +``` + +Registered via django-celery-beat admin (IntervalSchedule or CrontabSchedule). Not hardcoded in beat config — the team can set frequency via admin. Suggested: daily. + +## File locations + +| Component | Path | +|-----------|------| +| Core function | `ami/main/checks.py` | +| Management command | `ami/main/management/commands/check_occurrences.py` | +| Celery task | `ami/main/tasks.py` (add to existing) | +| Tests | `ami/main/tests/test_checks.py` | + +## Future considerations + +- **Post-pipeline hook:** After `save_results()` completes, call `check_occurrences(project_id=job.project_id)` to catch issues immediately. Not in this PR — let's observe the patterns first via the periodic task. +- **Classification.save() signal:** Could trigger `update_occurrence_determination()` when classifications are added outside the pipeline path. Deferred — need to understand when this actually happens. +- **Metrics/alerting:** The periodic task could emit New Relic custom events or Sentry breadcrumbs for dashboarding. Deferred until we know the baseline.