RolnickLab · mihow · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026 · Mar 25, 2026
diff --git a/ami/main/checks.py b/ami/main/checks.py
@@ -0,0 +1,102 @@
+import logging
+from dataclasses import dataclass, field
+
+from django.db.models import Count
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class OccurrenceCheckReport:
+    missing_determination: list[int] = field(default_factory=list)
+    orphaned_occurrences: list[int] = field(default_factory=list)
+    orphaned_detections: list[int] = field(default_factory=list)
+    fixed_determinations: int = 0
+    deleted_occurrences: int = 0
+
+    @property
+    def has_issues(self) -> bool:
+        return bool(self.missing_determination or self.orphaned_occurrences or self.orphaned_detections)
+
+    @property
+    def summary(self) -> str:
+        parts = []
+        if self.missing_determination:
+            s = f"{len(self.missing_determination)} missing determination"
+            if self.fixed_determinations:
+                s += f" ({self.fixed_determinations} fixed)"
+            parts.append(s)
+        if self.orphaned_occurrences:
+            s = f"{len(self.orphaned_occurrences)} orphaned occurrences"
+            if self.deleted_occurrences:
+                s += f" ({self.deleted_occurrences} deleted)"
+            parts.append(s)
+        if self.orphaned_detections:
+            parts.append(f"{len(self.orphaned_detections)} orphaned detections")
+        return ", ".join(parts) if parts else "No issues found"
+
+
+def check_occurrences(
+    project_id: int | None = None,
+    fix: bool = False,
+) -> OccurrenceCheckReport:
+    """
+    Check occurrence data integrity and optionally fix issues.
+
+    Args:
+        project_id: Scope to a single project. None = all projects.
+        fix: If True, auto-fix what can be fixed. If False (default), report only.
+
+    Returns:
+        OccurrenceCheckReport with findings and fix counts.
+    """
+    from ami.main.models import Detection, Occurrence, update_occurrence_determination
+
+    report = OccurrenceCheckReport()
+
+    # Base querysets scoped by project
+    occ_qs = Occurrence.objects.all()
+    det_qs = Detection.objects.all()
+    if project_id is not None:
+        occ_qs = occ_qs.filter(project_id=project_id)
+        det_qs = det_qs.filter(source_image__deployment__project_id=project_id)
-        det_qs = det_qs.filter(source_image__deployment__project_id=project_id)
+        det_qs = det_qs.filter(source_image__project_id=project_id)
-        det_qs = det_qs.filter(source_image__deployment__project_id=project_id)
+        det_qs = det_qs.filter(source_image__project_id=project_id)
+
+    # Check 1: Missing determination
+    # Occurrences with classifications but no determination set
+    missing = occ_qs.filter(
+        determination__isnull=True,
+        detections__classifications__isnull=False,
+    ).distinct()
+    report.missing_determination = list(missing.values_list("pk", flat=True))
+
+    if fix and report.missing_determination:
+        for occ in missing.iterator():
+            if update_occurrence_determination(occ, current_determination=None, save=True):
+                report.fixed_determinations += 1
+        logger.info(
+            "Fixed %d/%d missing determinations",
+            report.fixed_determinations,
+            len(report.missing_determination),
+        )
+
+    # Check 2: Orphaned occurrences (no detections)
+    orphaned_occ = occ_qs.annotate(det_count=Count("detections")).filter(det_count=0)
+    report.orphaned_occurrences = list(orphaned_occ.values_list("pk", flat=True))
+
+    if fix and report.orphaned_occurrences:
+        deleted_count, _ = orphaned_occ.delete()
+        report.deleted_occurrences = deleted_count
+        logger.info("Deleted %d orphaned occurrences", deleted_count)
-        deleted_count, _ = orphaned_occ.delete()
-        report.deleted_occurrences = deleted_count
-        logger.info("Deleted %d orphaned occurrences", deleted_count)
+        deleted_total, per_model_counts = orphaned_occ.delete()
+        deleted_occurrences = per_model_counts.get(Occurrence._meta.label, 0)
+        report.deleted_occurrences = deleted_occurrences
+        logger.info("Deleted %d orphaned occurrences", deleted_occurrences)
-        deleted_count, _ = orphaned_occ.delete()
-        report.deleted_occurrences = deleted_count
-        logger.info("Deleted %d orphaned occurrences", deleted_count)
+        deleted_total, per_model_counts = orphaned_occ.delete()
+        deleted_occurrences = per_model_counts.get(Occurrence._meta.label, 0)
+        report.deleted_occurrences = deleted_occurrences
+        logger.info("Deleted %d orphaned occurrences", deleted_occurrences)
+
+    # Check 3: Orphaned detections (no occurrence)
+    orphaned_det = det_qs.filter(occurrence__isnull=True)
+    report.orphaned_detections = list(orphaned_det.values_list("pk", flat=True))
+
+    if report.orphaned_detections:
+        logger.warning(
+            "Found %d orphaned detections (no occurrence linked): %s",
+            len(report.orphaned_detections),
+            report.orphaned_detections[:10],
+        )
+
+    return report
diff --git a/ami/main/management/commands/check_occurrences.py b/ami/main/management/commands/check_occurrences.py
@@ -0,0 +1,69 @@
+import logging
+
+from django.core.management.base import BaseCommand
+
+from ami.main.checks import check_occurrences
+
+logger = logging.getLogger(__name__)
+
+
-import logging
-
-from django.core.management.base import BaseCommand
-
-from ami.main.checks import check_occurrences
-
-logger = logging.getLogger(__name__)
+from django.core.management.base import BaseCommand
+
+from ami.main.checks import check_occurrences
-import logging
-
-from django.core.management.base import BaseCommand
-
-from ami.main.checks import check_occurrences
-
-logger = logging.getLogger(__name__)
+from django.core.management.base import BaseCommand
+
+from ami.main.checks import check_occurrences
+class Command(BaseCommand):
+    help = "Check occurrence data integrity and optionally fix issues"
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--project-id",
+            type=int,
+            default=None,
+            help="Scope to a single project ID",
+        )
+        parser.add_argument(
+            "--fix",
+            action="store_true",
+            help="Auto-fix issues (missing determinations, orphaned occurrences)",
+        )
+
+    def handle(self, *args, **options):
+        project_id = options["project_id"]
+        fix = options["fix"]
+
+        scope = f"project {project_id}" if project_id else "all projects"
+        self.stdout.write(f"Checking occurrence integrity for {scope}...")
+
+        report = check_occurrences(project_id=project_id, fix=fix)
+
+        # Missing determination
+        label = "Missing determination"
+        count = len(report.missing_determination)
+        if fix and report.fixed_determinations:
+            self.stdout.write(f"  {label}: {count} found, {report.fixed_determinations} fixed")
+        elif count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Orphaned occurrences
+        label = "Orphaned occurrences"
+        count = len(report.orphaned_occurrences)
+        if fix and report.deleted_occurrences:
+            self.stdout.write(f"  {label}: {count} found, {report.deleted_occurrences} deleted")
+        elif count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Orphaned detections
+        label = "Orphaned detections"
+        count = len(report.orphaned_detections)
+        if count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Summary
+        if report.has_issues and not fix:
+            self.stdout.write(self.style.NOTICE("\nRun with --fix to repair fixable issues."))
+        elif report.has_issues and fix:
+            self.stdout.write(self.style.SUCCESS("\nDone. Applied fixes."))
+        else:
+            self.stdout.write(self.style.SUCCESS("\nNo issues found."))
-        report = check_occurrences(project_id=project_id, fix=fix)
-
-        # Missing determination
-        label = "Missing determination"
-        count = len(report.missing_determination)
-        if fix and report.fixed_determinations:
-            self.stdout.write(f"  {label}: {count} found, {report.fixed_determinations} fixed")
-        elif count:
-            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
-        else:
-            self.stdout.write(f"  {label}: 0")
-
-        # Orphaned occurrences
-        label = "Orphaned occurrences"
-        count = len(report.orphaned_occurrences)
-        if fix and report.deleted_occurrences:
-            self.stdout.write(f"  {label}: {count} found, {report.deleted_occurrences} deleted")
-        elif count:
-            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
-        else:
-            self.stdout.write(f"  {label}: 0")
-
-        # Orphaned detections
-        label = "Orphaned detections"
-        count = len(report.orphaned_detections)
-        if count:
-            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
-        else:
-            self.stdout.write(f"  {label}: 0")
-
-        # Summary
-        if report.has_issues and not fix:
-            self.stdout.write(self.style.NOTICE("\nRun with --fix to repair fixable issues."))
-        elif report.has_issues and fix:
-            self.stdout.write(self.style.SUCCESS("\nDone. Applied fixes."))
-        else:
-            self.stdout.write(self.style.SUCCESS("\nNo issues found."))
+        report = check_occurrences(project_id=project_id, fix=fix)
+        remaining_missing = max(len(report.missing_determination) - report.fixed_determinations, 0)
+        remaining_orphaned_occurrences = max(len(report.orphaned_occurrences) - report.deleted_occurrences, 0)
+        remaining_issues = remaining_missing + remaining_orphaned_occurrences + len(report.orphaned_detections)
+
+        # Missing determination
+        label = "Missing determination"
+        count = len(report.missing_determination)
+        if fix and report.fixed_determinations:
+            self.stdout.write(f"  {label}: {count} found, {report.fixed_determinations} fixed")
+        elif count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Orphaned occurrences
+        label = "Orphaned occurrences"
+        count = len(report.orphaned_occurrences)
+        if fix and report.deleted_occurrences:
+            self.stdout.write(f"  {label}: {count} found, {report.deleted_occurrences} deleted")
+        elif count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Orphaned detections
+        label = "Orphaned detections"
+        count = len(report.orphaned_detections)
+        if count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Summary
+        if report.has_issues and not fix:
+            self.stdout.write(self.style.NOTICE("\nRun with --fix to repair fixable issues."))
+        elif fix and remaining_issues:
+            self.stdout.write(
+                self.style.WARNING(
+                    f"\nDone. Applied fixes, but {remaining_issues} issue(s) still require attention."
+                )
+            )
+        elif fix:
+            self.stdout.write(self.style.SUCCESS("\nDone. All fixable issues were repaired."))
+        else:
+            self.stdout.write(self.style.SUCCESS("\nNo issues found."))
-        report = check_occurrences(project_id=project_id, fix=fix)
-
-        # Missing determination
-        label = "Missing determination"
-        count = len(report.missing_determination)
-        if fix and report.fixed_determinations:
-            self.stdout.write(f"  {label}: {count} found, {report.fixed_determinations} fixed")
-        elif count:
-            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
-        else:
-            self.stdout.write(f"  {label}: 0")
-
-        # Orphaned occurrences
-        label = "Orphaned occurrences"
-        count = len(report.orphaned_occurrences)
-        if fix and report.deleted_occurrences:
-            self.stdout.write(f"  {label}: {count} found, {report.deleted_occurrences} deleted")
-        elif count:
-            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
-        else:
-            self.stdout.write(f"  {label}: 0")
-
-        # Orphaned detections
-        label = "Orphaned detections"
-        count = len(report.orphaned_detections)
-        if count:
-            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
-        else:
-            self.stdout.write(f"  {label}: 0")
-
-        # Summary
-        if report.has_issues and not fix:
-            self.stdout.write(self.style.NOTICE("\nRun with --fix to repair fixable issues."))
-        elif report.has_issues and fix:
-            self.stdout.write(self.style.SUCCESS("\nDone. Applied fixes."))
-        else:
-            self.stdout.write(self.style.SUCCESS("\nNo issues found."))
+        report = check_occurrences(project_id=project_id, fix=fix)
+        remaining_missing = max(len(report.missing_determination) - report.fixed_determinations, 0)
+        remaining_orphaned_occurrences = max(len(report.orphaned_occurrences) - report.deleted_occurrences, 0)
+        remaining_issues = remaining_missing + remaining_orphaned_occurrences + len(report.orphaned_detections)
+
+        # Missing determination
+        label = "Missing determination"
+        count = len(report.missing_determination)
+        if fix and report.fixed_determinations:
+            self.stdout.write(f"  {label}: {count} found, {report.fixed_determinations} fixed")
+        elif count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Orphaned occurrences
+        label = "Orphaned occurrences"
+        count = len(report.orphaned_occurrences)
+        if fix and report.deleted_occurrences:
+            self.stdout.write(f"  {label}: {count} found, {report.deleted_occurrences} deleted")
+        elif count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Orphaned detections
+        label = "Orphaned detections"
+        count = len(report.orphaned_detections)
+        if count:
+            self.stdout.write(self.style.WARNING(f"  {label}: {count} found"))
+        else:
+            self.stdout.write(f"  {label}: 0")
+
+        # Summary
+        if report.has_issues and not fix:
+            self.stdout.write(self.style.NOTICE("\nRun with --fix to repair fixable issues."))
+        elif fix and remaining_issues:
+            self.stdout.write(
+                self.style.WARNING(
+                    f"\nDone. Applied fixes, but {remaining_issues} issue(s) still require attention."
+                )
+            )
+        elif fix:
+            self.stdout.write(self.style.SUCCESS("\nDone. All fixable issues were repaired."))
+        else:
+            self.stdout.write(self.style.SUCCESS("\nNo issues found."))
diff --git a/ami/main/tasks.py b/ami/main/tasks.py
@@ -0,0 +1,18 @@
+import logging
+
+from config import celery_app
+
+logger = logging.getLogger(__name__)
+
+
+@celery_app.task()
+def check_occurrences_task():
+    """Periodic occurrence integrity check. Report-only, logs warnings."""
+    from ami.main.checks import check_occurrences
+
+    report = check_occurrences(fix=False)
+    if report.has_issues:
+        logger.warning("Occurrence integrity issues: %s", report.summary)
+    else:
+        logger.info("Occurrence integrity check passed")
+    return report.summary
diff --git a/ami/main/tests.py b/ami/main/tests.py
@@ -15,6 +15,7 @@
 
 from ami.exports.models import DataExport
 from ami.jobs.models import VALID_JOB_TYPES, Job
+from ami.main.checks import check_occurrences
 from ami.main.models import (
     Classification,
     Deployment,
@@ -3744,3 +3745,130 @@ def test_list_pipelines_public_project_non_member(self):
         self.client.force_authenticate(user=non_member)
         response = self.client.get(url)
         self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+
+class TestCheckOccurrences(TestCase):
+    def setUp(self):
+        self.project = Project.objects.create(name="Integrity Test Project")
+        self.deployment = Deployment.objects.create(name="Test Deployment", project=self.project)
+        self.event = Event.objects.create(
+            deployment=self.deployment,
+            project=self.project,
+            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
-            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            group_by="2024-01-01",
-            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            group_by="2024-01-01",
+        )
+        self.taxon = Taxon.objects.create(name="Test Species", rank=TaxonRank.SPECIES)
+        self.source_image = SourceImage.objects.create(
+            deployment=self.deployment,
+            event=self.event,
+        )
+
+    def _create_occurrence_with_classification(self, determination=None):
+        """Helper: create occurrence -> detection -> classification chain."""
+        occurrence = Occurrence.objects.create(
+            project=self.project,
+            event=self.event,
+            deployment=self.deployment,
+            determination=determination,
+        )
+        detection = Detection.objects.create(
+            source_image=self.source_image,
+            occurrence=occurrence,
+        )
+        Classification.objects.create(
+            detection=detection,
+            taxon=self.taxon,
+            score=0.9,
+            terminal=True,
-            terminal=True,
+            terminal=True,
+            timestamp=self.event.start,
-            terminal=True,
+            terminal=True,
+            timestamp=self.event.start,
+        )
+        return occurrence
+
+    def test_no_issues(self):
+        """Clean data should report no issues."""
+        self._create_occurrence_with_classification(determination=self.taxon)
+        report = check_occurrences(project_id=self.project.pk)
+        self.assertFalse(report.has_issues)
+        self.assertEqual(len(report.missing_determination), 0)
+        self.assertEqual(len(report.orphaned_occurrences), 0)
+        self.assertEqual(len(report.orphaned_detections), 0)
+
+    def test_missing_determination_detected(self):
+        """Occurrence with classification but null determination should be flagged."""
+        occurrence = self._create_occurrence_with_classification(determination=None)
+        # Force determination to None (save() would auto-set it)
+        Occurrence.objects.filter(pk=occurrence.pk).update(determination=None)
+
+        report = check_occurrences(project_id=self.project.pk)
+        self.assertTrue(report.has_issues)
+        self.assertIn(occurrence.pk, report.missing_determination)
+        self.assertEqual(report.fixed_determinations, 0)
+
+    def test_missing_determination_fixed(self):
+        """With fix=True, missing determination should be repaired."""
+        occurrence = self._create_occurrence_with_classification(determination=None)
+        Occurrence.objects.filter(pk=occurrence.pk).update(determination=None)
+
+        report = check_occurrences(project_id=self.project.pk, fix=True)
+        self.assertEqual(report.fixed_determinations, 1)
+
+        occurrence.refresh_from_db()
+        self.assertIsNotNone(occurrence.determination)
+
+    def test_orphaned_occurrence_detected(self):
+        """Occurrence with no detections should be flagged."""
+        orphan = Occurrence.objects.create(
+            project=self.project,
+            event=self.event,
+            deployment=self.deployment,
+        )
+        report = check_occurrences(project_id=self.project.pk)
+        self.assertIn(orphan.pk, report.orphaned_occurrences)
+
+    def test_orphaned_occurrence_fixed(self):
+        """With fix=True, orphaned occurrences should be deleted."""
+        orphan = Occurrence.objects.create(
+            project=self.project,
+            event=self.event,
+            deployment=self.deployment,
+        )
+        report = check_occurrences(project_id=self.project.pk, fix=True)
+        self.assertEqual(report.deleted_occurrences, 1)
+        self.assertFalse(Occurrence.objects.filter(pk=orphan.pk).exists())
+
+    def test_orphaned_detection_detected(self):
+        """Detection with no occurrence should be flagged."""
+        det = Detection.objects.create(
+            source_image=self.source_image,
+            occurrence=None,
+        )
+        report = check_occurrences(project_id=self.project.pk)
+        self.assertIn(det.pk, report.orphaned_detections)
+
+    def test_project_filter(self):
+        """Issues in other projects should not be reported."""
+        other_project = Project.objects.create(name="Other Project")
+        other_deployment = Deployment.objects.create(name="Other Dep", project=other_project)
+        other_event = Event.objects.create(
+            deployment=other_deployment,
+            project=other_project,
+            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
-            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            group_by=self.event.group_by,
-            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            start=datetime.datetime(2024, 1, 1, tzinfo=datetime.timezone.utc),
+            group_by=self.event.group_by,
+        )
+        Occurrence.objects.create(
+            project=other_project,
+            event=other_event,
+            deployment=other_deployment,
+        )  # orphaned in other project
+
+        report = check_occurrences(project_id=self.project.pk)
+        self.assertEqual(len(report.orphaned_occurrences), 0)
+
+    def test_report_summary(self):
+        """Summary should be a non-empty string when issues exist."""
+        Occurrence.objects.create(
+            project=self.project,
+            event=self.event,
+            deployment=self.deployment,
+        )
+        report = check_occurrences(project_id=self.project.pk)
+        self.assertTrue(report.has_issues)
+        self.assertIsInstance(report.summary, str)
+        self.assertGreater(len(report.summary), 0)