From 82986bc3ce627ba30c42be24597522ada941c204 Mon Sep 17 00:00:00 2001
From: iamheinrich <76793837+iamheinrich@users.noreply.github.com>
Date: Thu, 15 May 2025 10:20:39 +0200
Subject: [PATCH 1/6] refactor: Status Quo

---
 doleus/datasets/base.py             |  19 +--
 doleus/storage/prediction_store.py  | 110 ++++++++++---
 tests/test_metric_classification.py | 235 +++++++++++++++-------------
 tests/test_metric_detection.py      | 218 ++++++++++++--------------
 4 files changed, 317 insertions(+), 265 deletions(-)

diff --git a/doleus/datasets/base.py b/doleus/datasets/base.py
index fc48cd3..0404bd3 100644
--- a/doleus/datasets/base.py
+++ b/doleus/datasets/base.py
@@ -107,6 +107,7 @@ def add_model_predictions(
         self.prediction_store.add_predictions(
             predictions=predictions,
             model_id=model_id,
+            task=self.task,
         )
 
     # -------------------------------------------------------------------------
@@ -161,27 +162,23 @@ def add_metadata_from_list(self, metadata_list: List[Dict[str, Any]]):
             for key, value in md_dict.items():
                 self.metadata_store.add_metadata(i, key, value)
 
-    def add_predefined_metadata(self, keys: Union[str, List[str]]):
+    def add_predefined_metadata(self, attribute: str) -> None:
         """Add predefined metadata using functions from ATTRIBUTE_FUNCTIONS.
 
         Parameters
         ----------
-        keys : Union[str, List[str]]
-            Name(s) of predefined metadata function(s) to compute and add.
+        attribute : str
+            Name of predefined metadata function to compute and add.
             Available keys are defined in ATTRIBUTE_FUNCTIONS.
 
         Raises
         ------
         ValueError
-            If any key is not found in ATTRIBUTE_FUNCTIONS.
+            If attribute is not found in ATTRIBUTE_FUNCTIONS.
         """
-        if isinstance(keys, str):
-            keys = [keys]
-
-        for key in keys:
-            if key not in ATTRIBUTE_FUNCTIONS:
-                raise ValueError(f"Unknown predefined metadata key: {key}")
-            self.add_metadata(key, ATTRIBUTE_FUNCTIONS[key])
+        if attribute not in ATTRIBUTE_FUNCTIONS:
+            raise ValueError(f"Unknown predefined metadata attribute: {attribute}")
+        self.add_metadata(attribute, ATTRIBUTE_FUNCTIONS[attribute])
 
     def add_metadata_from_dataframe(self, df):
         """Add metadata from a pandas DataFrame.
diff --git a/doleus/storage/prediction_store.py b/doleus/storage/prediction_store.py
index 31c5a3d..d4dedaf 100644
--- a/doleus/storage/prediction_store.py
+++ b/doleus/storage/prediction_store.py
@@ -1,9 +1,9 @@
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 
 from doleus.annotations import Annotations, BoundingBoxes, Labels
-from doleus.utils import TaskType
+from doleus.utils import Task, TaskType
 
 
 class PredictionStore:
@@ -28,6 +28,7 @@ def add_predictions(
         self,
         predictions: Union[torch.Tensor, List[Dict[str, Any]]],
         model_id: str,
+        task: Optional[str] = None,
     ) -> None:
         """
         Store predictions for a model.
@@ -41,12 +42,16 @@ def add_predictions(
             with 'boxes', 'labels', and 'scores' keys.
         model_id : str
             Identifier of the specified model.
+        task : Optional[str], optional
+            The specific task (e.g., "multilabel", "multiclass"), by default None.
         """
-        processed_predictions = self._process_predictions(predictions)
+        processed_predictions = self._process_predictions(predictions, task)
         self.predictions[model_id] = processed_predictions
 
     def _process_predictions(
-        self, predictions: Union[torch.Tensor, List[Dict[str, Any]], Annotations]
+        self,
+        predictions: Union[torch.Tensor, List[Dict[str, Any]], Annotations],
+        task: Optional[str] = None,
     ) -> Annotations:
         """Process raw predictions into the standard annotation format.
 
@@ -57,14 +62,18 @@ def _process_predictions(
             - A torch.Tensor for classification tasks
             - A list of dictionaries for detection tasks
             - An already processed Annotations object
+        task : Optional[str], optional
+            The specific task (e.g., "multilabel", "multiclass"), by default None.
+
 
         Returns
         -------
         Annotations
             Processed predictions in standard annotation format.
         """
-        if isinstance(predictions[0], Labels) or isinstance(
-            predictions[0], BoundingBoxes
+        if isinstance(predictions, Annotations) and (
+            isinstance(predictions[0], Labels)
+            or isinstance(predictions[0], BoundingBoxes)
         ):
             return predictions
 
@@ -78,43 +87,89 @@ def _process_predictions(
 
             num_samples = predictions.shape[0]
 
-            # If shape is [N], assume these are predicted labels (class IDs)
-            # If shape is [N, C], assume these are logits or probabilities
             if predictions.dim() == 1:
+                # Assume these are predicted labels (class IDs) for single-label tasks
                 for i in range(num_samples):
-                    label_val = predictions[i].unsqueeze(0)
+                    label_val = predictions[i].unsqueeze(0) # Ensure [1] shape
                     ann = Labels(datapoint_number=i, labels=label_val, scores=None)
                     processed.add(ann)
 
-            elif predictions.dim() == 2:
-                # logits or probabilities of shape [N, C]
-                # currently we always interpret them as logits, with an argmax
+            elif predictions.dim() == 2: # Shape [N, C]
                 for i in range(num_samples):
-                    logit_row = predictions[i]
-                    # "labels" is the top-1 predicted label
-                    pred_label = logit_row.argmax(dim=0).unsqueeze(0)
-                    scores = torch.softmax(logit_row, dim=0)
-                    ann = Labels(
-                        datapoint_number=i,
-                        labels=pred_label,  # shape [1]
-                        scores=scores,  # shape [self.num_classes]
-                    )
+                    prediction_row = predictions[i]  # This is the [C] tensor for the i-th sample
+                    current_labels: torch.Tensor
+                    current_scores: Optional[torch.Tensor]
+
+                    if task == Task.MULTILABEL.value:
+                        if prediction_row.dtype in (torch.long, torch.int, torch.bool):
+                            # Input is integer multi-hot
+                            current_labels = prediction_row
+                            current_scores = None
+                        else:  # Float input, assumed to be logits or probabilities
+                            # Apply sigmoid if not already probabilities in [0,1]
+                            if not (prediction_row.min() >= 0 and prediction_row.max() <= 1):
+                                processed_scores_for_row = torch.sigmoid(prediction_row)
+                            else: # Already probabilities
+                                processed_scores_for_row = prediction_row
+                            
+                            current_labels = (processed_scores_for_row >= 0.5).long() # Default threshold 0.5
+                            current_scores = processed_scores_for_row
+                        
+                        ann = Labels(
+                            datapoint_number=i,
+                            labels=current_labels,
+                            scores=current_scores,
+                        )
+                    else:  # Binary, Multiclass, or task is None (default to old behavior)
+                        current_labels = prediction_row.argmax(dim=0).unsqueeze(0) # [1] tensor
+                        
+                        if prediction_row.dtype == torch.float:
+                            current_scores = torch.softmax(prediction_row, dim=0) # [C] tensor
+                        else:  # Integer input
+                            current_scores = None
+                        
+                        ann = Labels(
+                            datapoint_number=i,
+                            labels=current_labels,
+                            scores=current_scores,
+                        )
                     processed.add(ann)
-
             else:
-                raise ValueError("Classification predictions must be 1D or 2D tensor.")
+                raise ValueError(
+                    "Classification predictions must be a 1D or 2D tensor."
+                )
 
         elif self.task_type == TaskType.DETECTION.value:
             if not isinstance(predictions, list):
                 raise TypeError(
                     "For detection, predictions must be a list of length N."
                 )
+            if not all(isinstance(p, dict) for p in predictions):
+                raise TypeError(
+                    "Each item in detection predictions list must be a dictionary."
+                )
+
 
             # Each element should look like {"boxes": (M,4), "labels": (M,), "scores": (M,)}
             for i, pred_dict in enumerate(predictions):
-                boxes_xyxy = torch.tensor(pred_dict["boxes"], dtype=torch.float32)
-                labels = torch.tensor(pred_dict["labels"], dtype=torch.long)
-                scores = torch.tensor(pred_dict["scores"], dtype=torch.float32)
+                # Validate keys
+                required_keys = {"boxes", "labels", "scores"}
+                if not required_keys.issubset(pred_dict.keys()):
+                    raise ValueError(f"Detection prediction dict for sample {i} missing keys. Required: {required_keys}")
+
+                boxes_xyxy = torch.as_tensor(pred_dict["boxes"], dtype=torch.float32)
+                labels = torch.as_tensor(pred_dict["labels"], dtype=torch.long)
+                scores = torch.as_tensor(pred_dict["scores"], dtype=torch.float32)
+
+                # Validate shapes
+                num_detections = boxes_xyxy.shape[0]
+                if not (boxes_xyxy.ndim == 2 and boxes_xyxy.shape[1] == 4):
+                    raise ValueError(f"boxes for sample {i} must have shape (M,4)")
+                if not (labels.ndim == 1 and labels.shape[0] == num_detections):
+                    raise ValueError(f"labels for sample {i} must have shape (M,)")
+                if not (scores.ndim == 1 and scores.shape[0] == num_detections):
+                    raise ValueError(f"scores for sample {i} must have shape (M,)")
+
 
                 ann = BoundingBoxes(
                     datapoint_number=i,
@@ -123,6 +178,9 @@ def _process_predictions(
                     scores=scores,
                 )
                 processed.add(ann)
+        else:
+            raise ValueError(f"Unsupported task type: {self.task_type}")
+
 
         return processed
 
diff --git a/tests/test_metric_classification.py b/tests/test_metric_classification.py
index 95113c1..d567910 100644
--- a/tests/test_metric_classification.py
+++ b/tests/test_metric_classification.py
@@ -1,146 +1,161 @@
 import pytest
 import torch
-from moonwatcher.dataset.dataset import MoonwatcherDataset
-from moonwatcher.metric import calculate_metric_internal
 
+from torch.utils.data import Dataset as TorchDataset
+
+from doleus.datasets.classification import DoleusClassification
+from doleus.metric import calculate_metric_internal
+from doleus.utils.data import Task
 
 class MockModel:
-    def __init__(self, name, task, device):
+    def __init__(self, name: str, task_type: str):
         self.name = name
-        self.task_type = task
-        self.device = device
+        self.task_type = task_type
 
 
-class MockDataset(MoonwatcherDataset):
-    def __init__(self, name, labels, label_to_name):
-        self.name = name
-        self.labels = labels
-        self.label_to_name = label_to_name
-        self.dataset = labels
+class MockTorchDataset(TorchDataset):
+    def __init__(self, img_labels: list):
+        self.img_labels = img_labels
 
+    def __len__(self):
+        return len(self.img_labels)
 
-class MockLabel:
-    def __init__(self, label):
-        self.labels = torch.tensor([label])
+    def __getitem__(self, idx):
+        return torch.empty(0), self.img_labels[idx]
 
 
-def mock_do_predictions_exist(dataset_name, model_name):
-    return True
+class MockLabel:
+    def __init__(self, labels):
+        if not isinstance(labels, torch.Tensor):
+            self.labels = torch.tensor(labels)
+        else:
+            self.labels = labels
+
+def test_multilabel_accuracy():
+    model = MockModel(name="mock_multilabel_model", task_type=Task.MULTILABEL.value)
+    assert True == True
+
+def test_calculate_accuracy_binary():
+    model = MockModel(name="mock_binary_model", task_type=Task.BINARY.value)
+    
+    groundtruths_labels = [MockLabel([0]), MockLabel([1]), MockLabel([0]), MockLabel([1])]
+    predictions_labels = [MockLabel([0]), MockLabel([1]), MockLabel([1]), MockLabel([1])]
+        
+    dataset_tensor_labels = [gt.labels for gt in groundtruths_labels]
+    mock_torch_dataset = MockTorchDataset(dataset_tensor_labels)
+    
+    doleus_dataset = DoleusClassification(
+        dataset=mock_torch_dataset,
+        name="mock_binary_dataset",
+        task=Task.BINARY.value,
+        num_classes=2,
+        label_to_name={0: "class0", 1: "class1"}
+    )
+    relevant_ids = list(range(len(doleus_dataset)))
 
+    result = calculate_metric_internal(
+        model, relevant_ids, doleus_dataset, groundtruths_labels, predictions_labels, "Accuracy"
+    )
+    assert result == 0.75, f"Expected Accuracy to be 0.75 but got {result}"
 
-def mock_load_groundtruths(dataset_name):
-    return [MockLabel(0), MockLabel(1), MockLabel(0), MockLabel(1)]
 
+def test_calculate_precision_binary():
+    model = MockModel(name="mock_binary_model", task_type=Task.BINARY.value)
+    groundtruths_labels = [MockLabel([0]), MockLabel([1]), MockLabel([0]), MockLabel([1])]
+    predictions_labels = [MockLabel([0]), MockLabel([1]), MockLabel([1]), MockLabel([1])]
+    
+    dataset_tensor_labels = [gt.labels for gt in groundtruths_labels]
+    mock_torch_dataset = MockTorchDataset(dataset_tensor_labels)
+
+    doleus_dataset = DoleusClassification(
+        dataset=mock_torch_dataset,
+        name="mock_binary_dataset",
+        task=Task.BINARY.value,
+        num_classes=2,
+        label_to_name={0: "class0", 1: "class1"}
+    )
+    relevant_ids = list(range(len(doleus_dataset)))
 
-def mock_load_predictions(dataset_name, model_name):
-    return [MockLabel(0), MockLabel(1), MockLabel(1), MockLabel(1)]
+    result = calculate_metric_internal(
+        model, relevant_ids, doleus_dataset, groundtruths_labels, predictions_labels, "Precision"
+    )
+    assert result == pytest.approx(2/3), f"Expected Precision to be {2/3} but got {result}"
 
 
-def mock_inference(model, dataset, device):
-    pass
+def test_calculate_recall_binary():
+    model = MockModel(name="mock_binary_model", task_type=Task.BINARY.value)
+    groundtruths_labels = [MockLabel([0]), MockLabel([1]), MockLabel([0]), MockLabel([1])]
+    predictions_labels = [MockLabel([0]), MockLabel([1]), MockLabel([1]), MockLabel([1])]
 
+    dataset_tensor_labels = [gt.labels for gt in groundtruths_labels]
+    mock_torch_dataset = MockTorchDataset(dataset_tensor_labels)
 
-@pytest.fixture(autouse=True)
-def patch_functions(monkeypatch):
-    monkeypatch.setattr(
-        "moonwatcher.utils.data_storage.do_predictions_exist", mock_do_predictions_exist
+    doleus_dataset = DoleusClassification(
+        dataset=mock_torch_dataset,
+        name="mock_binary_dataset",
+        task=Task.BINARY.value,
+        num_classes=2,
+        label_to_name={0: "class0", 1: "class1"}
     )
-    monkeypatch.setattr(
-        "moonwatcher.utils.data_storage.load_groundtruths", mock_load_groundtruths
-    )
-    monkeypatch.setattr(
-        "moonwatcher.utils.data_storage.load_predictions", mock_load_predictions
-    )
-    monkeypatch.setattr("moonwatcher.inference.inference", mock_inference)
-
+    relevant_ids = list(range(len(doleus_dataset)))
 
-def load_data_for_testing():
-    model = MockModel(
-        name="mock_model", task_type=TaskType.CLASSIFICATION.value, device="cpu"
-    )
-    dataset = MockDataset(
-        name="mock_dataset",
-        labels=[0, 1, 0, 1],
-        label_to_name={0: "class0", 1: "class1"},
-    )
-    relevant_ids = list(range(len(dataset.labels)))
-    groundtruths_loaded = mock_load_groundtruths(dataset.name)
-    predictions_loaded = mock_load_predictions(dataset.name, model.name)
-    return model, relevant_ids, dataset, groundtruths_loaded, predictions_loaded
-
-
-def test_calculate_accuracy():
-    (
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-    ) = load_data_for_testing()
     result = calculate_metric_internal(
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-        "Accuracy",
+        model, relevant_ids, doleus_dataset, groundtruths_labels, predictions_labels, "Recall"
     )
-    assert result == 0.75, f"Expected Accuracy to be 0.75 but got {result}"
+    assert result == 1.0, f"Expected Recall to be 1.0 but got {result}"
 
 
-def test_calculate_precision():
-    (
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-    ) = load_data_for_testing()
-    result = calculate_metric_internal(
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-        "Precision",
+def test_calculate_f1_binary():
+    model = MockModel(name="mock_binary_model", task_type=Task.BINARY.value)
+    groundtruths_labels = [MockLabel([0]), MockLabel([1]), MockLabel([0]), MockLabel([1])]
+    predictions_labels = [MockLabel([0]), MockLabel([1]), MockLabel([1]), MockLabel([1])]
+
+    dataset_tensor_labels = [gt.labels for gt in groundtruths_labels]
+    mock_torch_dataset = MockTorchDataset(dataset_tensor_labels)
+
+    doleus_dataset = DoleusClassification(
+        dataset=mock_torch_dataset,
+        name="mock_binary_dataset",
+        task=Task.BINARY.value,
+        num_classes=2,
+        label_to_name={0: "class0", 1: "class1"}
     )
-    assert (
-        result == 0.66667
-    ), f"Expected Precision to be 0.66667 but got {
-        result}"
-
-
-def test_calculate_recall():
-    (
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-    ) = load_data_for_testing()
+    relevant_ids = list(range(len(doleus_dataset)))
+
     result = calculate_metric_internal(
-        model, relevant_ids, dataset, groundtruths_loaded, predictions_loaded, "Recall"
+        model, relevant_ids, doleus_dataset, groundtruths_labels, predictions_labels, "F1_Score"
     )
-    assert result == 1.0, f"Expected Recall to be 1.0 but got {result}"
+    assert result == 0.8, f"Expected F1_Score to be 0.8 but got {result}"
 
 
-def test_calculate_f1():
-    (
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-    ) = load_data_for_testing()
+def test_calculate_accuracy_multilabel():
+    model = MockModel(name="mock_multilabel_model", task_type=Task.MULTILABEL.value)
+    
+    groundtruths_labels = [
+        MockLabel([1, 0, 1]), MockLabel([0, 1, 1]),
+        MockLabel([1, 1, 0]), MockLabel([0, 0, 1])
+    ]
+    predictions_labels = [
+        MockLabel([1, 0, 1]), MockLabel([0, 1, 0]), 
+        MockLabel([1, 1, 0]), MockLabel([0, 1, 1])  
+    ]
+    
+    dataset_tensor_labels = [gt.labels for gt in groundtruths_labels]
+    mock_torch_dataset = MockTorchDataset(dataset_tensor_labels)
+
+    doleus_dataset = DoleusClassification(
+        dataset=mock_torch_dataset,
+        name="mock_multilabel_dataset",
+        task=Task.MULTILABEL.value,
+        num_classes=3,
+        label_to_name={0: "classA", 1: "classB", 2: "classC"}
+    )
+    relevant_ids = list(range(len(doleus_dataset)))
+
     result = calculate_metric_internal(
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-        "F1_Score",
+        model, relevant_ids, doleus_dataset, groundtruths_labels, predictions_labels, "Accuracy"
     )
-    assert result == 0.8, f"Expected F1_Score to be 0.8 but got {result}"
+    assert result == 0.5, f"Expected Multilabel Accuracy to be 0.5 but got {result}"
 
 
 if __name__ == "__main__":
diff --git a/tests/test_metric_detection.py b/tests/test_metric_detection.py
index 0989f6d..e5271f0 100644
--- a/tests/test_metric_detection.py
+++ b/tests/test_metric_detection.py
@@ -1,129 +1,111 @@
 import pytest
 import torch
-from moonwatcher.dataset.dataset import MoonwatcherDataset
-from moonwatcher.metric import calculate_metric
-
-
-class MockModel:
-    def __init__(self, name, task_type, device):
-        self.name = name
-        self.task_type = task_type
-        self.device = device
-
-
-class MockDataset(MoonwatcherDataset):
-    def __init__(self, name):
-        self.name = name
-        self.dataset = []
-
-
-class MockDetection:
-    def __init__(self, boxes, labels, scores=None):
-        self.boxes = torch.tensor(boxes)
-        self.labels = torch.tensor(labels)
-        self.scores = torch.tensor(scores) if scores else None
-
-    def to_dict(self):
-        result = {"boxes": self.boxes, "labels": self.labels}
-        if self.scores is not None:
-            result["scores"] = self.scores
-        return result
-
-
-def mock_do_predictions_exist(dataset_name, model_name):
-    return True
-
-
-def mock_load_groundtruths(dataset_name):
-    return [
-        MockDetection([[50, 50, 150, 150]], [1]),
-        MockDetection([[30, 30, 120, 120]], [0]),
-        MockDetection([[10, 10, 100, 100]], [1]),
-        MockDetection([[40, 40, 140, 140]], [1]),
-    ]
-
-
-def mock_load_predictions(dataset_name, model_name):
-    return [
-        MockDetection([[50, 50, 150, 150]], [1], [0.9]),
-        MockDetection([[35, 35, 115, 115]], [0], [0.8]),
-        MockDetection([[15, 15, 105, 105]], [1], [0.75]),
-        MockDetection([[45, 45, 145, 145]], [1], [0.85]),
-    ]
-
-
-def mock_inference(model, dataset, device):
-    pass
-
-
-@pytest.fixture(autouse=True)
-def patch_functions(monkeypatch):
-    monkeypatch.setattr(
-        "moonwatcher.utils.data_storage.do_predictions_exist", mock_do_predictions_exist
-    )
-    monkeypatch.setattr(
-        "moonwatcher.utils.data_storage.load_groundtruths", mock_load_groundtruths
+from torch.utils.data import Dataset as TorchDataset
+from doleus.datasets.detection import DoleusDetection
+from doleus.annotations import Annotations, BoundingBoxes
+from doleus.utils.data import TaskType
+from doleus.metrics.calculator import calculate_metric
+
+# Define a simple dummy dataset for detection
+class DummyDetectionDataset(TorchDataset):
+    def __init__(self, num_samples=4):
+        self.num_samples = num_samples
+        # Predefined data (image placeholder, boxes, labels)
+        self.data = [
+            ( 
+                torch.randn(3, 100, 100),
+                torch.tensor([[10, 10, 50, 50], [60, 60, 90, 90]]),
+                torch.tensor([0, 1])
+            ),
+            (
+                torch.randn(3, 100, 100),
+                torch.tensor([[20, 20, 70, 70]]),
+                torch.tensor([2])
+            ),
+            (
+                torch.randn(3, 100, 100),
+                torch.tensor([[30, 30, 80, 80], [50, 50, 95, 95]]),
+                torch.tensor([1, 3])
+            ),
+            (
+                torch.randn(3, 100, 100),
+                torch.tensor([[40, 40, 90, 90]]),
+                torch.tensor([0])
+            ),
+        ]
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        if idx >= self.num_samples:
+            raise IndexError("Index out of bounds")
+        return self.data[idx]
+
+
+# --- Test Setup ---
+
+@pytest.fixture(scope="module")
+def detection_data():
+    """Prepares a DoleusDetection dataset with sample ground truths and predictions."""
+
+    # 1. Create DoleusDetection dataset
+    doleus_dataset = DoleusDetection(
+        dataset=DummyDetectionDataset(),
+        name="test_detection_dataset",
     )
-    monkeypatch.setattr(
-        "moonwatcher.utils.data_storage.load_predictions", mock_load_predictions
-    )
-    monkeypatch.setattr("moonwatcher.inference.inference", mock_inference)
-
 
-def load_data_for_testing():
-    model = MockModel(
-        name="mock_model", task_type=TaskType.DETECTION.value, device="cpu"
-    )
-    dataset = MockDataset(name="mock_dataset")
-    relevant_ids = list(range(4))
-    groundtruths_loaded = mock_load_groundtruths(dataset.name)
-    predictions_loaded = mock_load_predictions(dataset.name, model.name)
-    return model, relevant_ids, dataset, groundtruths_loaded, predictions_loaded
+    # 2. Define sample predictions. Datapoint number corresponds to the index of the datapoint in the underlying dataset.
+    predictions = [
+        BoundingBoxes( 
+            datapoint_number=0,
+            boxes_xyxy=torch.tensor([[12, 12, 48, 48], [65, 65, 88, 88]], dtype=torch.float32),
+            labels=torch.tensor([0, 1]), # Correct labels predicted
+            scores=torch.tensor([0.9, 0.85]),
+        ),
+        BoundingBoxes( 
+            datapoint_number=1,
+            boxes_xyxy=torch.tensor([[25, 25, 75, 75], [5, 5, 15, 15]], dtype=torch.float32),
+            labels=torch.tensor([2, 0]), # Correct label + a false positive (class 0)
+            scores=torch.tensor([0.8, 0.5]),
+        ),
+        BoundingBoxes( 
+            datapoint_number=2,
+            boxes_xyxy=torch.tensor([[30, 30, 80, 80]], dtype=torch.float32),
+            labels=torch.tensor([1]), # Predicts only label 1 (misses label 3)
+            scores=torch.tensor([0.75]),
+        ),
+        BoundingBoxes( 
+            datapoint_number=3,
+            boxes_xyxy=torch.tensor([[42, 42, 88, 88], [10, 60, 30, 80]], dtype=torch.float32),
+            labels=torch.tensor([3, 2]), # Incorrect labels predicted (FP)
+            scores=torch.tensor([0.85, 0.6]),
+        ),
+    ]
 
+    # 3. Add predictions
+    doleus_dataset.add_model_predictions(predictions, model_id="test_model")
+    return doleus_dataset
 
-def test_calculate_iou():
-    (
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-    ) = load_data_for_testing()
-    result = calculate_metric(
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-        "IntersectionOverUnion",
-    )
-    assert (
-        result > 0.75
-    ), f"Expected IoU to be greater than 0.75 but got {
-        result}"
+# --- Placeholder for Actual Tests ---
+# (Tests will be added in the next step)
 
 
-def test_calculate_map():
-    (
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-    ) = load_data_for_testing()
-    result = calculate_metric(
-        model,
-        relevant_ids,
-        dataset,
-        groundtruths_loaded,
-        predictions_loaded,
-        "mAP",
-    )
-    assert (
-        result > 0.7
-    ), f"Expected mAP to be greater than 0.7 but got {
-        result}"
+# Example of how to use the fixture (will be replaced by actual tests):
+# def test_setup_works(detection_data):
+#     doleus_dataset, relevant_ids = detection_data
+#     assert doleus_dataset.name == "test_detection_dataset"
+#     assert len(relevant_ids) == 4
+#     assert len(doleus_dataset.groundtruths) == 4
+#     assert len(doleus_dataset.predictions) == 4
+#     assert doleus_dataset.task_type == TaskType.DETECTION.value
+#     print("\nSetup seems okay.")
 
 
 if __name__ == "__main__":
-    pytest.main()
+    # You can run pytest directly:
+    pytest.main([__file__])
+    # Or manually invoke the fixture for debugging:
+    # print("Debugging fixture setup:")
+    # data, ids = detection_data()
+    # test_setup_works((data, ids))

From 08ba4d4897cc5bb8db89d45fedac9b2d6ef08532 Mon Sep 17 00:00:00 2001
From: iamheinrich <76793837+iamheinrich@users.noreply.github.com>
Date: Fri, 23 May 2025 19:13:44 +0200
Subject: [PATCH 2/6] refactor: refactored cumulated prediction store into
 base, classification and detection store. added multilabel support for
 predictions.

---
 doleus/annotations/classification.py   |  19 +-
 doleus/datasets/base.py                |  54 ++++--
 doleus/storage/__init__.py             |  14 +-
 doleus/storage/base_store.py           | 122 +++++++++++++
 doleus/storage/classification_store.py | 162 +++++++++++++++++
 doleus/storage/detection_store.py      |  85 +++++++++
 doleus/storage/prediction_store.py     | 238 -------------------------
 7 files changed, 437 insertions(+), 257 deletions(-)
 create mode 100644 doleus/storage/base_store.py
 create mode 100644 doleus/storage/classification_store.py
 create mode 100644 doleus/storage/detection_store.py
 delete mode 100644 doleus/storage/prediction_store.py

diff --git a/doleus/annotations/classification.py b/doleus/annotations/classification.py
index bb4441d..cb69381 100644
--- a/doleus/annotations/classification.py
+++ b/doleus/annotations/classification.py
@@ -13,7 +13,7 @@ class Labels(Annotation):
     """
 
     def __init__(
-        self, datapoint_number: int, labels: Tensor, scores: Optional[Tensor] = None
+        self, datapoint_number: int, labels: Optional[Tensor], scores: Optional[Tensor] = None
     ):
         """Initialize a Labels instance.
 
@@ -21,11 +21,18 @@ def __init__(
         ----------
         datapoint_number : int
             Index for the corresponding data point.
-        labels : Tensor
-            A 1D integer tensor representing the label(s).
+        labels : Optional[Tensor]
+            A 1D integer tensor. For single-label tasks, this typically contains one class index
+            (e.g., `tensor([2])`). For multilabel tasks, this is typically a multi-hot encoded
+            tensor (e.g., `tensor([1, 0, 1])`). Can be `None` if only `scores` are provided.
         scores : Optional[Tensor], optional
-            A float tensor containing predicted probability scores (optional).
+            A 1D float tensor. For single-label tasks (e.g. multiclass), this usually contains
+            probabilities for each class (e.g., `tensor([0.1, 0.2, 0.7])`). For multilabel
+            tasks, this contains independent probabilities for each label (e.g.,
+            `tensor([0.8, 0.1, 0.9])`). Optional.
         """
+        if labels is None and scores is None:
+            raise ValueError("Either 'labels' or 'scores' must be provided but both are None.")
         super().__init__(datapoint_number)
         self.labels = labels
         self.scores = scores
@@ -38,7 +45,9 @@ def to_dict(self) -> dict:
         dict
             Dictionary with keys 'labels' and optionally 'scores'.
         """
-        output = {"labels": self.labels}
+        output = {}
+        if self.labels is not None:
+            output["labels"] = self.labels
         if self.scores is not None:
             output["scores"] = self.scores
         return output
diff --git a/doleus/datasets/base.py b/doleus/datasets/base.py
index 66ba1bf..4dd8fa4 100644
--- a/doleus/datasets/base.py
+++ b/doleus/datasets/base.py
@@ -7,10 +7,16 @@
 from tqdm import tqdm
 
 from doleus.annotations import BoundingBoxes, Labels
-from doleus.storage import GroundTruthStore, MetadataStore, PredictionStore
+from doleus.storage import (
+    ClassificationPredictionStore,
+    DetectionPredictionStore,
+    GroundTruthStore,
+    MetadataStore,
+)
 from doleus.utils import (
     ATTRIBUTE_FUNCTIONS,
     OPERATOR_DICT,
+    TaskType,
     get_current_timestamp,
     to_numpy_image,
     create_filename,
@@ -71,7 +77,18 @@ def __init__(
         self.metadata["_timestamp"] = get_current_timestamp()
 
         self.groundtruth_store = GroundTruthStore(task_type=task_type, dataset=dataset)
-        self.prediction_store = PredictionStore(task_type=task_type)
+        
+        if self.task_type == TaskType.CLASSIFICATION.value:
+            if not self.task:
+                raise ValueError(
+                    "For classification task_type, a specific 'task' (e.g., binary, multiclass, multilabel) must be provided."
+                )
+            self.prediction_store = ClassificationPredictionStore()
+        elif self.task_type == TaskType.DETECTION.value:
+            self.prediction_store = DetectionPredictionStore()
+        else:
+            raise ValueError(f"Unsupported task_type: {self.task_type} for PredictionStore assignment")
+
         self.metadata_store = MetadataStore(
             num_datapoints=len(dataset), metadata=per_datapoint_metadata
         )
@@ -86,7 +103,7 @@ def __getattr__(self, attr):
         return getattr(self.dataset, attr)
 
     @abstractmethod
-    def _create_new_instance(self, dataset, indices):
+    def _create_new_instance(self, dataset, indices, slice_name):
         pass
 
     def add_model_predictions(
@@ -106,10 +123,21 @@ def add_model_predictions(
         model_id : str
             Name of the model that generated these predictions
         """
+        kwargs = {}
+        if self.task_type == TaskType.CLASSIFICATION.value:
+            kwargs['task'] = self.task
+            # Ensure predictions is a Tensor for classification
+            if not isinstance(predictions, torch.Tensor):
+                raise TypeError("For classification tasks, predictions must be a torch.Tensor.")
+        elif self.task_type == TaskType.DETECTION.value:
+            # Ensure predictions is a List[Dict] for detection
+            if not isinstance(predictions, list) or not all(isinstance(p, dict) for p in predictions):
+                raise TypeError("For detection tasks, predictions must be a list of dictionaries.")
+        
         self.prediction_store.add_predictions(
             predictions=predictions,
             model_id=model_id,
-            task=self.task,
+            **kwargs,
         )
 
     # -------------------------------------------------------------------------
@@ -165,23 +193,27 @@ def add_metadata_from_list(self, metadata_list: List[Dict[str, Any]]):
             for key, value in md_dict.items():
                 self.metadata_store.add_metadata(i, key, value)
 
-    def add_predefined_metadata(self, attribute: str) -> None:
+    def add_predefined_metadata(self, keys: Union[str, List[str]]):
         """Add predefined metadata using functions from ATTRIBUTE_FUNCTIONS.
 
         Parameters
         ----------
-        attribute : str
-            Name of predefined metadata function to compute and add.
+        keys : Union[str, List[str]]
+            Name(s) of predefined metadata function(s) to compute and add.
             Available keys are defined in ATTRIBUTE_FUNCTIONS.
 
         Raises
         ------
         ValueError
-            If attribute is not found in ATTRIBUTE_FUNCTIONS.
+            If any key is not found in ATTRIBUTE_FUNCTIONS.
         """
-        if attribute not in ATTRIBUTE_FUNCTIONS:
-            raise ValueError(f"Unknown predefined metadata attribute: {attribute}")
-        self.add_metadata(attribute, ATTRIBUTE_FUNCTIONS[attribute])
+        if isinstance(keys, str):
+            keys = [keys]
+
+        for key in keys:
+            if key not in ATTRIBUTE_FUNCTIONS:
+                raise ValueError(f"Unknown predefined metadata key: {key}")
+            self.add_metadata(key, ATTRIBUTE_FUNCTIONS[key])
 
     def add_metadata_from_dataframe(self, df):
         """Add metadata from a pandas DataFrame.
diff --git a/doleus/storage/__init__.py b/doleus/storage/__init__.py
index 0e1ded7..e958b4f 100644
--- a/doleus/storage/__init__.py
+++ b/doleus/storage/__init__.py
@@ -1,5 +1,13 @@
-from doleus.storage.ground_truth_store import GroundTruthStore
+from doleus.storage.base_store import BasePredictionStore
+from doleus.storage.classification_store import ClassificationPredictionStore
+from doleus.storage.detection_store import DetectionPredictionStore
+from doleus.storage.groundtruth_store import GroundTruthStore
 from doleus.storage.metadata_store import MetadataStore
-from doleus.storage.prediction_store import PredictionStore
 
-__all__ = ["MetadataStore", "PredictionStore", "GroundTruthStore"]
+__all__ = [
+    "BasePredictionStore",
+    "ClassificationPredictionStore",
+    "DetectionPredictionStore",
+    "GroundTruthStore",
+    "MetadataStore",
+]
diff --git a/doleus/storage/base_store.py b/doleus/storage/base_store.py
new file mode 100644
index 0000000..eae633a
--- /dev/null
+++ b/doleus/storage/base_store.py
@@ -0,0 +1,122 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Union
+
+import torch
+
+from doleus.annotations import Annotation, Annotations
+
+
+class BasePredictionStore(ABC):
+    """Base storage for model predictions for a specific dataset instance."""
+
+    def __init__(self):
+        """Initialize the prediction store."""
+        self.predictions: Dict[str, Annotations] = {}
+
+    @abstractmethod
+    def add_predictions(
+        self,
+        predictions: Union[torch.Tensor, List[Dict[str, Any]]],
+        model_id: str,
+        **kwargs,
+    ) -> None:
+        """
+        Store predictions for a model.
+        Actual implementation will depend on the task type (classification, detection).
+
+        Parameters
+        ----------
+        predictions : Union[torch.Tensor, List[Dict[str, Any]]]
+            Model predictions to store. For classification, this should be a
+            tensor of shape [N, C] where N is the number of samples and C is the
+            number of classes. For detection, this should be a list of dictionaries
+            with 'boxes', 'labels', and 'scores' keys.
+        model_id : str
+            Identifier of the specified model.
+        kwargs : dict
+            Additional arguments specific to the subclass implementation (e.g., 'task' for classification).
+        """
+        pass
+
+    @abstractmethod
+    def _process_predictions(
+        self,
+        predictions: Union[torch.Tensor, List[Dict[str, Any]], Annotations],
+        **kwargs,
+    ) -> Annotations:
+        """
+        Process raw predictions into the standard annotation format.
+        Actual implementation will depend on the task type.
+
+        Parameters
+        ----------
+        predictions : Union[torch.Tensor, List[Dict[str, Any]], Annotations]
+            Raw predictions to process.
+        kwargs : dict
+            Additional arguments specific to the subclass implementation (e.g., 'task' for classification).
+
+        Returns
+        -------
+        Annotations
+            Processed predictions in standard annotation format.
+        """
+        pass
+
+    def get(self, model_id: str, datapoint_number: int) -> Annotation:
+        """Get a single annotation object by datapoint number.
+
+        Parameters
+        ----------
+        model_id : str
+            Identifier of the model to get predictions for.
+        datapoint_number : int
+            The ID of the sample in the dataset.
+
+        Returns
+        -------
+        Annotation
+            The specific Annotation object (e.g., Labels, BoundingBoxes) for the datapoint.
+        """
+        if model_id not in self.predictions:
+            raise KeyError(f"No predictions found for model: {model_id}")
+        return self.predictions[model_id][datapoint_number]
+
+    def get_subset(self, model_id: str, indices: List[int]) -> Annotations:
+        """Get a subset of predictions for a specific model based on indices.
+
+        Parameters
+        ----------
+        model_id : str
+            Identifier of the model to get predictions for.
+        indices : List[int]
+            List of indices to get predictions for.
+
+        Returns
+        -------
+        Annotations
+            An Annotations object containing predictions for the specified indices.
+        """
+        if model_id not in self.predictions:
+            raise KeyError(f"No predictions found for model: {model_id}")
+        
+        subset_annotations = Annotations()
+        for i in indices:
+            subset_annotations.add(self.predictions[model_id][i])
+        return subset_annotations
+
+    def get_predictions(self, model_id: str) -> Annotations:
+        """Get all predictions for a specific model.
+
+        Parameters
+        ----------
+        model_id : str
+            Identifier of the model to get predictions for.
+
+        Returns
+        -------
+        Annotations
+            An Annotations object containing all predictions for the specified model.
+        """
+        if model_id not in self.predictions:
+            raise KeyError(f"No predictions found for model: {model_id}")
+        return self.predictions[model_id] 
\ No newline at end of file
diff --git a/doleus/storage/classification_store.py b/doleus/storage/classification_store.py
new file mode 100644
index 0000000..003ee13
--- /dev/null
+++ b/doleus/storage/classification_store.py
@@ -0,0 +1,162 @@
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+from torch import Tensor
+
+from doleus.annotations import Annotations, Labels
+from doleus.storage.base_store import BasePredictionStore
+from doleus.utils import Task
+
+
+class ClassificationPredictionStore(BasePredictionStore):
+    """Storage for classification model predictions."""
+
+    def add_predictions(
+        self,
+        predictions: torch.Tensor,
+        model_id: str,
+        task: str,
+    ) -> None:
+        """
+        Store predictions for a classification model.
+
+        Parameters
+        ----------
+        predictions : torch.Tensor
+            Model predictions to store.
+        model_id : str
+            Identifier of the specified model.
+        task : str
+            The specific classification task (e.g., "multilabel", "multiclass", "binary").
+        """
+        if not isinstance(predictions, torch.Tensor):
+            raise TypeError("For classification, predictions must be a torch.Tensor.")
+        
+        processed_predictions = self._process_predictions(predictions, task=task)
+        self.predictions[model_id] = processed_predictions
+
+    def _process_predictions(
+        self,
+        predictions: torch.Tensor,
+        task: str,
+    ) -> Annotations:
+        """Process raw classification predictions into the standard annotation format.
+
+        The behavior depends on the `task` and the shape/dtype of `predictions`:
+
+        - **Task.BINARY.value**:
+            - If `predictions` is 1D and `dtype` is float (scores for the positive class):
+                - `Labels.scores` will store the raw float score.
+                - `Labels.labels` will be `None`.
+            - If `predictions` is 1D and `dtype` is int (0 or 1):
+                - `Labels.labels` will store the integer label.
+                - `Labels.scores` will be `None`.
+            - 2D predictions currently raise a ValueError.
+
+        - **Task.MULTICLASS.value**:
+            - If `predictions` is 1D and `dtype` is int (class indices):
+                - `Labels.labels` will store the integer class index.
+                - `Labels.scores` will be `None`.
+            - If `predictions` is 1D and `dtype` is float: Raises ValueError (expected class indices).
+            - If `predictions` is 2D (shape [N, C]) and `dtype` is float (logits or probabilities per class):
+                - If values are outside [0,1] (suggesting logits), `torch.softmax` is applied along the class dimension.
+                - `Labels.scores` will store the [C] float tensor of probabilities.
+                - `Labels.labels` will store the class index derived from `argmax` of these scores/original logits.
+            - If `predictions` is 2D and `dtype` is int: Raises ValueError (expected float scores/logits).
+
+        - **Task.MULTILABEL.value**:
+            - If `predictions` is 2D (shape [N, C]) and `dtype` is float (logits or probabilities per class):
+                - If values are outside [0,1] (suggesting logits), `torch.sigmoid` is applied element-wise.
+                - `Labels.scores` will store the [C] float tensor of probabilities.
+                - `Labels.labels` will be `None`.
+            - If `predictions` is 2D (shape [N, C]) and `dtype` is int (multi-hot encoded):
+                - `Labels.labels` will store the [C] integer tensor.
+                - `Labels.scores` will be `None`.
+            - Boolean inputs are not supported for multilabel. 1D predictions or other dtypes for 2D currently raise a ValueError.
+
+        Parameters
+        ----------
+        predictions : torch.Tensor
+            Raw predictions to process. Typically shape [N] or [N, C].
+        task : str
+            The specific classification task ("binary", "multiclass", "multilabel").
+
+        Returns
+        -------
+        Annotations
+            Processed predictions where each element is a `Labels` object.
+        """
+        processed = Annotations()
+        num_samples = predictions.shape[0]
+
+        for i in range(num_samples):
+            current_labels: Optional[Tensor] = None
+            current_scores: Optional[Tensor] = None
+
+            if task == Task.BINARY.value:
+                if predictions.dim() == 1:
+                    if predictions.dtype.is_floating_point:
+                        current_scores = predictions[i].unsqueeze(0)
+                        current_labels = None # Scores are provided, so labels can be None
+                    else: # Integer type
+                        current_labels = predictions[i].unsqueeze(0)
+                        current_scores = None
+                elif predictions.dim() == 2:
+                    #TODO: We need to handle samplewise predictions at some point.
+                    raise ValueError(f"{task} classification predictions must be 1D tensor. Got {predictions.dim()}D")
+                else:
+                    raise ValueError(f"{task} classification predictions must be 1D or 2D tensor. Got {predictions.dim()}D")
+            
+            elif task == Task.MULTICLASS.value:
+                if predictions.dim() == 1:
+                    if predictions.dtype.is_floating_point:
+                        raise ValueError(f"For {task} with 1D predictions, dtype must be integer, got {predictions.dtype}")
+                    else: # Integer type
+                        current_labels = predictions[i].unsqueeze(0)
+                        current_scores = None
+                elif predictions.dim() == 2: # Shape [N, C]
+                    prediction_sample = predictions[i] # Shape [C]
+                    if prediction_sample.dtype.is_floating_point:
+                        current_labels = prediction_sample.argmax(dim=0).unsqueeze(0)
+                        if torch.any(prediction_sample < 0) or torch.any(prediction_sample > 1):
+                            current_scores = torch.softmax(prediction_sample, dim=0)
+                        else:
+                            # Assuming probabilities if values are within [0,1]
+                            current_scores = prediction_sample
+                    else: # Integer type
+                        raise ValueError(f"For {task} with 2D predictions, dtype must be float (scores/logits), got {prediction_sample.dtype}")
+                else:
+                    raise ValueError(f"{task} classification predictions must be 1D or 2D tensor. Got {predictions.dim()}D")
+            
+            elif task == Task.MULTILABEL.value:
+                if predictions.dim() == 2:  # Expect [N, C]
+                    prediction_sample = predictions[i]  # Shape [C]
+
+                    if prediction_sample.dtype.is_floating_point:
+                        if torch.any(prediction_sample < 0) or torch.any(prediction_sample > 1):
+                            probabilities = torch.sigmoid(prediction_sample)
+                        else:
+                            probabilities = prediction_sample
+                        current_scores = probabilities
+                        current_labels = None
+                    
+                    elif prediction_sample.dtype.is_integer():
+                        current_labels = prediction_sample.int()
+                        current_scores = None
+                    else:
+                        raise ValueError(
+                            f"For {task}, 2D prediction samples must be float or integer. Got {prediction_sample.dtype} for sample {i}"
+                        )
+                else:
+                    raise ValueError(
+                        f"{task} classification predictions must be a 2D tensor of shape [N, C]. "
+                        f"Got {predictions.dim()}D with shape {predictions.shape}"
+                    )
+            
+            else:
+                raise ValueError(f"Unsupported task: {task}")
+
+            ann = Labels(datapoint_number=i, labels=current_labels, scores=current_scores)
+            processed.add(ann)
+            
+        return processed 
\ No newline at end of file
diff --git a/doleus/storage/detection_store.py b/doleus/storage/detection_store.py
new file mode 100644
index 0000000..d036ada
--- /dev/null
+++ b/doleus/storage/detection_store.py
@@ -0,0 +1,85 @@
+from typing import Any, Dict, List, Union
+
+import torch
+
+from doleus.annotations import Annotations, BoundingBoxes
+from doleus.storage.base_store import BasePredictionStore
+
+
+class DetectionPredictionStore(BasePredictionStore):
+    """Storage for detection model predictions."""
+
+    def add_predictions(
+        self,
+        predictions: List[Dict[str, Any]],
+        model_id: str,
+        **kwargs,
+    ) -> None:
+        """
+        Store predictions for a detection model.
+
+        Parameters
+        ----------
+        predictions : List[Dict[str, Any]]
+            Model predictions to store. This should be a list of dictionaries,
+            each with 'boxes', 'labels', and 'scores' keys.
+        model_id : str
+            Identifier of the specified model.
+        """
+        if not isinstance(predictions, list):
+            raise TypeError("For detection, predictions must be a list.")
+        if not all(isinstance(p, dict) for p in predictions):
+            raise TypeError("Each item in detection predictions list must be a dictionary.")
+
+        processed_predictions = self._process_predictions(predictions)
+        self.predictions[model_id] = processed_predictions
+
+    def _process_predictions(
+        self,
+        predictions: List[Dict[str, Any]],
+        **kwargs,
+    ) -> Annotations:
+        """Process raw detection predictions into the standard annotation format.
+
+        Parameters
+        ----------
+        predictions : List[Dict[str, Any]]
+            Raw predictions to process. Must be a list of dictionaries.
+
+        Returns
+        -------
+        Annotations
+            Processed predictions in standard annotation format.
+        """
+        processed = Annotations()
+        
+        for i, pred_dict in enumerate(predictions):
+            # Validate keys
+            required_keys = {"boxes", "labels", "scores"}
+            if not required_keys.issubset(pred_dict.keys()):
+                raise ValueError(
+                    f"Detection prediction dict for sample {i} missing keys. "
+                    f"Required: {required_keys}, Got: {list(pred_dict.keys())}"
+                )
+
+            boxes_xyxy = torch.as_tensor(pred_dict["boxes"], dtype=torch.float32)
+            labels = torch.as_tensor(pred_dict["labels"], dtype=torch.long)
+            scores = torch.as_tensor(pred_dict["scores"], dtype=torch.float32)
+
+            # Validate shapes
+            num_detections = boxes_xyxy.shape[0]
+            if not (boxes_xyxy.ndim == 2 and boxes_xyxy.shape[1] == 4):
+                raise ValueError(f"boxes for sample {i} must have shape (M,4), Got: {boxes_xyxy.shape}")
+            if not (labels.ndim == 1 and labels.shape[0] == num_detections):
+                raise ValueError(f"labels for sample {i} must have shape (M,), Got: {labels.shape}")
+            if not (scores.ndim == 1 and scores.shape[0] == num_detections):
+                raise ValueError(f"scores for sample {i} must have shape (M,), Got: {scores.shape}")
+
+            ann = BoundingBoxes(
+                datapoint_number=i,
+                boxes_xyxy=boxes_xyxy,
+                labels=labels,
+                scores=scores,
+            )
+            processed.add(ann)
+        return processed 
\ No newline at end of file
diff --git a/doleus/storage/prediction_store.py b/doleus/storage/prediction_store.py
deleted file mode 100644
index 81092b5..0000000
--- a/doleus/storage/prediction_store.py
+++ /dev/null
@@ -1,238 +0,0 @@
-from typing import Any, Dict, List, Optional, Union
-
-import torch
-
-from doleus.annotations import Annotations, BoundingBoxes, Labels
-from doleus.utils import Task, TaskType
-
-
-class PredictionStore:
-    """Storage for model predictions for a specific dataset instance.
-
-    Each Doleus Dataset has its own PredictionStore instance to manage
-    predictions from different models for that specific dataset.
-    """
-
-    def __init__(self, task_type: str):
-        """Initialize the prediction store.
-
-        Parameters
-        ----------
-        task_type : str
-            Type of task (e.g., "classification", "detection").
-        """
-        self.task_type = task_type
-        self.predictions: Dict[str, Annotations] = {}
-
-    def add_predictions(
-        self,
-        predictions: Union[torch.Tensor, List[Dict[str, Any]]],
-        model_id: str,
-        task: Optional[str] = None,
-    ) -> None:
-        """
-        Store predictions for a model.
-
-        Parameters
-        ----------
-        predictions : Union[torch.Tensor, List[Dict[str, Any]]]
-            Model predictions to store. For classification, this should be a
-            tensor of shape [N, C] where N is the number of samples and C is the
-            number of classes. For detection, this should be a list of dictionaries
-            with 'boxes', 'labels', and 'scores' keys.
-        model_id : str
-            Identifier of the specified model.
-        task : Optional[str], optional
-            The specific task (e.g., "multilabel", "multiclass"), by default None.
-        """
-        processed_predictions = self._process_predictions(predictions, task)
-        self.predictions[model_id] = processed_predictions
-
-    def _process_predictions(
-        self,
-        predictions: Union[torch.Tensor, List[Dict[str, Any]], Annotations],
-        task: Optional[str] = None,
-    ) -> Annotations:
-        """Process raw predictions into the standard annotation format.
-
-        Parameters
-        ----------
-        predictions : Union[torch.Tensor, List[Dict[str, Any]], Annotations]
-            Raw predictions to process. Can be:
-            - A torch.Tensor for classification tasks
-            - A list of dictionaries for detection tasks
-            - An already processed Annotations object
-        task : Optional[str], optional
-            The specific task (e.g., "multilabel", "multiclass"), by default None.
-
-
-        Returns
-        -------
-        Annotations
-            Processed predictions in standard annotation format.
-        """
-        if isinstance(predictions, Annotations) and (
-            isinstance(predictions[0], Labels)
-            or isinstance(predictions[0], BoundingBoxes)
-        ):
-            return predictions
-
-        processed = Annotations()
-
-        if self.task_type == TaskType.CLASSIFICATION.value:
-            if not isinstance(predictions, torch.Tensor):
-                raise TypeError(
-                    "For classification, predictions must be a torch.Tensor."
-                )
-
-            num_samples = predictions.shape[0]
-
-            if predictions.dim() == 1:
-                # Assume these are predicted labels (class IDs) for single-label tasks
-                for i in range(num_samples):
-                    label_val = predictions[i].unsqueeze(0) # Ensure [1] shape
-                    ann = Labels(datapoint_number=i, labels=label_val, scores=None)
-                    processed.add(ann)
-
-            elif predictions.dim() == 2: # Shape [N, C]
-                for i in range(num_samples):
-                    prediction_row = predictions[i]  # This is the [C] tensor for the i-th sample
-                    current_labels: torch.Tensor
-                    current_scores: Optional[torch.Tensor]
-
-                    if task == Task.MULTILABEL.value:
-                        if prediction_row.dtype in (torch.long, torch.int, torch.bool):
-                            # Input is integer multi-hot
-                            current_labels = prediction_row
-                            current_scores = None
-                        else:  # Float input, assumed to be logits or probabilities
-                            # Apply sigmoid if not already probabilities in [0,1]
-                            if not (prediction_row.min() >= 0 and prediction_row.max() <= 1):
-                                processed_scores_for_row = torch.sigmoid(prediction_row)
-                            else: # Already probabilities
-                                processed_scores_for_row = prediction_row
-                            
-                            current_labels = (processed_scores_for_row >= 0.5).long() # Default threshold 0.5
-                            current_scores = processed_scores_for_row
-                        
-                        ann = Labels(
-                            datapoint_number=i,
-                            labels=current_labels,
-                            scores=current_scores,
-                        )
-                    else:  # Binary, Multiclass, or task is None (default to old behavior)
-                        current_labels = prediction_row.argmax(dim=0).unsqueeze(0) # [1] tensor
-                        
-                        if prediction_row.dtype == torch.float:
-                            current_scores = torch.softmax(prediction_row, dim=0) # [C] tensor
-                        else:  # Integer input
-                            current_scores = None
-                        
-                        ann = Labels(
-                            datapoint_number=i,
-                            labels=current_labels,
-                            scores=current_scores,
-                        )
-                    processed.add(ann)
-            else:
-                raise ValueError(
-                    "Classification predictions must be a 1D or 2D tensor."
-                )
-
-        elif self.task_type == TaskType.DETECTION.value:
-            if not isinstance(predictions, list):
-                raise TypeError(
-                    "For detection, predictions must be a list of length N."
-                )
-            if not all(isinstance(p, dict) for p in predictions):
-                raise TypeError(
-                    "Each item in detection predictions list must be a dictionary."
-                )
-
-
-            # Each element should look like {"boxes": (M,4), "labels": (M,), "scores": (M,)}
-            for i, pred_dict in enumerate(predictions):
-                # Validate keys
-                required_keys = {"boxes", "labels", "scores"}
-                if not required_keys.issubset(pred_dict.keys()):
-                    raise ValueError(f"Detection prediction dict for sample {i} missing keys. Required: {required_keys}")
-
-                boxes_xyxy = torch.as_tensor(pred_dict["boxes"], dtype=torch.float32)
-                labels = torch.as_tensor(pred_dict["labels"], dtype=torch.long)
-                scores = torch.as_tensor(pred_dict["scores"], dtype=torch.float32)
-
-                # Validate shapes
-                num_detections = boxes_xyxy.shape[0]
-                if not (boxes_xyxy.ndim == 2 and boxes_xyxy.shape[1] == 4):
-                    raise ValueError(f"boxes for sample {i} must have shape (M,4)")
-                if not (labels.ndim == 1 and labels.shape[0] == num_detections):
-                    raise ValueError(f"labels for sample {i} must have shape (M,)")
-                if not (scores.ndim == 1 and scores.shape[0] == num_detections):
-                    raise ValueError(f"scores for sample {i} must have shape (M,)")
-
-
-                ann = BoundingBoxes(
-                    datapoint_number=i,
-                    boxes_xyxy=boxes_xyxy,
-                    labels=labels,
-                    scores=scores,
-                )
-                processed.add(ann)
-        else:
-            raise ValueError(f"Unsupported task type: {self.task_type}")
-
-
-        return processed
-
-    def get(self, model_id: str, datapoint_number: int):
-        """Get annotation by datapoint number.
-
-        Parameters
-        ----------
-        datapoint_number : int
-            The ID of the sample in the dataset.
-
-        Returns
-        -------
-        Annotation
-            The annotation for the datapoint.
-        """
-        if model_id not in self.predictions:
-            raise KeyError(f"No predictions found for model: {model_id}")
-        return self.predictions[model_id][datapoint_number]
-
-    def get_subset(self, model_id: str, indices: List[int]) -> List[Any]:
-        """Get a subset of predictions for a specific model based on indices.
-
-        Parameters
-        ----------
-        model_id : str
-            Identifier of the model to get predictions for.
-        indices : List[int]
-            List of indices to get predictions for.
-
-        Returns
-        -------
-        List[Any]
-            List of predictions for the specified indices.
-        """
-        if model_id not in self.predictions:
-            raise KeyError(f"No predictions found for model: {model_id}")
-        return [self.predictions[model_id][i] for i in indices]
-
-    def get_predictions(self, model_id: str) -> List[Any]:
-        """Get all predictions for a specific model.
-
-        Parameters
-        ----------
-        model_id : str
-            Identifier of the model to get predictions for.
-
-        Returns
-        -------
-        List[Any]
-            List of all predictions for the specified model.
-        """
-        if model_id not in self.predictions:
-            raise KeyError(f"No predictions found for model: {model_id}")
-        return self.predictions[model_id].annotations

From 3a66cc5b15ad7cd564e4f344acb9fb3110a67fcf Mon Sep 17 00:00:00 2001
From: iamheinrich <76793837+iamheinrich@users.noreply.github.com>
Date: Sat, 24 May 2025 08:54:53 +0200
Subject: [PATCH 3/6] refactor: refactored groundtruth store. now split up in
 detection and classification store

---
 doleus/datasets/base.py                       |  20 +--
 doleus/datasets/classification.py             |  19 ++-
 doleus/datasets/detection.py                  |  12 +-
 doleus/storage/__init__.py                    |  13 +-
 doleus/storage/base_store.py                  |  56 ++++++++-
 .../classification_ground_truth_store.py      | 115 ++++++++++++++++++
 ....py => classification_prediction_store.py} |   0
 .../storage/detection_ground_truth_store.py   |  89 ++++++++++++++
 ...store.py => detection_prediction_store.py} |   0
 doleus/storage/ground_truth_store.py          |  89 --------------
 10 files changed, 296 insertions(+), 117 deletions(-)
 create mode 100644 doleus/storage/classification_ground_truth_store.py
 rename doleus/storage/{classification_store.py => classification_prediction_store.py} (100%)
 create mode 100644 doleus/storage/detection_ground_truth_store.py
 rename doleus/storage/{detection_store.py => detection_prediction_store.py} (100%)
 delete mode 100644 doleus/storage/ground_truth_store.py

diff --git a/doleus/datasets/base.py b/doleus/datasets/base.py
index 4dd8fa4..2e91e91 100644
--- a/doleus/datasets/base.py
+++ b/doleus/datasets/base.py
@@ -8,11 +8,9 @@
 
 from doleus.annotations import BoundingBoxes, Labels
 from doleus.storage import (
-    ClassificationPredictionStore,
-    DetectionPredictionStore,
-    GroundTruthStore,
     MetadataStore,
 )
+from doleus.storage.base_store import BasePredictionStore, BaseGroundTruthStore
 from doleus.utils import (
     ATTRIBUTE_FUNCTIONS,
     OPERATOR_DICT,
@@ -76,18 +74,10 @@ def __init__(
         self.metadata = metadata if metadata is not None else {}
         self.metadata["_timestamp"] = get_current_timestamp()
 
-        self.groundtruth_store = GroundTruthStore(task_type=task_type, dataset=dataset)
-        
-        if self.task_type == TaskType.CLASSIFICATION.value:
-            if not self.task:
-                raise ValueError(
-                    "For classification task_type, a specific 'task' (e.g., binary, multiclass, multilabel) must be provided."
-                )
-            self.prediction_store = ClassificationPredictionStore()
-        elif self.task_type == TaskType.DETECTION.value:
-            self.prediction_store = DetectionPredictionStore()
-        else:
-            raise ValueError(f"Unsupported task_type: {self.task_type} for PredictionStore assignment")
+        # Ground truth and prediction stores are initialized to None in the base class.
+        # Specific instantiations will be handled by subclasses (DoleusClassification, DoleusDetection).
+        self.groundtruth_store: Optional[BaseGroundTruthStore] = None
+        self.prediction_store: Optional[BasePredictionStore] = None
 
         self.metadata_store = MetadataStore(
             num_datapoints=len(dataset), metadata=per_datapoint_metadata
diff --git a/doleus/datasets/classification.py b/doleus/datasets/classification.py
index b4116df..1834aa4 100644
--- a/doleus/datasets/classification.py
+++ b/doleus/datasets/classification.py
@@ -5,6 +5,9 @@
 
 from doleus.datasets.base import Doleus
 from doleus.utils import TaskType
+from doleus.storage.classification_ground_truth_store import ClassificationGroundTruthStore
+from doleus.storage.classification_prediction_store import ClassificationPredictionStore
+from doleus.annotations import Annotations
 
 
 class DoleusClassification(Doleus):
@@ -49,6 +52,12 @@ def __init__(
             metadata=metadata,
             per_datapoint_metadata=per_datapoint_metadata,
         )
+        self.groundtruth_store = ClassificationGroundTruthStore(
+            dataset=self.dataset,
+            task=self.task,
+            num_classes=self.num_classes
+        )
+        self.prediction_store = ClassificationPredictionStore()
 
     def _create_new_instance(self, dataset, indices, name):
         # TODO: Do we need to create a new dataset instance?
@@ -64,8 +73,12 @@ def _create_new_instance(self, dataset, indices, name):
             per_datapoint_metadata=metadata_subset,
         )
 
-        for model_id in self.prediction_store.predictions:
-            sliced_preds = self.prediction_store.get_subset(model_id, indices)
-            new_instance.prediction_store.add_predictions(sliced_preds, model_id)
+        # Correctly transfer sliced predictions
+        if self.prediction_store and self.prediction_store.predictions:
+            for model_id in self.prediction_store.predictions:
+                # get_subset already returns an Annotations object with re-indexed datapoint_numbers
+                sliced_preds_annotations = self.prediction_store.get_subset(model_id, indices)
+                # Directly assign the Annotations object to the new instance's store
+                new_instance.prediction_store.predictions[model_id] = sliced_preds_annotations
 
         return new_instance
diff --git a/doleus/datasets/detection.py b/doleus/datasets/detection.py
index 90a5497..867bc89 100644
--- a/doleus/datasets/detection.py
+++ b/doleus/datasets/detection.py
@@ -4,6 +4,9 @@
 
 from doleus.datasets.base import Doleus
 from doleus.utils import TaskType
+from doleus.storage.detection_ground_truth_store import DetectionGroundTruthStore
+from doleus.storage.detection_prediction_store import DetectionPredictionStore
+from doleus.annotations import Annotations
 
 
 class DoleusDetection(Doleus):
@@ -40,6 +43,8 @@ def __init__(
             metadata=metadata,
             per_datapoint_metadata=per_datapoint_metadata,
         )
+        self.groundtruth_store = DetectionGroundTruthStore(dataset=self.dataset)
+        self.prediction_store = DetectionPredictionStore()
 
     def _create_new_instance(self, dataset, indices, slice_name):
         subset = Subset(dataset, indices)
@@ -52,8 +57,9 @@ def _create_new_instance(self, dataset, indices, slice_name):
             per_datapoint_metadata=new_metadata,
         )
 
-        for model_id in self.prediction_store.predictions:
-            sliced_preds = self.prediction_store.get_subset(model_id, indices)
-            new_instance.prediction_store.add_predictions(sliced_preds, model_id)
+        if self.prediction_store and self.prediction_store.predictions:
+            for model_id in self.prediction_store.predictions:
+                sliced_preds_annotations = self.prediction_store.get_subset(model_id, indices)
+                new_instance.prediction_store.predictions[model_id] = sliced_preds_annotations
 
         return new_instance
diff --git a/doleus/storage/__init__.py b/doleus/storage/__init__.py
index e958b4f..b9a56c4 100644
--- a/doleus/storage/__init__.py
+++ b/doleus/storage/__init__.py
@@ -1,13 +1,16 @@
-from doleus.storage.base_store import BasePredictionStore
-from doleus.storage.classification_store import ClassificationPredictionStore
-from doleus.storage.detection_store import DetectionPredictionStore
-from doleus.storage.groundtruth_store import GroundTruthStore
+from doleus.storage.base_store import BasePredictionStore, BaseGroundTruthStore
+from doleus.storage.classification_ground_truth_store import ClassificationGroundTruthStore
+from doleus.storage.classification_prediction_store import ClassificationPredictionStore
+from doleus.storage.detection_ground_truth_store import DetectionGroundTruthStore
+from doleus.storage.detection_prediction_store import DetectionPredictionStore
 from doleus.storage.metadata_store import MetadataStore
 
 __all__ = [
+    "BaseGroundTruthStore",
     "BasePredictionStore",
+    "ClassificationGroundTruthStore",
     "ClassificationPredictionStore",
+    "DetectionGroundTruthStore",
     "DetectionPredictionStore",
-    "GroundTruthStore",
     "MetadataStore",
 ]
diff --git a/doleus/storage/base_store.py b/doleus/storage/base_store.py
index eae633a..f2899a6 100644
--- a/doleus/storage/base_store.py
+++ b/doleus/storage/base_store.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Union, Optional
 
 import torch
 
@@ -119,4 +119,56 @@ def get_predictions(self, model_id: str) -> Annotations:
         """
         if model_id not in self.predictions:
             raise KeyError(f"No predictions found for model: {model_id}")
-        return self.predictions[model_id] 
\ No newline at end of file
+        return self.predictions[model_id]
+
+
+class BaseGroundTruthStore(ABC):
+    """Base storage for ground truth data for a specific dataset instance."""
+
+    def __init__(self, dataset: Any):
+        """
+        Initialize the ground truth store.
+
+        Parameters
+        ----------
+        dataset : Any
+            The raw PyTorch dataset object.
+        """
+        self.dataset = dataset
+        self.groundtruths: Optional[Annotations] = None
+        self.groundtruths = self._process_groundtruths()
+
+    @abstractmethod
+    def _process_groundtruths(self) -> Annotations:
+        """
+        Process raw ground truth data from the dataset into the standard annotation format.
+        Actual implementation will depend on the task type (classification, detection).
+
+        Returns
+        -------
+        Annotations
+            Processed ground truths in standard annotation format.
+        """
+        pass
+
+    def get(self, datapoint_number: int) -> Optional[Annotation]:
+        """
+        Get a single ground truth annotation object by datapoint number.
+
+        Parameters
+        ----------
+        datapoint_number : int
+            The ID of the sample in the dataset.
+
+        Returns
+        -------
+        Optional[Annotation]
+            The specific Annotation object (e.g., Labels, BoundingBoxes) for the datapoint,
+            or None if not found.
+        """
+        if self.groundtruths is None:
+            return None
+        try:
+            return self.groundtruths[datapoint_number]
+        except KeyError: 
+            return None 
\ No newline at end of file
diff --git a/doleus/storage/classification_ground_truth_store.py b/doleus/storage/classification_ground_truth_store.py
new file mode 100644
index 0000000..d8e204d
--- /dev/null
+++ b/doleus/storage/classification_ground_truth_store.py
@@ -0,0 +1,115 @@
+import torch
+from typing import Any
+
+from doleus.storage.base_store import BaseGroundTruthStore
+from doleus.annotations import Annotations
+from doleus.annotations.classification import Labels
+from doleus.utils.data import Task
+
+
+class ClassificationGroundTruthStore(BaseGroundTruthStore):
+    """Ground truth store for classification tasks."""
+
+    def __init__(self, dataset: Any, task: str, num_classes: int):
+        """
+        Initialize the classification ground truth store.
+
+        Parameters
+        ----------
+        dataset : Any
+            The raw PyTorch dataset object.
+        task : str
+            The specific classification task (e.g., Task.BINARY.value, Task.MULTICLASS.value, Task.MULTILABEL.value).
+        num_classes : int
+            The number of classes for the task.
+        """
+        self.task = task
+        self.num_classes = num_classes
+        super().__init__(dataset) 
+
+    def _process_groundtruths(self) -> Annotations:
+        """
+        Process raw ground truth data from the dataset for classification tasks.
+
+        Returns
+        -------
+        Annotations
+            Processed ground truths in standard annotation format.
+        
+        Raises
+        ------
+        ValueError
+            If the task is unsupported or if ground truth data is in an invalid format.
+        """
+        processed_annotations = Annotations()
+
+        for idx, data in enumerate(self.dataset):
+            # Assuming standard (image, label) structure for dataset items
+            if not (isinstance(data, (list, tuple)) and len(data) > 1):
+                raise ValueError(
+                    f"Dataset item at index {idx} is not in the expected format (e.g., (image, target)). "
+                    f"Got: {type(data)}"
+                )
+            
+            label = data[1]
+            processed_label_tensor: torch.Tensor
+
+            if self.task == Task.BINARY.value:
+                if not isinstance(label, torch.Tensor):
+                    label = torch.tensor(label)
+                
+                if not (label.ndim == 0 or (label.ndim == 1 and label.numel() == 1)):
+                    raise ValueError(
+                        f"Binary ground truth for item {idx} must be a scalar or 1-element tensor. Got shape: {label.shape}"
+                    )
+                if not (label.item() == 0 or label.item() == 1):
+                    raise ValueError(
+                        f"Binary ground truth for item {idx} must be 0 or 1. Got: {label.item()}"
+                    )
+                processed_label_tensor = torch.tensor([label.item()], dtype=torch.long)
+
+            elif self.task == Task.MULTICLASS.value:
+                if not isinstance(label, torch.Tensor):
+                    label = torch.tensor(label)
+
+                if not (label.ndim == 0 or (label.ndim == 1 and label.numel() == 1)):
+                    raise ValueError(
+                        f"Multiclass ground truth for item {idx} must be a scalar or 1-element tensor. Got shape: {label.shape}"
+                    )
+                label_value = label.item()
+                if not (0 <= label_value < self.num_classes):
+                    raise ValueError(
+                        f"Multiclass ground truth for item {idx} must be between 0 and {self.num_classes - 1}. Got: {label_value}"
+                    )
+                processed_label_tensor = torch.tensor([label_value], dtype=torch.long)
+
+            elif self.task == Task.MULTILABEL.value:
+                if not isinstance(label, torch.Tensor):
+                    try:
+                        label = torch.tensor(label) 
+                    except Exception as e:
+                        raise ValueError(f"Could not convert label for item {idx} to tensor: {label}. Error: {e}")
+
+                if label.dim() != 1:
+                    raise ValueError(f"Multilabel ground truth for item {idx} must be a 1D tensor. Got {label.dim()} dimensions.")
+                if label.shape[0] != self.num_classes:
+                    raise ValueError(
+                        f"Multilabel ground truth tensor shape for item {idx} must be ({self.num_classes},). Got {label.shape}."
+                    )
+                if not (label.dtype == torch.int or label.dtype == torch.long):
+                    raise ValueError(
+                        f"Multilabel ground truth tensor for item {idx} must be of integer type (torch.int or torch.long). Got {label.dtype}."
+                    )
+                if not torch.all((label == 0) | (label == 1)):
+                    raise ValueError(
+                        f"Multilabel ground truth tensor for item {idx} must be multi-hot encoded (contain only 0s and 1s). Got: {label}"
+                    )
+                processed_label_tensor = label.long()
+
+            else:
+                raise ValueError(f"Unsupported task for ClassificationGroundTruthStore: {self.task}")
+
+            ann = Labels(datapoint_number=idx, labels=processed_label_tensor, scores=None) # scores=None for ground truth
+            processed_annotations.add(ann)
+
+        return processed_annotations 
\ No newline at end of file
diff --git a/doleus/storage/classification_store.py b/doleus/storage/classification_prediction_store.py
similarity index 100%
rename from doleus/storage/classification_store.py
rename to doleus/storage/classification_prediction_store.py
diff --git a/doleus/storage/detection_ground_truth_store.py b/doleus/storage/detection_ground_truth_store.py
new file mode 100644
index 0000000..81f0fd9
--- /dev/null
+++ b/doleus/storage/detection_ground_truth_store.py
@@ -0,0 +1,89 @@
+import torch
+from typing import Any
+
+from doleus.storage.base_store import BaseGroundTruthStore
+from doleus.annotations import Annotations
+from doleus.annotations.detection import BoundingBoxes
+
+
+class DetectionGroundTruthStore(BaseGroundTruthStore):
+    """Ground truth store for detection tasks."""
+
+    def __init__(self, dataset: Any):
+        """
+        Initialize the detection ground truth store.
+
+        Parameters
+        ----------
+        dataset : Any
+            The raw PyTorch dataset object.
+        """
+        super().__init__(dataset)  # This will call _process_groundtruths
+
+    def _process_groundtruths(self) -> Annotations:
+        """
+        Process raw ground truth data from the dataset for detection tasks.
+
+        Returns
+        -------
+        Annotations
+            Processed ground truths in standard annotation format.
+        
+        Raises
+        ------
+        ValueError
+            If ground truth data is in an invalid format.
+        """
+        processed_annotations = Annotations()
+
+        for idx, data in enumerate(self.dataset):
+            # Assuming standard (image, bounding_boxes, labels) structure for dataset items
+            if not (isinstance(data, (list, tuple)) and len(data) == 3):
+                raise ValueError(
+                    f"Dataset item at index {idx} is not in the expected format (image, bounding_boxes, labels). "
+                    f"Got {len(data)} elements of type: {type(data)}"
+                )
+            
+            _, raw_boxes, raw_labels = data
+
+            # Convert to tensors
+            if not isinstance(raw_boxes, torch.Tensor):
+                try:
+                    bounding_boxes = torch.tensor(raw_boxes, dtype=torch.float32)
+                except Exception as e:
+                    raise ValueError(f"Could not convert bounding_boxes for item {idx} to tensor: {raw_boxes}. Error: {e}")
+            else:
+                bounding_boxes = raw_boxes.float() # Ensure correct dtype
+
+            if not isinstance(raw_labels, torch.Tensor):
+                try:
+                    labels = torch.tensor(raw_labels, dtype=torch.long)
+                except Exception as e:
+                    raise ValueError(f"Could not convert labels for item {idx} to tensor: {raw_labels}. Error: {e}")
+            else:
+                labels = raw_labels.long() # Ensure correct dtype
+
+            # Validate shapes
+            # Assuming M is the number of detected objects for this datapoint
+            # Bounding boxes should be (M, 4)
+            if bounding_boxes.ndim != 2 or bounding_boxes.shape[1] != 4:
+                raise ValueError(
+                    f"Bounding boxes for item {idx} must have shape (M, 4). Got shape: {bounding_boxes.shape}"
+                )
+            
+            # Labels should be (M,)
+            num_detections = bounding_boxes.shape[0]
+            if not (labels.ndim == 1 and labels.shape[0] == num_detections):
+                raise ValueError(
+                    f"Labels for item {idx} must have shape (M,). Got shape: {labels.shape}, expected M={num_detections}"
+                )
+
+            ann = BoundingBoxes(
+                datapoint_number=idx, 
+                boxes_xyxy=bounding_boxes, 
+                labels=labels, 
+                scores=None
+            )
+            processed_annotations.add(ann)
+
+        return processed_annotations 
\ No newline at end of file
diff --git a/doleus/storage/detection_store.py b/doleus/storage/detection_prediction_store.py
similarity index 100%
rename from doleus/storage/detection_store.py
rename to doleus/storage/detection_prediction_store.py
diff --git a/doleus/storage/ground_truth_store.py b/doleus/storage/ground_truth_store.py
deleted file mode 100644
index 4426e63..0000000
--- a/doleus/storage/ground_truth_store.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from typing import Any, Optional
-
-import torch
-
-from doleus.annotations import Annotations, BoundingBoxes, Labels
-from doleus.utils import TaskType
-
-
-class GroundTruthStore:
-    """Storage for ground truth annotations for a specific dataset instance.
-
-    Each Doleus Dataset has its own GroundTruthStore instance to manage
-    ground truth annotations for that specific dataset.
-    """
-
-    def __init__(self, task_type: str, dataset: Any):
-        """Initialize the ground truth store.
-
-        Parameters
-        ----------
-        task_type : str
-            Type of task (e.g., "classification", "detection").
-        dataset : Any
-            The underlying dataset to process ground truths from.
-        """
-        self.task_type = task_type
-        self.dataset = dataset
-        self.groundtruths: Optional[Annotations] = None
-        self._process_groundtruths()
-
-    def _process_groundtruths(self):
-        """Process and store ground truth annotations from the dataset."""
-        groundtruths = Annotations()
-
-        if self.task_type == TaskType.CLASSIFICATION.value:
-            for idx in range(len(self.dataset)):
-                data = self.dataset[idx]
-                if len(data) < 2:
-                    raise ValueError(
-                        f"Expected (image, label(s)) from dataset at index {idx}, got {len(data)} elements."
-                    )
-                _, labels = data
-
-                # Convert label(s) to tensor of shape [N] if needed
-                if not isinstance(labels, torch.Tensor):
-                    labels = torch.tensor(labels)
-                if labels.dim() == 0:
-                    labels = labels.unsqueeze(0)
-
-                ann = Labels(datapoint_number=idx, labels=labels)
-                groundtruths.add(ann)
-
-        elif self.task_type == TaskType.DETECTION.value:
-            for idx in range(len(self.dataset)):
-                data = self.dataset[idx]
-                if len(data) != 3:
-                    raise ValueError(
-                        f"Expected (image, bounding_boxes, labels) for detection at index {idx}, got {len(data)} elements."
-                    )
-                _, bounding_boxes, labels = data
-
-                if not isinstance(bounding_boxes, torch.Tensor):
-                    bounding_boxes = torch.tensor(bounding_boxes, dtype=torch.float32)
-                if not isinstance(labels, torch.Tensor):
-                    labels = torch.tensor(labels, dtype=torch.long)
-
-                ann = BoundingBoxes(
-                    datapoint_number=idx, boxes_xyxy=bounding_boxes, labels=labels
-                )
-                groundtruths.add(ann)
-
-        self.groundtruths = groundtruths
-
-    def get(self, datapoint_number: int):
-        """Get annotation by datapoint number.
-
-        Parameters
-        ----------
-        datapoint_number : int
-            The ID of the sample in the dataset.
-
-        Returns
-        -------
-        Annotation
-            The annotation for the datapoint.
-        """
-        if self.groundtruths is None:
-            raise ValueError("No ground truth annotations found")
-        return self.groundtruths[datapoint_number]

From 1f282a3c7f9ea816f7f107dd0f045d3b20f7e17e Mon Sep 17 00:00:00 2001
From: Niklas Schmolenski <niklas_id@icloud.com>
Date: Sat, 24 May 2025 14:19:31 +0200
Subject: [PATCH 4/6] fix: minor fix

---
 doleus/metrics/calculator.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/doleus/metrics/calculator.py b/doleus/metrics/calculator.py
index 85b7a74..b53fba3 100644
--- a/doleus/metrics/calculator.py
+++ b/doleus/metrics/calculator.py
@@ -79,10 +79,7 @@ def _calculate_classification(
         try:
             gt_tensor = torch.stack([ann.labels.squeeze() for ann in groundtruths])
 
-            pred_list = [
-                ann.scores if ann.scores is not None else ann.labels.squeeze()
-                for ann in predictions
-            ]
+            pred_list = [ann.labels.squeeze() for ann in predictions]
             if not pred_list:
                 raise ValueError("No predictions provided to compute the metric.")
             pred_tensor = torch.stack(pred_list)

From 1198750bf4113c26bb9039772cdfbbdfad5453fb Mon Sep 17 00:00:00 2001
From: iamheinrich <76793837+iamheinrich@users.noreply.github.com>
Date: Sat, 24 May 2025 20:03:50 +0200
Subject: [PATCH 5/6] fix: there was a discprency in the new more modular store
 structure when adding predictions. This is now solvedand re-indexing works
 fine.

---
 doleus/datasets/classification.py             | 19 ++++-------
 doleus/storage/base_store.py                  | 15 ++++-----
 .../classification_prediction_store.py        | 32 ++++++++++++++++++
 doleus/storage/detection_prediction_store.py  | 33 +++++++++++++++++++
 4 files changed, 79 insertions(+), 20 deletions(-)

diff --git a/doleus/datasets/classification.py b/doleus/datasets/classification.py
index 1834aa4..bb663cd 100644
--- a/doleus/datasets/classification.py
+++ b/doleus/datasets/classification.py
@@ -7,7 +7,6 @@
 from doleus.utils import TaskType
 from doleus.storage.classification_ground_truth_store import ClassificationGroundTruthStore
 from doleus.storage.classification_prediction_store import ClassificationPredictionStore
-from doleus.annotations import Annotations
 
 
 class DoleusClassification(Doleus):
@@ -52,15 +51,14 @@ def __init__(
             metadata=metadata,
             per_datapoint_metadata=per_datapoint_metadata,
         )
+        
+        # Instantiate the classification-specific stores
         self.groundtruth_store = ClassificationGroundTruthStore(
-            dataset=self.dataset,
-            task=self.task,
-            num_classes=self.num_classes
+            dataset=self.dataset, task=self.task, num_classes=self.num_classes
         )
         self.prediction_store = ClassificationPredictionStore()
 
     def _create_new_instance(self, dataset, indices, name):
-        # TODO: Do we need to create a new dataset instance?
         subset = Subset(dataset, indices)
         metadata_subset = self.metadata_store.get_subset(indices)
         new_instance = DoleusClassification(
@@ -73,12 +71,9 @@ def _create_new_instance(self, dataset, indices, name):
             per_datapoint_metadata=metadata_subset,
         )
 
-        # Correctly transfer sliced predictions
-        if self.prediction_store and self.prediction_store.predictions:
-            for model_id in self.prediction_store.predictions:
-                # get_subset already returns an Annotations object with re-indexed datapoint_numbers
-                sliced_preds_annotations = self.prediction_store.get_subset(model_id, indices)
-                # Directly assign the Annotations object to the new instance's store
-                new_instance.prediction_store.predictions[model_id] = sliced_preds_annotations
+        # Copy sliced predictions directly to the new instance
+        for model_id in self.prediction_store.predictions:
+            sliced_preds = self.prediction_store.get_subset(model_id, indices)
+            new_instance.prediction_store.predictions[model_id] = sliced_preds
 
         return new_instance
diff --git a/doleus/storage/base_store.py b/doleus/storage/base_store.py
index f2899a6..7ffddaf 100644
--- a/doleus/storage/base_store.py
+++ b/doleus/storage/base_store.py
@@ -81,9 +81,13 @@ def get(self, model_id: str, datapoint_number: int) -> Annotation:
             raise KeyError(f"No predictions found for model: {model_id}")
         return self.predictions[model_id][datapoint_number]
 
+    @abstractmethod
     def get_subset(self, model_id: str, indices: List[int]) -> Annotations:
         """Get a subset of predictions for a specific model based on indices.
 
+        Each subclass must implement this method to handle re-indexing for their
+        specific annotation types (Labels for classification, BoundingBoxes for detection).
+
         Parameters
         ----------
         model_id : str
@@ -94,15 +98,10 @@ def get_subset(self, model_id: str, indices: List[int]) -> Annotations:
         Returns
         -------
         Annotations
-            An Annotations object containing predictions for the specified indices.
+            An Annotations object containing predictions for the specified indices,
+            with datapoint_number values re-indexed starting from 0.
         """
-        if model_id not in self.predictions:
-            raise KeyError(f"No predictions found for model: {model_id}")
-        
-        subset_annotations = Annotations()
-        for i in indices:
-            subset_annotations.add(self.predictions[model_id][i])
-        return subset_annotations
+        pass
 
     def get_predictions(self, model_id: str) -> Annotations:
         """Get all predictions for a specific model.
diff --git a/doleus/storage/classification_prediction_store.py b/doleus/storage/classification_prediction_store.py
index 003ee13..43eebdb 100644
--- a/doleus/storage/classification_prediction_store.py
+++ b/doleus/storage/classification_prediction_store.py
@@ -35,6 +35,38 @@ def add_predictions(
         processed_predictions = self._process_predictions(predictions, task=task)
         self.predictions[model_id] = processed_predictions
 
+    def get_subset(self, model_id: str, indices: List[int]) -> Annotations:
+        """Get a subset of predictions for a specific model based on indices.
+
+        Parameters
+        ----------
+        model_id : str
+            Identifier of the model to get predictions for.
+        indices : List[int]
+            List of indices to get predictions for.
+
+        Returns
+        -------
+        Annotations
+            An Annotations object containing predictions for the specified indices,
+            with datapoint_number values re-indexed starting from 0.
+        """
+        if model_id not in self.predictions:
+            raise KeyError(f"No predictions found for model: {model_id}")
+        
+        subset_annotations = Annotations()
+        for new_idx, original_idx in enumerate(indices):
+            original_annotation = self.predictions[model_id][original_idx]
+            
+            # Create a new Labels annotation with re-indexed datapoint_number
+            new_annotation = Labels(
+                datapoint_number=new_idx,
+                labels=original_annotation.labels,
+                scores=original_annotation.scores
+            )
+            subset_annotations.add(new_annotation)
+        return subset_annotations
+
     def _process_predictions(
         self,
         predictions: torch.Tensor,
diff --git a/doleus/storage/detection_prediction_store.py b/doleus/storage/detection_prediction_store.py
index d036ada..3ffbb5b 100644
--- a/doleus/storage/detection_prediction_store.py
+++ b/doleus/storage/detection_prediction_store.py
@@ -34,6 +34,39 @@ def add_predictions(
         processed_predictions = self._process_predictions(predictions)
         self.predictions[model_id] = processed_predictions
 
+    def get_subset(self, model_id: str, indices: List[int]) -> Annotations:
+        """Get a subset of predictions for a specific model based on indices.
+
+        Parameters
+        ----------
+        model_id : str
+            Identifier of the model to get predictions for.
+        indices : List[int]
+            List of indices to get predictions for.
+
+        Returns
+        -------
+        Annotations
+            An Annotations object containing predictions for the specified indices,
+            with datapoint_number values re-indexed starting from 0.
+        """
+        if model_id not in self.predictions:
+            raise KeyError(f"No predictions found for model: {model_id}")
+        
+        subset_annotations = Annotations()
+        for new_idx, original_idx in enumerate(indices):
+            original_annotation = self.predictions[model_id][original_idx]
+            
+            # Create a new BoundingBoxes annotation with re-indexed datapoint_number
+            new_annotation = BoundingBoxes(
+                datapoint_number=new_idx,
+                boxes_xyxy=original_annotation.boxes_xyxy,
+                labels=original_annotation.labels,
+                scores=original_annotation.scores
+            )
+            subset_annotations.add(new_annotation)
+        return subset_annotations
+
     def _process_predictions(
         self,
         predictions: List[Dict[str, Any]],

From 799289bee8e2be8cc9563e33abedec59a12115b5 Mon Sep 17 00:00:00 2001
From: iamheinrich <76793837+iamheinrich@users.noreply.github.com>
Date: Sat, 24 May 2025 20:11:36 +0200
Subject: [PATCH 6/6] refactor: new folder structure

---
 doleus/datasets/base.py                       |  3 +-
 doleus/datasets/classification.py             |  3 +-
 doleus/datasets/detection.py                  |  3 +-
 doleus/storage/__init__.py                    | 15 +++--
 doleus/storage/ground_truth_store/__init__.py |  9 +++
 doleus/storage/ground_truth_store/base.py     | 56 +++++++++++++++++++
 .../classification.py}                        |  2 +-
 .../detection.py}                             |  2 +-
 doleus/storage/metadata_store/__init__.py     |  5 ++
 .../store.py}                                 |  2 +-
 doleus/storage/prediction_store/__init__.py   |  9 +++
 .../base.py}                                  | 56 +------------------
 .../classification.py}                        |  2 +-
 .../detection.py}                             |  2 +-
 14 files changed, 100 insertions(+), 69 deletions(-)
 create mode 100644 doleus/storage/ground_truth_store/__init__.py
 create mode 100644 doleus/storage/ground_truth_store/base.py
 rename doleus/storage/{classification_ground_truth_store.py => ground_truth_store/classification.py} (98%)
 rename doleus/storage/{detection_ground_truth_store.py => ground_truth_store/detection.py} (97%)
 create mode 100644 doleus/storage/metadata_store/__init__.py
 rename doleus/storage/{metadata_store.py => metadata_store/store.py} (97%)
 create mode 100644 doleus/storage/prediction_store/__init__.py
 rename doleus/storage/{base_store.py => prediction_store/base.py} (71%)
 rename doleus/storage/{classification_prediction_store.py => prediction_store/classification.py} (99%)
 rename doleus/storage/{detection_prediction_store.py => prediction_store/detection.py} (98%)

diff --git a/doleus/datasets/base.py b/doleus/datasets/base.py
index 2e91e91..9cd5a8c 100644
--- a/doleus/datasets/base.py
+++ b/doleus/datasets/base.py
@@ -9,8 +9,9 @@
 from doleus.annotations import BoundingBoxes, Labels
 from doleus.storage import (
     MetadataStore,
+    BasePredictionStore,
+    BaseGroundTruthStore,
 )
-from doleus.storage.base_store import BasePredictionStore, BaseGroundTruthStore
 from doleus.utils import (
     ATTRIBUTE_FUNCTIONS,
     OPERATOR_DICT,
diff --git a/doleus/datasets/classification.py b/doleus/datasets/classification.py
index bb663cd..2912a93 100644
--- a/doleus/datasets/classification.py
+++ b/doleus/datasets/classification.py
@@ -5,8 +5,7 @@
 
 from doleus.datasets.base import Doleus
 from doleus.utils import TaskType
-from doleus.storage.classification_ground_truth_store import ClassificationGroundTruthStore
-from doleus.storage.classification_prediction_store import ClassificationPredictionStore
+from doleus.storage import ClassificationGroundTruthStore, ClassificationPredictionStore
 
 
 class DoleusClassification(Doleus):
diff --git a/doleus/datasets/detection.py b/doleus/datasets/detection.py
index 867bc89..2941a85 100644
--- a/doleus/datasets/detection.py
+++ b/doleus/datasets/detection.py
@@ -4,8 +4,7 @@
 
 from doleus.datasets.base import Doleus
 from doleus.utils import TaskType
-from doleus.storage.detection_ground_truth_store import DetectionGroundTruthStore
-from doleus.storage.detection_prediction_store import DetectionPredictionStore
+from doleus.storage import DetectionGroundTruthStore, DetectionPredictionStore
 from doleus.annotations import Annotations
 
 
diff --git a/doleus/storage/__init__.py b/doleus/storage/__init__.py
index b9a56c4..9f6726b 100644
--- a/doleus/storage/__init__.py
+++ b/doleus/storage/__init__.py
@@ -1,8 +1,13 @@
-from doleus.storage.base_store import BasePredictionStore, BaseGroundTruthStore
-from doleus.storage.classification_ground_truth_store import ClassificationGroundTruthStore
-from doleus.storage.classification_prediction_store import ClassificationPredictionStore
-from doleus.storage.detection_ground_truth_store import DetectionGroundTruthStore
-from doleus.storage.detection_prediction_store import DetectionPredictionStore
+from doleus.storage.prediction_store import (
+    BasePredictionStore,
+    ClassificationPredictionStore,
+    DetectionPredictionStore,
+)
+from doleus.storage.ground_truth_store import (
+    BaseGroundTruthStore,
+    ClassificationGroundTruthStore,
+    DetectionGroundTruthStore,
+)
 from doleus.storage.metadata_store import MetadataStore
 
 __all__ = [
diff --git a/doleus/storage/ground_truth_store/__init__.py b/doleus/storage/ground_truth_store/__init__.py
new file mode 100644
index 0000000..e390fbd
--- /dev/null
+++ b/doleus/storage/ground_truth_store/__init__.py
@@ -0,0 +1,9 @@
+from doleus.storage.ground_truth_store.base import BaseGroundTruthStore
+from doleus.storage.ground_truth_store.classification import ClassificationGroundTruthStore
+from doleus.storage.ground_truth_store.detection import DetectionGroundTruthStore
+
+__all__ = [
+    "BaseGroundTruthStore",
+    "ClassificationGroundTruthStore",
+    "DetectionGroundTruthStore",
+] 
\ No newline at end of file
diff --git a/doleus/storage/ground_truth_store/base.py b/doleus/storage/ground_truth_store/base.py
new file mode 100644
index 0000000..aca2289
--- /dev/null
+++ b/doleus/storage/ground_truth_store/base.py
@@ -0,0 +1,56 @@
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+from doleus.annotations import Annotation, Annotations
+
+
+class BaseGroundTruthStore(ABC):
+    """Base storage for ground truth data for a specific dataset instance."""
+
+    def __init__(self, dataset: Any):
+        """
+        Initialize the ground truth store.
+
+        Parameters
+        ----------
+        dataset : Any
+            The raw PyTorch dataset object.
+        """
+        self.dataset = dataset
+        self.groundtruths: Optional[Annotations] = None
+        self.groundtruths = self._process_groundtruths()
+
+    @abstractmethod
+    def _process_groundtruths(self) -> Annotations:
+        """
+        Process raw ground truth data from the dataset into the standard annotation format.
+        Actual implementation will depend on the task type (classification, detection).
+
+        Returns
+        -------
+        Annotations
+            Processed ground truths in standard annotation format.
+        """
+        pass
+
+    def get(self, datapoint_number: int) -> Optional[Annotation]:
+        """
+        Get a single ground truth annotation object by datapoint number.
+
+        Parameters
+        ----------
+        datapoint_number : int
+            The ID of the sample in the dataset.
+
+        Returns
+        -------
+        Optional[Annotation]
+            The specific Annotation object (e.g., Labels, BoundingBoxes) for the datapoint,
+            or None if not found.
+        """
+        if self.groundtruths is None:
+            return None
+        try:
+            return self.groundtruths[datapoint_number]
+        except KeyError: 
+            return None 
\ No newline at end of file
diff --git a/doleus/storage/classification_ground_truth_store.py b/doleus/storage/ground_truth_store/classification.py
similarity index 98%
rename from doleus/storage/classification_ground_truth_store.py
rename to doleus/storage/ground_truth_store/classification.py
index d8e204d..08458b3 100644
--- a/doleus/storage/classification_ground_truth_store.py
+++ b/doleus/storage/ground_truth_store/classification.py
@@ -1,7 +1,7 @@
 import torch
 from typing import Any
 
-from doleus.storage.base_store import BaseGroundTruthStore
+from doleus.storage.ground_truth_store.base import BaseGroundTruthStore
 from doleus.annotations import Annotations
 from doleus.annotations.classification import Labels
 from doleus.utils.data import Task
diff --git a/doleus/storage/detection_ground_truth_store.py b/doleus/storage/ground_truth_store/detection.py
similarity index 97%
rename from doleus/storage/detection_ground_truth_store.py
rename to doleus/storage/ground_truth_store/detection.py
index 81f0fd9..af5e5d7 100644
--- a/doleus/storage/detection_ground_truth_store.py
+++ b/doleus/storage/ground_truth_store/detection.py
@@ -1,7 +1,7 @@
 import torch
 from typing import Any
 
-from doleus.storage.base_store import BaseGroundTruthStore
+from doleus.storage.ground_truth_store.base import BaseGroundTruthStore
 from doleus.annotations import Annotations
 from doleus.annotations.detection import BoundingBoxes
 
diff --git a/doleus/storage/metadata_store/__init__.py b/doleus/storage/metadata_store/__init__.py
new file mode 100644
index 0000000..c23a4dc
--- /dev/null
+++ b/doleus/storage/metadata_store/__init__.py
@@ -0,0 +1,5 @@
+from doleus.storage.metadata_store.store import MetadataStore
+
+__all__ = [
+    "MetadataStore",
+] 
\ No newline at end of file
diff --git a/doleus/storage/metadata_store.py b/doleus/storage/metadata_store/store.py
similarity index 97%
rename from doleus/storage/metadata_store.py
rename to doleus/storage/metadata_store/store.py
index b6a576e..5540458 100644
--- a/doleus/storage/metadata_store.py
+++ b/doleus/storage/metadata_store/store.py
@@ -74,4 +74,4 @@ def get_subset(self, indices: List[int]) -> List[Dict[str, Any]]:
         List[Dict[str, Any]]
             List of metadata dictionaries for the specified indices.
         """
-        return [self.metadata[i] for i in indices]
+        return [self.metadata[i] for i in indices] 
\ No newline at end of file
diff --git a/doleus/storage/prediction_store/__init__.py b/doleus/storage/prediction_store/__init__.py
new file mode 100644
index 0000000..fe69628
--- /dev/null
+++ b/doleus/storage/prediction_store/__init__.py
@@ -0,0 +1,9 @@
+from doleus.storage.prediction_store.base import BasePredictionStore
+from doleus.storage.prediction_store.classification import ClassificationPredictionStore
+from doleus.storage.prediction_store.detection import DetectionPredictionStore
+
+__all__ = [
+    "BasePredictionStore",
+    "ClassificationPredictionStore", 
+    "DetectionPredictionStore",
+] 
\ No newline at end of file
diff --git a/doleus/storage/base_store.py b/doleus/storage/prediction_store/base.py
similarity index 71%
rename from doleus/storage/base_store.py
rename to doleus/storage/prediction_store/base.py
index 7ffddaf..27eb252 100644
--- a/doleus/storage/base_store.py
+++ b/doleus/storage/prediction_store/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Union, Optional
+from typing import Any, Dict, List, Union
 
 import torch
 
@@ -118,56 +118,4 @@ def get_predictions(self, model_id: str) -> Annotations:
         """
         if model_id not in self.predictions:
             raise KeyError(f"No predictions found for model: {model_id}")
-        return self.predictions[model_id]
-
-
-class BaseGroundTruthStore(ABC):
-    """Base storage for ground truth data for a specific dataset instance."""
-
-    def __init__(self, dataset: Any):
-        """
-        Initialize the ground truth store.
-
-        Parameters
-        ----------
-        dataset : Any
-            The raw PyTorch dataset object.
-        """
-        self.dataset = dataset
-        self.groundtruths: Optional[Annotations] = None
-        self.groundtruths = self._process_groundtruths()
-
-    @abstractmethod
-    def _process_groundtruths(self) -> Annotations:
-        """
-        Process raw ground truth data from the dataset into the standard annotation format.
-        Actual implementation will depend on the task type (classification, detection).
-
-        Returns
-        -------
-        Annotations
-            Processed ground truths in standard annotation format.
-        """
-        pass
-
-    def get(self, datapoint_number: int) -> Optional[Annotation]:
-        """
-        Get a single ground truth annotation object by datapoint number.
-
-        Parameters
-        ----------
-        datapoint_number : int
-            The ID of the sample in the dataset.
-
-        Returns
-        -------
-        Optional[Annotation]
-            The specific Annotation object (e.g., Labels, BoundingBoxes) for the datapoint,
-            or None if not found.
-        """
-        if self.groundtruths is None:
-            return None
-        try:
-            return self.groundtruths[datapoint_number]
-        except KeyError: 
-            return None 
\ No newline at end of file
+        return self.predictions[model_id] 
\ No newline at end of file
diff --git a/doleus/storage/classification_prediction_store.py b/doleus/storage/prediction_store/classification.py
similarity index 99%
rename from doleus/storage/classification_prediction_store.py
rename to doleus/storage/prediction_store/classification.py
index 43eebdb..6dfb815 100644
--- a/doleus/storage/classification_prediction_store.py
+++ b/doleus/storage/prediction_store/classification.py
@@ -4,7 +4,7 @@
 from torch import Tensor
 
 from doleus.annotations import Annotations, Labels
-from doleus.storage.base_store import BasePredictionStore
+from doleus.storage.prediction_store.base import BasePredictionStore
 from doleus.utils import Task
 
 
diff --git a/doleus/storage/detection_prediction_store.py b/doleus/storage/prediction_store/detection.py
similarity index 98%
rename from doleus/storage/detection_prediction_store.py
rename to doleus/storage/prediction_store/detection.py
index 3ffbb5b..9c6c9dd 100644
--- a/doleus/storage/detection_prediction_store.py
+++ b/doleus/storage/prediction_store/detection.py
@@ -3,7 +3,7 @@
 import torch
 
 from doleus.annotations import Annotations, BoundingBoxes
-from doleus.storage.base_store import BasePredictionStore
+from doleus.storage.prediction_store.base import BasePredictionStore
 
 
 class DetectionPredictionStore(BasePredictionStore):