brain-score · mschrimpf · Jul 6, 2024 · Apr 11, 2023 · Apr 11, 2023 · Apr 12, 2023
diff --git a/brainscore_vision/benchmarks/malania2007/__init__.py b/brainscore_vision/benchmarks/malania2007/__init__.py
@@ -0,0 +1,13 @@
+from brainscore_vision import benchmark_registry
+from . import benchmark
+
+benchmark_registry['Malania2007.short2-threshold_elevation'] = lambda: benchmark._Malania2007Base('short2')
+benchmark_registry['Malania2007.short4-threshold_elevation'] = lambda: benchmark._Malania2007Base('short4')
+benchmark_registry['Malania2007.short6-threshold_elevation'] = lambda: benchmark._Malania2007Base('short6')
+benchmark_registry['Malania2007.short8-threshold_elevation'] = lambda: benchmark._Malania2007Base('short8')
+benchmark_registry['Malania2007.short16-threshold_elevation'] = lambda: benchmark._Malania2007Base('short16')
+benchmark_registry['Malania2007.equal2-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal2')
+benchmark_registry['Malania2007.long2-threshold_elevation'] = lambda: benchmark._Malania2007Base('long2')
+benchmark_registry['Malania2007.equal16-threshold_elevation'] = lambda: benchmark._Malania2007Base('equal16')
+benchmark_registry['Malania2007.long16-threshold_elevation'] = lambda: benchmark._Malania2007Base('long16')
+benchmark_registry['Malania2007.vernieracuity-threshold'] = lambda: benchmark._Malania2007VernierAcuity()
diff --git a/brainscore_vision/benchmarks/malania2007/benchmark.py b/brainscore_vision/benchmarks/malania2007/benchmark.py
@@ -0,0 +1,224 @@
+from typing import Tuple
+import numpy as np
+
+import brainscore_vision
+from brainio.assemblies import PropertyAssembly
+from brainscore_vision.benchmarks import BenchmarkBase
+from brainscore_vision.benchmark_helpers.screen import place_on_screen
+from brainscore_vision import load_metric
+from brainscore_vision.model_interface import BrainModel
+from brainscore_vision.utils import LazyLoad
+from brainscore_core.metrics import Score
+
+
+BIBTEX = """@article{malania2007,
+            author = {Malania, Maka and Herzog, Michael H. and Westheimer, Gerald},
+            title = "{Grouping of contextual elements that affect vernier thresholds}",
+            journal = {Journal of Vision},
+            volume = {7},
+            number = {2},
+            pages = {1-1},
+            year = {2007},
+            issn = {1534-7362},
+            doi = {10.1167/7.2.1},
+            url = {https://doi.org/10.1167/7.2.1}
+        }"""
+
+BASELINE_CONDITION = 'vernier_only'
+DATASETS = ['short2-threshold_elevation', 'short4-threshold_elevation', 'short6-threshold_elevation',
+            'short8-threshold_elevation', 'short16-threshold_elevation', 'equal2-threshold_elevation',
+            'long2-threshold_elevation', 'equal16-threshold_elevation', 'long16-threshold_elevation',
+            'vernieracuity-threshold']
+# Values in NUM_FLANKERS_PER_CONDITION denote the condition (i.e., in this case the number of flankers) to be selected
+# This is kept track of simply because the benchmark uses threshold elevation - i.e., a comparison of 2 conditions
+NUM_FLANKERS_PER_CONDITION = {'short2': 2, 'short4': 4, 'short6': 6, 'short8': 8,
+                              'short16': 16, 'equal2': 2, 'long2': 2, 'equal16': 16,
+                              'long16': 16, 'vernier_only': 0}
+
+
+class _Malania2007Base(BenchmarkBase):
+    """
+    INFORMATION:
+
+    Benchmark DATASETS should be considered as independent. This means that participant-specific across-condition data
+    should only ever be compared using the 'subject_unique_id'. In some conditions (short-2, vernier_only, short-16)
+    an additional observer was added from the original paper's plots. This is because in these conditions, two
+    experiments were independently conducted, and 1 additional observer that was non-overlapping between the
+    experiments was added to the aggregate benchmark.
+
+    While humans and models are performing the same testing task in this benchmark, there are a number of choices
+    that are made in this benchmark that make minor deviations from the human experiment. The choices that make
+    deviations from the human experiment are listed below alongside the reason for why the departure was made,
+    and what the 'precisely faithful' alternative would be.
+
+    Benchmark Choices:
+
+    1) The number and type of fitting stimuli are unfounded choices. Currently, the number of fitting stimuli is chosen
+        to be relatively large, and hopefully sufficient for decoding in the baseline condition in general.
+        - Precisely faithful alternative: Present text instructions to models as they were presented to humans
+            * Why not this alternative? Since the experiment is about early visual perception, and there are currently
+            few/no models capable of a task like this, it would not be interesting.
+        - Somewhat faithful alternative: Present a smaller number of training stimuli, motivated by work like
+        Lee & DiCarlo (2023), biorXiv (doi:https://doi.org/10.1101/2022.12.31.522402).
+            * Why not this alternative? Since the experiment is not about perceptual learning but about early visual
+            perception, and there are few/no models capable of a task like this, it would not be interesting.
+        - Importantly, this means the benchmark examines the models' capability to support a task like this, rather than
+        their capability to learn a task like this.
+    2) In the human experiment, stimuli were presented at exactly the foveal position. In the model experiment,
+        testing stimuli are presented at exactly the foveal position +- 72arcsec = 0.02deg.
+        * Why this alternative? Since most models evaluated are test-time deterministic, we want a more precise
+        estimate of the threshold than a point estimate. Since human microsaccades of small distances are generally
+        uncontrolled and uncontrollable for (e.g., up to 360arcsec = 6arcmin = 0.1 deg), we believe the tiny jitter
+        of 0.02deg to have no impact at all on the comparison under study, while improving the precision of threshold
+        estimates.
+
+    """
+    def __init__(self, condition: str):
+        self.baseline_condition = BASELINE_CONDITION
+        self.condition = condition
+
+        # since this benchmark compares threshold elevation against a baseline, we omit one subject
+        # in some conditions in which that subject did not perform both the baseline and the test
+        # condition
+        baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
+        condition_assembly = LazyLoad(lambda: load_assembly(self.condition))
+        self._assembly, self._baseline_assembly = filter_baseline_subjects(condition_assembly,
+                                                                           baseline_assembly)
+
+        self._assemblies = {'baseline_assembly': self._baseline_assembly,
+                            'condition_assembly': self._assembly}
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation'))
+        self._baseline_stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}'.rstrip('-threshold_elevation'))
+        self._stimulus_sets = {self.condition: self._stimulus_set,
+                               self.baseline_condition: self._baseline_stimulus_set}
+        self._fitting_stimuli = brainscore_vision.load_stimulus_set(f'Malania2007.{self.condition}'.rstrip('-threshold_elevation') + '_fit')
+
+        self._metric = load_metric('threshold_elevation',
+                                   independent_variable='image_label',
+                                   baseline_condition=self.baseline_condition,
+                                   test_condition=self.condition,
+                                   threshold_accuracy=0.75)
+
+        self._visual_degrees = 2.986667
+        self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
+
+        super(_Malania2007Base, self).__init__(
+            identifier=f'Malania2007.{condition}', version=1,
+            ceiling_func=lambda: self._metric.ceiling(self._assemblies),
+            parent='Malania2007',
+            bibtex=BIBTEX)
+
+    def __call__(self, candidate: BrainModel):
+        model_responses = {}
+        candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli,
+                             number_of_trials=2, require_variance=True)
+        for condition in (self.baseline_condition, self.condition):
+            stimulus_set = place_on_screen(
+                self._stimulus_sets[condition],
+                target_visual_degrees=candidate.visual_degrees(),
+                source_visual_degrees=self._visual_degrees
+            )
+            model_responses[condition] = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
+                                                           require_variance=True)
+
+        raw_score = self._metric(model_responses, self._assemblies)
+
+        # Adjust score to ceiling
+        ceiling = self.ceiling
+        score = raw_score / ceiling
+
+        score.attrs['raw'] = raw_score
+        score.attrs['ceiling'] = ceiling
+        return score
+
+
+class _Malania2007VernierAcuity(BenchmarkBase):
+    def __init__(self):
+        self.baseline_condition = BASELINE_CONDITION
+        self.conditions = DATASETS.copy()
+        self.conditions.remove('vernieracuity-threshold')
+
+        self._assemblies = {condition: {'baseline_assembly': self.get_assemblies(condition)['baseline_assembly'],
+                                        'condition_assembly': self.get_assemblies(condition)['condition_assembly']}
+                            for condition in self.conditions}
+        self._stimulus_set = brainscore_vision.load_stimulus_set(f'Malania2007.{self.baseline_condition}')
+        self._fitting_stimuli = {condition: brainscore_vision.load_stimulus_set(f'Malania2007.{condition}'.rstrip('-threshold_elevation') + '_fit')
+                                 for condition in self.conditions}
+
+        self._metric = load_metric('threshold',
+                                   independent_variable='image_label',
+                                   threshold_accuracy=0.75)
+
+        self._visual_degrees = 2.986667
+        self._number_of_trials = 10  # arbitrary choice for microsaccades to improve precision of estimates
+
+        super(_Malania2007VernierAcuity, self).__init__(
+            identifier=f'Malania2007.vernieracuity-threshold', version=1,
+            ceiling_func=lambda: self.mean_ceiling(),
+            parent='Malania2007',
+            bibtex=BIBTEX)
+
+    def __call__(self, candidate: BrainModel):
+        scores = []
+        for condition in self.conditions:
+            candidate.start_task(BrainModel.Task.probabilities, fitting_stimuli=self._fitting_stimuli[condition],
+                                 number_of_trials=2, require_variance=True)
+            stimulus_set = place_on_screen(
+                self._stimulus_set,
+                target_visual_degrees=candidate.visual_degrees(),
+                source_visual_degrees=self._visual_degrees
+            )
+            model_response = candidate.look_at(stimulus_set, number_of_trials=self._number_of_trials,
+                                               require_variance=True)
+
+            raw_score = self._metric(model_response, self._assemblies[condition])
+            # Adjust score to ceiling
+            ceiling = self.ceiling
+            score = raw_score / ceiling
+            score.attrs['error'] = raw_score.error
+
+            score.attrs['raw'] = raw_score
+            score.attrs['ceiling'] = ceiling
+            scores.append(score)
+        # average all scores to get 1 average score
+        mean_score = Score(np.mean(scores))
+        mean_score.attrs['error'] = np.mean([score.error for score in scores])
+        return mean_score
+
+    def get_assemblies(self, condition: str):
+        condition = condition.rstrip('-threshold_elevation')
+        baseline_assembly = LazyLoad(lambda: load_assembly(self.baseline_condition))
+        condition_assembly = LazyLoad(lambda: load_assembly(condition))
+        assembly, baseline_assembly = filter_baseline_subjects(condition_assembly,
+                                                               baseline_assembly)
+        return {'condition_assembly': assembly,
+                'baseline_assembly': baseline_assembly}
+
+    def mean_ceiling(self):
+        ceilings = []
+        errors = []
+        for assembly_name in self._assemblies.keys():
+            this_ceiling = self._metric.ceiling(self._assemblies[assembly_name]['baseline_assembly'])
+            ceilings.append(this_ceiling.values)
+            errors.append(this_ceiling.error)
+        mean_ceiling = Score(np.mean(ceilings))
+        mean_ceiling.attrs['error'] = np.mean(errors)
+        return mean_ceiling
+
+
+def load_assembly(dataset: str) -> PropertyAssembly:
+    assembly = brainscore_vision.load_dataset(f'Malania2007.{dataset}')
+    return assembly
+
+
+def filter_baseline_subjects(condition_assembly: PropertyAssembly,
+                             baseline_assembly: PropertyAssembly
+                             ) -> Tuple[PropertyAssembly, PropertyAssembly]:
+    """A function to select only the unique subjects that exist in the condition_assembly."""
+    non_nan_mask = ~np.isnan(condition_assembly.values)
+    unique_ids = condition_assembly.coords['subject'][non_nan_mask].values.tolist()
+
+    mask = baseline_assembly.coords['subject'].isin(unique_ids)
+    filtered_baseline_assembly = baseline_assembly.where(mask, drop=True)
+    filtered_condition_assembly = condition_assembly.where(mask, drop=True)
+    return filtered_condition_assembly, filtered_baseline_assembly
diff --git a/brainscore_vision/benchmarks/malania2007/test.py b/brainscore_vision/benchmarks/malania2007/test.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pytest
+from pytest import approx
+
+from brainscore_vision import benchmark_registry, load_benchmark, load_model
+from brainscore_vision.benchmarks.malania2007.benchmark import DATASETS
+
+
+class TestBehavioral:
+    def test_count(self):
+        assert len(DATASETS) == 5 + 2 + 2 + 1
+
+    @pytest.mark.parametrize('dataset', DATASETS)
+    def test_in_pool(self, dataset):
+        identifier = f"Malania2007.{dataset}"
+        assert identifier in benchmark_registry
+
+    @pytest.mark.private_access
+    def test_mean_ceiling(self):
+        benchmarks = [f"Malania2007.{dataset}" for dataset in DATASETS]
+        benchmarks = [load_benchmark(benchmark) for benchmark in benchmarks]
+        ceilings = [benchmark.ceiling for benchmark in benchmarks]
+        mean_ceiling = np.mean(ceilings)
+        assert mean_ceiling == approx(0.5757928329186803, abs=0.001)
+
+    # these test values are for the pooled score ceiling
+    @pytest.mark.private_access
+    @pytest.mark.parametrize('dataset, expected_ceiling', [
+        ('short2-threshold_elevation', approx(0.78719345, abs=0.001)),
+        ('short4-threshold_elevation', approx(0.49998989, abs=0.001)),
+        ('short6-threshold_elevation', approx(0.50590051, abs=0.001)),
+        ('short8-threshold_elevation', approx(0.4426336, abs=0.001)),
+        ('short16-threshold_elevation', approx(0.8383443, abs=0.001)),
+        ('equal2-threshold_elevation', approx(0.56664015, abs=0.001)),
+        ('long2-threshold_elevation', approx(0.46470421, abs=0.001)),
+        ('equal16-threshold_elevation', approx(0.44087153, abs=0.001)),
+        ('long16-threshold_elevation', approx(0.50996587, abs=0.001)),
+        ('vernieracuity-threshold', approx(0.70168481, abs=0.001))
+    ])
+    def test_dataset_ceiling(self, dataset, expected_ceiling):
+        benchmark = f"Malania2007.{dataset}"
+        benchmark = load_benchmark(benchmark)
+        ceiling = benchmark.ceiling
+        assert ceiling == expected_ceiling
+
+    @pytest.mark.parametrize('dataset, expected_score', [
+        ('short2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short4-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short6-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short8-threshold_elevation', approx(0.0, abs=0.001)),
+        ('short16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('equal2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('long2-threshold_elevation', approx(0.0, abs=0.001)),
+        ('equal16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('long16-threshold_elevation', approx(0.0, abs=0.001)),
+        ('vernieracuity-threshold', approx(0.0, abs=0.001))
+    ])
+    def test_model_score(self, dataset, expected_score):
+        benchmark = f"Malania2007.{dataset}"
+        benchmark = load_benchmark(benchmark)
+        model = load_model('alexnet')
+        model_score = benchmark(model)
+        assert model_score.values == expected_score