fixed med3pa and added comparaison

MEDomics-UdeS · Jul 16, 2024 · b328161 · b328161
1 parent d84fb04
commit b328161
Show file tree

Hide file tree

Showing 47 changed files with 951 additions and 249,195 deletions.
diff --git a/MED3pa/detectron/experiment.py b/MED3pa/detectron/experiment.py
@@ -121,6 +121,14 @@ def save(self, file_path: str, file_name: str = 'detectron_results', save_config
         with open(file_name_path, 'w') as file:
             json.dump(self.test_results, file, indent=4)
 
+        counts_dict = {}
+        counts_dict['reference'] = self.cal_record.rejected_counts().tolist()
+        counts_dict['test'] = self.test_record.rejected_counts().tolist()
+
+        file_name_path_counts = os.path.join(file_path, 'rejection_counts.json')
+        with open(file_name_path_counts, 'w') as file:
+            json.dump(counts_dict, file, indent=4)
+
         if save_config:
             config_file_path = os.path.join(file_path, 'experiment_config.json')
             with open(config_file_path, 'w') as file:

diff --git a/MED3pa/detectron/strategies.py b/MED3pa/detectron/strategies.py
@@ -103,26 +103,38 @@ def execute(calibration_records: DetectronRecordsManager, test_records:Detectron
 
         # Perform the Mann-Whitney U test
         u_statistic, p_value = stats.mannwhitneyu(cal_counts, test_counts, alternative='less')
-        z_score = (test_mean - cal_mean) / cal_std
+
+        # Calculate the z-scores for the test data
+        z_scores = (test_counts[:, None] - cal_counts) / np.std(cal_counts)
+
+        # Define thresholds for categorizing
+        def categorize_z_score(z):
+            if z <= 0:
+                return 'no significant shift'
+            elif abs(z) < 1:
+                return 'small'
+            elif abs(z) < 2:
+                return 'moderate'
+            else:
+                return 'large'
+
+        # Categorize each test count based on its z-score
+        categories = np.array([categorize_z_score(z) for z in z_scores.flatten()])
+        # Calculate the percentage of each category
+        category_counts = pd.Series(categories).value_counts(normalize=True) * 100
 
         # Describe the significance of the shift based on the z-score
-        significance_description = ""
-        if z_score <= 0 :
-            significance_description = "no significant shift"
-        elif abs(z_score) < 1.0:
-            significance_description = "Small"
-        elif abs(z_score) < 2.0:
-            significance_description = "Moderate"
-        elif abs(z_score) < 3.0:
-            significance_description = "Large"
-        else:
-            significance_description = "Very large"
-        # Results dictionary including rank statistics
+        significance_description = {
+            'no shift': category_counts.get('no significant shift', 0),
+            'small': category_counts.get('small', 0),
+            'moderate': category_counts.get('moderate', 0),
+            'large': category_counts.get('large', 0)
+        }
+
         results = {
             'p_value': p_value,
             'u_statistic': u_statistic,
-            'z-score':z_score,
-            'shift significance' : significance_description
+            'significance_description' : significance_description
         }
 
         return results
@@ -239,7 +251,7 @@ def trim_dataset(data, proportion_to_cut):
         # Define thresholds for categorizing
         def categorize_z_score(z):
             if z <= 0:
-                return 'no shift'
+                return 'no significant shift'
             elif abs(z) < 1:
                 return 'small'
             elif abs(z) < 2:
@@ -257,7 +269,7 @@ def categorize_z_score(z):
 
         # Describe the significance of the shift based on the z-score
         significance_description = {
-            'no shift': category_counts.get('no shift', 0),
+            'no shift': category_counts.get('no significant shift', 0),
             'small': category_counts.get('small', 0),
             'moderate': category_counts.get('moderate', 0),
             'large': category_counts.get('large', 0)

diff --git a/MED3pa/med3pa/comparaison.py b/MED3pa/med3pa/comparaison.py
@@ -0,0 +1,198 @@
+import json
+import os
+from typing import Any, Dict, List, Tuple, Type, Union
+
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+from MED3pa.med3pa.models import *
+from MED3pa.med3pa.uncertainty import *
+from MED3pa.models.base import BaseModelManager
+from MED3pa.med3pa.experiment import Med3paResults
+
+class Med3paComparison:
+    """
+    Class to compare the results of two Med3paExperiment instances.
+    """
+    def __init__(self, results1_path: str, results2_path: str) -> None:
+        self.results1_path = os.path.abspath(results1_path)
+        self.results2_path = os.path.abspath(results2_path)
+        self.profiles_metrics_comparaison = {}
+        self.profiles_detectron_comparaison = {}
+        self.global_metrics_comparaison = {}
+        self.compare_profiles = False
+        self.compare_detectron = False
+        self._check_experiment_name()
+
+    def _check_experiment_name(self) -> None:
+        """
+        Checks if the experiment_name in the config_file of both results paths is the same.
+        If not, raises a ValueError. Also sets the flag for Detectron comparison if applicable.
+        """
+        config_file_1 = os.path.join(self.results1_path, 'experiment_config.json')
+        config_file_2 = os.path.join(self.results2_path, 'experiment_config.json')
+
+        with open(config_file_1, 'r') as f1, open(config_file_2, 'r') as f2:
+            config1 = json.load(f1)
+            config2 = json.load(f2)
+
+        if config1['experiment_name'] not in ["Med3paDetectronExperiment", "Med3paExperiment"]:
+            raise ValueError("Only Med3paDetectronExperiments & Med3paExperiments can be compared")
+
+        if config1['experiment_name'] != config2['experiment_name']:
+            raise ValueError("The two results are not from the same experiment.")
+
+        if config1['experiment_name'] == 'Med3paDetectronExperiment':
+            self.compare_detectron = True
+
+    def _check_experiment_tree(self) -> None:
+        """
+        Checks if the experiment trees in the results paths are the same.
+        If they are, sets the flag for profile comparison.
+        """
+        tree_file_1 = os.path.join(self.results1_path, 'test', 'tree.json')
+        tree_file_2 = os.path.join(self.results2_path, 'test', 'tree.json')
+
+        with open(tree_file_1, 'r') as f1, open(tree_file_2, 'r') as f2:
+            tree1 = json.load(f1)
+            tree2 = json.load(f2)
+
+        if tree1 == tree2:
+            self.compare_profiles = True
+
+    def compare_profiles_metrics(self):
+        """
+        Compares profile metrics between two sets of results and stores them in a dictionary.
+        """
+        combined = {}
+        profiles_file_1 = os.path.join(self.results1_path, 'test', 'profiles.json')
+        profiles_file_2 = os.path.join(self.results2_path, 'test', 'profiles.json')
+
+        with open(profiles_file_1, 'r') as f1, open(profiles_file_2, 'r') as f2:
+            profiles1 = json.load(f1)
+            profiles2 = json.load(f2)
+
+        for samples_ratio, dr_dict in profiles1.items():
+            if samples_ratio not in combined:
+                combined[samples_ratio] = {}
+            for dr, profiles in dr_dict.items():
+                for profile in profiles:
+                    profile_path = " / ".join(profile["path"])
+                    if profile_path not in combined[samples_ratio]:
+                        combined[samples_ratio][profile_path] = {}
+                    if dr not in combined[samples_ratio][profile_path]:
+                        combined[samples_ratio][profile_path][dr] = {}
+                    combined[samples_ratio][profile_path][dr]['metrics_1'] = profile["metrics"]
+
+        for samples_ratio, dr_dict in profiles2.items():
+            if samples_ratio not in combined:
+                combined[samples_ratio] = {}
+            for dr, profiles in dr_dict.items():
+                for profile in profiles:
+                    profile_path = " / ".join(profile["path"])
+                    if profile_path not in combined[samples_ratio]:
+                        combined[samples_ratio][profile_path] = {}
+                    if dr not in combined[samples_ratio][profile_path]:
+                        combined[samples_ratio][profile_path][dr] = {}
+                    combined[samples_ratio][profile_path][dr]['metrics_2'] = profile["metrics"]
+
+        self.profiles_metrics_comparaison = combined
+
+    def compare_profiles_detectron_results(self):
+        """
+        Compares Detectron results between two sets of profiles and stores them in a dictionary.
+        """
+        combined = {}
+        profiles_file_1 = os.path.join(self.results1_path, 'test', 'profiles.json')
+        profiles_file_2 = os.path.join(self.results2_path, 'test', 'profiles.json')
+
+        with open(profiles_file_1, 'r') as f1, open(profiles_file_2, 'r') as f2:
+            profiles1 = json.load(f1)
+            profiles2 = json.load(f2)
+
+        # Determine the smallest positive samples_ratio
+        smallest_samples_ratio = min(filter(lambda x: float(x) > 0, profiles1.keys()))
+
+        for profiles in [profiles1, profiles2]:
+            if smallest_samples_ratio not in profiles:
+                continue
+
+            dr_dict = profiles[smallest_samples_ratio]
+
+            if smallest_samples_ratio not in combined:
+                combined[smallest_samples_ratio] = {}
+
+            if "100" not in dr_dict:
+                continue
+
+            for profile in dr_dict["100"]:
+                profile_path = " / ".join(profile["path"])
+                if profile_path not in combined[smallest_samples_ratio]:
+                    combined[smallest_samples_ratio][profile_path] = {}
+                if "100" not in combined[smallest_samples_ratio][profile_path]:
+                    combined[smallest_samples_ratio][profile_path]["100"] = {}
+
+                if profiles is profiles1:
+                    combined[smallest_samples_ratio][profile_path]["100"]['detectron_results_1'] = profile["detectron_results"]
+                else:
+                    combined[smallest_samples_ratio][profile_path]["100"]['detectron_results_2'] = profile["detectron_results"]
+
+        self.profiles_detectron_comparaison = combined
+
+    def compare_global_metrics(self):
+        """
+        Compares global metrics between two sets of results and stores them in a dictionary.
+        """
+        combined = {}
+        file_1 = os.path.join(self.results1_path, 'test', 'metrics_dr.json')
+        file_2 = os.path.join(self.results2_path, 'test', 'metrics_dr.json')
+
+        with open(file_1, 'r') as f1, open(file_2, 'r') as f2:
+            dr1 = json.load(f1)
+            dr2 = json.load(f2)
+
+        for dr in range(100, -1, -1):  # Iterating from 100 to 0
+            dr_str = str(dr)
+            combined[dr_str] = {}
+
+            if dr_str in dr1:
+                combined[dr_str]['metrics_dr_1'] = dr1[dr_str]
+            if dr_str in dr2:
+                combined[dr_str]['metrics_dr_2'] = dr2[dr_str]
+
+        self.global_metrics_comparaison = combined
+
+    def compare_experiments(self):
+        """
+        Compares the experiments by global metrics, profiles, and Detectron results if applicable.
+        """
+        self.compare_global_metrics()
+        self._check_experiment_tree()
+        if self.compare_profiles:
+            self.compare_profiles_metrics()
+        if self.compare_detectron:
+            self.compare_profiles_detectron_results()
+
+    def save(self, directory_path: str) -> None:
+        """
+        Saves the comparison results to a specified directory.
+
+        Args:
+            directory_path (str): The directory where the comparison results will be saved.
+        """
+        # Ensure the main directory exists
+        os.makedirs(directory_path, exist_ok=True)
+
+        global_comparaison_path = os.path.join(directory_path, 'global_metrics_comparaison.json')
+        with open(global_comparaison_path, 'w') as f:
+                json.dump(self.global_metrics_comparaison, f, indent=4)
+
+        if self.profiles_detectron_comparaison is not {} and self.compare_detectron:
+            profiles_detectron_path = os.path.join(directory_path, 'profiles_detectron_comparaison.json')
+            with open(profiles_detectron_path, 'w') as f:
+                json.dump(self.profiles_detectron_comparaison, f, indent=4)
+
+        if self.profiles_metrics_comparaison is not {} and self.compare_profiles:
+            profiles_metrics_path = os.path.join(directory_path, 'profiles_metrics_comparaison.json')
+            with open(profiles_metrics_path, 'w') as f:
+                json.dump(self.profiles_metrics_comparaison, f, indent=4)