fixed some issues

MEDomics-UdeS · Aug 15, 2024 · cc25eac · cc25eac
1 parent bf1da2f
commit cc25eac
Show file tree

Hide file tree

Showing 18 changed files with 94 additions and 1,528 deletions.
diff --git a/MED3pa/med3pa/comparaison.py b/MED3pa/med3pa/comparaison.py
@@ -24,12 +24,42 @@ def __init__(self, results1_path: str, results2_path: str) -> None:
         self.profiles_detectron_comparaison = {}
         self.global_metrics_comparaison = {}
         self.models_evaluation_comparaison = {}
+        self.shared_profiles = {}  # New variable to store shared profiles
         self.config_file = {}
-        self.compare_profiles = False
+        self.compare_profiles = True
         self.compare_detectron = False
         self.mode = ""
         self._check_experiment_name()
 
+    def identify_shared_profiles(self):
+        """
+        Identifies the shared profiles between the two experiments and stores them in shared_profiles.
+        """
+        profiles_file_1 = os.path.join(self.results1_path, 'test', 'profiles.json')
+        profiles_file_2 = os.path.join(self.results2_path, 'test', 'profiles.json')
+
+        with open(profiles_file_1, 'r') as f1, open(profiles_file_2, 'r') as f2:
+            profiles1 = json.load(f1)
+            profiles2 = json.load(f2)
+
+        shared = {}
+
+        for samples_ratio, dr_dict in profiles1.items():
+            if samples_ratio in profiles2:  # Only proceed if samples_ratio is in both profiles
+                if samples_ratio not in shared:
+                    shared[samples_ratio] = {}
+                for dr, profiles in dr_dict.items():
+                    if dr in profiles2[samples_ratio]:  # Only proceed if dr is in both profiles
+                        for profile in profiles:
+                            profile_path = " / ".join(profile["path"])
+                            # Check if the profile_path exists in both profiles1 and profiles2
+                            matching_profile = next((p for p in profiles2[samples_ratio][dr] if p["path"] == profile["path"]), None)
+                            if matching_profile:
+                                if profile_path not in shared[samples_ratio]:
+                                    shared[samples_ratio][profile_path] = profile["path"]
+
+        self.shared_profiles = shared  # Store shared profiles
+
     def _check_experiment_name(self) -> None:
         """
         Checks if the experiment_name in the config_file of both results paths is the same.
@@ -69,10 +99,25 @@ def is_comparable(self) -> bool:
         base_model_different = self.config_file['base_model']['different']
 
         if self.compare_detectron:
-            params_different = self.config_file['med3pa_detectron_params']['different']
+            # Extract med3pa_detectron_params for comparison, excluding apc_model and ipc_model
+            params1 = self.config_file['med3pa_detectron_params']['med3pa_detectron_params1'].copy()
+            params2 = self.config_file['med3pa_detectron_params']['med3pa_detectron_params2'].copy()
+            # Remove apc_model and ipc_model from comparison
+            params1['med3pa_params'].pop('apc_model', None)
+            params1['med3pa_params'].pop('ipc_model', None)
+            params2['med3pa_params'].pop('apc_model', None)
+            params2['med3pa_params'].pop('ipc_model', None)
         else:
-            params_different = self.config_file['med3pa_params']['different']
+            # Extract med3pa_params for comparison, excluding apc_model and ipc_model
+            params1 = self.config_file['med3pa_params']['med3pa_params1'].copy()
+            params2 = self.config_file['med3pa_params']['med3pa_params2'].copy()
+            params1.pop('apc_model', None)
+            params1.pop('ipc_model', None)
+            params2.pop('apc_model', None)
+            params2.pop('ipc_model', None)
+
 
+        params_different = (params1 != params2)
         # Check the conditions for comparability
         can_compare = False
         if datasets_different and not base_model_different and not params_different:
@@ -105,7 +150,8 @@ def _check_experiment_tree(self) -> None:
 
     def compare_profiles_metrics(self):
         """
-        Compares profile metrics between two sets of results and stores them in a dictionary.
+        Compares profile metrics between two sets of results and stores them in a dictionary,
+        using only the shared profiles.
         """
         combined = {}
         profiles_file_1 = os.path.join(self.results1_path, 'test', 'profiles.json')
@@ -115,66 +161,61 @@ def compare_profiles_metrics(self):
             profiles1 = json.load(f1)
             profiles2 = json.load(f2)
 
-        for samples_ratio, dr_dict in profiles1.items():
-            if samples_ratio not in combined:
-                combined[samples_ratio] = {}
-            for dr, profiles in dr_dict.items():
-                for profile in profiles:
-                    profile_path = " / ".join(profile["path"])
-                    if profile_path not in combined[samples_ratio]:
-                        combined[samples_ratio][profile_path] = {}
-                    if dr not in combined[samples_ratio][profile_path]:
-                        combined[samples_ratio][profile_path][dr] = {}
-                    combined[samples_ratio][profile_path][dr]['metrics_1'] = profile["metrics"]
-
-        for samples_ratio, dr_dict in profiles2.items():
-            if samples_ratio not in combined:
-                combined[samples_ratio] = {}
-            for dr, profiles in dr_dict.items():
-                for profile in profiles:
-                    profile_path = " / ".join(profile["path"])
+        for samples_ratio, profiles_dict in self.shared_profiles.items():
+            combined[samples_ratio] = {}
+            for profile_path_list in profiles_dict.values():
+                profile_path = " / ".join(profile_path_list)  # Convert the list to a string
+
+                # Extract possible drs (decision rules) where profiles match in profiles1
+                drs = [dr for dr, profiles in profiles1[samples_ratio].items()]
+                for dr in drs:
+                    # Attempt to find matching profiles in both profiles1 and profiles2
+                    matching_profile_1 = next((p for p in profiles1[samples_ratio][dr] if " / ".join(p["path"]) == profile_path), None)
+                    matching_profile_2 = next((p for p in profiles2[samples_ratio][dr] if " / ".join(p["path"]) == profile_path), None)
+
                     if profile_path not in combined[samples_ratio]:
                         combined[samples_ratio][profile_path] = {}
-                    if dr not in combined[samples_ratio][profile_path]:
-                        combined[samples_ratio][profile_path][dr] = {}
-                    combined[samples_ratio][profile_path][dr]['metrics_2'] = profile["metrics"]
+
+                    combined[samples_ratio][profile_path][dr] = {
+                        'metrics_1': matching_profile_1["metrics"] if matching_profile_1 else None,
+                        'metrics_2': matching_profile_2["metrics"] if matching_profile_2 else None
+                    }
 
         self.profiles_metrics_comparaison = combined
-
+
+
     def compare_profiles_detectron_results(self):
         """
-        Compares Detectron results between two sets of profiles and stores them in a dictionary.
+        Compares Detectron results between two sets of profiles and stores them in a dictionary,
+        using only the shared profiles.
         """
         combined = {}
+
         profiles_file_1 = os.path.join(self.results1_path, 'test', 'profiles.json')
         profiles_file_2 = os.path.join(self.results2_path, 'test', 'profiles.json')
 
         with open(profiles_file_1, 'r') as f1, open(profiles_file_2, 'r') as f2:
             profiles1 = json.load(f1)
             profiles2 = json.load(f2)
 
-        # Determine the smallest positive samples_ratio
-        smallest_samples_ratio = min([int(k) for k in profiles1.keys() if int(k) >= 0])
-        smallest_samples_ratio = str(smallest_samples_ratio)
+        for samples_ratio, profiles_dict in self.shared_profiles.items():
+            combined[samples_ratio] = {}
+            for profile_path_list in profiles_dict.values():
+                profile_path = " / ".join(profile_path_list)  # Convert the list to a string
 
-        for profiles, key in zip([profiles1, profiles2], ['detectron_results_1', 'detectron_results_2']):
-            if smallest_samples_ratio not in profiles:
-                continue
+                # Attempt to find matching profiles in both profiles1 and profiles2
+                matching_profile_1 = next((p for p in profiles1[samples_ratio]["100"] if " / ".join(p["path"]) == profile_path), None)
+                matching_profile_2 = next((p for p in profiles2[samples_ratio]["100"] if " / ".join(p["path"]) == profile_path), None)
 
-            dr_dict = profiles[smallest_samples_ratio]
+                if profile_path not in combined[samples_ratio]:
+                    combined[samples_ratio][profile_path] = {}
 
-            if "100" not in dr_dict:
-                continue
-
-            for profile in dr_dict["100"]:
-                profile_path = " / ".join(profile["path"])
-                if profile_path not in combined:
-                    combined[profile_path] = {}
-
-                combined[profile_path][key] = profile["detectron_results"]
+                combined[samples_ratio][profile_path]['detectron_results_1'] = matching_profile_1["detectron_results"] if matching_profile_1 else None
+                combined[samples_ratio][profile_path]['detectron_results_2'] = matching_profile_2["detectron_results"] if matching_profile_2 else None
 
         self.profiles_detectron_comparaison = combined
 
+
     def compare_global_metrics(self):
         """
         Compares global metrics between two sets of results and stores them in a dictionary.
@@ -288,9 +329,8 @@ def compare_experiments(self):
             raise ValueError("The two experiments cannot be compared based on the provided criteria.")
 
         self.compare_global_metrics()
-
+        self.identify_shared_profiles()  # Identify shared profiles before comparisons
         if self.mode in ['apc', 'mpc']:
-            self._check_experiment_tree()
             if self.compare_profiles:
                 self.compare_profiles_metrics()
             if self.compare_detectron:

diff --git a/MED3pa/med3pa/experiment.py b/MED3pa/med3pa/experiment.py
@@ -491,7 +491,7 @@ def _run_by_set(datasets_manager: DatasetsManager,
             else:
                 APC_model = apc_instance
                 print("Used a trainde IPC instance.")
-
+            
             # Predict APC values
             APC_values = APC_model.predict(x)
             print("Aggregated confidence scores calculated.")
@@ -527,7 +527,7 @@ def _run_by_set(datasets_manager: DatasetsManager,
             for samples_ratio in range(samples_ratio_min, samples_ratio_max + 1, samples_ratio_step):
 
                 # Calculate profiles and their metrics by declaration rate                
-                MDRCalculator.calc_profiles(profiles_manager, tree, mpc_dataset, features, MPC_values, samples_ratio)                
+                MDRCalculator.calc_profiles(profiles_manager, tree, mpc_dataset, features, MPC_values, samples_ratio)
                 MDRCalculator.calc_metrics_by_profiles(profiles_manager, mpc_dataset, features, MPC_values, samples_ratio, med3pa_metrics)
                 results.set_profiles_manager(profiles_manager)                
                 print("Results extracted for minimum_samples_ratio = ", samples_ratio)

diff --git a/MED3pa/med3pa/mdr.py b/MED3pa/med3pa/mdr.py
@@ -304,7 +304,7 @@ def calc_profiles(profiles_manager: ProfilesManager, tree: TreeRepresentation, d
         lost_profiles_all = []  # Saves lost profiles
         last_min_confidence_level = -1  # Last min confidence level
         min_confidence_levels_dict = {}  # Saves the min_confidence_level thresholds
-
+        precision = 14
         # Go through all declaration rates
         for dr in range(100, -1, -1):
 
@@ -329,10 +329,11 @@ def calc_profiles(profiles_manager: ProfilesManager, tree: TreeRepresentation, d
                     # calculate the samples_ratio (pop%) and mean_confidence_level of this node, if the filtered data isnt empty
                     if len(filtered_confidence_scores) > 0:
                         samples_ratio = len(filtered_confidence_scores) / len(confidence_scores) * 100
-                        mean_cconfidence = np.mean(filtered_confidence_scores) if filtered_confidence_scores.size > 0 else 0
+                        mean_cconfidence = np.mean(filtered_confidence_scores)
                         # if the calculated samples_ratio and mean_confidence meet the conditions, keep this node
-                        if samples_ratio >= min_samples_ratio and mean_cconfidence >= min_confidence_level:
+                        if samples_ratio >= min_samples_ratio and round(mean_cconfidence, precision) >= round(min_confidence_level, precision):
                             profiles_current.append(node)
+
 
 
                 # If the last profiles are different from current profiles

diff --git a/MED3pa/med3pa/models.py b/MED3pa/med3pa/models.py
@@ -282,7 +282,7 @@ def print_decision_tree_structure(tree_model, feature_names=None):
         """
         tree_rules = export_text(tree_model, feature_names=feature_names)
         print(tree_rules)
-
+        
     def optimize(self, param_grid: dict, cv: int, x: np.ndarray, error_prob: np.ndarray, sample_weight: np.ndarray = None) -> None:
         """
         Optimizes the model parameters using GridSearchCV.
@@ -303,7 +303,7 @@ def optimize(self, param_grid: dict, cv: int, x: np.ndarray, error_prob: np.ndar
         self.params.update(grid_search.best_params_)
         self.grid_search_params = param_grid
         df_X, df_y, df_w = self.dataPreparationStrategy.execute(column_labels=self.features, observations=x, labels=error_prob)
-        self.treeRepresentation.build_tree(self.model, df_X, error_prob, node_id=0)
+        self.treeRepresentation.head = self.treeRepresentation.build_tree(self.model, df_X, error_prob, node_id=0)
         self.optimized = True
 
 

diff --git a/MED3pa/med3pa/tree.py b/MED3pa/med3pa/tree.py
@@ -84,7 +84,7 @@ def build_tree(self, dtr: DecisionTreeRegressorModel, X: DataFrame, y: Series, n
         node_thresh = dtr.model.tree_.threshold[node_id]
         node_feature_id = dtr.model.tree_.feature[node_id]
         node_feature = self.features[node_feature_id]
-
+        
         # Check if the split would result in an empty set, if so, stop the recursion
         if y[X[node_feature] <= node_thresh].size == 0 or y[X[node_feature] > node_thresh].size == 0:
             print("split would results in an empty data section")

diff --git a/setup.py b/setup.py
diff --git a/tutorials/detectron_experiment_comparaison/detectron_results_comparaison.json b/tutorials/detectron_experiment_comparaison/detectron_results_comparaison.json