Merge pull request #39 from mmzdouc/dev_mmz

fermo_core v.0.4.3
mmzdouc · Jul 22, 2024 · 8498197 · 8498197
2 parents 56ed2a8 + 2164a84
commit 8498197
Show file tree

Hide file tree

Showing 10 changed files with 241 additions and 107 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,21 +8,30 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## Unreleased
 
-N/A
+## [0.4.3] 22-07-2024
+
+### Fixed:
+
+- PhenotypeManager: prevented Pearson calculation on constant or NaN-containing arrays
+- GeneralParser: fixed error-handling on malformed input files.
+- MS2DeepScoreNetworker: fixed MS2 spectra filtering for ms2deepscore algorithm
 
 ## [0.4.2] 16-06-2024
 
 ### Fixed
+
 - Fixed bug in SummaryWriter: a nonexisting function was referenced, leading to premature exit of module.
 
 ## [0.4.1] 16-06-2024
 
 ### Fixed
+
 - Versioning
 
 ## [0.4.0] 15-06-2024
 
 ### Removed
+
 - [Breaking change] Removed MS2Query de novo annotation after observation of process instability (unforeseen process termination by system with SIGKILL (9))
 
 ## [0.3.3] 06-06-2024
@@ -44,6 +53,7 @@ N/A
 - Loosened typing restrictions for Feature and Sample object attributes: area and height (intensity) now accept float values.
 
 ### Removed
+
 - [Breaking change] Removed toggle 'nonbiological' from 'FragmentAnnotator' and from parameters file; 'nonbiological' fragment annotation is now performed automatically
 
 ### Security

diff --git a/fermo_core/data_analysis/annotation_manager/class_ms2deepscore_annotator.py b/fermo_core/data_analysis/annotation_manager/class_ms2deepscore_annotator.py
@@ -1,6 +1,6 @@
 """Runs the ms2deepscore library annotation module.
 
-Copyright (c) 2024 Mitja Maximilian Zdouc, PhD
+Copyright (c) 2024 to present Mitja Maximilian Zdouc, PhD
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -154,7 +154,7 @@ def calculate_scores_ms2deepscore(self: Self):
             except func_timeout.FunctionTimedOut as e:
                 logger.warning(
                     f"'AnnotationManager/Ms2deepscoreAnnotator': timeout of "
-                    f"MS2dDeepScore-based "
+                    f"MS2DeepScore-based "
                     f"calculation: more than specified '{self.max_time}' seconds."
                     f"For unlimited runtime, set 'maximum_runtime' to 0 - SKIP"
                 )

diff --git a/fermo_core/data_analysis/class_analysis_manager.py b/fermo_core/data_analysis/class_analysis_manager.py
@@ -112,7 +112,10 @@ def run_feature_filter(self: Self):
             self.stats, self.features, self.samples = feature_filter.return_values()
             self.params.FeatureFilteringParameters.module_passed = True
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "FeatureFilter: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_blank_assignment(self: Self):
@@ -143,7 +146,10 @@ def run_blank_assignment(self: Self):
             self.stats, self.features = blank_assigner.return_attrs()
             self.params.BlankAssignmentParameters.module_passed = True
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "BlankAssigner: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_group_assignment(self: Self):
@@ -160,7 +166,10 @@ def run_group_assignment(self: Self):
             group_assigner.run_analysis()
             self.stats, self.features = group_assigner.return_attrs()
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "GroupAssigner: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_group_factor_assignment(self: Self):
@@ -187,7 +196,10 @@ def run_group_factor_assignment(self: Self):
             self.features = group_fact_ass.return_features()
             self.params.GroupFactAssignmentParameters.module_passed = True
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "GroupFactorAssigner: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_phenotype_manager(self: Self):
@@ -218,7 +230,10 @@ def run_phenotype_manager(self: Self):
             phenotype_manager.run_analysis()
             self.stats, self.features, self.params = phenotype_manager.return_attrs()
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "PhenotypeManager: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_sim_networks_manager(self: Self):
@@ -250,7 +265,10 @@ def run_sim_networks_manager(self: Self):
                 sim_networks_manager.return_attrs()
             )
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "SimNetworksManager: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_annotation_manager(self: Self):
@@ -268,7 +286,10 @@ def run_annotation_manager(self: Self):
                 annotation_manager.return_attrs()
             )
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "AnnotationManager: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_score_assignment(self: Self):
@@ -283,7 +304,10 @@ def run_score_assignment(self: Self):
             score_assigner.run_analysis()
             self.features, self.samples = score_assigner.return_attributes()
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "ScoreAssigner: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
 
     def run_chrom_trace_calculator(self: Self):
@@ -293,5 +317,8 @@ def run_chrom_trace_calculator(self: Self):
                 self.samples, self.stats
             )
         except Exception as e:
-            logger.warning(str(e))
+            logger.error(str(e))
+            logger.error(
+                "ChromTraceCalculator: an error occurred and the module terminated prematurely - SKIP"
+            )
             return
diff --git a/fermo_core/data_analysis/phenotype_manager/class_phen_quant_conc_assigner.py b/fermo_core/data_analysis/phenotype_manager/class_phen_quant_conc_assigner.py
@@ -24,6 +24,7 @@
 import logging
 from typing import Self
 
+import numpy as np
 from pydantic import BaseModel
 from scipy.stats import pearsonr, zscore
 
@@ -89,7 +90,7 @@ def find_relevant_f_ids(self: Self):
             else:
                 logger.debug(
                     f"'PhenQuantConcAssigner': feature id '{f_id}' only detected in "
-                    f"'{len(feature.samples)}' samples: exclude from correlation "
+                    f"'{len(feature.samples)}' samples: excluded from correlation "
                     f"analysis."
                 )
 
@@ -101,8 +102,8 @@ def calculate_correlation(self: Self):
         """
         if len(self.relevant_f_ids) == 0:
             raise RuntimeError(
-                "'PhenQuantConcAssigner': No relevant features (detected in >3 "
-                "samples) detected - SKIP."
+                "'PhenQuantConcAssigner': No relevant features detected"
+                "(i.e. found in >3 samples) - SKIP."
             )
 
         for f_id in self.relevant_f_ids:
@@ -130,6 +131,19 @@ def calculate_correlation(self: Self):
                 areas_scaled = zscore(areas)
                 activs_scaled = zscore(activs_reciprocal)
 
+                if np.isnan(areas_scaled).any():
+                    logger.debug(
+                        f"'PhenQuantConcAssigner': feature id '{f_id}' has constant "
+                        f"area values ('{areas[0]}'). Cannot calculate Pearson correlation - SKIP."
+                    )
+                    continue
+                elif np.isnan(activs_scaled).any():
+                    logger.debug(
+                        f"'PhenQuantConcAssigner': feature id '{f_id}' has constant "
+                        f"phenotype values ('{activs[0]}'). Cannot calculate Pearson correlation - SKIP."
+                    )
+                    continue
+
                 pearson_s, p_val = pearsonr(areas_scaled, activs_scaled)
 
                 p_val_cor = p_val * len(self.relevant_f_ids)
@@ -148,6 +162,7 @@ def calculate_correlation(self: Self):
                             score=pearson_s,
                             p_value=p_val,
                             p_value_corr=p_val_cor,
+                            descr="Area/phenotype Pearson correlation",
                         )
                     )
                     self.stats.phenotypes[num].f_ids_positive.add(f_id)
@@ -163,6 +178,7 @@ def calculate_correlation(self: Self):
                             score=pearson_s,
                             p_value=p_val,
                             p_value_corr=p_val_cor,
+                            descr="Area/phenotype Pearson correlation",
                         )
                     )
                     self.stats.phenotypes[num].f_ids_positive.add(f_id)

diff --git a/fermo_core/data_analysis/phenotype_manager/class_phen_quant_perc_assigner.py b/fermo_core/data_analysis/phenotype_manager/class_phen_quant_perc_assigner.py
@@ -24,6 +24,7 @@
 import logging
 from typing import Self
 
+import numpy as np
 from pydantic import BaseModel
 from scipy.stats import pearsonr, zscore
 
@@ -89,7 +90,7 @@ def find_relevant_f_ids(self: Self):
             else:
                 logger.debug(
                     f"'PhenQuantPercAssigner': feature id '{f_id}' only detected in "
-                    f"'{len(feature.samples)}' samples: exclude from correlation "
+                    f"'{len(feature.samples)}' samples: excluded from correlation "
                     f"analysis."
                 )
 
@@ -101,8 +102,8 @@ def calculate_correlation(self: Self):
         """
         if len(self.relevant_f_ids) == 0:
             raise RuntimeError(
-                "'PhenQuantPercAssigner': No relevant features (detected in >3 "
-                "samples) detected - SKIP."
+                "'PhenQuantPercAssigner': No relevant features detected "
+                "(i.e. found in >3 samples) - SKIP."
             )
 
         for f_id in self.relevant_f_ids:
@@ -128,6 +129,19 @@ def calculate_correlation(self: Self):
                 areas_scaled = zscore(areas)
                 activs_scaled = zscore(activs)
 
+                if np.isnan(areas_scaled).any():
+                    logger.debug(
+                        f"'PhenQuantPercAssigner': feature id '{f_id}' has constant "
+                        f"area values ('{areas[0]}'). Cannot calculate Pearson correlation - SKIP."
+                    )
+                    continue
+                elif np.isnan(activs_scaled).any():
+                    logger.debug(
+                        f"'PhenQuantPercAssigner': feature id '{f_id}' has constant "
+                        f"phenotype values ('{activs[0]}'). Cannot calculate Pearson correlation - SKIP."
+                    )
+                    continue
+
                 pearson_s, p_val = pearsonr(areas_scaled, activs_scaled)
 
                 p_val_cor = p_val * len(self.relevant_f_ids)
@@ -146,6 +160,7 @@ def calculate_correlation(self: Self):
                             score=pearson_s,
                             p_value=p_val,
                             p_value_corr=p_val_cor,
+                            descr="Area/phenotype Pearson correlation",
                         )
                     )
                     self.stats.phenotypes[num].f_ids_positive.add(f_id)
@@ -162,6 +177,7 @@ def calculate_correlation(self: Self):
                             score=pearson_s,
                             p_value=p_val,
                             p_value_corr=p_val_cor,
+                            descr="Area/phenotype Pearson correlation",
                         )
                     )
                     self.stats.phenotypes[num].f_ids_positive.add(f_id)

diff --git a/fermo_core/data_analysis/score_assigner/class_score_assigner.py b/fermo_core/data_analysis/score_assigner/class_score_assigner.py
@@ -129,7 +129,7 @@ def assign_sample_scores(self: Self):
         except Exception as e:
             logger.warning(str(e))
             logger.warning(
-                "'ScoreAssigner': Could not assign sample score, possibly due "
+                "'ScoreAssigner': Could not assign sample score, possibly due to "
                 "lack of spectral networking information - SKIP"
             )
             return

diff --git a/fermo_core/data_analysis/sim_networks_manager/class_sim_networks_manager.py b/fermo_core/data_analysis/sim_networks_manager/class_sim_networks_manager.py
@@ -27,6 +27,7 @@
 
 import func_timeout
 import networkx
+import numpy as np
 from pydantic import BaseModel
 
 from fermo_core.data_analysis.sim_networks_manager.class_mod_cosine_networker import (
@@ -157,9 +158,10 @@ def run_modified_cosine_alg(self: Self):
         logger.info("'SimNetworksManager/ModCosineNetworker': started calculation")
 
         filtered_features = self.filter_input_spectra(
-            tuple(self.stats.active_features),
-            self.features,
-            self.params.SpecSimNetworkCosineParameters.msms_min_frag_nr,
+            features=tuple(self.stats.active_features),
+            feature_repo=self.features,
+            msms_min_frag_nr=self.params.SpecSimNetworkCosineParameters.msms_min_frag_nr,
+            algorithm="modified_cosine",
         )
 
         try:
@@ -205,9 +207,10 @@ def run_ms2deepscore_alg(self: Self):
             return
 
         filtered_features = self.filter_input_spectra(
-            tuple(self.stats.active_features),
-            self.features,
-            self.params.SpecSimNetworkDeepscoreParameters.msms_min_frag_nr,
+            features=tuple(self.stats.active_features),
+            feature_repo=self.features,
+            msms_min_frag_nr=self.params.SpecSimNetworkDeepscoreParameters.msms_min_frag_nr,
+            algorithm="ms2deepscore",
         )
 
         try:
@@ -246,18 +249,37 @@ def run_ms2deepscore_alg(self: Self):
         self.params.SpecSimNetworkDeepscoreParameters.module_passed = True
         logger.info("'SimNetworksManager/Ms2deepscoreNetworker': completed calculation")
 
+    @staticmethod
+    def filter_for_ms2deepscore(mz_array: np.ndarray) -> bool:
+        """Filters features that have no peaks between 10 and 1000.
+
+        MS2DeepScore v0.5.0 has a function 'bin_number_array_fixed()' in file
+        'spectrum_binning_fixed.py' that raises an AssertionError if all peaks are
+        below 10 and over 1000 m/z
+
+        Arguments:
+            mz_array: Numpy array of peak m/z positions
+        """
+        new_array = mz_array[(mz_array >= 10.0) & (mz_array <= 1000.0)]
+        if len(new_array) == 0:
+            return True
+        else:
+            return False
+
     def filter_input_spectra(
         self: Self,
         features: tuple,
         feature_repo: Repository,
         msms_min_frag_nr: int,
+        algorithm: str,
     ) -> dict[str, set]:
         """Filter features for spectral similarity analysis based on given restrictions.
 
         Arguments:
             features: a tuple of feature IDs
             feature_repo: containing GeneralFeature objects with feature info
             msms_min_frag_nr: minimum number of fragments per spectrum to be considered
+            algorithm: a flag indicating the calling algorithm
 
         Returns:
             A dictionary containing included and excluded feature ints in sets.
@@ -275,6 +297,11 @@ def filter_input_spectra(
                 self.log_filtered_feature_nr_fragments(
                     f_id, len(feature.Spectrum.peaks.mz), msms_min_frag_nr
                 )
+            elif algorithm == "ms2deepscore":
+                if self.filter_for_ms2deepscore(feature.Spectrum.peaks.mz):
+                    excluded.add(f_id)
+                else:
+                    included.add(f_id)
             else:
                 included.add(f_id)