Merge pull request #25 from AberystwythSystemsBiology/normalisation

New Release
AberystwythSystemsBiology · Sep 3, 2020 · 057149e · 057149e
2 parents ee08170 + fd4ca48
commit 057149e
Show file tree

Hide file tree

Showing 6 changed files with 209 additions and 80 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,6 +1,5 @@
 language: python
 python:
-    - "3.5"
     - "3.7"
 install:
     - pip install .

diff --git a/dimepy/spectrum.py b/dimepy/spectrum.py
@@ -1,22 +1,7 @@
 # -*- coding: utf-8 -*-
 # encoding: utf-8
 
-"""
-The class :py:class:`Spectrum` has been designed to load data
-from a mzML file and to represetn the data as a python object.
-
-.. note::
-    This class is still being actively developed and will likely change over time.
-"""
-import numpy as np
-from typing import Tuple, List
-from scipy.stats import binned_statistic
-from pymzml.run import Reader as pymzmlReader
-from .scan import Scan
-from .utils import terms, bin_masses_and_intensities
-import itertools
-
-# Copyright (c) 2017-2019 Keiron O'Shea
+# Copyright (c) 2017-2020 Keiron O'Shea
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public
@@ -33,6 +18,26 @@
 # Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 # Boston, MA 02110-1301 USA
 
+
+import numpy as np
+import logging
+from typing import Tuple, List
+from scipy.stats import binned_statistic
+from pymzml.run import Reader as pymzmlReader
+from .scan import Scan
+from .utils import terms, bin_masses_and_intensities
+import itertools
+import os
+import matplotlib.pyplot as plt
+
+"""
+The class :py:class:`Spectrum` has been designed to load data
+from a mzML file.
+
+.. note::
+    This class is still being actively developed and will likely change over time.
+"""
+
 class Spectrum(object):
 
     """ Initialise Spectrum object for a given mzML file.
@@ -41,11 +46,12 @@ class Spectrum(object):
 
     def __init__(self,
                  filepath: str,
-                 identifier: str,
+                 identifier: str = None,
                  injection_order: int = None,
                  stratification: str = None,
                  snr_estimator: str = False,
                  peak_type: str = "raw",
+                 is_qc: bool= False,
                  MS1_precision: float = 5e-6,
                  MSn_precision: float = 20e-6
                  ):
@@ -57,7 +63,8 @@ def __init__(self,
         Arguments:
             filepath (str): Path to the mzML file to parse.
 
-            identifier (str): Unique identifier for the Spectrum object.
+            identifier (str): Unique identifier for the Spectrum object. If none is given,
+                it will default to the filepath.
 
             injection_order (int): The injection number of the Spectrum object.
 
@@ -70,24 +77,31 @@ def __init__(self,
                     * 'mean'
                     * 'mad'
 
-            peak_type (raw): What peak type to load in.
+            peak_type (str): What peak type to load in.
 
                 Currently supported peak types are:
                     * raw (default)
                     * centroided
                     * reprofiled
 
-        MS1_precision (float): Measured precision for the MS level 1.
+            is_qc (bool): Whether the Sample is QC.
 
-        MSn_precision (float): Measured precision for the MS level n.
+            MS1_precision (float): Measured precision for the MS level 1.
+
+            MSn_precision (float): Measured precision for the MS level n.
 
     """
         self.filepath = filepath
         self.identifier = identifier
+
+        if identifier == None:
+            self.identifier = os.path.basename(filepath)
+
         self.injection_order = injection_order
         self.stratification = stratification
         self.snr_estimator = snr_estimator
         self.peak_type = peak_type
+        self.is_qc = is_qc
         self.MS1_precision = MS1_precision
         self.MSn_precision = MSn_precision
 
@@ -99,6 +113,8 @@ def __init__(self,
 
     def _base_load(self) -> Tuple[np.array, np.array]:
 
+        logging.info("Loading %s" % (self.filepath))
+
         extraAccessions = [
             [[y, ["value"]] for y in terms[x]] for x in terms.keys()
         ]
@@ -122,7 +138,7 @@ def _base_load(self) -> Tuple[np.array, np.array]:
 
         return np.array(scans), np.array(to_use)
 
-    def limit_polarity(self, polarity: str) -> None:
+    def limit_polarity(self, polarity: str, verbose: bool = False) -> None:
         """
         Limit the Scans found within the mzML file to whatever polarity is given.
         This should only be called where fast-polarity switching is used.
@@ -134,21 +150,30 @@ def limit_polarity(self, polarity: str) -> None:
                     * 'positive'
                     * 'negative'
 
+            verbose (bool): enable verbose output.
+
         """
 
+        if polarity.upper() not in ["POSITIVE", "NEGATIVE"]:
+            raise AttributeError("%s not a valid option" % (polarity))
+
         def _determine_polarity(scan) -> str:
             scan_polarity = None
             for polarity_acc in terms["polarity"]:
                 if scan.get(polarity_acc) != None:
                     scan_polarity = terms["polarity"][polarity_acc]
-
             return scan_polarity
 
         for index, scan in enumerate(self._scans):
             if _determine_polarity(scan) != polarity.upper():
                 self._to_use[index] = False
+                logging.info("Scan %i is not %s polarity" % (index, polarity))
+
+            if verbose and self._to_use[index]:
+                print("Scan %i is %s polarity" % (index, polarity))
+
 
-    def limit_infusion(self, threshold: int = 3) -> None:
+    def limit_infusion(self, threshold: int = 3, plot = False) -> None:
         """
         This method is a slight extension of Manfred Beckmann's (meb@aber.ac.uk)
         LCT/Q-ToF scan retrieval method in FIEMSpro in which we use the median absolute
@@ -161,7 +186,7 @@ def limit_infusion(self, threshold: int = 3) -> None:
                 / \ 
                /   \_
           ____/       \_________________
-          0     0.5     1     1.5     2 [min]
+          0     0.5     1     1.5     n [scan number]
               |--------| Apex
         
         We are only interested in the scans in which the infusion takes place
@@ -172,7 +197,9 @@ def limit_infusion(self, threshold: int = 3) -> None:
 
             mad_multiplier (int): The multiplier for the median absolute
                 deviation method to take the infusion profile from.
-        
+
+            plot (bool): Plot the results.
+
         """
 
         def _calculate_mad(tics: np.array) -> float:
@@ -190,11 +217,34 @@ def _get_mask(tics: np.array, mad: float) -> np.array:
 
             return modified_z_score >= threshold
 
+        def _plot(tics: np.array, mad: float, ini_scan: int, end_scan: int):
+            plt.figure()
+            plt.title("Apex Selection Plot \n%s" % (self.identifier))
+            plt.plot(tics)
+            plt.xlim(0, len(tics))
+            plt.ylim(0, max(tics * 1.1))
+            plt.axvline(ini_scan, ls="--", c="red")
+            plt.axvline(end_scan, ls="--", c="red")
+            plt.xlabel("Scan Number")
+            plt.ylabel("Total Ion Current (TIC)")
+            plt.tight_layout()
+            if type(plot) == bool:
+                plt.show()
+            else:
+                plt.savefig(plot)
+
         tics = np.array([scan.TIC for scan in self._scans[self._to_use]])
         mad = _calculate_mad(tics)
-
         apex_index = _get_mask(tics, mad)
 
+        logging.info("Mean Absolute Deviation Calculated as %f TIC" % (mad))
+
+        ini_scan = np.min(np.where(apex_index == True)[0])
+        end_scan = np.max(np.where(apex_index == True)[0])
+
+        if plot:
+            _plot(tics, mad, ini_scan, end_scan)
+
         to_use = np.where(self._to_use == True)[0]
 
         for i, j in enumerate(to_use):
@@ -232,8 +282,10 @@ def _load_masses_and_ints_from_scans(self) -> None:
         intensities = []
 
         for scan in self.read_scans:
-            masses.extend(scan.masses)
-            intensities.extend(scan.intensities)
+            for m, i in zip(scan.masses, scan.intensities):
+                if i > 0.0:
+                    masses.append(m)
+                    intensities.append(i)
 
         masses = np.array(masses)
         intensities = np.array(intensities)
@@ -387,6 +439,10 @@ def intensities(self) -> np.array:
             return self._intensities
         else:
             raise ValueError("No intensities generated, run load_scans() first!")
+
+    @property
+    def TIC(self) -> float:
+        return sum(self.intensities)
 
     @property
     def to_use(self) -> List[bool]: