Merge pull request #293 from pymzml/dev

Merge dev into master
pymzml · Apr 11, 2022 · fb0c560 · fb0c560
2 parents a883ff0 + 1d0e0e1
commit fb0c560
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 18 deletions.
diff --git a/pymzml/file_classes/standardMzml.py b/pymzml/file_classes/standardMzml.py
@@ -551,9 +551,12 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
                     current_position = seeker.tell()
 
             elif len(data) == 0:
-                sorted_keys = sorted(self.offset_dict.keys())
+                sorted_int_keys = {
+                    k: v for k, v in self.offset_dict.items() if isinstance(k, int)
+                }
+                sorted_keys = sorted(sorted_int_keys.keys())
                 pos = (
-                    bisect.bisect_left(sorted_keys, target_index) - 2
+                    bisect.bisect_left(sorted_int_keys, target_index) - 2
                 )  # dat magic number :)
                 try:
                     key = sorted_keys[pos]
@@ -587,20 +590,16 @@ def _read_to_spec_end(self, seeker, chunks_to_read=8):
         start_pos = seeker.tell()
         data_chunk = seeker.read(chunk_size)
         while end_found is False:
-            chunk_offset = seeker.tell()
             data_chunk += seeker.read(chunk_size)
             tag_end, seeker = self._read_until_tag_end(seeker)
             data_chunk += tag_end
             if regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk):
                 match = regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk)
-                relative_pos_in_chunk = match.end()
-                end_pos = chunk_offset + relative_pos_in_chunk
                 end_pos = match.end()
                 end_found = True
             elif regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk):
                 match = regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk)
-                relative_pos_in_chunk = match.end()
-                end_pos = chunk_offset + relative_pos_in_chunk
+                end_pos = match.end()
                 end_found = True
         return (start_pos, end_pos)
 
@@ -743,7 +742,7 @@ def _search_string_identifier(self, search_string, chunk_size=8):
                 file_pointer = seeker.tell()
 
                 data = seeker.read(total_chunk_size)
-                string, seeker = self._read_until_tag_end(seeker, byte_mode=True)
+                string, seeker = self._read_until_tag_end(seeker)
                 data += string
                 spec_start = regex_string.search(data)
                 chrom_start = regex_patterns.CHROMO_OPEN_PATTERN.search(data)
@@ -769,7 +768,7 @@ def _search_string_identifier(self, search_string, chunk_size=8):
                 elif len(data) == 0:
                     raise Exception("cant find specified string")
 
-    def _read_until_tag_end(self, seeker, max_search_len=12, byte_mode=False):
+    def _read_until_tag_end(self, seeker, max_search_len=12):
         """
         Help make sure no splitted text appear in chunked data, so regex always find
         <spectrum ...>

diff --git a/pymzml/obo.py b/pymzml/obo.py
@@ -84,6 +84,7 @@
 import os
 import re
 import gzip
+import urllib
 
 
 class OboTranslator(object):
@@ -146,6 +147,15 @@ def __normalize_version(version):
 
         return version
 
+    def download_obo(self, version, obo_file):
+        uri = f"https://raw.githubusercontent.com/pymzml/psi-ms-CV/v{self.version}/psi-ms.obo"
+        urllib.request.urlretrieve(uri, obo_file)
+
+        with open(obo_file, "rb") as fin, gzip.open(obo_file + ".gz", "wb") as fout:
+            fout.writelines(fin.readlines())
+            os.remove(obo_file)
+        return
+
     def parseOBO(self):
         self.__obo_parsed = True
         """
@@ -172,13 +182,13 @@ def parseOBO(self):
             "obo",
             "psi-ms{0}.obo".format("-" + self.version if self.version else ""),
         )
-
         if os.path.exists(obo_file):
             pass
         elif os.path.exists(obo_file + ".gz"):
             obo_file = obo_file + ".gz"
         else:
-            raise IOError("Could not find obo file {0}".format(obo_file))
+            self.download_obo(self.version, obo_file)
+            obo_file += ".gz"
 
         with open(obo_file, "rb") as fin:
             # never rely on file extensions!
@@ -192,7 +202,7 @@ def parseOBO(self):
                     "The file may be corrupted or not gzipped."
                 )
 
-        with open_func(obo_file, "rt", encoding='utf-8') as obo:
+        with open_func(obo_file, "rt", encoding="utf-8") as obo:
             collections = {}
             collect = False
             for line in obo:

diff --git a/pymzml/run.py b/pymzml/run.py
@@ -43,6 +43,7 @@
 import xml.etree.ElementTree as ElementTree
 from collections import defaultdict as ddict
 from io import BytesIO
+from pathlib import Path
 
 from . import spec
 from . import obo
@@ -105,19 +106,22 @@ def __init__(
             0: 0.0001,
             1: 5e-6,
             2: 20e-6,
+            3: 20e-6,
         }
         self.ms_precisions.update(MS_precisions)
 
         # File info
         self.info = ddict()
         self.path_or_file = path_or_file
-        if isinstance(path_or_file, str):
-            self.info["file_name"] = path_or_file
-            self.info["encoding"] = self._determine_file_encoding(path_or_file)
+        if isinstance(self.path_or_file, Path):
+            self.path_or_file = str(self.path_or_file)
+        if isinstance(self.path_or_file, str):
+            self.info["file_name"] = self.path_or_file
+            self.info["encoding"] = self._determine_file_encoding(self.path_or_file)
         else:
-            self.info["encoding"] = self._guess_encoding(path_or_file)
+            self.info["encoding"] = self._guess_encoding(self.path_or_file)
 
-        self.info["file_object"] = self._open_file(path_or_file)
+        self.info["file_object"] = self._open_file(self.path_or_file)
         self.info["offset_dict"] = self.info["file_object"].offset_dict
         if obo_version:
             self.info["obo_version"] = self._obo_version_validator(obo_version)

diff --git a/pymzml/spec.py b/pymzml/spec.py
@@ -426,6 +426,7 @@ def __init__(self, element=ElementTree.Element(""), measured_precision=5e-6):
         self._t_mass_set = None
         self._t_mz_set = None
         self._TIC = None
+        self._precursors = None
         self._transformed_mass_with_error = None
         self._transformed_mz_with_error = None
         self._transformed_peaks = None
@@ -883,6 +884,8 @@ def scan_time_in_minutes(self):
         """
         if self._scan_time_in_minutes is None:
             self._scan_time, time_unit = self.scan_time
+            if self._scan_time_unit.lower() == "millisecond":
+                self._scan_time_in_minutes = self._scan_time / 1000.0 / 60.0
             if self._scan_time_unit.lower() == "second":
                 self._scan_time_in_minutes = self._scan_time / 60.0
             elif self._scan_time_unit.lower() == "minute":
@@ -959,7 +962,7 @@ def precursors(self):
             precursor(list): list of precursor ids for this spectrum.
         """
         self.deprecation_warning(sys._getframe().f_code.co_name)
-        if self._precursors is None:
+        if not self._precursors:
             precursors = self.element.findall(
                 "./{ns}precursorList/{ns}precursor".format(ns=self.ns)
             )