diff --git a/pymzml/file_classes/standardMzml.py b/pymzml/file_classes/standardMzml.py index 7e413132..a8115410 100755 --- a/pymzml/file_classes/standardMzml.py +++ b/pymzml/file_classes/standardMzml.py @@ -551,9 +551,12 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100): current_position = seeker.tell() elif len(data) == 0: - sorted_keys = sorted(self.offset_dict.keys()) + sorted_int_keys = { + k: v for k, v in self.offset_dict.items() if isinstance(k, int) + } + sorted_keys = sorted(sorted_int_keys.keys()) pos = ( - bisect.bisect_left(sorted_keys, target_index) - 2 + bisect.bisect_left(sorted_int_keys, target_index) - 2 ) # dat magic number :) try: key = sorted_keys[pos] @@ -587,20 +590,16 @@ def _read_to_spec_end(self, seeker, chunks_to_read=8): start_pos = seeker.tell() data_chunk = seeker.read(chunk_size) while end_found is False: - chunk_offset = seeker.tell() data_chunk += seeker.read(chunk_size) tag_end, seeker = self._read_until_tag_end(seeker) data_chunk += tag_end if regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk): match = regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk) - relative_pos_in_chunk = match.end() - end_pos = chunk_offset + relative_pos_in_chunk end_pos = match.end() end_found = True elif regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk): match = regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk) - relative_pos_in_chunk = match.end() - end_pos = chunk_offset + relative_pos_in_chunk + end_pos = match.end() end_found = True return (start_pos, end_pos) @@ -743,7 +742,7 @@ def _search_string_identifier(self, search_string, chunk_size=8): file_pointer = seeker.tell() data = seeker.read(total_chunk_size) - string, seeker = self._read_until_tag_end(seeker, byte_mode=True) + string, seeker = self._read_until_tag_end(seeker) data += string spec_start = regex_string.search(data) chrom_start = regex_patterns.CHROMO_OPEN_PATTERN.search(data) @@ -769,7 +768,7 @@ def _search_string_identifier(self, search_string, chunk_size=8): elif len(data) == 0: raise Exception("cant find specified string") - def _read_until_tag_end(self, seeker, max_search_len=12, byte_mode=False): + def _read_until_tag_end(self, seeker, max_search_len=12): """ Help make sure no splitted text appear in chunked data, so regex always find diff --git a/pymzml/obo.py b/pymzml/obo.py index a3a9d07f..e8cdd50e 100755 --- a/pymzml/obo.py +++ b/pymzml/obo.py @@ -84,6 +84,7 @@ import os import re import gzip +import urllib class OboTranslator(object): @@ -146,6 +147,15 @@ def __normalize_version(version): return version + def download_obo(self, version, obo_file): + uri = f"https://raw.githubusercontent.com/pymzml/psi-ms-CV/v{self.version}/psi-ms.obo" + urllib.request.urlretrieve(uri, obo_file) + + with open(obo_file, "rb") as fin, gzip.open(obo_file + ".gz", "wb") as fout: + fout.writelines(fin.readlines()) + os.remove(obo_file) + return + def parseOBO(self): self.__obo_parsed = True """ @@ -172,13 +182,13 @@ def parseOBO(self): "obo", "psi-ms{0}.obo".format("-" + self.version if self.version else ""), ) - if os.path.exists(obo_file): pass elif os.path.exists(obo_file + ".gz"): obo_file = obo_file + ".gz" else: - raise IOError("Could not find obo file {0}".format(obo_file)) + self.download_obo(self.version, obo_file) + obo_file += ".gz" with open(obo_file, "rb") as fin: # never rely on file extensions! @@ -192,7 +202,7 @@ def parseOBO(self): "The file may be corrupted or not gzipped." ) - with open_func(obo_file, "rt", encoding='utf-8') as obo: + with open_func(obo_file, "rt", encoding="utf-8") as obo: collections = {} collect = False for line in obo: diff --git a/pymzml/run.py b/pymzml/run.py index 22fc15bb..2f76456a 100755 --- a/pymzml/run.py +++ b/pymzml/run.py @@ -43,6 +43,7 @@ import xml.etree.ElementTree as ElementTree from collections import defaultdict as ddict from io import BytesIO +from pathlib import Path from . import spec from . import obo @@ -105,19 +106,22 @@ def __init__( 0: 0.0001, 1: 5e-6, 2: 20e-6, + 3: 20e-6, } self.ms_precisions.update(MS_precisions) # File info self.info = ddict() self.path_or_file = path_or_file - if isinstance(path_or_file, str): - self.info["file_name"] = path_or_file - self.info["encoding"] = self._determine_file_encoding(path_or_file) + if isinstance(self.path_or_file, Path): + self.path_or_file = str(self.path_or_file) + if isinstance(self.path_or_file, str): + self.info["file_name"] = self.path_or_file + self.info["encoding"] = self._determine_file_encoding(self.path_or_file) else: - self.info["encoding"] = self._guess_encoding(path_or_file) + self.info["encoding"] = self._guess_encoding(self.path_or_file) - self.info["file_object"] = self._open_file(path_or_file) + self.info["file_object"] = self._open_file(self.path_or_file) self.info["offset_dict"] = self.info["file_object"].offset_dict if obo_version: self.info["obo_version"] = self._obo_version_validator(obo_version) diff --git a/pymzml/spec.py b/pymzml/spec.py index ce0da30e..8c36cbc2 100755 --- a/pymzml/spec.py +++ b/pymzml/spec.py @@ -426,6 +426,7 @@ def __init__(self, element=ElementTree.Element(""), measured_precision=5e-6): self._t_mass_set = None self._t_mz_set = None self._TIC = None + self._precursors = None self._transformed_mass_with_error = None self._transformed_mz_with_error = None self._transformed_peaks = None @@ -883,6 +884,8 @@ def scan_time_in_minutes(self): """ if self._scan_time_in_minutes is None: self._scan_time, time_unit = self.scan_time + if self._scan_time_unit.lower() == "millisecond": + self._scan_time_in_minutes = self._scan_time / 1000.0 / 60.0 if self._scan_time_unit.lower() == "second": self._scan_time_in_minutes = self._scan_time / 60.0 elif self._scan_time_unit.lower() == "minute": @@ -959,7 +962,7 @@ def precursors(self): precursor(list): list of precursor ids for this spectrum. """ self.deprecation_warning(sys._getframe().f_code.co_name) - if self._precursors is None: + if not self._precursors: precursors = self.element.findall( "./{ns}precursorList/{ns}precursor".format(ns=self.ns) )