Skip to content

Commit

Permalink
Merge pull request #293 from pymzml/dev
Browse files Browse the repository at this point in the history
Merge dev into master
  • Loading branch information
MKoesters authored Apr 11, 2022
2 parents a883ff0 + 1d0e0e1 commit fb0c560
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 18 deletions.
17 changes: 8 additions & 9 deletions pymzml/file_classes/standardMzml.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,9 +551,12 @@ def _interpol_search(self, target_index, chunk_size=8, fallback_cutoff=100):
current_position = seeker.tell()

elif len(data) == 0:
sorted_keys = sorted(self.offset_dict.keys())
sorted_int_keys = {
k: v for k, v in self.offset_dict.items() if isinstance(k, int)
}
sorted_keys = sorted(sorted_int_keys.keys())
pos = (
bisect.bisect_left(sorted_keys, target_index) - 2
bisect.bisect_left(sorted_int_keys, target_index) - 2
) # dat magic number :)
try:
key = sorted_keys[pos]
Expand Down Expand Up @@ -587,20 +590,16 @@ def _read_to_spec_end(self, seeker, chunks_to_read=8):
start_pos = seeker.tell()
data_chunk = seeker.read(chunk_size)
while end_found is False:
chunk_offset = seeker.tell()
data_chunk += seeker.read(chunk_size)
tag_end, seeker = self._read_until_tag_end(seeker)
data_chunk += tag_end
if regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk):
match = regex_patterns.SPECTRUM_CLOSE_PATTERN.search(data_chunk)
relative_pos_in_chunk = match.end()
end_pos = chunk_offset + relative_pos_in_chunk
end_pos = match.end()
end_found = True
elif regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk):
match = regex_patterns.CHROMATOGRAM_CLOSE_PATTERN.search(data_chunk)
relative_pos_in_chunk = match.end()
end_pos = chunk_offset + relative_pos_in_chunk
end_pos = match.end()
end_found = True
return (start_pos, end_pos)

Expand Down Expand Up @@ -743,7 +742,7 @@ def _search_string_identifier(self, search_string, chunk_size=8):
file_pointer = seeker.tell()

data = seeker.read(total_chunk_size)
string, seeker = self._read_until_tag_end(seeker, byte_mode=True)
string, seeker = self._read_until_tag_end(seeker)
data += string
spec_start = regex_string.search(data)
chrom_start = regex_patterns.CHROMO_OPEN_PATTERN.search(data)
Expand All @@ -769,7 +768,7 @@ def _search_string_identifier(self, search_string, chunk_size=8):
elif len(data) == 0:
raise Exception("cant find specified string")

def _read_until_tag_end(self, seeker, max_search_len=12, byte_mode=False):
def _read_until_tag_end(self, seeker, max_search_len=12):
"""
Help make sure no splitted text appear in chunked data, so regex always find
<spectrum ...>
Expand Down
16 changes: 13 additions & 3 deletions pymzml/obo.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
import os
import re
import gzip
import urllib


class OboTranslator(object):
Expand Down Expand Up @@ -146,6 +147,15 @@ def __normalize_version(version):

return version

def download_obo(self, version, obo_file):
uri = f"https://raw.githubusercontent.com/pymzml/psi-ms-CV/v{self.version}/psi-ms.obo"
urllib.request.urlretrieve(uri, obo_file)

with open(obo_file, "rb") as fin, gzip.open(obo_file + ".gz", "wb") as fout:
fout.writelines(fin.readlines())
os.remove(obo_file)
return

def parseOBO(self):
self.__obo_parsed = True
"""
Expand All @@ -172,13 +182,13 @@ def parseOBO(self):
"obo",
"psi-ms{0}.obo".format("-" + self.version if self.version else ""),
)

if os.path.exists(obo_file):
pass
elif os.path.exists(obo_file + ".gz"):
obo_file = obo_file + ".gz"
else:
raise IOError("Could not find obo file {0}".format(obo_file))
self.download_obo(self.version, obo_file)
obo_file += ".gz"

with open(obo_file, "rb") as fin:
# never rely on file extensions!
Expand All @@ -192,7 +202,7 @@ def parseOBO(self):
"The file may be corrupted or not gzipped."
)

with open_func(obo_file, "rt", encoding='utf-8') as obo:
with open_func(obo_file, "rt", encoding="utf-8") as obo:
collections = {}
collect = False
for line in obo:
Expand Down
14 changes: 9 additions & 5 deletions pymzml/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
import xml.etree.ElementTree as ElementTree
from collections import defaultdict as ddict
from io import BytesIO
from pathlib import Path

from . import spec
from . import obo
Expand Down Expand Up @@ -105,19 +106,22 @@ def __init__(
0: 0.0001,
1: 5e-6,
2: 20e-6,
3: 20e-6,
}
self.ms_precisions.update(MS_precisions)

# File info
self.info = ddict()
self.path_or_file = path_or_file
if isinstance(path_or_file, str):
self.info["file_name"] = path_or_file
self.info["encoding"] = self._determine_file_encoding(path_or_file)
if isinstance(self.path_or_file, Path):
self.path_or_file = str(self.path_or_file)
if isinstance(self.path_or_file, str):
self.info["file_name"] = self.path_or_file
self.info["encoding"] = self._determine_file_encoding(self.path_or_file)
else:
self.info["encoding"] = self._guess_encoding(path_or_file)
self.info["encoding"] = self._guess_encoding(self.path_or_file)

self.info["file_object"] = self._open_file(path_or_file)
self.info["file_object"] = self._open_file(self.path_or_file)
self.info["offset_dict"] = self.info["file_object"].offset_dict
if obo_version:
self.info["obo_version"] = self._obo_version_validator(obo_version)
Expand Down
5 changes: 4 additions & 1 deletion pymzml/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@ def __init__(self, element=ElementTree.Element(""), measured_precision=5e-6):
self._t_mass_set = None
self._t_mz_set = None
self._TIC = None
self._precursors = None
self._transformed_mass_with_error = None
self._transformed_mz_with_error = None
self._transformed_peaks = None
Expand Down Expand Up @@ -883,6 +884,8 @@ def scan_time_in_minutes(self):
"""
if self._scan_time_in_minutes is None:
self._scan_time, time_unit = self.scan_time
if self._scan_time_unit.lower() == "millisecond":
self._scan_time_in_minutes = self._scan_time / 1000.0 / 60.0
if self._scan_time_unit.lower() == "second":
self._scan_time_in_minutes = self._scan_time / 60.0
elif self._scan_time_unit.lower() == "minute":
Expand Down Expand Up @@ -959,7 +962,7 @@ def precursors(self):
precursor(list): list of precursor ids for this spectrum.
"""
self.deprecation_warning(sys._getframe().f_code.co_name)
if self._precursors is None:
if not self._precursors:
precursors = self.element.findall(
"./{ns}precursorList/{ns}precursor".format(ns=self.ns)
)
Expand Down

0 comments on commit fb0c560

Please sign in to comment.