Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Development #130

Merged
merged 7 commits into from
Jan 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions alphabase/peptide/precursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ def get_mod_seq_isotope_distribution(
right_most_idx-mono,
)

def calc_precursor_isotope(
def calc_precursor_isotope_info(
precursor_df:pd.DataFrame,
min_right_most_intensity:float=0.2,
):
Expand Down Expand Up @@ -484,7 +484,7 @@ def _count_batchify_df(df_group, mp_batch_size):

# `process_bar` should be replaced by more advanced tqdm wrappers created by Sander
# I will leave it to alphabase.utils
def calc_precursor_isotope_mp(
def calc_precursor_isotope_info_mp(
precursor_df:pd.DataFrame,
processes:int=8,
mp_batch_size:int=10000,
Expand Down Expand Up @@ -521,8 +521,8 @@ def calc_precursor_isotope_mp(
DataFrame with `isotope_*` columns,
see :meth:'calc_precursor_isotope()'.
"""
if len(precursor_df) < min_precursor_num_to_run_mp:
return calc_precursor_isotope(
if len(precursor_df) < min_precursor_num_to_run_mp or processes<=1:
return calc_precursor_isotope_info(
precursor_df=precursor_df,
min_right_most_intensity=min_right_most_intensity,
)
Expand All @@ -531,7 +531,7 @@ def calc_precursor_isotope_mp(
with mp.get_context("spawn").Pool(processes) as p:
processing = p.imap(
partial(
calc_precursor_isotope,
calc_precursor_isotope_info,
min_right_most_intensity=min_right_most_intensity
), _batchify_df(df_group, mp_batch_size)
)
Expand Down Expand Up @@ -662,6 +662,14 @@ def calc_precursor_isotope_intensity_mp(

"""

if mp_process_num <= 1:
return calc_precursor_isotope_intensity(
precursor_df=precursor_df,
max_isotope=max_isotope,
min_right_most_intensity=min_right_most_intensity,
normalize=normalize
)

df_list = []
df_group = precursor_df.groupby('nAA')

Expand All @@ -680,4 +688,7 @@ def calc_precursor_isotope_intensity_mp(
else:
df_list = list(processing)

return pd.concat(df_list, ignore_index=True)
return pd.concat(df_list, ignore_index=True)

calc_precursor_isotope = calc_precursor_isotope_intensity
calc_precursor_isotope_mp = calc_precursor_isotope_intensity_mp
10 changes: 7 additions & 3 deletions alphabase/protein/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,7 @@ def process_from_naked_peptide_seqs(self):
self.add_special_modifications()
self.add_peptide_labeling()
self.add_charge()
self.calc_and_clip_precursor_mz()

def get_peptides_from_fasta(self, fasta_file:Union[str,list]):
"""Load peptide sequences from fasta files.
Expand Down Expand Up @@ -997,7 +998,8 @@ def get_peptides_from_fasta_list(self, fasta_files:list):
protein_dict = load_all_proteins(fasta_files)
self.get_peptides_from_protein_dict(protein_dict)

def _get_peptides_from_protein_df(self):
def get_peptides_from_protein_df(self, protein_df:pd.DataFrame):
self.protein_df = protein_df
if self.I_to_L:
self.protein_df[
'sequence_I2L'
Expand Down Expand Up @@ -1025,10 +1027,10 @@ def get_peptides_from_protein_dict(self, protein_dict:dict):
}
```
"""
self.protein_df = pd.DataFrame.from_dict(
protein_df = pd.DataFrame.from_dict(
protein_dict, orient='index'
).reset_index(drop=True)
self._get_peptides_from_protein_df()
self.get_peptides_from_protein_df(protein_df)

def _cleave_to_peptides(self,
protein_df:pd.DataFrame,
Expand Down Expand Up @@ -1270,6 +1272,8 @@ def add_peptide_labeling(self, labeling_channel_dict:dict=None):
def add_charge(self):
"""Add charge states
"""
if "charge" in self._precursor_df.columns:
return
self._precursor_df['charge'] = [
np.arange(
self.min_precursor_charge,
Expand Down
3 changes: 1 addition & 2 deletions alphabase/protein/protein_level_decoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,7 @@ def _decoy_protein_df(self):
def _generate_decoy_sequences(self):
_target_prot_df = self.target_lib.protein_df
_target_pep_df = self.target_lib.precursor_df
self.target_lib.protein_df = self.protein_df
self.target_lib._get_peptides_from_protein_df()
self.target_lib.get_peptides_from_protein_df(self.protein_df)
self._precursor_df = self.target_lib.precursor_df
self.target_lib.protein_df = _target_prot_df
self.target_lib._precursor_df = _target_pep_df
Expand Down
1 change: 1 addition & 0 deletions alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,4 @@ def _load_modifications(self, pfind_df):
)

psm_reader_provider.register_reader('pfind', pFindReader)
psm_reader_provider.register_reader('pfind3', pFindReader)
53 changes: 35 additions & 18 deletions alphabase/spectral_library/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,25 +310,25 @@ def clip_by_precursor_mz_(self):

def calc_precursor_mz(self):
"""
Calculate precursor mz for self._precursor_df,
and clip the self._precursor_df using `self.clip_by_precursor_mz_`
Calculate precursor mz for self._precursor_df
"""
fragment.update_precursor_mz(self._precursor_df)

def update_precursor_mz(self):
def calc_and_clip_precursor_mz(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

really good, I like that the clipping is mentioned and won't surprise users

"""
Calculate precursor mz for self._precursor_df,
and clip the self._precursor_df using `self.clip_by_precursor_mz_`
"""
self.calc_precursor_mz()
self.clip_by_precursor_mz_()

def calc_precursor_isotope_intensity(self,
multiprocessing : bool=True,
max_isotope = 6,
min_right_most_intensity = 0.001,
mp_batch_size = 10000,
mp_process_num = 8
):
mp_process_num = 8,
normalize:typing.Literal['mono','sum'] = "sum",
):
"""
Calculate and append the isotope intensity columns into self.precursor_df.
See `alphabase.peptide.precursor.calc_precursor_isotope_intensity` for details.
Expand All @@ -351,53 +351,70 @@ def calc_precursor_isotope_intensity(self,
"""

if 'precursor_mz' not in self._precursor_df.columns:
self.calc_precursor_mz()
self.clip_by_precursor_mz_()
self.calc_and_clip_precursor_mz()

if multiprocessing and len(self.precursor_df)>mp_batch_size:
if mp_process_num>1 and len(self.precursor_df)>mp_batch_size:
(
self._precursor_df
) = precursor.calc_precursor_isotope_intensity_mp(
self.precursor_df,
max_isotope = max_isotope,
min_right_most_intensity = min_right_most_intensity,
normalize=normalize,
mp_process_num = mp_process_num,
mp_batch_size=mp_batch_size,
)
else:
(
self._precursor_df
) = precursor.calc_precursor_isotope_intensity(
self.precursor_df,
max_isotope = max_isotope,
normalize=normalize,
min_right_most_intensity = min_right_most_intensity,
)


def calc_precursor_isotope(self,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you like we can keep only one calc_precursor_isotope_intensity or calc_precursor_isotope so there's no redundancy.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

calc_precursor_isotope is now calc_precursor_isotope_info. My original thought is that maybe it is useful in AlphaDIA, but I think your calc_*_intensity is better in readability and memory-saving.

max_isotope = 6,
min_right_most_intensity = 0.001,
mp_batch_size = 10000,
mp_process_num = 8,
normalize:typing.Literal['mono','sum'] = "sum",
):
return self.calc_precursor_isotope_intensity(
max_isotope=max_isotope,
min_right_most_intensity=min_right_most_intensity,
normalize=normalize,
mp_batch_size=mp_batch_size,
mp_process_num=mp_process_num,
)

def calc_precursor_isotope(self,
multiprocessing:bool=True,
def calc_precursor_isotope_info(self,
mp_process_num:int=8,
mp_process_bar=None,
min_precursor_num_to_run_mp:int=1000,
mp_batch_size = 10000,
):
"""
Append isotope columns into self.precursor_df.
See `alphabase.peptide.precursor.calc_precursor_isotope` for details.
"""
if 'precursor_mz' not in self._precursor_df.columns:
self.calc_precursor_mz()
self.clip_by_precursor_mz_()
if multiprocessing and len(self.precursor_df)>min_precursor_num_to_run_mp:
self.calc_and_clip_precursor_mz()
if (
mp_process_num > 1 and
len(self.precursor_df)>mp_batch_size
):
(
self._precursor_df
) = precursor.calc_precursor_isotope_mp(
) = precursor.calc_precursor_isotope_info_mp(
self.precursor_df,
processes=mp_process_num,
process_bar=mp_process_bar,
)
else:
(
self._precursor_df
) = precursor.calc_precursor_isotope(
) = precursor.calc_precursor_isotope_info(
self.precursor_df
)

Expand Down
12 changes: 10 additions & 2 deletions alphabase/spectral_library/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,11 @@ def run(self):
while True:
df, batch = self.task_queue.get()
if df is None: break
df.to_csv(self.tsv, header=(batch==0), sep="\t", mode="a", index=False, lineterminator="\n")
if tuple([int(i) for i in pd.__version__.split(".")[:2]]) >= (1,5):
newline=dict(lineterminator="\n")
else:
newline=dict(line_terminator="\n")
df.to_csv(self.tsv, header=(batch==0), sep="\t", mode="a", index=False, **newline)

def translate_to_tsv(
speclib:SpecLibBase,
Expand Down Expand Up @@ -443,7 +447,11 @@ def translate_to_tsv(
if multiprocessing:
df_head_queue.put((df, i))
else:
df.to_csv(tsv, header=(i==0), sep="\t", mode='a', index=False, lineterminator="\n")
if tuple([int(i) for i in pd.__version__.split(".")[:2]]) >= (1,5):
newline=dict(lineterminator="\n")
else:
newline=dict(line_terminator="\n")
df.to_csv(tsv, header=(i==0), sep="\t", mode='a', index=False, **newline)
if multiprocessing:
df_head_queue.put((None, None))
print("Translation finished, it will take several minutes to export the rest precursors to the tsv file...")
Expand Down
Loading