Merge pull request #28 from MannLabs/development

Development
MannLabs · Jul 27, 2022 · 80aaf73 · 80aaf73
2 parents 5911f35 + 17069b4
commit 80aaf73
Show file tree

Hide file tree

Showing 31 changed files with 13,737 additions and 94 deletions.
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,27 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF
+formats: all
+
+conda:
+  environment: docs/conda_development_environment.yaml
+
+# # Optionally set the version of Python and requirements required to build your docs
+# python:
+#   version: 3.8
+#   install:
+#     - requirements: requirements/requirements.txt
+#     - method: pip
+#       path: .
+#       extra_requirements:
+#         - plotting
+#         - development
diff --git a/HISTORY.md b/HISTORY.md
@@ -1,5 +1,11 @@
 ## Changelog
 
+### 1.1.14
+* FIX: Fix the m/z fragment calculation for the modified peptides.
+* FIX: Fix the extraction of the protein names and sequences for the protein groups in DIA-NN.
+* FEAT: Use deep learning prediction for raw files without having to provide output.
+* STYLE: Update the tutorial and add a notebook demonstrating the use of the predict mode.
+
 ### 1.1.13
 * FEAT: Enable fine-tuning of retention times for predictions.
 * FEAT: Add the predicted retention time and ion mobility values to the "Main tab" of the dashboard.

diff --git a/alphaviz/__init__.py b/alphaviz/__init__.py
@@ -1,7 +1,7 @@
 #!python
 
 __project__ = "alphaviz"
-__version__ = "1.1.13"
+__version__ = "1.1.14"
 __license__ = "Apache"
 __description__ = "A interactive Dashboard to explore mass spectrometry data."
 __author__ = "Eugenia Voytik"

diff --git a/alphaviz/docs/alphaviz_tutorial.docx b/alphaviz/docs/alphaviz_tutorial.docx
diff --git a/alphaviz/docs/alphaviz_tutorial.pdf b/alphaviz/docs/alphaviz_tutorial.pdf
diff --git a/alphaviz/gui.py b/alphaviz/gui.py
diff --git a/alphaviz/io.py b/alphaviz/io.py
@@ -260,7 +260,8 @@ def import_mq_all_peptides(
 
 
 def import_mq_msms(
-    filepath: str
+    filepath: str,
+    experiment: str
 ) -> pd.DataFrame:
     """Read some columns from the output file msms.txt of MaxQuant software.
 
@@ -281,6 +282,7 @@ def import_mq_msms(
     """
     try:
         maxquant_msms_columns = [
+            'Raw file',
             'Scan number',
             'Matches',
             'Masses',
@@ -290,13 +292,15 @@ def import_mq_msms(
         data_common = read_file(filepath, maxquant_msms_columns)
     except ValueError:
         maxquant_msms_columns = [
+            'Raw file',
             'Scan number',
             'Matches',
             'Masses',
             'Mass Deviations [Da]',
             'Mass Deviations [ppm]'
         ]
         data_common = read_file(filepath, maxquant_msms_columns)
+    data_common = data_common[data_common['Raw file'] == experiment]
     data_common.columns = [col.strip().replace('Deviations', 'deviations') for col in data_common.columns]
     data_common['Scan number'] = data_common['Scan number'].astype('int')
     return data_common
@@ -355,7 +359,7 @@ def import_mq_output(
             path_mq_output_folder,
             file
         )
-        if file in ['allPeptides.txt', 'msms.txt', 'summary.txt']:
+        if file in ['allPeptides.txt', 'summary.txt']:
             df = file_func_dict[file](
                 file_path
             )
@@ -489,7 +493,10 @@ def create_diann_peptides_table(
         The output data frame contains information about peptides.
     """
     peptides = diann_df.copy()
-    columns = [col for col in peptides.columns if 'PG' not in col and 'Protein' not in col and 'Genes' not in col and 'GG' not in col]
+    columns = [
+        col for col in peptides.columns if 'PG' not in col
+        and 'Protein' not in col and 'Genes' not in col and 'GG' not in col
+    ]
     columns.extend(['Genes'])
 
     peptides = diann_df[columns[2:]].copy()
@@ -502,11 +509,22 @@ def create_diann_peptides_table(
         'Stripped.Sequence': 'Sequence'
     }, inplace=True)
 
-    peptides['Sequence_AP_mod'] = peptides['Modified.Sequence'].apply(alphaviz.preprocessing.convert_diann_ap_mod)
-    peptides['Modified.Sequence'] = peptides['Modified.Sequence'].apply(alphaviz.preprocessing.convert_diann_mq_mod)
+    peptides['Sequence_AP_mod'] = peptides['Modified.Sequence'].apply(
+        alphaviz.preprocessing.convert_diann_ap_mod
+    )
+    peptides['Modified.Sequence'] = peptides['Modified.Sequence'].apply(
+        alphaviz.preprocessing.convert_diann_mq_mod
+    )
     peptides['m/z'] = 0.0
-    first_columns = ['Modified.Sequence', 'Length', 'm/z', 'RT', 'Predicted.RT', 'Charge', 'IM', 'Predicted.IM']
-    peptides = peptides[first_columns + sorted(list(set(peptides.columns).difference(first_columns)))]
+    first_columns = [
+        'Modified.Sequence', 'Length', 'm/z', 'RT',
+        'Predicted.RT', 'Charge', 'IM', 'Predicted.IM'
+    ]
+    peptides = peptides[
+        first_columns + sorted(list(
+            set(peptides.columns).difference(first_columns))
+        )
+    ]
     return peptides
 
 
@@ -543,3 +561,100 @@ def import_diann_output(
     diann_overview = import_diann_stats(os.path.join(path_diann_output_folder, diann_stats_file), experiment)
 
     return diann_proteins, diann_peptides, diann_overview, diann_output_file
+
+
+def create_ap_proteins_table(
+    ap_df: pd.DataFrame,
+    fasta: object
+):
+    ap_df[['Protein names', 'Protein IDs', 'Gene names']] = ap_df.apply(
+        lambda x: alphaviz.preprocessing.get_protein_info_from_fastaheader(
+            x['protein_group']
+        ), axis=1, result_type='expand'
+    )
+    ap_df[['Protein names', 'Sequence lengths']] = ap_df.apply(
+        lambda x: alphaviz.preprocessing.get_protein_info(
+            fasta, x['Protein IDs']
+        ), axis=1, result_type='expand'
+    )
+    columns = [col for col in ap_df.columns if 'protein' in col] \
+        + ['sequence', 'Protein names', 'Protein IDs',
+            'Gene names', 'Sequence lengths']
+
+    agg_dict = dict.fromkeys(columns, 'max')
+    agg_dict['sequence'] = 'count'
+    grouped_ap_df = ap_df.groupby(
+        'index_protein_group',
+        as_index=False
+    )[columns].agg(agg_dict)
+
+    grouped_ap_df.rename(
+        columns={'sequence': '(EXP) # peptides'},
+        inplace=True
+    )
+    grouped_ap_df['# proteins'] = grouped_ap_df['protein_idx'].apply(
+        lambda x: len(x.split(',')))
+    grouped_ap_df['# MS/MS'] = grouped_ap_df['(EXP) # peptides']
+    first_columns = [
+        'Protein IDs', 'Protein names', 'Gene names', '# proteins',
+        '(EXP) # peptides', '# MS/MS', 'Sequence lengths'
+    ]
+    proteins = grouped_ap_df[
+        first_columns + sorted(list(
+            set(grouped_ap_df.columns).difference(first_columns))
+        )
+    ]
+    return proteins
+
+
+def create_ap_peptides_table(
+    ap_df: pd.DataFrame
+):
+    peptides = ap_df.copy()
+    columns = [
+        col for col in peptides.columns if 'protein' not in col
+        and 'Protein' not in col and col != 'Sequence lengths'
+    ]
+    peptides = peptides[columns]
+    peptides.rename(columns={
+        'n_AA': 'Length',
+        'charge': 'Charge',
+        'sequence_naked': 'Sequence',
+        'parent': 'MS/MS scan number',
+        'sequence': 'Sequence_AP_mod',
+        'mz': 'm/z',
+        'mass': 'Mass',
+        'mobility': 'IM',
+        'rt': 'RT',
+    }, inplace=True)
+    peptides['Modified.Sequence'] = peptides['Sequence_AP_mod']
+
+    first_columns = [
+        'Modified.Sequence', 'Length', 'm/z',
+        'RT', 'Charge', 'Mass', 'IM'
+    ]
+    peptides = peptides[
+        first_columns + sorted(list(
+            set(peptides.columns).difference(first_columns))
+        )
+    ]
+    return peptides
+
+
+def import_alphapept_output(
+    path_ap_output_folder: str,
+    experiment: str,
+    fasta: object
+):
+    ap_output_file = 'results_peptides.csv'
+    ap_df = pd.read_csv(
+        os.path.join(path_ap_output_folder, ap_output_file),
+        low_memory=False
+    )
+    ap_df = ap_df[ap_df.shortname == experiment]
+    cols_to_remove = ['filename', 'shortname', 'sample_group']
+    ap_df.drop(columns=cols_to_remove, axis=1, inplace=True)
+    ap_proteins = create_ap_proteins_table(ap_df, fasta)
+    ap_peptides = create_ap_peptides_table(ap_df)
+
+    return ap_proteins, ap_peptides
diff --git a/alphaviz/plotting.py b/alphaviz/plotting.py
@@ -1115,6 +1115,7 @@ def plot_elution_profile_heatmap(
     timstof_data,
     peptide_info: dict,
     mass_dict: dict,
+    calculate_fragment_masses: bool = True,
     mz_tol: int = 50,
     rt_tol: int = 30,
     im_tol: int = 0.05,
@@ -1158,14 +1159,15 @@ def plot_elution_profile_heatmap(
         The elution profile heatmap plots in retention time and ion mobility dimensions
         for the specified peptide and all his fragments.
     """
-    # predict the theoretical fragments using the Alphapept get_fragmass() function.
-    frag_masses, frag_type = alphaviz.utils.get_fragmass(
-        parsed_pep=alphaviz.utils.parse(peptide_info['sequence']),
-        mass_dict=mass_dict
-    )
-    peptide_info['fragments'] = {
-        (f"b{key}" if key > 0 else f"y{-key}"): value for key, value in zip(frag_type, frag_masses)
-    }
+    if calculate_fragment_masses:
+        # predict the theoretical fragments using the Alphapept get_fragmass() function.
+        frag_masses, frag_type = alphaviz.utils.get_fragmass(
+            parsed_pep=alphaviz.utils.parse(peptide_info['sequence']),
+            mass_dict=mass_dict
+        )
+        peptide_info['fragments'] = {
+            (f"b{key}" if key > 0 else f"y{-key}"): value for key, value in zip(frag_type, frag_masses)
+        }
 
     # slice the data using the rt_tol, im_tol and mz_tol values
     rt_slice = slice(peptide_info['rt'] - rt_tol, peptide_info['rt'] + rt_tol)

diff --git a/alphaviz/preprocessing.py b/alphaviz/preprocessing.py
@@ -194,7 +194,6 @@ def get_mq_ms2_scan_data(
             data.loc[ion_index, 'ions'] = row.ions
             msms_filtered_df.loc[msms_filtered_df.ions == row.ions, 'mass_dev_ppm'] = mass_dev_ppm_calc
 
-    data.drop_duplicates('mz_values', inplace=True)
     data.sort_values(['ions', 'intensity_values'], ascending=True, inplace=True)
     data_merged = pd.merge(data, msms_filtered_df, on='ions', how='left')
 
@@ -383,7 +382,7 @@ def get_protein_info(
     """
     protein_names = []
     protein_seq_lens = []
-    for protein_id in protein_ids.split():
+    for protein_id in protein_ids.replace(';', ' ').split():
         try:
             protein_names.append(fasta.get_by_id(protein_id).description['name'])
         except KeyError:

diff --git a/alphaviz/style/tables_formatting.json b/alphaviz/style/tables_formatting.json
@@ -1 +1 @@
-{"maxquant": {"peptides_table": {"widths": {"Sequence": 220, "Proteins": 200, "MS/MS scan number": 100}}, "proteins_table": {"formatters": {"Protein names": {"type": "textarea"}}, "widths": {"Protein IDs": 230, "Protein names": 350, "Sequence lengths": 150}}}, "diann": {"peptides_table": {}, "proteins_table": {}}}
+{"maxquant": {"peptides_table": {"widths": {"Sequence": 220, "Proteins": 200, "MS/MS scan number": 100}}, "proteins_table": {"formatters": {"Protein names": {"type": "textarea"}}, "widths": {"Protein IDs": 230, "Protein names": 350, "Sequence lengths": 150}}}, "diann": {"peptides_table": {}, "proteins_table": {"widths": {"Protein IDs": 100}}}}
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/alphaviz.gui.rst b/docs/alphaviz.gui.rst
@@ -0,0 +1,7 @@
+alphaviz.gui
+-----------------------
+
+.. automodule:: alphaviz.gui
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/alphaviz.io.rst b/docs/alphaviz.io.rst
@@ -0,0 +1,7 @@
+alphaviz.io
+----------------------
+
+.. automodule:: alphaviz.io
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/alphaviz.plotting.rst b/docs/alphaviz.plotting.rst
@@ -0,0 +1,7 @@
+alphaviz.plotting
+----------------------
+
+.. automodule:: alphaviz.plotting
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/alphaviz.preprocessing.rst b/docs/alphaviz.preprocessing.rst
@@ -0,0 +1,7 @@
+alphaviz.preprocessing
+----------------------
+
+.. automodule:: alphaviz.preprocessing
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/alphaviz.utils.rst b/docs/alphaviz.utils.rst
@@ -0,0 +1,7 @@
+alphaviz.utils
+----------------------
+
+.. automodule:: alphaviz.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/conda_development_environment.yaml b/docs/conda_development_environment.yaml
@@ -0,0 +1,8 @@
+name: alphaviz
+channels:
+  - defaults
+dependencies:
+  - pip=21.0.1
+  - python=3.8
+  - pip:
+    - -e ../.[development-stable]