Merge pull request #12 from MannLabs/development

Development
MannLabs · Jul 18, 2023 · c425f3e · c425f3e
2 parents b53065b + 4a1f2eb
commit c425f3e
Show file tree

Hide file tree

Showing 18 changed files with 183 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -195,9 +195,9 @@ In case of issues, check out the following:
 ---
 ## Citations
 
-In the case that directLFQ is useful to you, please consider supporting us by citing the [preprint](https://doi.org/10.1101/2023.02.17.528962)
+In the case that directLFQ is useful to you, please consider supporting us by citing the [paper](https://doi.org/10.1016/j.mcpro.2023.100581)
 
-Ammar, C., Schessner, J.P., Willems, S., Michaelis, A.C., and Mann, M. (2023). Accurate label-free quantification by directLFQ to compare unlimited numbers of proteomes. bioRxiv, 2023.02.17.528962. 10.1101/2023.02.17.528962.
+Ammar, C., Schessner, J.P., Willems, S., Michaelis, A.C., and Mann, M. (2023). Accurate label-free quantification by directLFQ to compare unlimited numbers of proteomes.  Molecular & Cellular Proteomics, 100581.
 
 
 
@@ -229,6 +229,7 @@ directLFQ is started internally via the directlfq.lfq_manager.run_lfq() command.
 -  **num_cores**: The number of cores to use (default is to use multiprocessing).
 -  **filename_suffix**: Suffix to append to the output files.
 -  **deactivate_normalization**: Set to true, if no between-sample normalization should be performed before processing.
+-  **filter_dict**: In case you want to define specific filters in addition to the standard filters, you can add a yaml file where the filters are defined (see example [here](release/examples/filterdict.yaml)). In the Python API you can also directly put in the dictionary instead of the .yaml file.
 
 ---
 

diff --git a/directlfq/__init__.py b/directlfq/__init__.py
@@ -2,7 +2,7 @@
 
 
 __project__ = "directlfq"
-__version__ = "0.2.9"
+__version__ = "0.2.10"
 __license__ = "Apache"
 __description__ = "An open-source Python package of the AlphaPept ecosystem"
 __author__ = "Mann Labs"

diff --git a/directlfq/cli.py b/directlfq/cli.py
@@ -195,7 +195,10 @@ def gui():
 @click.option("--maximum_number_of_quadratic_ions_to_use_per_protein", "-mn", type= int, default = 10,  help="How many ions are used to create the anchor intensity trace (see paper). Increasing might marginally increase performance at the cost of runtime.")
 @click.option("--number_of_quadratic_samples", "-nq", type = int, default = 50, help="How many samples are used to create the anchor intensity trace (see paper). Increasing might marginally increase performance at the cost of runtime.")
 @click.option("--filename_suffix", "-fs", type=str, default="", help="A suffix to add to the output file name.")
+@click.option("--num_cores",  "-nc", type = int, default = None, help="The number of cores to use (default is to use multiprocessing).")
 @click.option("--deactivate_normalization",  "-dn", type = bool, default = False, help="If you want to deactivate the normalization step, you can set this flag to True.")
+@click.option("--filter_dict",  "-dn", type = bool, default = False, help="In case you want to define specific filters in addition to the standard filters, you can add a yaml file where the filters are defined (see GitHub docu for example).")
+
 def run_directlfq(**kwargs):
     print("starting directLFQ")
     import directlfq.lfq_manager

diff --git a/directlfq/configs/intable_config.yaml b/directlfq/configs/intable_config.yaml
@@ -347,6 +347,7 @@ diann_precursors_plexDIA:
   use_iontree: True
   ml_level: CHARGE
 
+
 diann_precursors:
   format: longtable
   sample_ID: Run
@@ -355,6 +356,12 @@ diann_precursors:
    - Protein.Group
   ion_cols:
    - Precursor.Id
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
+
   # filters:
   #   shape_quality:
   #     param: Quantity.Quality
@@ -397,6 +404,11 @@ diann_fragion_isotopes_raw:
          - Precursor.Charge
         MS1ISOTOPES:
          - Precursor.Charge
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 
@@ -435,6 +447,11 @@ diann_fragion_isotopes_raw_charite_dataset:
          - Precursor.Charge
         MS1ISOTOPES:
          - Precursor.Charge
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 
@@ -473,6 +490,11 @@ diann_fragion_isotopes:
          - Precursor.Charge
         MS1ISOTOPES:
          - Precursor.Charge
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 
@@ -510,6 +532,11 @@ diann_fragion_isotopes_topn:
          - Precursor.Charge
         MS1ISOTOPES:
          - Precursor.Charge
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 
@@ -542,6 +569,11 @@ diann_precursor_ms1_and_ms2:
          - Precursor.Charge
         MS1ISOTOPES:
          - Precursor.Charge
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 
@@ -568,6 +600,11 @@ diann_fragion:
          - Precursor.Charge
         FRGION: 
          - Fragment.Quant.Corrected
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 
@@ -581,6 +618,11 @@ diann_precursors_ms1:
    - Protein.Group
   ion_cols:
    - Precursor.Id
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
 
 diann_precursor_filename_sampleid:
   format: longtable
@@ -590,6 +632,11 @@ diann_precursor_filename_sampleid:
    - Genes
   ion_cols:
    - Precursor.Id
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   # filters:
   #   shape_quality:
   #     param: Quantity.Quality
@@ -606,10 +653,10 @@ diann_sequence:
   ion_cols:
    - Stripped.Sequence
   filters:
-    shape_quality:
-      param: Quantity.Quality
-      comparator: ">"
-      value: 0.3
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
 
 
 diann_protein:
@@ -658,6 +705,11 @@ diann_peptide_based_on_precursor_ms1_and_ms2:
          - Precursor.Charge
         MS1ISOTOPES:
          - Precursor.Charge
+  filters:
+    protein_qval:
+      param: Lib.PG.Q.Value
+      comparator: "<="
+      value: 0.01
   use_iontree: True
   ml_level: CHARGE
 

diff --git a/directlfq/dashboard_parts.py b/directlfq/dashboard_parts.py
@@ -176,6 +176,16 @@ def __init__(self):
             margin=(15, 15, 0, 15)
         )
 
+        self.yaml_filt_dict_title = pn.pane.Markdown('* In case you want to define specific filters in addition to the standard filters, you can add a yaml file where the filters are defined (see GitHub docs).')
+        self.yaml_filt_dict_path = pn.widgets.TextInput(
+            name='',
+            default = None,
+            placeholder='(optional) Enter the whole path to the yaml file with the filters',
+            width=900,
+            sizing_mode='stretch_width',
+            margin=(15, 15, 0, 15)
+        )
+
         self.dropdown_menu_for_input_type_title = pn.pane.Markdown('* Specify the type of the input table you want to use from the dropdown menu. Applies only if you want to use non-default settings, for example if you want to use summarized precursor intensities instead of fragment ion intensities for DIA data:')
         self.dropdown_menu_for_input_type = pn.widgets.Select(name = "",
         options = {'detect automatically' : None, 'Alphapept peptides.csv' : 'alphapept_peptides', 'MaxQuant evidence.txt' : "maxquant_evidence", 'MaxQuant peptides.txt' : 'maxquant_peptides',
@@ -185,6 +195,9 @@ def __init__(self):
         self.num_nonan_vals_title = pn.pane.Markdown('* Specify the minimum number of non-nan ion intensities required to derive a protein intensity. The higher this number, the more reliable the protein quantification at the cost of more missing values:')
         self.num_nonan_vals = pn.widgets.IntInput(name='', value=1, step=1, start=0, end=1000)
 
+        self.num_cores_title = pn.pane.Markdown('* Specify the number of cores to use (default of 0 means multiprocessing):')
+        self.num_cores_vals = pn.widgets.IntInput(name='', value=0, step=1, start=0, end=1000)
+
 
 
         # RUN PIPELINE
@@ -231,6 +244,10 @@ def create(self):
                             self.protein_subset_for_normalization_file,
                             self.num_nonan_vals_title,
                             self.num_nonan_vals,
+                            self.num_cores_title,
+                            self.num_cores_vals,
+                            self.yaml_filt_dict_title,
+                            self.yaml_filt_dict_path,
                             ), ), 
                             header='optional configurations',
                             collapsed=True,
@@ -308,10 +325,12 @@ def run_pipeline(self, *args):
         additional_headers = [] if self.additional_headers.value == '' else self.additional_headers.value
         min_nonan = self.num_nonan_vals.value
         file_of_proteins_for_normalization = None if self.protein_subset_for_normalization_file.value == '' else self.protein_subset_for_normalization_file.value
-
+        num_cores = None if self.num_cores_vals.value == -1 else self.num_cores_vals.value
+        yaml_filt_dict_path = None if self.yaml_filt_dict_path.value == '' else self.yaml_filt_dict_path.value
 
         lfq_manager.run_lfq(input_file = input_file, input_type_to_use = input_type_to_use, maximum_number_of_quadratic_ions_to_use_per_protein = 10,
-         number_of_quadratic_samples = 50, mq_protein_groups_txt= mq_protein_groups_txt, columns_to_add= additional_headers, selected_proteins_file= file_of_proteins_for_normalization, min_nonan = min_nonan)
+         number_of_quadratic_samples = 50, mq_protein_groups_txt= mq_protein_groups_txt, columns_to_add= additional_headers, selected_proteins_file= file_of_proteins_for_normalization, 
+         min_nonan = min_nonan, num_cores=num_cores, filter_dict=yaml_filt_dict_path)
 
         self.trigger_dependancy()
         self.run_pipeline_progress.active = False

diff --git a/directlfq/lfq_manager.py b/directlfq/lfq_manager.py
@@ -9,6 +9,7 @@
 import directlfq.utils as lfqutils
 import pandas as pd
 import directlfq
+import os
 
 import warnings
 
@@ -17,7 +18,7 @@
 
 
 def run_lfq(input_file,  columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10, 
-number_of_quadratic_samples = 50, num_cores = None, filename_suffix = "", deactivate_normalization = False
+number_of_quadratic_samples = 50, num_cores = None, filename_suffix = "", deactivate_normalization = False, filter_dict = None
 ):
     """Run the directLFQ pipeline on a given input file. The input file is expected to contain ion intensities. The output is a table containing protein intensities.
 
@@ -34,9 +35,9 @@ def run_lfq(input_file,  columns_to_add = [], selected_proteins_file :str = None
     """
     print("Starting directLFQ analysis.")
     input_file = prepare_input_filename(input_file)
-    print("reformatting input file, for large files this might take a while.")
+    filter_dict = load_filter_dict_if_given_as_yaml(filter_dict)
     input_file = lfqutils.add_mq_protein_group_ids_if_applicable_and_obtain_annotated_file(input_file, input_type_to_use,mq_protein_groups_txt, columns_to_add)
-    input_df = lfqutils.import_data(input_file=input_file, input_type_to_use=input_type_to_use)
+    input_df = lfqutils.import_data(input_file=input_file, input_type_to_use=input_type_to_use, filter_dict=filter_dict)
     input_df = lfqutils.index_and_log_transform_input_df(input_df)
     input_df = lfqutils.remove_allnan_rows_input_df(input_df)
 
@@ -59,6 +60,15 @@ def run_lfq(input_file,  columns_to_add = [], selected_proteins_file :str = None
 
     print("Analysis finished!")
 
+def load_filter_dict_if_given_as_yaml(filter_dict):
+    if os.path.isfile(str(filter_dict)):
+        #check if filter_dict is a path to a yaml file
+        if filter_dict.endswith(".yaml"):
+            filter_dict = lfqutils.load_config(filter_dict)
+            return filter_dict
+    else:
+        return filter_dict
+
 def prepare_input_filename(input_file):
     input_file = fr"{input_file}".replace("\ ", " ").rstrip() #replace escaped spaces with normal spaces and remove trailing whitespace
     return input_file

diff --git a/directlfq/utils.py b/directlfq/utils.py
@@ -769,7 +769,7 @@ def check_for_processed_runs_in_results_folder(results_folder):
 import os
 import pathlib
 
-def import_data(input_file, input_type_to_use = None, samples_subset = None, results_dir = None):
+def import_data(input_file, input_type_to_use = None, samples_subset = None, filter_dict = None):
     """
     Function to import peptide level data. Depending on available columns in the provided file,
     the function identifies the type of input used (e.g. Spectronaut, MaxQuant, DIA-NN), reformats if necessary
@@ -782,16 +782,19 @@ def import_data(input_file, input_type_to_use = None, samples_subset = None, res
     if ("aq_reformat" in input_file) | (input_type_to_use == "directlfq"):
         file_to_read = input_file
     else:
-        file_to_read = reformat_and_save_input_file(input_file=input_file, input_type_to_use=input_type_to_use)
+        file_to_read = reformat_and_save_input_file(input_file=input_file, input_type_to_use=input_type_to_use, filter_dict=filter_dict)
 
     input_reshaped = pd.read_csv(file_to_read, sep = "\t", encoding = 'latin1', usecols=samples_subset)
     input_reshaped = input_reshaped.drop_duplicates(subset='ion')
     return input_reshaped
 
 
-def reformat_and_save_input_file(input_file, input_type_to_use = None):
+def reformat_and_save_input_file(input_file, input_type_to_use = None, filter_dict = None):
 
     input_type, config_dict_for_type, sep = get_input_type_and_config_dict(input_file, input_type_to_use)
+
+    if filter_dict is not None:
+        config_dict_for_type['filters']=  dict(config_dict_for_type.get('filters', {}),**filter_dict)
     print(f"using input type {input_type}")
     format = config_dict_for_type.get('format')
     outfile_name = f"{input_file}.{input_type}.aq_reformat.tsv"

diff --git a/misc/bumpversion.cfg b/misc/bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.2.9
+current_version = 0.2.10
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

diff --git a/release/examples/filterdict.yaml b/release/examples/filterdict.yaml
@@ -0,0 +1,8 @@
+protein_qval:
+  param: Lib.PG.Q.Value
+  comparator: "<="
+  value: 0.01
+peptide_qval:
+  param: Lib.Q.Value
+  comparator: "<="
+  value: 0.01
diff --git a/release/one_click_linux_gui/control b/release/one_click_linux_gui/control
@@ -1,5 +1,5 @@
 Package: directlfq
-Version: 0.2.9
+Version: 0.2.10
 Architecture: all
 Maintainer: Mann Labs <opensource@alphapept.com>
 Description: directlfq

diff --git a/release/one_click_linux_gui/create_installer_linux.sh b/release/one_click_linux_gui/create_installer_linux.sh
@@ -17,7 +17,7 @@ python setup.py sdist bdist_wheel
 # Setting up the local package
 cd release/one_click_linux_gui
 # Make sure you include the required extra packages and always use the stable or very-stable options!
-pip install "../../dist/directlfq-0.2.9-py3-none-any.whl[stable, gui]"
+pip install "../../dist/directlfq-0.2.10-py3-none-any.whl[stable, gui]"
 
 # Creating the stand-alone pyinstaller folder
 pip install pyinstaller==4.10

diff --git a/release/one_click_macos_gui/Info.plist b/release/one_click_macos_gui/Info.plist
@@ -9,9 +9,9 @@
 	<key>CFBundleIconFile</key>
 	<string>alpha_logo.icns</string>
 	<key>CFBundleIdentifier</key>
-	<string>directlfq.0.2.9</string>
+	<string>directlfq.0.2.10</string>
 	<key>CFBundleShortVersionString</key>
-	<string>0.2.9</string>
+	<string>0.2.10</string>
 	<key>CFBundleInfoDictionaryVersion</key>
 	<string>6.0</string>
 	<key>CFBundleName</key>

diff --git a/release/one_click_macos_gui/create_installer_macos.sh b/release/one_click_macos_gui/create_installer_macos.sh
@@ -20,7 +20,7 @@ python setup.py sdist bdist_wheel
 
 # Setting up the local package
 cd release/one_click_macos_gui
-pip install "../../dist/directlfq-0.2.9-py3-none-any.whl[stable, gui]"
+pip install "../../dist/directlfq-0.2.10-py3-none-any.whl[stable, gui]"
 
 # Creating the stand-alone pyinstaller folder
 pip install pyinstaller==4.10
@@ -40,5 +40,5 @@ cp ../../LICENSE Resources/LICENSE
 cp ../logos/alpha_logo.png Resources/alpha_logo.png
 chmod 777 scripts/*
 
-pkgbuild --root dist/directlfq --identifier de.mpg.biochem.directlfq.app --version 0.2.9 --install-location /Applications/directlfq.app --scripts scripts directlfq.pkg
+pkgbuild --root dist/directlfq --identifier de.mpg.biochem.directlfq.app --version 0.2.10 --install-location /Applications/directlfq.app --scripts scripts directlfq.pkg
 productbuild --distribution distribution.xml --resources Resources --package-path directlfq.pkg dist/directlfq_gui_installer_macos.pkg
diff --git a/release/one_click_macos_gui/distribution.xml b/release/one_click_macos_gui/distribution.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8" standalone="no"?>
 <installer-script minSpecVersion="1.000000">
-    <title>directlfq 0.2.9</title>
+    <title>directlfq 0.2.10</title>
     <background mime-type="image/png" file="alpha_logo.png" scaling="proportional"/>
     <welcome file="welcome.html" mime-type="text/html" />
     <conclusion file="conclusion.html" mime-type="text/html" />

diff --git a/release/one_click_windows_gui/create_installer_windows.sh b/release/one_click_windows_gui/create_installer_windows.sh
@@ -17,7 +17,7 @@ python setup.py sdist bdist_wheel
 # Setting up the local package
 cd release/one_click_windows_gui
 # Make sure you include the required extra packages and always use the stable or very-stable options!
-pip install "../../dist/directlfq-0.2.9-py3-none-any.whl[stable, gui]"
+pip install "../../dist/directlfq-0.2.10-py3-none-any.whl[stable, gui]"
 
 # Creating the stand-alone pyinstaller folder
 pip install pyinstaller==4.10

diff --git a/release/one_click_windows_gui/directlfq_innoinstaller.iss b/release/one_click_windows_gui/directlfq_innoinstaller.iss
@@ -2,7 +2,7 @@
 ; SEE THE DOCUMENTATION FOR DETAILS ON CREATING INNO SETUP SCRIPT FILES!
 
 #define MyAppName "directlfq"
-#define MyAppVersion "0.2.9"
+#define MyAppVersion "0.2.10"
 #define MyAppPublisher "Max Planck Institute of Biochemistry and the University of Copenhagen, Mann Labs"
 #define MyAppURL "https://github.com/MannLabs/directlfq"
 #define MyAppExeName "directlfq_gui.exe"