Optimize depmap driver analysis (#37)

# What? - add `process.py` which is used for depmap driver analysis - add `proxbias/notebooks/DepMap_PB_Driver_Analysis.ipynb` and `proxbias/notebooks/DepMap_PB_Sample.ipynb` to include analysis performed and sample of how to use driver code - add optimized versions of monte carlo simulation code that utilize numba - add tsv for cancer gene list - reorganize repo to make things more clear - make downloading depmap rnai data optional to speed up usage - clean up .gitignore for public release - update dependencies # Why? - all code is necessary for public release or related to reviewer comments.
recursionpharma · Oct 26, 2023 · ce2cddd · ce2cddd
1 parent e6f40d8
commit ce2cddd
Show file tree

Hide file tree

Showing 26 changed files with 2,856 additions and 174 deletions.
diff --git a/.gitignore b/.gitignore
@@ -84,22 +84,9 @@ ipython_config.py
 # pyenv
 .python-version
 
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
 
 # Environments
 .env
@@ -125,9 +112,6 @@ venv.bak/
 .dmypy.json
 dmypy.json
 
-# Pyre type checker
-.pyre/
-
 # VS Code
 settings.json
 
@@ -137,3 +121,12 @@ settings.json
 # Scratch work
 scratch/
 proxbias/notebooks/plots/
+
+
+# Determined.ai
+.detignore
+notebook_config.yaml
+startup-hook.sh
+
+# Depmap
+depmap/*
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ pip install proxbias
 
 Notebooks to reproduce plots from public datasets are included in the `notebooks` directory. Note that this doesn't include rxrx3 for IP reasons at this time. 
 
-- `cpg0016_loading.ipynb` - Load the JUMP CP data, apply PCA and proximity bias correction and save data.
+- `cpg0016_loading.ipynb` - Load the JUMP CP data, apply PCA and proximity bias correction and save data locally.
 - `cpg0016_plots.ipynb` - Create whole-genome plots from cpg0016, calculate Brunner-Munzel statistics and make bar plots.
 - `shinyDepMap_benchmark.ipynb` - Load DepMap 19Q3 data and create plots showing an enrichment for within-chromosome arm relationships.
 - `DepMap_PB_Drivers.ipynb` - Use DepMap 22Q4 data to look at differential proximity bias when TP53 and other genes are wild-type vs loss/gain of function.

diff --git a/VERSION b/VERSION
@@ -1,2 +1,2 @@
 MAJOR="0"
-MINOR="10"
+MINOR="11"
diff --git a/proxbias/data/cancerGeneList.tsv b/proxbias/data/cancerGeneList.tsv
diff --git a/proxbias/depmap/__init__.py b/proxbias/depmap/__init__.py
diff --git a/proxbias/depmap/constants.py b/proxbias/depmap/constants.py
@@ -0,0 +1,38 @@
+DEPMAP_API_URL = "https://depmap.org/portal/api/download/files"
+DEPMAP_RELEASE_DEFAULT = "DepMap Public 22Q4"
+
+CNV_FILENAME = "OmicsCNGene.csv"
+CRISPR_DEPENDENCY_EFFECT_FILENAME = "CRISPRGeneEffect.csv"
+MUTATION_FILENAME = "OmicsSomaticMutations.csv"
+
+DEMETER2_RELEASE_DEFAULT = "DEMETER2 Data v6"
+RNAI_DEPENDENCY_EFFECT_FILENAME = "D2_combined_gene_dep_scores.csv"
+
+
+CN_LOSS_CUTOFF = 1.5
+CN_GAIN_CUTOFF = 2.5
+
+
+LOF_MUTATION_TYPES = [
+    "MISSENSE",
+    "NONSENSE",
+    "IN_FRAME_DEL",
+    "SPLICE_SITE",
+    "FRAME_SHIFT_INS",
+    "FRAME_SHIFT_DEL",
+    "IN_FRAME_INS",
+]
+
+COMPLETE_LOF_MUTATION_TYPES = [
+    "NONSENSE",
+    "FRAME_SHIFT_INS",
+    "FRAME_SHIFT_DEL",
+]
+
+# gain of function mutation types
+AMP_MUTATION_TYPES = [
+    "MISSENSE",
+    "IN_FRAME_DEL",
+    "SPLICE_SITE",
+    "IN_FRAME_INS",
+]
diff --git a/proxbias/utils/depmap.py → proxbias/depmap/load.py b/proxbias/utils/depmap.py → proxbias/depmap/load.py
@@ -1,16 +1,18 @@
 import os
-from typing import Tuple
 from functools import partial
-import pandas as pd
+from typing import Tuple
 
+import pandas as pd
 
-DEPMAP_API_URL = "https://depmap.org/portal/api/download/files"
-DEPMAP_RELEASE_DEFAULT = "DepMap Public 22Q4"
-DEMETER2_RELEASE_DEFAULT = "DEMETER2 Data v6"
-CNV_FILENAME = "OmicsCNGene.csv"
-CRISPR_DEPENDENCY_EFFECT_FILENAME = "CRISPRGeneEffect.csv"
-RNAI_DEPENDENCY_EFFECT_FILENAME = "D2_combined_gene_dep_scores.csv"
-MUTATION_FILENAME = "OmicsSomaticMutations.csv"
+from proxbias.depmap.constants import (
+    CNV_FILENAME,
+    CRISPR_DEPENDENCY_EFFECT_FILENAME,
+    DEMETER2_RELEASE_DEFAULT,
+    DEPMAP_API_URL,
+    DEPMAP_RELEASE_DEFAULT,
+    MUTATION_FILENAME,
+    RNAI_DEPENDENCY_EFFECT_FILENAME,
+)
 
 
 def _download_file(_release_files: pd.DataFrame, _filename: str, **read_kwargs):
@@ -38,7 +40,8 @@ def get_depmap_data(
     depmap_release : str, optional
         a depmap release string, by default DEPMAP_RELEASE_DEFAULT
     rnai_release : str, optional
-        a rnai release string, by default DEMETER2_RELEASE_DEFAULT
+        a rnai release string, by default DEMETER2_RELEASE_DEFAULT. If an empty string,
+        do not check cache and return an empty dataframe
     cache : bool, optional
         whether cache the data as csv or not, by default True
     cache_base_dir : str, optional
@@ -67,7 +70,7 @@ def _read_and_cache(release_name, release_files, file_prefix, filename, **read_k
             target_data = pd.read_csv(target_file, **read_kwargs)
             print("Done!")
         else:
-            print(f"Cache is not found. Downloading {filename} from {release_name}...")
+            print(f"Cached file {filename} is not found. Downloading from {release_name}...")
             target_data = _download_file(release_files, filename, **read_kwargs)
             print("Done!")
             if cache:
@@ -101,16 +104,19 @@ def _read_and_cache(release_name, release_files, file_prefix, filename, **read_k
     crispr_effect_data.index = [g.split(" ")[0] for g in crispr_effect_data.index]
 
     # RNAi Dependency Effect
-    rnai_effect_data = _read_cache_rnai(RNAI_DEPENDENCY_EFFECT_FILENAME, index_col=0)
-    rnai_effect_data.columns.name = "ModelID"
-    rnai_effect_data.index = [g.split(" ")[0] for g in rnai_effect_data.index]
-    # remove multi-mapping oligos
-    rnai_effect_data = rnai_effect_data.query("~index.str.contains('&')")
-    # some genes are not available for a majority of cell models.
-    # most are deprecated or low confidence genes. remove them.
-    n_missed_models = rnai_effect_data.isna().sum(axis=1)
-    rnai_models = n_missed_models[n_missed_models < 200].index
-    rnai_effect_data = rnai_effect_data.loc[rnai_models].dropna(axis=1)
+    if rnai_release:
+        rnai_effect_data = _read_cache_rnai(RNAI_DEPENDENCY_EFFECT_FILENAME, index_col=0)
+        rnai_effect_data.columns.name = "ModelID"
+        rnai_effect_data.index = [g.split(" ")[0] for g in rnai_effect_data.index]
+        # remove multi-mapping oligos
+        rnai_effect_data = rnai_effect_data.query("~index.str.contains('&')")
+        # some genes are not available for a majority of cell models.
+        # most are deprecated or low confidence genes. remove them.
+        n_missed_models = rnai_effect_data.isna().sum(axis=1)
+        rnai_models = n_missed_models[n_missed_models < 200].index
+        rnai_effect_data = rnai_effect_data.loc[rnai_models].dropna(axis=1)
+    else:
+        rnai_effect_data = pd.DataFrame()
 
     # Copy Number Variation
     cnv_data = _read_cache_depmap(CNV_FILENAME, index_col=0)