Skip to content

Commit

Permalink
Optimize depmap driver analysis (#37)
Browse files Browse the repository at this point in the history
# What?
- add `process.py` which is used for depmap driver analysis
- add `proxbias/notebooks/DepMap_PB_Driver_Analysis.ipynb` and `proxbias/notebooks/DepMap_PB_Sample.ipynb` to include analysis performed and sample of how to use driver code
- add optimized versions of monte carlo simulation code that utilize numba
- add tsv for cancer gene list
- reorganize repo to make things more clear
- make downloading depmap rnai data optional to speed up usage
-  clean up .gitignore for public release
- update dependencies

# Why?

-   all code is necessary for public release or related to reviewer comments.
  • Loading branch information
johnurbanik authored Oct 26, 2023
1 parent e6f40d8 commit ce2cddd
Show file tree
Hide file tree
Showing 26 changed files with 2,856 additions and 174 deletions.
25 changes: 9 additions & 16 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -84,22 +84,9 @@ ipython_config.py
# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
Expand All @@ -125,9 +112,6 @@ venv.bak/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# VS Code
settings.json

Expand All @@ -137,3 +121,12 @@ settings.json
# Scratch work
scratch/
proxbias/notebooks/plots/


# Determined.ai
.detignore
notebook_config.yaml
startup-hook.sh

# Depmap
depmap/*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pip install proxbias

Notebooks to reproduce plots from public datasets are included in the `notebooks` directory. Note that this doesn't include rxrx3 for IP reasons at this time.

- `cpg0016_loading.ipynb` - Load the JUMP CP data, apply PCA and proximity bias correction and save data.
- `cpg0016_loading.ipynb` - Load the JUMP CP data, apply PCA and proximity bias correction and save data locally.
- `cpg0016_plots.ipynb` - Create whole-genome plots from cpg0016, calculate Brunner-Munzel statistics and make bar plots.
- `shinyDepMap_benchmark.ipynb` - Load DepMap 19Q3 data and create plots showing an enrichment for within-chromosome arm relationships.
- `DepMap_PB_Drivers.ipynb` - Use DepMap 22Q4 data to look at differential proximity bias when TP53 and other genes are wild-type vs loss/gain of function.
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
MAJOR="0"
MINOR="10"
MINOR="11"
1,098 changes: 1,098 additions & 0 deletions proxbias/data/cancerGeneList.tsv

Large diffs are not rendered by default.

Empty file added proxbias/depmap/__init__.py
Empty file.
38 changes: 38 additions & 0 deletions proxbias/depmap/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
DEPMAP_API_URL = "https://depmap.org/portal/api/download/files"
DEPMAP_RELEASE_DEFAULT = "DepMap Public 22Q4"

CNV_FILENAME = "OmicsCNGene.csv"
CRISPR_DEPENDENCY_EFFECT_FILENAME = "CRISPRGeneEffect.csv"
MUTATION_FILENAME = "OmicsSomaticMutations.csv"

DEMETER2_RELEASE_DEFAULT = "DEMETER2 Data v6"
RNAI_DEPENDENCY_EFFECT_FILENAME = "D2_combined_gene_dep_scores.csv"


CN_LOSS_CUTOFF = 1.5
CN_GAIN_CUTOFF = 2.5


LOF_MUTATION_TYPES = [
"MISSENSE",
"NONSENSE",
"IN_FRAME_DEL",
"SPLICE_SITE",
"FRAME_SHIFT_INS",
"FRAME_SHIFT_DEL",
"IN_FRAME_INS",
]

COMPLETE_LOF_MUTATION_TYPES = [
"NONSENSE",
"FRAME_SHIFT_INS",
"FRAME_SHIFT_DEL",
]

# gain of function mutation types
AMP_MUTATION_TYPES = [
"MISSENSE",
"IN_FRAME_DEL",
"SPLICE_SITE",
"IN_FRAME_INS",
]
48 changes: 27 additions & 21 deletions proxbias/utils/depmap.py → proxbias/depmap/load.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
import os
from typing import Tuple
from functools import partial
import pandas as pd
from typing import Tuple

import pandas as pd

DEPMAP_API_URL = "https://depmap.org/portal/api/download/files"
DEPMAP_RELEASE_DEFAULT = "DepMap Public 22Q4"
DEMETER2_RELEASE_DEFAULT = "DEMETER2 Data v6"
CNV_FILENAME = "OmicsCNGene.csv"
CRISPR_DEPENDENCY_EFFECT_FILENAME = "CRISPRGeneEffect.csv"
RNAI_DEPENDENCY_EFFECT_FILENAME = "D2_combined_gene_dep_scores.csv"
MUTATION_FILENAME = "OmicsSomaticMutations.csv"
from proxbias.depmap.constants import (
CNV_FILENAME,
CRISPR_DEPENDENCY_EFFECT_FILENAME,
DEMETER2_RELEASE_DEFAULT,
DEPMAP_API_URL,
DEPMAP_RELEASE_DEFAULT,
MUTATION_FILENAME,
RNAI_DEPENDENCY_EFFECT_FILENAME,
)


def _download_file(_release_files: pd.DataFrame, _filename: str, **read_kwargs):
Expand Down Expand Up @@ -38,7 +40,8 @@ def get_depmap_data(
depmap_release : str, optional
a depmap release string, by default DEPMAP_RELEASE_DEFAULT
rnai_release : str, optional
a rnai release string, by default DEMETER2_RELEASE_DEFAULT
a rnai release string, by default DEMETER2_RELEASE_DEFAULT. If an empty string,
do not check cache and return an empty dataframe
cache : bool, optional
whether cache the data as csv or not, by default True
cache_base_dir : str, optional
Expand Down Expand Up @@ -67,7 +70,7 @@ def _read_and_cache(release_name, release_files, file_prefix, filename, **read_k
target_data = pd.read_csv(target_file, **read_kwargs)
print("Done!")
else:
print(f"Cache is not found. Downloading {filename} from {release_name}...")
print(f"Cached file {filename} is not found. Downloading from {release_name}...")
target_data = _download_file(release_files, filename, **read_kwargs)
print("Done!")
if cache:
Expand Down Expand Up @@ -101,16 +104,19 @@ def _read_and_cache(release_name, release_files, file_prefix, filename, **read_k
crispr_effect_data.index = [g.split(" ")[0] for g in crispr_effect_data.index]

# RNAi Dependency Effect
rnai_effect_data = _read_cache_rnai(RNAI_DEPENDENCY_EFFECT_FILENAME, index_col=0)
rnai_effect_data.columns.name = "ModelID"
rnai_effect_data.index = [g.split(" ")[0] for g in rnai_effect_data.index]
# remove multi-mapping oligos
rnai_effect_data = rnai_effect_data.query("~index.str.contains('&')")
# some genes are not available for a majority of cell models.
# most are deprecated or low confidence genes. remove them.
n_missed_models = rnai_effect_data.isna().sum(axis=1)
rnai_models = n_missed_models[n_missed_models < 200].index
rnai_effect_data = rnai_effect_data.loc[rnai_models].dropna(axis=1)
if rnai_release:
rnai_effect_data = _read_cache_rnai(RNAI_DEPENDENCY_EFFECT_FILENAME, index_col=0)
rnai_effect_data.columns.name = "ModelID"
rnai_effect_data.index = [g.split(" ")[0] for g in rnai_effect_data.index]
# remove multi-mapping oligos
rnai_effect_data = rnai_effect_data.query("~index.str.contains('&')")
# some genes are not available for a majority of cell models.
# most are deprecated or low confidence genes. remove them.
n_missed_models = rnai_effect_data.isna().sum(axis=1)
rnai_models = n_missed_models[n_missed_models < 200].index
rnai_effect_data = rnai_effect_data.loc[rnai_models].dropna(axis=1)
else:
rnai_effect_data = pd.DataFrame()

# Copy Number Variation
cnv_data = _read_cache_depmap(CNV_FILENAME, index_col=0)
Expand Down
Loading

0 comments on commit ce2cddd

Please sign in to comment.