diff --git a/appyters/Drug_Gene_Budger2/README.md b/appyters/Drug_Gene_Budger2/README.md index cebf1832..c347a7b8 100644 --- a/appyters/Drug_Gene_Budger2/README.md +++ b/appyters/Drug_Gene_Budger2/README.md @@ -1,11 +1,11 @@ -# Dr. Gene Budger (DGB) 2 +# DrugRanger -The Dr. Gene Budger 2 (DGB2) Appyter takes a single human gene as input, and returns ranked up- and down-regulating drugs from three Connectivity Mapping resources that were shown to maximally increase or decrease the mRNA expression of the gene in human cell lines. The three Connectivity Mapping resources are: +The DrugRanger Appyter takes a single human gene or gene set as input, and returns ranked up- and down-regulating drugs from three Connectivity Mapping resources that were shown to maximally increase or decrease the mRNA expression of the gene(s) in human cell lines. The three Connectivity Mapping resources are: - [Ginkgo GDPx1 and GDPx2 datasets](https://huggingface.co/ginkgo-datapoints) - [Novartis DRUG-seq U2OS MoABox dataset](https://zenodo.org/records/14291446) -- [LINCS L1000 Chemical Perturbation dataset](https://maayanlab.cloud/sigcom-lincs/#/Download) +- [Tahoe-100M](https://huggingface.co/datasets/tahoebio/Tahoe-100M) -In addition to producing tables of ranked up- and down-regulating drugs of the input gene, the notebook creates volcano plot visualizations and UpSet plots that identify overlap in regulators across datasets. \ No newline at end of file +In addition to producing tables of ranked up- and down-regulating drugs of the input gene, the notebook creates various visualizations for the single gene and multi-gene analysis, to help users determine the most effective regulators of their input gene(s). \ No newline at end of file diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json index 86ac03a7..72152930 100644 --- a/appyters/Drug_Gene_Budger2/appyter.json +++ b/appyters/Drug_Gene_Budger2/appyter.json @@ -1,10 +1,10 @@ { "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json", "name": "Drug_Gene_Budger2", - "title": "Dr. Gene Budger (DGB) 2", - "version": "0.0.8", - "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets", - "image": "dgb_logo.png", + "title": "DrugRanger", + "version": "0.1.0", + "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene or gene set across Connectivity Mapping datasets", + "image": "DR_logo.png", "authors": [ { "name": "Lily Taub", @@ -13,7 +13,6 @@ ], "url": "https://github.com/MaayanLab/appyter-catalog", "tags": [ - "L1000", "DRUG-seq", "RNA-seq" ], diff --git a/appyters/Drug_Gene_Budger2/cmap_readers.py b/appyters/Drug_Gene_Budger2/cmap_readers.py new file mode 100644 index 00000000..ac0f3f45 --- /dev/null +++ b/appyters/Drug_Gene_Budger2/cmap_readers.py @@ -0,0 +1,136 @@ +import pandas as pd +import numpy as np +import hashlib +import polars as pl + +def prepare_novartis_data(gene, URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'): + ''' + gene: gene symbol to retrieve + URL: Novartis data storage location + + output: results dataframe from Novartis data + ''' + try: + novartis_de = pd.read_feather(f'{URL}/{gene}.f').set_index('index') + except: + # print(f'{gene} not found in Novartis') + return None + # format p-values + novartis_de['log10adj.P.Val'] = novartis_de['P.Adj'].replace(0,1e-323).map(np.log10)*-1 + # rename logFC column for concordance with Ginkgo columns + novartis_de.rename(columns={'LogFC':'logFC', 'P.Adj':'adj.P.Val'}, inplace=True) + return novartis_de + +def prepare_lincs_data(gene, URL='https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_de'): + ''' + gene: gene symbol to retrieve + URL: LINCS data storage location + + output: results dataframe from LINCS data + ''' + try: + lincs_de = pd.read_feather(f'{URL}/{gene}.f') + except: + # print(f'{gene} not found in LINCS') + return None + # format p-values + lincs_de['log10adj.P.Val'] = lincs_de['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1 + # remove CRISPR KO perturbations + lincs_ko_perturbs = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_ko_perturbs.txt', sep='\t') + lincs_de = lincs_de[~lincs_de['Drug'].isin(lincs_ko_perturbs.cmap_name.to_list())] + return lincs_de + +def hash_bucket(gene, num_buckets=512): + ''' + gene: Gene symbol + num_buckets: number of hash buckets to create + + output: integer hash for gene name (between 0-n_buckets) + ''' + return int(hashlib.md5(gene.encode()).hexdigest(),16) % num_buckets + +def prepare_tahoe_data(df, gene): + ''' + df: DataFrame retrieved from Tahoe gene bucket file + gene: gene to filter dataframe + + output: results dataframe from Tahoe data filtered to gene + ''' + tahoe_de = df[df['gene_name']==gene] + if tahoe_de.shape[0] == 0: + # print(f'{gene} not found in Tahoe') + return None + tahoe_de['log10adj.P.Val'] = tahoe_de['padj'].replace(0,1e-323).map(np.log10)*-1 + tahoe_de.rename(columns = {'log2FoldChange':'logFC', 'drug':'Drug', 'padj':'adj.P.Val', 'group':'Perturbation', 'gene_name':'Gene'}, inplace=True) + tahoe_de['GeneDir'] = np.where(tahoe_de['UpReg']==1,'Up','Dn') + return tahoe_de + +def retrieve_tahoe_data(gene_set, URL='https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/tahoe_de'): + ''' + gene_set: list of gene symbols + + output: dictionary of tahoe data for each gene in gene_set + ''' + hash_dict = {} + for g in gene_set: + hash_dict[g] = str(hash_bucket(g)) + hash_dict_rev = {} + for gene, hash in hash_dict.items(): + if hash in hash_dict_rev: + hash_dict_rev[hash].append(gene) + else: + hash_dict_rev[hash]=[gene] + tahoe_results = {} + for hash, genes in hash_dict_rev.items(): + df = pd.read_parquet(f'{URL}/gene_bucket_{hash}.parquet', use_pandas_metadata=False) + for g in genes: + tahoe_data = prepare_tahoe_data(df, g) + tahoe_results[g] = tahoe_data + return tahoe_results + +def prepare_ginkgo_data_dict(gene, cell_types, URL='https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/ginkgo_de'): + ''' + gene: gene symbol to retrieve + cell types: Ginkgo cell types in GDPx1 and GDPx2 + URL: Ginkgo data storage location + + output: dictionary with dataframes for all Ginkgo cell types + ''' + try: + df = pd.read_feather(f'{URL}/{gene}.f') + except: + # print('Gene not found in Ginkgo') + return None + cell_type_results = {} + for k in cell_types: + subset = df[df['Perturbation'].str.contains(k)] + subset['log10adj.P.Val'] = subset['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1 + subset = subset.drop('index', axis=1) + cell_type_results[k] = subset + + return cell_type_results + +def prepare_ginkgo_data_df(gene, cell_types, URL='https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/ginkgo_de'): + ''' + gene: gene symbol to retrieve + cell types: Ginkgo cell types in GDPx1 and GDPx2 + URL: Ginkgo data storage location + + output: results dataframe for all Ginkgo cell types + ''' + try: + + df = pd.read_feather(f'{URL}/{gene}.f') + except: + # print('Gene not found in Ginkgo') + return None + cell_type_results = {} + for k in cell_types: + subset = df[df['Perturbation'].str.contains(k)] + subset['log10adj.P.Val'] = subset['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1 + subset = subset.drop('index', axis=1) + cell_type_results[k] = subset + + all_df = pd.concat(cell_type_results.values(), ignore_index=True) + + return all_df \ No newline at end of file diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb index 0cd54b0d..5c57af9b 100644 --- a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb +++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb @@ -22,36 +22,64 @@ "%%appyter hide_code\n", "\n", "{% do SectionField(\n", - " name='gene_input', \n", - " title = '1. Select an input gene', \n", - " subtitle = 'Enter a human gene of interest'\n", + " name='input', \n", + " title = '1. Gene Upload', \n", + " subtitle = 'Enter a single human gene or a gene set'\n", ") %}\n", "\n", "{% do SectionField(\n", - " name='method_input', \n", - " title = '2. Select a drug ranking method', \n", - " subtitle = \"Select a ranking method by which to identify top up- and down-regulating drugs. Options are to rank drugs by the differential expression (DE) p-value of the target (default) or by the rank of the target in each perturbation's DE signature (Target Rank), where ranking is determined by the adjusted p-value of differential expression. This method prioritizes drug specificity over magnitude of regulation.\"\n", + " name='analysis', \n", + " title = '2. Select a drug ranking method for single gene analysis', \n", + " subtitle = \"Select a ranking method by which to identify top up- and down-regulating drugs. Options are to rank drugs by the differential expression (DE) p-value of the target (default) or by the rank of the target in each perturbation's DE signature (Target Rank), where ranking is determined by the adjusted p-value of differential expression. This method prioritizes drug specificity over magnitude of regulation. THIS PARAMETER IS IGNORED for the multi-gene analysis.\"\n", ") %}" ] }, { "cell_type": "code", "execution_count": null, - "id": "e07b2ead", + "id": "b0abfb33", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter hide code\n", + "{% set gene_input = TabField(\n", + " name = 'gene_input',\n", + " label = 'Query Gene(s)',\n", + " default = 'Single Gene',\n", + " description = 'Enter the gene symbol or gene set of interest.',\n", + " choices = {\n", + " 'Single Gene': [\n", + " AutocompleteField(\n", + " name='single_gene_input',\n", + " label = 'Input Gene',\n", + " default = 'C9ORF72',\n", + " description = 'Enter a single human gene of interest',\n", + " file_path = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/all_genes.json',\n", + " section = 'input'\n", + " )\n", + " ],\n", + " 'Multi-gene': [\n", + " TextField(\n", + " name='multi_gene_input',\n", + " label = 'Input Gene Set',\n", + " default = 'GPR141\\nMS4A6A\\nCR1\\nHLA-DQA1\\nMS4A4A',\n", + " description = 'Enter a list of genes, one per row',\n", + " section = 'input'\n", + " )\n", + " ]\n", + " },\n", + " section='input'\n", + ")%}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15e0d733", "metadata": {}, "outputs": [], "source": [ "%%appyter hide_code\n", - "\n", - "{% set input_gene = AutocompleteField(\n", - " name = 'input_gene',\n", - " label = 'Query Gene',\n", - " default = 'C9ORF72',\n", - " description = 'Enter the gene symbol of interest.',\n", - " file_path = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/all_genes.json',\n", - " section='gene_input'\n", - ")%}\n", - "\n", "{% set ranking_method = ChoiceField(\n", " name = 'ranking_method',\n", " label = 'Ranking Method',\n", @@ -61,19 +89,26 @@ " 'Differential Expression P-Value',\n", " 'Target Rank'\n", " ],\n", - " section='method_input'\n", + " section='analysis'\n", ")%}" ] }, { "cell_type": "code", "execution_count": null, - "id": "733c8208", + "id": "7e3c5250", "metadata": {}, "outputs": [], "source": [ "%%appyter code_exec\n", - "query_gene = \"{{ input_gene.value.upper() }}\"\n", + "\n", + "{%- if gene_input.raw_value == 'Single Gene' %} # single gene input\n", + "input_type = 'single'\n", + "query_gene = \"{{ gene_input.value[0]['args']['value'].upper() }}\"\n", + "{%- else -%} # multi-gene input\n", + "geneset = {{ gene_input.value[0] }}\n", + "input_type = 'multi'\n", + "{%- endif %}\n", "\n", "{%- if ranking_method.raw_value == 'Differential Expression P-Value' %}\n", "ranking_method = 'pval'\n", @@ -82,12 +117,28 @@ "{%- endif %}" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "588e87d5", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "# this cell parses the gene list into a python list\n", + "\n", + "{%- if gene_input.raw_value == 'Multi-gene' %}\n", + "query_geneset = geneset.split('\\n')\n", + "query_geneset = [x.strip().upper() for x in query_geneset]\n", + "{%- endif %}" + ] + }, { "cell_type": "markdown", "id": "123ecde2", "metadata": {}, "source": [ - "# Drug Gene Budger 2" + "# DrugRanger" ] }, { @@ -99,7 +150,6 @@ "\n", "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene expression results for 1,354 drugs.\n", "- Novartis DRUG-seq: Differential: Limma-Trend based differential gene expression results for 4,343 drugs. \n", - "- LINCS L1000 Chemical Perturbations: Limma-Voom based differential gene expression results for a subset of 4,091 drugs from the LINCS L1000 Chemical Perturbation dataset. \n", "- Tahoe 100-M: DESeq based differential gene expression results for 376 drugs tested across 50 different cancer cell lines. \n", "\n", "The Ginkgo dataset includes 4 primary cell types (epithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type.\n", @@ -122,12 +172,15 @@ "import warnings\n", "import hashlib\n", "\n", + "## Helpers\n", + "from cmap_readers import *\n", + "from multi_gene_utils import *\n", + "\n", "## Tables\n", - "from IPython.display import display, display_markdown, HTML\n", + "from IPython.display import display, display_markdown, HTML, FileLink\n", "\n", "## UpSet Plot\n", "from upsetplot import from_contents, plot\n", - "from matplotlib import pyplot\n", "\n", "## Venn Diagram\n", "from matplotlib_venn import venn3, venn2\n", @@ -135,9 +188,15 @@ "\n", "## Volcano Plot\n", "from bokeh.plotting import figure, show\n", - "from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper\n", - "from bokeh.palettes import RdBu\n", - "from bokeh.io import output_notebook" + "from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper, CategoricalColorMapper\n", + "from bokeh.palettes import RdBu, Category10\n", + "from bokeh.io import output_notebook\n", + "\n", + "## Barplots\n", + "import seaborn as sns\n", + "from matplotlib.colors import TwoSlopeNorm\n", + "from matplotlib.colorbar import ColorbarBase\n", + "import matplotlib.patches as mpatches" ] }, { @@ -150,7 +209,6 @@ "# Storage URLs for DE gene files\n", "ginkgo_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/ginkgo_de'\n", "novartis_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'\n", - "lincs_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_de'\n", "deepcover_moa_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/deepcoverMoa_de'\n", "tahoe_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/tahoe_de'\n", "\n", @@ -165,7 +223,8 @@ "metadata": {}, "outputs": [], "source": [ - "in_ginkgo = in_novartis = in_lincs = in_tahoe = True" + "in_ginkgo = in_novartis = in_tahoe = True\n", + "ginkgo_cell_types = ['human_epithelial_melanocytes', 'human_dermal_fibroblast', 'human_aortic_smooth_muscle_cells', 'human_skeletal_muscle_myoblasts','A549']" ] }, { @@ -175,98 +234,19 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "# get Ginkgo DE results for gene\n", - "gene_file = f'{query_gene}.f'\n", - "try:\n", - " ginkgo_de = pd.read_feather(f'{ginkgo_URL}/{gene_file}')\n", - " ginkgo_cell_types = list(set(p.split('-')[0] for p in ginkgo_de.Perturbation))\n", - "except:\n", + "ginkgo_gene_expr_dict = prepare_ginkgo_data_dict(query_gene, ginkgo_cell_types, ginkgo_URL)\n", + "if ginkgo_gene_expr_dict == None:\n", " in_ginkgo=False\n", - " print('Gene not in Ginkgo dataset')\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b9fb1130", - "metadata": {}, - "outputs": [], - "source": [ - "def prepare_ginkgo_data(df, cell_types):\n", - " '''Create a results dictionary where each cell type\n", - " in the Ginkgo dataset is a key and the value is the DE data\n", - " for the query gene for that cell type.\n", - " '''\n", - " cell_type_results = {}\n", - " for k in cell_types:\n", - " subset = df[df['Perturbation'].str.contains(k)]\n", - " subset['log10adj.P.Val'] = subset['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1\n", - " cell_type_results[k] = subset\n", - " return cell_type_results\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdc92b50", - "metadata": {}, - "outputs": [], - "source": [ - "if in_ginkgo:\n", - " ginkgo_gene_expr_dict = prepare_ginkgo_data(ginkgo_de, ginkgo_cell_types)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41658110", - "metadata": {}, - "outputs": [], - "source": [ - "# get LINCS DE results for gene\n", - "try:\n", - " lincs_de = pd.read_feather(f'{lincs_URL}/{gene_file}')\n", - " # format p-values\n", - " lincs_de['log10adj.P.Val'] = lincs_de['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1\n", - " # remove CRISPR KO perturbations\n", - " lincs_ko_perturbs = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_ko_perturbs.txt', sep='\\t')\n", - " lincs_de = lincs_de[~lincs_de['Drug'].isin(lincs_ko_perturbs.cmap_name.to_list())]\n", - "except:\n", - " print('Gene not in LINCS L1000 dataset')\n", - " in_lincs=False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d731a2ba", - "metadata": {}, - "outputs": [], - "source": [ + " print('Gene not in Ginkgo dataset') \n", "# get Novartis DE results for gene\n", - "try:\n", - " novartis_de = pd.read_feather(f'{novartis_URL}/{gene_file}').set_index('index')\n", - " # format p-values\n", - " novartis_de['log10adj.P.Val'] = novartis_de['P.Adj'].replace(0,1e-323).map(np.log10)*-1\n", - " # rename logFC column for concordance with Ginkgo columns\n", - " novartis_de.rename(columns={'LogFC':'logFC'}, inplace=True)\n", - "except:\n", + "novartis_de = prepare_novartis_data(query_gene, novartis_URL)\n", + "if novartis_de is None:\n", " print('Gene not in Novartis DRUG-seq dataset')\n", - " in_novartis=False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e7910ada", - "metadata": {}, - "outputs": [], - "source": [ + " in_novartis = False\n", "# get Tahoe DE results for gene\n", - "\n", - "# hash_bucket function used to sort genes into buckets\n", "def hash_bucket(gene, num_buckets=512):\n", " '''\n", " gene: Gene symbol\n", @@ -275,49 +255,26 @@ " Returns integer hash for gene name (between 0-n_buckets)\n", " '''\n", " return int(hashlib.md5(gene.encode()).hexdigest(),16) % num_buckets\n", - "\n", "query_gene_encoded = hash_bucket(query_gene)\n", - "\n", - "try:\n", - " tahoe_de = pd.read_parquet(f'{tahoe_URL}/gene_bucket_{query_gene_encoded}.parquet')\n", - " tahoe_de = tahoe_de[tahoe_de['gene_name']==query_gene]\n", - " tahoe_de['log10adj.P.Val'] = tahoe_de['padj'].replace(0,1e-323).map(np.log10)*-1\n", - " tahoe_de.rename(columns = {'log2FoldChange':'logFC', 'drug':'Drug', 'padj':'adj.P.Val'}, inplace=True)\n", - " tahoe_de['GeneDir'] = np.where(tahoe_de['UpReg']>0,'Up','Dn')\n", - " \n", - "except:\n", + "tahoe_de = pd.read_parquet(f'{tahoe_URL}/gene_bucket_{query_gene_encoded}.parquet')\n", + "tahoe_de = prepare_tahoe_data(tahoe_de, query_gene)\n", + "if tahoe_de is None:\n", " print('Gene not in Tahoe-100M dataset')\n", - " in_tahoe=False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c5607885", - "metadata": {}, - "outputs": [], - "source": [ - "if in_lincs + in_novartis + in_ginkgo + in_tahoe < 1:\n", - " print(f\"LINCS: {in_lincs}\")\n", + " in_tahoe=False\n", + "\n", + "if in_novartis + in_ginkgo + in_tahoe < 1:\n", " print(f\"Novartis: {in_novartis}\")\n", " print(f\"Ginkgo: {in_ginkgo}\")\n", " print(f\"Tahoe-100M: {in_tahoe}\")\n", - " raise Exception(\"Execution stopped, gene not found in any datasets\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5ac47199", - "metadata": {}, - "outputs": [], - "source": [ + " raise Exception(\"Execution stopped, gene not found in any datasets\")\n", "# Get proteomics data\n", "in_deepcover = True\n", "try:\n", - " protein_de = pd.read_feather(f'{deepcover_moa_URL}/{gene_file}').set_index('index')\n", + " protein_de = pd.read_feather(f'{deepcover_moa_URL}/{query_gene}.f').set_index('index')\n", "except:\n", - " in_deepcover=False" + " in_deepcover=False\n", + " \n", + "{% endif %} " ] }, { @@ -332,12 +289,46 @@ "pubchem_ids = pd.read_csv(pubchem_location, dtype = {'Drug':str, 'CID':str})" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a766854", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "# Retrieve data for multi-gene analysis\n", + "pbc_df = pubchem_ids.copy()\n", + "pbc_df.rename(columns={'Drug':'DrugMatch'}, inplace=True)\n", + "all_genes, not_found = fetch_cmap_data(query_geneset)\n", + "single_gene_drug_ranks = single_gene_rank(all_genes, query_geneset, pbc_df)\n", + "combined_gene_results, multi_gene_drug_ranks = get_multi_gene_ranks(single_gene_drug_ranks, pbc_df)\n", + "\n", + "# genes not found in any dataset\n", + "if len(not_found) == 3:\n", + " none_found = set.intersection(*[set(gl) for gl in not_found.values()])\n", + " if none_found:\n", + " display_markdown(f'Gene(s) {none_found} not found in Novartis DRUG-seq, Tahoe-100M, or Ginkgo. **Excluded from multi-gene analysis.**', raw=True)\n", + "else:\n", + " none_found = {}\n", + " \n", + "# list genes not found in some datasets\n", + "display_markdown('**Genes not found in the following datasets:**', raw=True)\n", + "for ds,gl in not_found.items():\n", + " print(f'{ds}: {gl}') \n", + "if (len(none_found) == len(query_geneset)):\n", + " print('No input genes found in any datasets')\n", + " raise Exception(\"Execution stopped, no input genes found in any datasets\")\n", + "{% endif %}" + ] + }, { "cell_type": "markdown", "id": "dd12adb9", "metadata": {}, "source": [ - "## Query Gene" + "## Query Gene(s)" ] }, { @@ -347,8 +338,15 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(f\"This notebook shows results for the input gene **{query_gene}**\", raw=True)\n", - "display_markdown(f\"Drugs that up and down regulate **{query_gene}** are ranked by method **{ranking_method}**\", raw=True)" + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + " display_markdown(\"**Single Gene Input**\", raw=True)\n", + " display_markdown(f\"This notebook shows results for the input gene **{query_gene}**\", raw=True)\n", + " display_markdown(f\"Drugs that up and down regulate **{query_gene}** are ranked by method **{ranking_method}**\", raw=True)\n", + "{% elif gene_input.raw_value == 'Multi-gene' %}\n", + " display_markdown(\"**Multi-gene Input**\", raw=True)\n", + " display_markdown(f\"This notebook shows results for the input gene set **{query_geneset}**\", raw=True)\n", + "{% endif %}" ] }, { @@ -356,14 +354,19 @@ "id": "df830837", "metadata": {}, "source": [ - "## Rank Tables" + "## Drug Tables" ] }, { - "cell_type": "markdown", - "id": "97c523ce", + "cell_type": "code", + "execution_count": null, + "id": "61aeda92", "metadata": {}, + "outputs": [], "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "\n", "Within each dataset drugs are ranked by either:\n", "\n", "1. The statistical significance of the regulatory relationship. The pipeline uses the adjusted p-value from the differential expression results.\n", @@ -374,7 +377,9 @@ "\n", "When a dataset contains multiple perturbations for the same drug (i.e. a cell exposed to the drug at different doses), p-values and normalized ranks are averaged across doses to get a single ranking for the drug. \n", "\n", - "The rankings are done separately for up-regulated and down-regulated genes." + "The rankings are done separately for up-regulated and down-regulated genes.\n", + "\n", + "{% endif %}" ] }, { @@ -384,6 +389,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "def get_rankings(data:pd.DataFrame, source:str, cell_type:str, direction:str, option:str):\n", " '''\n", " Given a dataframe of logFC and p-values for a gene of interest across perturbations, \n", @@ -393,29 +400,44 @@ " perturbation ranks. \n", " '''\n", " ranked_data = data.copy()\n", + "\n", + " ranked_data['DrugLower'] = ranked_data['Drug'].str.lower()\n", + " ranked_data = ranked_data.merge(pubchem_ids, how='left', left_on ='DrugLower', right_on='Drug')\n", + " ranked_data['DrugGroup'] = ranked_data['CID']\n", + " ranked_data['DrugGroup'] = ranked_data['DrugGroup'].fillna(ranked_data['DrugLower'])\n", " \n", - " if (source == 'Ginkgo') & (cell_type=='A549'):\n", - " ranked_data.loc[ranked_data['Drug']=='Brefeldin A from Penicillium brefeldianum', 'Drug'] = 'Brefeldin A'\n", - " elif (source == 'Ginkgo') & (cell_type != 'A549'):\n", - " ranked_data.loc[ranked_data['Drug']=='Brefeldin-A', 'Drug'] = 'Brefeldin A'\n", - " elif source == 'Novartis':\n", - " ranked_data.loc[ranked_data['Drug']=='Trichostatin A (racemate)', 'Drug'] = 'Trichostatin A'\n", + " # if (source == 'Ginkgo') & (cell_type != 'A549'):\n", + " # ranked_data.loc[ranked_data['Drug']=='Brefeldin-A', 'Drug'] = 'Brefeldin A'\n", + " if source == 'Novartis':\n", + " # ranked_data.loc[ranked_data['Drug']=='Trichostatin A (racemate)', 'Drug'] = 'Trichostatin A'\n", " ranked_data.rename(columns={'P.Adj':'adj.P.Val'}, inplace=True)\n", " if option == 'pval':\n", " # average rank across all drug dosages\n", - " drug_mean_ranks = ranked_data.loc[:,['Drug','logFC','log10adj.P.Val']].groupby('Drug')[['logFC','log10adj.P.Val']].mean().sort_values('log10adj.P.Val', ascending=False)\n", + " drug_mean_ranks = ranked_data.loc[:,['DrugGroup','logFC','log10adj.P.Val']].groupby('DrugGroup')[['logFC','log10adj.P.Val']].mean().sort_values('log10adj.P.Val', ascending=False)\n", " # filter for up or down regulation\n", " if direction == 'up':\n", " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] > 0]\n", " elif direction == 'down':\n", " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] < 0]\n", - " drug_mean_ranks.rename(columns={'logFC':'Avg logFC', 'log10adj.P.Val':'Avg -log10(Adj.PVal)'}, inplace=True)\n", " elif option == 'target_rank':\n", " if direction == 'up':\n", " ranked_data = ranked_data[ranked_data['GeneDir'] == 'Up']\n", " elif direction == 'down':\n", " ranked_data = ranked_data[ranked_data['GeneDir'] == 'Dn']\n", - " drug_mean_ranks = ranked_data.loc[:,['Drug','logFC','adj.P.Val','Rank','PctRank']].groupby('Drug')[['logFC','adj.P.Val','Rank','PctRank']].mean().sort_values('PctRank', ascending=True)\n", + " drug_mean_ranks = ranked_data.loc[:,['DrugGroup','logFC','adj.P.Val','Rank','PctRank']].groupby('DrugGroup')[['logFC','adj.P.Val','Rank','PctRank']].mean().sort_values('PctRank', ascending=True)\n", + " # clean drug mean ranks\n", + " drug_mean_ranks.reset_index(drop=False, names='DrugGroup', inplace=True)\n", + " drug_mean_ranks = drug_mean_ranks.merge(pubchem_ids, how='left', left_on='DrugGroup', right_on='CID')\n", + " drug_mean_ranks['Drug'].fillna(drug_mean_ranks['DrugGroup'], inplace=True)\n", + " if option == 'pval':\n", + " drug_mean_ranks = drug_mean_ranks.drop_duplicates(subset=['CID','logFC','log10adj.P.Val'])\n", + " drug_mean_ranks.rename(columns={'logFC':'Avg logFC', 'log10adj.P.Val':'Avg -log10(Adj.PVal)'}, inplace=True)\n", + " elif option == 'target_rank':\n", + " drug_mean_ranks = drug_mean_ranks.drop_duplicates(subset=['CID','logFC','PctRank'])\n", + " drug_mean_ranks.rename(columns={'logFC':'Avg logFC', 'PctRank':'Avg PctRank'}, inplace=True)\n", + " # clean ranks\n", + " ranked_data = ranked_data[['Gene','Perturbation','Drug_x','CID','DrugGroup','adj.P.Val','logFC','GeneDir','Rank','PctRank','log10adj.P.Val']]\n", + " ranked_data.rename(columns={'Drug_x':'Drug'}, inplace=True)\n", " return drug_mean_ranks, ranked_data\n", "\n", "def get_top(rank_results:pd.DataFrame, n=50):\n", @@ -425,9 +447,10 @@ "\n", " If there are less drugs than N, will return all results.\n", " '''\n", - " top = {d.casefold() for d in set(rank_results.head(n).index)}\n", + " top = {d.casefold() for d in set(rank_results.head(n)['Drug'])}\n", " return top\n", "\n", + "{% endif %}\n", "\n", "def download_link(df, fname, link_header='Download full results'):\n", " '''\n", @@ -439,30 +462,18 @@ " return link" ] }, - { - "cell_type": "markdown", - "id": "07fea912", - "metadata": {}, - "source": [ - "### Ginkgo" - ] - }, - { - "cell_type": "markdown", - "id": "c5073607", - "metadata": {}, - "source": [ - "Drug rankings for the Ginkgo dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " - ] - }, { "cell_type": "code", "execution_count": null, - "id": "1d05ddc0", + "id": "04a4f8d8", "metadata": {}, "outputs": [], "source": [ - "top_n = 20" + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "### Ginkgo\n", + "Drug rankings for the Ginkgo dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. \n", + "{% endif %}" ] }, { @@ -472,6 +483,9 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "top_n = 20\n", "if in_ginkgo:\n", " ginkgo_drugs_up = {}\n", " ginkgo_drugs_down = {}\n", @@ -491,61 +505,23 @@ " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_DnReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_DnRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n", "else:\n", - " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)" - ] - }, - { - "cell_type": "markdown", - "id": "fc6465b9", - "metadata": {}, - "source": [ - "### L1000" - ] - }, - { - "cell_type": "markdown", - "id": "8777d900", - "metadata": {}, - "source": [ - "Drug rankings for the LINCS L1000 dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " + " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)\n", + "\n", + "{% endif %} " ] }, { "cell_type": "code", "execution_count": null, - "id": "57438b7c", + "id": "e2931451", "metadata": {}, "outputs": [], "source": [ - "if in_lincs:\n", - " lincs_drugs_up = get_rankings(lincs_de, 'LINCS', '', 'up', ranking_method)\n", - " lincs_drugs_down = get_rankings(lincs_de, 'LINCS', '', 'down', ranking_method)\n", - " display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n", - " display(lincs_drugs_up[0].head(top_n))\n", - " display(HTML(download_link(lincs_drugs_up[0], f\"l1000_drug_ranks_{query_gene}_UpReg.tsv\", 'Download results averaged across drug dosages')))\n", - " display(HTML(download_link(lincs_drugs_up[1], f\"l1000_drug_ranks_{query_gene}_full_UpReg.tsv\", 'Download results for all perturbations')))\n", - " display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n", - " display(lincs_drugs_down[0].head(top_n))\n", - " display(HTML(download_link(lincs_drugs_down[0], f\"l1000_drug_ranks_{query_gene}_DnReg.tsv\", 'Download results averaged across drug dosages')))\n", - " display(HTML(download_link(lincs_drugs_down[1], f\"l1000_drug_ranks_{query_gene}_full_DnReg.tsv\", 'Download results for all perturbations')))\n", - "else: \n", - " display_markdown(f'**{query_gene}** not found in LINCS L1000 dataset', raw=True)" - ] - }, - { - "cell_type": "markdown", - "id": "e177678e", - "metadata": {}, - "source": [ - "### Novartis DRUG-seq" - ] - }, - { - "cell_type": "markdown", - "id": "e22fef7a", - "metadata": {}, - "source": [ - "Drug rankings for the Novartis DRUG-seq dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "### Novartis DRUG-seq\n", + "Drug rankings for the Novartis DRUG-seq dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. \n", + "{% endif %}" ] }, { @@ -555,6 +531,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_novartis:\n", " novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up', ranking_method)\n", " novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down', ranking_method)\n", @@ -568,23 +546,22 @@ " display(HTML(download_link(novartis_drugs_down[0], f'novartis_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n", " display(HTML(download_link(novartis_drugs_down[1], f'novartis_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))\n", "else:\n", - " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq', raw=True)" + " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq', raw=True)\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "c04f416c", - "metadata": {}, - "source": [ - "### Tahoe-100M" - ] - }, - { - "cell_type": "markdown", - "id": "e07834e6", + "cell_type": "code", + "execution_count": null, + "id": "3f6e38ba", "metadata": {}, + "outputs": [], "source": [ - "Drug rankings for the Tahoe-100M dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "### Tahoe-100M\n", + "Drug rankings for the Tahoe-100M dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. \n", + "{% endif %}" ] }, { @@ -594,6 +571,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_tahoe:\n", " tahoe_drugs_up = get_rankings(tahoe_de, 'Tahoe', '', 'up', ranking_method)\n", " tahoe_drugs_down = get_rankings(tahoe_de, 'Tahoe', '', 'down', ranking_method)\n", @@ -607,7 +586,156 @@ " display(HTML(download_link(tahoe_drugs_down[0], f'tahoe_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n", " display(HTML(download_link(tahoe_drugs_down[1], f'tahoe_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))\n", "else:\n", - " display_markdown(f'**{query_gene}** not found in Tahoe-100M', raw=True)" + " display_markdown(f'**{query_gene}** not found in Tahoe-100M', raw=True)\n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a7a6716", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "\n", + "Drug ranks according to the adjusted p-value method are generated for each input gene. A single multi-gene drug rank for each drug is calculated by averaging the ranks for a drug across all input genes. \n", + "\n", + "Tables below show the top 20 drugs identified by their 0-1 normalized multi-gene drug rank, where lower values correspond to greater regulation, and their multi-gene rank deviation from the expected drug rank. The expected rank for each drug was calculated from random input gene sets. \n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d97287eb", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "# expected rank information\n", + "null_ranks_dn = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/null_rank_dn.csv').rename(columns={'MultiGeneRank':'NullRank'})\n", + "null_ranks_up = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/null_rank_up.csv').rename(columns={'MultiGeneRank':'NullRank'})\n", + "# Universal regulators\n", + "flagged_drugs = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/flagged_drug_list.csv')\n", + "flagged_drugs['DrugMatch'].fillna(flagged_drugs['CID'], inplace=True)\n", + "# Data on FDA approved drugs from OpenTargets\n", + "fda_approved_toxicity = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/opentargets_fda_approved_toxicity.csv')\n", + "fda_approved_toxicity['synonyms_lower'] = fda_approved_toxicity['synonyms'].str.lower()\n", + "fda_approved_toxicity['name_lower'] = fda_approved_toxicity['name'].str.lower()\n", + "fda_approved_toxicity.synonyms_lower.fillna(fda_approved_toxicity['name_lower'], inplace=True)\n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7d883b2", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "def clean_ranks_df(direction, method):\n", + " df = multi_gene_drug_ranks[direction]\n", + " df = df.merge(pbc_df, how='left', on='DrugMatch')\n", + " df = df.drop_duplicates(subset=['DrugGroup', 'MultiGeneRank','NGenes'])\n", + " df['DrugMatch'] = df['DrugMatch'].fillna(df['DrugGroup'])\n", + " # expected ranks\n", + " df = df.merge(null_ranks_up[['DrugMatch','NullRank']], how='left', on='DrugMatch')\n", + " # FDA status\n", + " df = df.merge(fda_approved_toxicity[['synonyms_lower','isApproved']], how='left', left_on='DrugMatch', right_on='synonyms_lower')\n", + " df['isApproved'].fillna(False, inplace=True)\n", + "\n", + " if method=='mg_rank':\n", + " df = df.sort_values(['MultiGeneRank', 'NGenes'], ascending=[True, False])\n", + " new_names = {\n", + " 'DrugMatch':'Drug',\n", + " 'CID': 'PubChem CID',\n", + " 'MultiGeneRank': f'Multi-gene {direction} Rank',\n", + " 'NGenes': 'N Genes Regulated',\n", + " 'isApproved':'FDA Approved'\n", + " }\n", + " elif method=='deviation':\n", + " df['RankDiff'] = df['NullRank'] - df['MultiGeneRank']\n", + " df = df.sort_values(['RankDiff','MultiGeneRank', 'NGenes'], ascending=[False, True, False])\n", + " new_names = {\n", + " 'DrugMatch':'Drug',\n", + " 'CID': 'PubChem CID',\n", + " 'MultiGeneRank': f'Multi-gene {direction} Rank',\n", + " 'RankDiff': f'Expected Rank Difference {direction}',\n", + " 'NGenes': 'N Genes Regulated',\n", + " 'isApproved':'FDA Approved'\n", + " }\n", + "\n", + " df = df[list(new_names.keys())]\n", + " df.rename(columns=new_names, inplace=True)\n", + " df = df.drop_duplicates(subset=['PubChem CID',f'Multi-gene {direction} Rank', 'N Genes Regulated'])\n", + "\n", + " return df\n", + "\n", + "def clean_individual_df(direction):\n", + " df = combined_gene_results[direction]\n", + " df = df.merge(pbc_df, how='left', on='DrugMatch')\n", + " df = df.drop_duplicates(subset = ['gene','CID','Drug','MeanRank'])\n", + " new_names = {\n", + " 'gene':'Gene',\n", + " 'Drug':'Drug',\n", + " 'CID':'PubChem CID',\n", + " 'MeanRank':f'Mean Rank {direction}'\n", + " }\n", + " df = df[list(new_names.keys())]\n", + " df.rename(columns=new_names, inplace=True)\n", + " return df\n", + "def filter_genes(df, cutoff):\n", + " return df[df['N Genes Regulated'] > cutoff]\n", + "top_n = 20\n", + "## multi-gene rank method\n", + "# Up Regulation\n", + "display_markdown('**Multi-gene Drug Rank Up**', raw=True)\n", + "mg_ranks_up = clean_ranks_df('Up', 'mg_rank')\n", + "display(filter_genes(mg_ranks_up, 1)[:top_n])\n", + "display(HTML(download_link(mg_ranks_up, f\"multi_gene_drug_ranks_UpReg.tsv\", 'Download table')))\n", + "# Down Regulation\n", + "display_markdown('**Multi-gene Drug Rank Down**', raw=True)\n", + "mg_ranks_dn = clean_ranks_df('Dn', 'mg_rank')\n", + "display(filter_genes(mg_ranks_dn, 1)[:top_n])\n", + "display(HTML(download_link(mg_ranks_dn, f\"multi_gene_drug_ranks_DnReg.tsv\", 'Download table')))\n", + "\n", + "## deviation rank method\n", + "# Up Regulation\n", + "display_markdown('**Deviation Drug Rank Up**', raw=True)\n", + "dev_ranks_up = clean_ranks_df('Up', 'deviation')\n", + "display(filter_genes(dev_ranks_up, 1)[:top_n])\n", + "display(HTML(download_link(dev_ranks_up, f\"multi_gene_drug_ranks_deviation_UpReg.tsv\", 'Download table')))\n", + "# Down Regulation\n", + "display_markdown('**Deviation Drug Rank Down**', raw=True)\n", + "dev_ranks_dn = clean_ranks_df('Dn', 'deviation')\n", + "display(filter_genes(dev_ranks_dn,1)[:top_n])\n", + "display(HTML(download_link(dev_ranks_dn, f\"multi_gene_drug_ranks_deviation_DnReg.tsv\", 'Download table')))\n", + "\n", + "# Individual gene results downloads\n", + "display_markdown(\"**Results per gene available for download**\", raw = True)\n", + "display(HTML(download_link(clean_individual_df('Up'), f\"individual_gene_drug_ranks_UpReg.tsv\", 'Download table')))\n", + "display(HTML(download_link(clean_individual_df('Dn'), f\"individual_gene_drug_ranks_DnReg.tsv\", 'Download table')))\n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "620e428c", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "## UpSet Plot\n", + "\n", + "The UpSet plots show the overlap among top up-regulating or down-regulating drugs in each dataset. If there were more than 50 significant regulators in a dataset for a given input gene, the input was restricted to the top 50 regulators.\n", + " {% endif %}" ] }, { @@ -617,6 +745,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "top_up = {}\n", "top_down = {}\n", "# get results from Ginkgo\n", @@ -624,10 +754,6 @@ " for cell_type in ginkgo_drugs_down.keys():\n", " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n", " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n", - "# get results from L1000\n", - "if in_lincs:\n", - " top_up['lincs_l1000'] = get_top(lincs_drugs_up[0], n=50)\n", - " top_down['lincs_l1000'] = get_top(lincs_drugs_down[0], n=50)\n", "# get results from novartis\n", "if in_novartis:\n", " top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n", @@ -635,23 +761,9 @@ "# get results from Tahoe\n", "if in_tahoe:\n", " top_up['tahoe'] = get_top(tahoe_drugs_up[0], n=50)\n", - " top_down['tahoe'] = get_top(tahoe_drugs_down[0], n=50)" - ] - }, - { - "cell_type": "markdown", - "id": "01b5b49c", - "metadata": {}, - "source": [ - "## UpSet Plot" - ] - }, - { - "cell_type": "markdown", - "id": "fb0246a5", - "metadata": {}, - "source": [ - "The UpSet plots show the overlap among top up-regulating or down-regulating drugs in each dataset. If there were more than 50 significant regulators in a dataset for a given input gene, the input was restricted to the top 50 regulators." + " top_down['tahoe'] = get_top(tahoe_drugs_down[0], n=50)\n", + "\n", + "{% endif %}" ] }, { @@ -665,7 +777,7 @@ "def save_figure(plot_name, **kwargs):\n", " import io\n", " mem = io.BytesIO()\n", - " pyplot.savefig(mem, bbox_inches='tight')\n", + " plt.savefig(mem, bbox_inches='tight')\n", " with open(plot_name, 'wb') as fw:\n", " fw.write(mem.getbuffer())" ] @@ -677,14 +789,16 @@ "metadata": {}, "outputs": [], "source": [ - "def create_upset(top_sets: dict):\n", + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "\n", + "def create_upset(top_sets: dict, direction: str):\n", " '''\n", " Given a dictionary to top up or down regulating genes,\n", " creates an Upset plot. \n", " '''\n", " rename_keys = {\n", " 'ginkgo_A549': 'ginkgo_A549',\n", - " 'lincs_l1000': 'lincs_l1000',\n", " 'novartis': 'novartis',\n", " 'tahoe': 'tahoe',\n", " 'ginkgo_human_epithelial_melanocytes': 'ginkgo_melanocytes',\n", @@ -695,24 +809,19 @@ " top_sets = {rename_keys[k]:v for k,v in top_sets.items()}\n", " upset_data = from_contents(top_sets)\n", " plot(upset_data, orientation = 'horizontal', show_counts = True, element_size = 30)\n", - " pyplot.show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "58fa71a7", - "metadata": {}, - "outputs": [], - "source": [ - "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n", + " save_figure(f'upset_{direction}.png')\n", + " plt.show()\n", + "\n", + "if in_ginkgo + in_novartis + in_tahoe < 2:\n", " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", "else:\n", " display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n", - " create_upset(top_up)\n", - "\n", + " create_upset(top_up, 'up')\n", + " display(FileLink('upset_up.png', result_html_prefix='Download:'))\n", " display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n", - " create_upset(top_down)" + " create_upset(top_down, 'down')\n", + " display(FileLink('upset_down.png', result_html_prefix='Download:'))\n", + "{% endif %}" ] }, { @@ -722,6 +831,9 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "\n", "def get_overlapping_sets(top_sets:dict):\n", " '''\n", " Given the dictionary of sets used to created the UpSet plot,\n", @@ -740,31 +852,39 @@ " overlapping_sets = pd.DataFrame(columns=['Members', 'Overlap', 'Length'])\n", " for idx in range(multi_index_df.shape[0]):\n", " ixn_drugs = set_df.loc[tuple(multi_index_df.iloc[idx])].id.to_list()\n", + " # replace PubChem IDs with drug names\n", + " pubchem_dict = dict(zip(pubchem_ids.CID.to_list(), pubchem_ids.Drug.to_list()))\n", + " ixn_drug_names = []\n", + " for d in ixn_drugs:\n", + " if d in pubchem_dict:\n", + " ixn_drug_names.append(pubchem_dict[d])\n", + " else:\n", + " ixn_drug_names.append(d)\n", " # get group members\n", " ixn_name = multi_index_df.iloc[idx][multi_index_df.iloc[idx]].index.to_list()\n", " ixn_name_joined = '-'.join(ixn_name)\n", " # append results\n", - " overlapping_sets = pd.concat([overlapping_sets, pd.DataFrame({'Members':ixn_name_joined, 'Overlap':[ixn_drugs], 'Length':len(ixn_drugs), 'N Datasets':len(ixn_name)})])\n", + " overlapping_sets = pd.concat([overlapping_sets, pd.DataFrame({'Members':ixn_name_joined, 'Overlap':[ixn_drug_names], 'Length':len(ixn_drug_names), 'N Datasets':len(ixn_name)})])\n", " \n", " \n", " overlapping_sets = overlapping_sets.sort_values('N Datasets', ascending=False)\n", - " return overlapping_sets\n" - ] - }, - { - "cell_type": "markdown", - "id": "6247c9b9", - "metadata": {}, - "source": [ - "## Consensus Regulator Tables" + " return overlapping_sets\n", + " {% endif %}" ] }, { - "cell_type": "markdown", - "id": "556a29cc", + "cell_type": "code", + "execution_count": null, + "id": "9307e382", "metadata": {}, + "outputs": [], "source": [ - "Below are tabular representations of the UpSet plots." + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "## Consensus Regulator Tables\n", + "\n", + "Below are tabular representations of the UpSet plots.\n", + "{% endif %}" ] }, { @@ -774,7 +894,9 @@ "metadata": {}, "outputs": [], "source": [ - "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n", + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "if in_ginkgo + in_novartis + in_tahoe < 2:\n", " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", "else:\n", " overlap_down = get_overlapping_sets(top_down)\n", @@ -784,7 +906,8 @@ " display(HTML(download_link(overlap_down, f'overlapping_drugs_{query_gene}_DnReg.tsv')))\n", " display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n", " display(overlap_up)\n", - " display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n" + " display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n", + "{% endif %}\n" ] }, { @@ -794,6 +917,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "def get_ranking_averages(overlapping_df, data_dict, ranking_method):\n", " # get average, integrating across datasets\n", " average_rank_vals = {}\n", @@ -801,6 +926,8 @@ " average_logfc_vals = {}\n", " average_pvals = {}\n", " n_datasets = list()\n", + " pubchem_dict = dict(zip(pubchem_ids.Drug.to_list(), pubchem_ids.CID.to_list()))\n", + " pubchem_dict_rev = dict(zip(pubchem_ids.CID.to_list(), pubchem_ids.Drug.to_list()))\n", " for _,row in overlapping_df.iterrows():\n", " n_datasets.extend([row['N Datasets']]*len(row['Overlap']))\n", " member_sets = row['Members']\n", @@ -813,7 +940,8 @@ " for source_name,df in data_dict.items():\n", " if not re.search(source_name, member_sets):\n", " continue\n", - " subset = df[df['Drug'].str.lower() == d.lower()]\n", + " d = pubchem_dict[d] if d in pubchem_dict else d\n", + " subset = df[df['DrugGroup'] == d]\n", " n = n + subset.shape[0]\n", " runsum_rank = runsum_rank + subset.Rank.sum()\n", " runsum_pctrank = runsum_pctrank + subset.PctRank.sum()\n", @@ -832,6 +960,7 @@ " 'Avg PctRank': list(average_pctrank_vals.values())\n", " })\n", " res_df['N Datasets'] = n_datasets\n", + " res_df['Drug'] = res_df['Drug'].apply(lambda x: pubchem_dict_rev[x] if x in pubchem_dict_rev else x)\n", " if ranking_method == 'target_rank':\n", " # sort based on N datasets and average percentile rank\n", " res_df = res_df.sort_values(['N Datasets','Avg PctRank'], ascending=[False,True])\n", @@ -851,7 +980,8 @@ " # clean column names\n", " with_proteins.rename(columns = {'logFC':'Protein logFC', 'Pubchem' : 'PubChem CID'}, inplace=True)\n", " with_proteins.drop(columns='CID',inplace=True)\n", - " return with_proteins.sort_values(['N Datasets', 'Avg Adj.P.Val'], ascending=[False,True])" + " return with_proteins.sort_values(['N Datasets', 'Avg Adj.P.Val'], ascending=[False,True])\n", + "{% endif %}" ] }, { @@ -861,24 +991,22 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "# define input data for get_ranking_averages\n", "data_source_present = {'A549': in_ginkgo, \n", - " 'human_dermal_fibroblast':in_ginkgo,\n", - " 'human_aortic_smooth_muscle_cells':in_ginkgo,\n", - " 'human_epithelial_melanocytes': in_ginkgo,\n", - " 'human_skeletal_muscle_myoblasts': in_ginkgo,\n", - " 'novartis': in_novartis,\n", - " 'lincs': in_lincs,\n", - " 'tahoe': in_tahoe}\n", + " 'human_dermal_fibroblast':in_ginkgo,\n", + " 'human_aortic_smooth_muscle_cells':in_ginkgo,\n", + " 'human_epithelial_melanocytes': in_ginkgo,\n", + " 'human_skeletal_muscle_myoblasts': in_ginkgo,\n", + " 'novartis': in_novartis,\n", + " 'tahoe': in_tahoe}\n", "data_dict_down ={}\n", "data_dict_up = {}\n", "for source,present in data_source_present.items():\n", - " if (present) & (not source in ['novartis','lincs','tahoe']):\n", + " if (present) & (not source in ['novartis','tahoe']):\n", " data_dict_down[source] = ginkgo_drugs_down[source][1]\n", " data_dict_up[source] = ginkgo_drugs_up[source][1]\n", - " elif (present) & (source == 'lincs'):\n", - " data_dict_down[source] = lincs_drugs_down[1]\n", - " data_dict_up[source] = lincs_drugs_up[1]\n", " elif (present) & (source == 'novartis'):\n", " data_dict_down[source] = novartis_drugs_down[1]\n", " data_dict_up[source] = novartis_drugs_up[1]\n", @@ -886,17 +1014,23 @@ " data_dict_down[source] = tahoe_drugs_down[1]\n", " data_dict_up[source] = tahoe_drugs_up[1]\n", "\n", - "if in_ginkgo + in_lincs + in_novartis + in_tahoe > 1:\n", + "if in_ginkgo + in_novartis + in_tahoe > 1:\n", " overlapping_up_TargetRank = get_ranking_averages(overlap_up, data_dict_up, ranking_method)\n", - " overlapping_down_TargetRank = get_ranking_averages(overlap_down, data_dict_down, ranking_method)" + " overlapping_down_TargetRank = get_ranking_averages(overlap_down, data_dict_down, ranking_method)\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "bc739111", + "cell_type": "code", + "execution_count": null, + "id": "073650d0", "metadata": {}, + "outputs": [], "source": [ - "The tables below show average logFC, adjusted p-value, raw rank and normalized rank values across datasets for drugs that were found to be significant regulators in more than one dataset." + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "The tables below show average logFC, adjusted p-value, raw rank and normalized rank values across datasets for drugs that were found to be significant regulators in more than one dataset.\n", + "{% endif %}" ] }, { @@ -906,7 +1040,9 @@ "metadata": {}, "outputs": [], "source": [ - "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n", + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "if in_ginkgo + in_novartis + in_tahoe < 2:\n", " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", "else:\n", " display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n", @@ -914,17 +1050,23 @@ " display(HTML(download_link(overlapping_up_TargetRank, f'overlapping_drugs_averages_{query_gene}_UpReg.tsv')))\n", " display_markdown(\"**Averages across datasets: Down-regulating drugs**\", raw=True)\n", " display(overlapping_down_TargetRank.head(n=top_n))\n", - " display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))" + " display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "2aaee19a", + "cell_type": "code", + "execution_count": null, + "id": "74e6a8bc", "metadata": {}, + "outputs": [], "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "### Protein Regulation\n", "\n", - "Query gene regulation at the protein level is dispalyed in the table below. Proteomics data is from the [Deepcover MoA dataset](https://wren.hms.harvard.edu/DeepCoverMOA/), which exposes cells from the HCT116 cancer cell line to 875 small molecule compounds. " + "Query gene regulation at the protein level is dispalyed in the table below. Proteomics data is from the [Deepcover MoA dataset](https://wren.hms.harvard.edu/DeepCoverMOA/), which exposes cells from the HCT116 cancer cell line to 875 small molecule compounds. \n", + "{% endif %}" ] }, { @@ -934,6 +1076,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_deepcover:\n", " up_protein = protein_de[protein_de['logFC'] > 0].loc[:,['UniprotID','Drug','Pubchem','logFC','Zscore','UpRank','PctUpRank']].sort_values('logFC',ascending=False).reset_index().drop(columns='index')\n", " up_protein.rename(columns={'Pubchem':'PubChem CID','Zscore':'Z-score','UpRank':'Up Rank', 'PctUpRank':'Normalized Up Rank'}, inplace=True)\n", @@ -947,15 +1091,21 @@ " display(dn_protein.head(top_n))\n", " display(HTML(download_link(dn_protein, f'DeepcoverMoa_protein_{query_gene}_DnReg.tsv')))\n", "else:\n", - " display_markdown(f\"Protein of {query_gene} not in DeepCover MoA Dataset\", raw=True)" + " display_markdown(f\"Protein of {query_gene} not in DeepCover MoA Dataset\", raw=True)\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "a0630a08", + "cell_type": "code", + "execution_count": null, + "id": "d8b97a35", "metadata": {}, + "outputs": [], "source": [ - "If the protein associated with the query gene was found in the Deepcover MoA proteomics dataset, the tables below show how the protein was up or down-regulated by the consensus drugs identified in the connectivity mapping resources. The table only includes compounds that were used in the connectivity mapping resources and the Deepcover MoA dataset. " + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "If the protein associated with the query gene was found in the Deepcover MoA proteomics dataset, the tables below show how the protein was up or down-regulated by the consensus drugs identified in the connectivity mapping resources. The table only includes compounds that were used in the connectivity mapping resources and the Deepcover MoA dataset. \n", + "{% endif %}" ] }, { @@ -965,6 +1115,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_deepcover:\n", " up_with_cid = join_proteomics(overlapping_up_TargetRank, protein_de)\n", " dn_with_cid = join_proteomics(overlapping_down_TargetRank, protein_de)\n", @@ -977,17 +1129,25 @@ " display(dn_with_cid.head(n=top_n))\n", " display(HTML(download_link(dn_with_cid, f'{query_gene}_mRNA_protein_DnReg.tsv')))\n", "else:\n", - " display_markdown(f\"Protein of {query_gene} not in DeepCover MoA Dataset\", raw=True)" + " display_markdown(f\"Protein of {query_gene} not in DeepCover MoA Dataset\", raw=True)\n", + "\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "2e7c13dd", + "cell_type": "code", + "execution_count": null, + "id": "9669c9b6", "metadata": {}, + "outputs": [], "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "## Venn Diagrams\n", "\n", - "The venn diagrams show the pairwise overlap among either up-regulating or down-regulating drugs across the four Connectivity Mapping datasets Tahoe-100M, Novartis DRUG-seq, LINCS L1000, and Ginkgo (all cell types grouped). " + "The venn diagrams show the pairwise overlap among either up-regulating or down-regulating drugs across the four Connectivity Mapping datasets Tahoe-100M, Novartis DRUG-seq, and Ginkgo (all cell types grouped). \n", + "\n", + "{% endif %}" ] }, { @@ -997,6 +1157,8 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "# combine top up and down drugs across Ginkgo cell types\n", "if in_ginkgo:\n", " all_ginkgo_up = set()\n", @@ -1007,9 +1169,8 @@ " all_ginkgo_down = all_ginkgo_down.union(top_down[k])\n", "# define input data for venn diagrams\n", "data_source_present = {'ginkgo':in_ginkgo,\n", - " 'lincs_l1000':in_lincs,\n", - " 'novartis':in_novartis,\n", - " 'tahoe': in_tahoe}\n", + " 'novartis':in_novartis,\n", + " 'tahoe': in_tahoe}\n", "venn_up = {}\n", "venn_down = {}\n", "for source,present in data_source_present.items():\n", @@ -1018,7 +1179,9 @@ " venn_down[source] = all_ginkgo_down\n", " elif present:\n", " venn_up[source] = top_up[source]\n", - " venn_down[source] = top_down[source]" + " venn_down[source] = top_down[source]\n", + "\n", + "{% endif %}" ] }, { @@ -1028,14 +1191,17 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "\n", "def create_venn(venn_dict):\n", " '''\n", " Given a dictionary of sets of top up or down regulating genes,\n", " creates a 2 or 3 set venn diagram. \n", " '''\n", " if len(venn_dict) == 3:\n", - " venn3(subsets=(venn_dict['novartis'], venn_dict['lincs_l1000'], venn_dict['ginkgo']),\n", - " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n", + " venn3(subsets=(venn_dict['novartis'], venn_dict['tahoe'], venn_dict['ginkgo']),\n", + " set_labels=('Novartis', 'Tahoe', 'Ginkgo'));\n", " elif len(venn_dict) == 2:\n", " venn2(subsets = (venn_dict.values()),\n", " set_labels=(venn_dict.keys()));\n", @@ -1070,7 +1236,8 @@ " if len(overlap) == 0:\n", " overlap = ['None']\n", " # print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")\n", - " return f\"{', '.join(datasets)}: {', '.join(overlap)}\"" + " return f\"{', '.join(datasets)}: {', '.join(overlap)}\"\n", + "{% endif %}" ] }, { @@ -1080,15 +1247,22 @@ "metadata": {}, "outputs": [], "source": [ - "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n", + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "if in_ginkgo + in_novartis + in_tahoe < 2:\n", " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", "else:\n", " display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n", " for combo in combinations(list(venn_up.keys()), 2):\n", " combo_venn = {k:venn_up[k] for k in combo if k in venn_up}\n", " create_venn(combo_venn)\n", - " plt.title(print_overlap(combo_venn))\n", - " plt.show()" + " title = print_overlap(combo_venn)\n", + " plt.title(title)\n", + " plt_name = f\"{title.split(':')[0].replace(',','_')}_up.png\"\n", + " save_figure(plt_name)\n", + " plt.show()\n", + " display(FileLink(plt_name, result_html_prefix='Download: '))\n", + "{% endif %}" ] }, { @@ -1098,15 +1272,189 @@ "metadata": {}, "outputs": [], "source": [ - "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n", + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "if in_ginkgo + in_novartis + in_tahoe < 2:\n", " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", "else:\n", " display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n", " for combo in combinations(list(venn_down.keys()), 2):\n", " combo_venn = {k:venn_down[k] for k in combo if k in venn_down}\n", " create_venn(combo_venn)\n", - " plt.title(print_overlap(combo_venn))\n", - " plt.show()" + " title = print_overlap(combo_venn)\n", + " plt.title(title)\n", + " plt_name = f\"{title.split(':')[0].replace(',','_')}_down.png\"\n", + " save_figure(plt_name)\n", + " plt.show()\n", + " display(FileLink(plt_name, result_html_prefix='Download: '))\n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8b09e6ff", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "## Top 20 Drug Plots\n", + "\n", + "The barplots show the multi-gene drug rank for the top 20 drugs for up and down regulation of the input geneset. The top drugs are identified by two sorting methods.\n", + "\n", + "1. Multi-gene rank: the average ranking for the drug acorss the genes in the input gene set\n", + "\n", + "2. Deviation rank: difference between multi-gene rank and the expected rank of the drug, determined by retrieving drug ranks for random input gene sets. \n", + "\n", + "\n", + "Plots are shown with and without filtering for drugs that regulate a majority of the inpute gene set in the specified direction. \n", + "\n", + "**Drug annotations (Y-axis):**\n", + "* \\*\\* FDA approved drug\n", + "* Red: Universal regulator, determined based on expected drug rank calculated from random input gene sets. \n", + "\n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4d79365", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "def visualize_top_drugs(direction, max_genes=10, df=None, rank_method='MultiGeneRank', flag_filter = False, filter_genes=False):\n", + " from math import ceil \n", + "\n", + " top_n_drugs = 20\n", + " \n", + " # Create a blue-white-red continuous colormap centered at max_genes / 2\n", + " cmap = sns.color_palette(\"coolwarm\", as_cmap=True)\n", + " norm = TwoSlopeNorm(vmin=1, vcenter=max_genes / 2, vmax=max_genes)\n", + "\n", + " # join with expected rank data\n", + " if direction == 'Down':\n", + " df = df.merge(null_ranks_dn[['DrugMatch','NullRank']], how='left', on='DrugMatch')\n", + " elif direction == 'Up':\n", + " df = df.merge(null_ranks_up[['DrugMatch','NullRank']], how='left', on='DrugMatch')\n", + " \n", + " df['RankDiff'] = df['NullRank']-df['MultiGeneRank']\n", + " \n", + " # remove flagged drugs\n", + " if flag_filter:\n", + " df = df[~df['DrugMatch'].isin(flagged_drugs['DrugMatch'].to_list())]\n", + " if filter_genes:\n", + " df = df[df['NGenes'] >= ceil(max_genes/2)]\n", + "\n", + " # select top n drugs by RankDiff\n", + " if rank_method == 'RankDiff':\n", + " method_title = 'Deviation Rank'\n", + " df = df.sort_values(['RankDiff','MultiGeneRank', 'NGenes'], ascending=[False,True,False])[:top_n_drugs]\n", + " # select top n drugs by MG Rank\n", + " elif rank_method == 'MultiGeneRank':\n", + " method_title = 'Multi-gene Rank'\n", + " df = df.sort_values(['MultiGeneRank', 'NGenes'], ascending=[True,False])[:top_n_drugs]\n", + " \n", + " # FDA status\n", + " fda_approved = df.merge(fda_approved_toxicity, how='left', left_on='DrugMatch',right_on='synonyms_lower')\n", + " fda_approved = list(fda_approved[fda_approved['isApproved']==True].DrugMatch.unique())\n", + " \n", + " # order for plotting\n", + " df = df.sort_values('MultiGeneRank', ascending=True)\n", + " fig, ax = plt.subplots(figsize = (6,8))\n", + " null_bar = sns.barplot(df,\n", + " y='DrugMatch',\n", + " x='NullRank',\n", + " color='darkgray',\n", + " ax=ax)\n", + " mg_bar = sns.barplot(df,\n", + " y='DrugMatch',\n", + " x='MultiGeneRank',\n", + " hue='NGenes',\n", + " palette=cmap,\n", + " hue_norm=norm,\n", + " ax=ax)\n", + " \n", + " plt.ylabel(None)\n", + " plt.xlabel('Multi-gene Drug Rank [0-1]')\n", + " plt.title(f'{method_title} Top {top_n_drugs} Drugs: {direction}')\n", + " # add legend\n", + " top_bar = mpatches.Patch(color='darkgray', label='Null Rank')\n", + " plt.legend(handles=[top_bar], loc='upper center', bbox_to_anchor=(0.5,-0.1))\n", + " # add colorbar\n", + " sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)\n", + " sm.set_array([]) # needed for fig.colorbar\n", + " cbar = fig.colorbar(sm, ax=ax, orientation='horizontal', fraction=0.03, location='bottom')\n", + " cbar.set_label(f'Number of input targets {direction}-regulated')\n", + " \n", + " # Update specified y-axis labels\n", + " star_set = set([str(x) for x in fda_approved])\n", + "\n", + " # Denote FDA approved drugs\n", + " modified_label = []\n", + " for label in ax.get_yticklabels():\n", + " txt = label.get_text()\n", + " if txt in star_set:\n", + " modified_label.append(f'**{txt}')\n", + " else:\n", + " modified_label.append(txt)\n", + " \n", + " # Redraw the modified labels\n", + " ax.set_yticklabels(modified_label)\n", + "\n", + " # Denotes flagged drugs\n", + " for label in ax.get_yticklabels():\n", + " txt = label.get_text()\n", + " if str.replace(txt,'**','') in flagged_drugs['DrugMatch'].to_list():\n", + " label.set_color('red')\n", + " else:\n", + " label.set_color('black')\n", + " gene_filter = 'majorityGenes' if filter_genes else '' \n", + " plt_name = f\"{direction}_{rank_method}_top{top_n_drugs}_{gene_filter}.png\"\n", + " save_figure(plt_name)\n", + " plt.show()\n", + " return plt_name\n", + " \n", + "# Up\n", + "display_markdown(\" ### Up-Regulating\", raw=True)\n", + "display_markdown(\" #### Multi-gene Rank\", raw=True)\n", + "display(HTML('

Filter for drugs that regulate at least half of the input genes

'))\n", + "file_name = visualize_top_drugs('Up', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Up'], rank_method='MultiGeneRank', filter_genes=True)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "display(HTML('

No gene regulation filter

'))\n", + "file_name = visualize_top_drugs('Up', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Up'], rank_method='MultiGeneRank', filter_genes=False)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "\n", + "display_markdown(\" #### Deviation Rank\", raw=True)\n", + "display(HTML('

Filter for drugs that regulate at least half of the input genes

'))\n", + "file_name = visualize_top_drugs('Up', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Up'], rank_method='RankDiff', filter_genes=True)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "display(HTML('

No gene regulation filter

'))\n", + "file_name = visualize_top_drugs('Up', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Up'], rank_method='RankDiff', filter_genes=False)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "\n", + "# Dn\n", + "display_markdown(\" ### Down-Regulating\", raw=True)\n", + "display_markdown(\" #### Multi-gene Rank\", raw=True)\n", + "display(HTML('

Filter for drugs that regulate at least half of the input genes

'))\n", + "file_name=visualize_top_drugs('Down', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Dn'], rank_method='MultiGeneRank', filter_genes=True)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "display(HTML('

No gene regulation filter

'))\n", + "file_name=visualize_top_drugs('Down', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Dn'], rank_method='MultiGeneRank', filter_genes=False)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "\n", + "display_markdown(\" #### Deviation Rank\", raw=True)\n", + "display(HTML('

Filter for drugs that regulate at least half of the input genes

'))\n", + "file_name=visualize_top_drugs('Down', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Dn'], rank_method='RankDiff', filter_genes=True)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "display(HTML('

No gene regulation filter

'))\n", + "file_name=visualize_top_drugs('Down', max_genes=len(query_geneset), df = multi_gene_drug_ranks['Dn'], rank_method='RankDiff', filter_genes=False)\n", + "display(FileLink(file_name, result_html_prefix='Download: '))\n", + "\n", + "{% endif %}" ] }, { @@ -1118,11 +1466,19 @@ ] }, { - "cell_type": "markdown", - "id": "30eb397f", + "cell_type": "code", + "execution_count": null, + "id": "8f2814d8", "metadata": {}, + "outputs": [], "source": [ - "The volcano plots show the strength and statistical significance of the drug perturbation for each signature in the dataset (drug and dose specific). Color of points indicate up (red) or down (blue) regulation. Hover over points in the volcano plot to see the label (with cell line, drug, and dose information), logFC, fold-change, log10-transformed p-value, raw rank and normalized rank. Tools to the right of the plot allow you to manipulate (pan, zoom) and download the figure. " + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "The volcano plots show the strength and statistical significance of the drug perturbation for each signature in the dataset (drug and dose specific). Color of points indicate up (red) or down (blue) regulation. Hover over points in the volcano plot to see the label (with cell line, drug, and dose information), logFC, fold-change, log10-transformed p-value, raw rank and normalized rank. Tools to the right of the plot allow you to manipulate (pan, zoom) and download the figure. \n", + "\n", + "{% elif gene_input.raw_value == 'Multi-gene' %}\n", + "The volcano plots show the strength and statistical significance of the drug perturbation for each signature in the datasets. Color of points indicate the dataset associated with the signature. Hover over points in the volcano plot to see the perturbation information, logFC, fold-change, log10-transformed p-value, raw rank and normalized rank. Tools to the right of the plot allow you to manipulate (pan, zoom) and download the figure. \n", + "{% endif %}" ] }, { @@ -1132,8 +1488,18 @@ "metadata": {}, "outputs": [], "source": [ - "output_notebook()\n", - "\n", + "output_notebook()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fba19084", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "def create_bokeh_volcano_plot(expr_data:pd.DataFrame, gene_id:str, cell_type:str, source:str):\n", " '''\n", " Given the expression data for a given gene, create an interactive\n", @@ -1204,15 +1570,22 @@ " fill_alpha=0.6,\n", " color = {'field':'logFC','transform':color_mapper})\n", " p.add_tools(hover)\n", - " show(p)" + " show(p)\n", + " \n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "e1e29e34", + "cell_type": "code", + "execution_count": null, + "id": "e824946f", "metadata": {}, + "outputs": [], "source": [ - "### Ginkgo" + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "### Ginkgo\n", + "{% endif %}" ] }, { @@ -1222,41 +1595,28 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_ginkgo:\n", " for cell_type, expr_df in ginkgo_gene_expr_dict.items():\n", " cell_name = ' '.join(re.sub('human_','',cell_type).split('_'))\n", - " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo')\n", + " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo') \n", "else:\n", - " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)" - ] - }, - { - "cell_type": "markdown", - "id": "3571fcfb", - "metadata": {}, - "source": [ - "### L1000" + " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)\n", + "{% endif %}" ] }, { "cell_type": "code", "execution_count": null, - "id": "36a1f0cc", + "id": "9dab5b97", "metadata": {}, "outputs": [], "source": [ - "if in_lincs:\n", - " create_bokeh_volcano_plot(lincs_de, query_gene, '', 'L1000')\n", - "else:\n", - " display_markdown(f'**{query_gene}** not found in LINCS L1000 dataset', raw=True)" - ] - }, - { - "cell_type": "markdown", - "id": "242d31af", - "metadata": {}, - "source": [ - "### Novartis DRUG-seq" + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "### Novartis DRUG-seq\n", + "{% endif %}" ] }, { @@ -1266,18 +1626,26 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_novartis:\n", - " create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis')\n", + " create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis') \n", "else:\n", - " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)" + " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "7ada5a72", + "cell_type": "code", + "execution_count": null, + "id": "c39118cf", "metadata": {}, + "outputs": [], "source": [ - "### Tahoe-100M" + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "### Tahoe-100M\n", + "{% endif %}" ] }, { @@ -1287,20 +1655,30 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", + "\n", "if in_tahoe:\n", " create_bokeh_volcano_plot(tahoe_de, query_gene, '','Tahoe')\n", "else:\n", - " display_markdown(f'**{query_gene}** not found in Tahoe-100M dataset', raw=True)" + " display_markdown(f'**{query_gene}** not found in Tahoe-100M dataset', raw=True)\n", + "\n", + "{% endif %}" ] }, { - "cell_type": "markdown", - "id": "8c3df5f5", + "cell_type": "code", + "execution_count": null, + "id": "4d129fa4", "metadata": {}, + "outputs": [], "source": [ + "%%appyter markdown\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "### Deepcover MoA\n", "\n", - "The Deepcover MoA proteomics dataset consists of proteome fingerprints for 875 chemical perturbations. This volcano plot of protein expression shows the logFC on the x-axis and the absolute difference in standard deviations between the protein's logFC for a given compound and the protein's mean logFC across all compounds on the y-axis. " + "The Deepcover MoA proteomics dataset consists of proteome fingerprints for 875 chemical perturbations. This volcano plot of protein expression shows the logFC on the x-axis and the absolute difference in standard deviations between the protein's logFC for a given compound and the protein's mean logFC across all compounds on the y-axis. \n", + "{% endif %}" ] }, { @@ -1310,11 +1688,78 @@ "metadata": {}, "outputs": [], "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Single Gene' %}\n", "if in_deepcover:\n", " # check for multiple proteins\n", " uniprot_ids = list(protein_de['UniprotID'].unique())\n", " for uid in uniprot_ids:\n", - " create_bokeh_volcano_plot(protein_de[protein_de['UniprotID']==uid], query_gene, '', 'Deepcover MoA')" + " create_bokeh_volcano_plot(protein_de[protein_de['UniprotID']==uid], query_gene, '', 'Deepcover MoA')\n", + "{% endif %}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4acd6f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%appyter code_exec\n", + "{% if gene_input.raw_value == 'Multi-gene' %}\n", + "\n", + "def create_bokeh_volcano_plot(expr_data:pd.DataFrame, gene_id:str):\n", + " '''\n", + " Given the expression data for a given gene, create an interactive\n", + " volcano plot that shows regulation of gene across all perturbations (drug, dosage, cell line).\n", + " '''\n", + " \n", + " df = expr_data.copy()\n", + " \n", + " # clean columns\n", + " df['FC'] = 2**df['logFC']\n", + "\n", + " # set plot source\n", + " \n", + " plot_source = ColumnDataSource(df.loc[:,['Perturbation','Drug','Dataset','logFC','FC','log10adj.P.Val','PctRank']])\n", + " x,y='logFC','log10adj.P.Val'\n", + " xlabel,ylabel = 'Log2(Fold Change)','-Log10(Adj. p-value)'\n", + " title = f'{gene_id} Regulation'\n", + " hover = HoverTool(tooltips=[(\"Perturbation\", \"@Perturbation\"),\n", + " (\"Drug\", \"@Drug\"),\n", + " (\"Dataset\", \"@Dataset\"),\n", + " (\"Log2(FC)\", \"@logFC\"),\n", + " (\"Fold Change\", \"@FC\"),\n", + " ('-Log10(Adj. p-value)',\"@{log10adj.P.Val}{0.00e}\"),\n", + " (\"Normalized Rank\", \"@PctRank\")])\n", + "\n", + " \n", + " \n", + " # define figure\n", + " p = figure(\n", + " title=title,\n", + " x_axis_label = xlabel,\n", + " y_axis_label = ylabel,\n", + " tools = 'pan,wheel_zoom,box_zoom,reset,save'\n", + " )\n", + "\n", + " # color mapper\n", + " color_mapper = CategoricalColorMapper(palette = Category10[3],\n", + " factors = ['ginkgo', 'tahoe', 'novartis'])\n", + " # plot\n", + " p.scatter(x=x,\n", + " y=y,\n", + " size=8,\n", + " source=plot_source,\n", + " fill_alpha=0.6,\n", + " color = {'field':'Dataset', 'transform': color_mapper})\n", + " p.add_tools(hover)\n", + " show(p)\n", + "\n", + "for g in query_geneset:\n", + " if g in all_genes:\n", + " create_bokeh_volcano_plot(all_genes[g], g)\n", + "{% endif %}" ] }, { @@ -1332,13 +1777,11 @@ "\n", "[4] Subramanian, Aravind, Rajiv Narayan, Steven M. Corsello, David D. Peck, Ted E. Natoli, Xiaodong Lu, Joshua Gould, et al. 2017. “A next Generation Connectivity Map: L1000 Platform and the First 1,000,000 Profiles.” Cell 171 (6): 1437-1452.e17.\n", "\n", - "[5] “LINCS L1000 Reverse Search.” n.d. Accessed September 5, 2025. https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/.\n", - "\n", - "[6] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57.\n", + "[5] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57.\n", "\n", - "[7] Zhang, Jesse, Airol A. Ubas, Richard de Borja, Valentine Svensson, Nicole Thomas, Neha Thakar, Aidan Winters, et al. 2025. “Tahoe-100M: A Giga-Scale Single-Cell Perturbation Atlas for Context-Dependent Gene Function and Cellular Modeling.” bioRxiv. https://doi.org/10.1101/2025.02.20.639398.\n", + "[6] Zhang, Jesse, Airol A. Ubas, Richard de Borja, Valentine Svensson, Nicole Thomas, Neha Thakar, Aidan Winters, et al. 2025. “Tahoe-100M: A Giga-Scale Single-Cell Perturbation Atlas for Context-Dependent Gene Function and Cellular Modeling.” bioRxiv. https://doi.org/10.1101/2025.02.20.639398.\n", "\n", - "[8] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48." + "[7] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48." ] } ], diff --git a/appyters/Drug_Gene_Budger2/multi_gene_utils.py b/appyters/Drug_Gene_Budger2/multi_gene_utils.py new file mode 100644 index 00000000..4e21e41b --- /dev/null +++ b/appyters/Drug_Gene_Budger2/multi_gene_utils.py @@ -0,0 +1,169 @@ +import pandas as pd +import numpy as np +import random +from tqdm import tqdm +import warnings +import itertools +import os +import sys + +# utils file +import cmap_readers + +def fetch_cmap_data(input_set): + ''' + input_set: input gene list (iterable) + + output: dictionary of gene-specific dataframes, concatenated across all resources + ''' + ginkgo_cell_types = ['human_epithelial_melanocytes', 'human_dermal_fibroblast', 'human_aortic_smooth_muscle_cells', 'human_skeletal_muscle_myoblasts','A549'] + novartis_perturbs = {g:[] for g in input_set} + # lincs_perturbs = {g:[] for g in input_set} + ginkgo_perturbs = {g:[] for g in input_set} + + not_found = {} + for g in tqdm(input_set): + novartis_perturbs[g] = cmap_readers.prepare_novartis_data(g) + # lincs_perturbs[g] = cmap_readers.prepare_lincs_data(g) + ginkgo_perturbs[g] = cmap_readers.prepare_ginkgo_data_df(g, ginkgo_cell_types) + + tahoe_perturbs = cmap_readers.retrieve_tahoe_data(input_set) + # create single df for all resources + cols_to_keep = ['Gene','Perturbation','Drug','logFC','GeneDir','PctRank', 'log10adj.P.Val'] + dataset_names = ['tahoe','novartis','ginkgo'] #'lincs' + all_genes = {gene:None for gene in input_set} + for g in all_genes.keys(): + all_df = pd.DataFrame() + for i,pdict in enumerate([tahoe_perturbs, novartis_perturbs, ginkgo_perturbs]): #lincs_perturbs + if pdict[g] is None: + # print(f'{g} not found in {dataset_names[i]}') + if dataset_names[i] in not_found: + not_found[dataset_names[i]].append(g) + else: + not_found[dataset_names[i]] = [g] + continue + df = pdict[g][cols_to_keep] + df['Dataset'] = dataset_names[i] + if i == 0: + all_df = df + else: + all_df = pd.concat([all_df, df], ignore_index=True) + all_genes[g] = all_df + return all_genes, not_found + +def clean_drugname(d): + if d == None: + return d + else: + return d.casefold() + +def add_pubchem_col(df, cid_df): + ''' + df: single-gene dataframe + cid_df: pubchem CID dataframe + + output: dataframe with CID column + ''' + df['DrugMatch'] = df['Drug'].apply(clean_drugname) + df = df.merge(cid_df, on='DrugMatch', how='left') + df['DrugGroup'] = df['CID'].fillna(df['DrugMatch']) + df = df.drop(columns=['DrugMatch']) + return df + +def rank_drugs(df, direction, pubchem_ids): + ''' + df: CMAP dataframe + direction: up or down regulation + + output: dataframe ranked by up/down regulating signatures + ''' + tmp = df.copy() + # filter by direction + tmp = tmp[tmp['GeneDir']==direction] + # add drug matching column for downstream multi-gene analysis + tmp = add_pubchem_col(tmp, pubchem_ids) + # rank drugs by adjusted p-value + tmp = tmp.sort_values(['log10adj.P.Val','PctRank'], ascending=[False,True], ignore_index=True) + tmp['IndexRank'] = tmp.index + 1 + tmp['Rank'] = tmp['IndexRank'].rank(pct=True, ascending=True) + return tmp.sort_values('Rank', ascending=True) + +def agg_drug_rankings(df): + ''' + df: single gene dataframe with signature rankings for up or down regulation + + output: dataframe with signature rankings averaged for each drug + ''' + agg_rankings = df.groupby('DrugGroup').agg( + MeanRank=('Rank','mean'), + NSigs=('Perturbation','size'), + NResources = ('Dataset', 'nunique')) \ + .sort_values('MeanRank').reset_index() + + drug_match = df[['Drug','DrugGroup']].drop_duplicates(subset='DrugGroup', keep='first', ignore_index=True) + agg_rankings = agg_rankings.merge(drug_match, on='DrugGroup', how='left') + return agg_rankings + +def single_gene_rank(gene_cmap_dict, input_set, pubchem_ids): + ''' + gene_cmap_dict: dictionary from fetch_cmap + input_set: input gene set + pubchem_ids: dataframe of pubchem IDs + + Rank drugs for all genes in input set individually + + out: dictionary of drug ranks for individual genes + ''' + # prepare drug rank output object + drug_ranks = {g:{dir:None for dir in ['Up','Dn']} for g in input_set} + # genes + remove = [] + for g in drug_ranks.keys(): + # no CMAP resources had data for the gene + if gene_cmap_dict[g].empty: + remove.append(g) + continue + # direction + for dir in drug_ranks[g].keys(): + rank_df = rank_drugs(gene_cmap_dict[g], direction=dir, pubchem_ids=pubchem_ids) + drug_ranks[g][dir]= agg_drug_rankings(rank_df) + # remove genes for which there was no CMAP data + drug_ranks = {g:data for g,data in drug_ranks.items() if g not in remove} + return drug_ranks + +def combine_gene_results (agg_rank_dict, direction): + ''' + agg_rank_dict: dictionary with drug rankings for multiple genes + direction: 'Up' or 'Dn' regulation + + output: dataframe with drug rankings for given direction and method for multiple genes + ''' + results = {'gene':[], + 'DrugGroup':[], + 'Drug':[], + 'MeanRank':[]} + for g in agg_rank_dict.keys(): + results['gene'].extend([g] * agg_rank_dict[g][direction].shape[0]) + results['DrugGroup'].extend(agg_rank_dict[g][direction]['DrugGroup'].to_list()) + results['Drug'].extend(agg_rank_dict[g][direction]['Drug'].to_list()) + results['MeanRank'].extend(agg_rank_dict[g][direction]['MeanRank'].to_list()) + return pd.DataFrame(results) + +def get_multi_gene_ranks (agg_drug_ranks, pubchem_ids): + ''' + combined_df: dataframe from combine_gene_results + + output: dataframe of ranked drugs, scores averaged across genes + ''' + multi_gene_ranks = {direction: None for direction in ['Up','Dn']} + combined_gene_results = {direction: None for direction in ['Up','Dn']} + + for dir in ['Up','Dn']: + combined_df = combine_gene_results(agg_drug_ranks, dir) + mg_ranks = combined_df.groupby(['DrugGroup']).agg(MultiGeneRank = ('MeanRank','mean'), NGenes = ('gene', 'nunique')).reset_index().sort_values('MultiGeneRank', ascending=True) + mg_ranks = mg_ranks.merge(pubchem_ids, left_on='DrugGroup',right_on='CID',how='left').drop(columns='CID') + combined_df = combined_df.merge(pubchem_ids, left_on='DrugGroup',right_on='CID',how='left').drop(columns='CID') + multi_gene_ranks[dir] = mg_ranks + combined_gene_results[dir] = combined_df + + return combined_gene_results, multi_gene_ranks \ No newline at end of file diff --git a/appyters/Drug_Gene_Budger2/requirements.txt b/appyters/Drug_Gene_Budger2/requirements.txt index 32b67753..36e48940 100644 --- a/appyters/Drug_Gene_Budger2/requirements.txt +++ b/appyters/Drug_Gene_Budger2/requirements.txt @@ -2,9 +2,12 @@ appyter @ git+https://github.com/Maayanlab/appyter numpy pandas pyarrow +polars requests IPython upsetplot matplotlib matplotlib_venn -bokeh \ No newline at end of file +bokeh +seaborn +tqdm \ No newline at end of file diff --git a/appyters/Drug_Gene_Budger2/static/DR-logo.png b/appyters/Drug_Gene_Budger2/static/DR-logo.png new file mode 100644 index 00000000..34332e5f Binary files /dev/null and b/appyters/Drug_Gene_Budger2/static/DR-logo.png differ diff --git a/appyters/Drug_Gene_Budger2/static/DR_logo.png b/appyters/Drug_Gene_Budger2/static/DR_logo.png new file mode 100644 index 00000000..04dbd9ed Binary files /dev/null and b/appyters/Drug_Gene_Budger2/static/DR_logo.png differ diff --git a/appyters/Drug_Gene_Budger2/static/dgb_logo.png b/appyters/Drug_Gene_Budger2/static/dgb_logo.png deleted file mode 100644 index 794aff94..00000000 Binary files a/appyters/Drug_Gene_Budger2/static/dgb_logo.png and /dev/null differ diff --git a/appyters/Drug_Gene_Budger2/static/dgb_logo_small.png b/appyters/Drug_Gene_Budger2/static/dgb_logo_small.png deleted file mode 100644 index e7892f3a..00000000 Binary files a/appyters/Drug_Gene_Budger2/static/dgb_logo_small.png and /dev/null differ diff --git a/appyters/Drug_Gene_Budger2/templates/form.j2 b/appyters/Drug_Gene_Budger2/templates/form.j2 index b9eab39f..8a9e647a 100644 --- a/appyters/Drug_Gene_Budger2/templates/form.j2 +++ b/appyters/Drug_Gene_Budger2/templates/form.j2 @@ -18,8 +18,8 @@
-

Dr. Gene Budger 2 (DBG2)

-

Submit a human gene symbol and explore drugs that maximally increase or decrease the mRNA expression of the input gene based on data from a collection of Connectivity Mapping resources

+

DrugRanger

+

Submit a human gene symbol or gene set and explore drugs that maximally increase or decrease the mRNA expression of the input gene(s) based on data from a collection of Connectivity Mapping resources