From 4a4b7c352d62abd89e04e8d5aed48a206f968a44 Mon Sep 17 00:00:00 2001
From: Lily Taub <115661359+lilydtaub@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:31:08 -0400
Subject: [PATCH] handle gene inputs that are not present in all datasets
---
appyters/Drug_Gene_Budger2/appyter.json | 2 +-
.../drug_gene_budger2_appyter.ipynb | 383 +++++++++++-------
2 files changed, 231 insertions(+), 154 deletions(-)
diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json
index e15e61b1..dc052fd3 100644
--- a/appyters/Drug_Gene_Budger2/appyter.json
+++ b/appyters/Drug_Gene_Budger2/appyter.json
@@ -2,7 +2,7 @@
"$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
"name": "Drug_Gene_Budger2",
"title": "Dr. Gene Budger (DGB) 2",
- "version": "0.0.5",
+ "version": "0.0.6",
"description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets",
"image": "dgb_logo.png",
"authors": [
diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
index 0357c85c..96403a16 100644
--- a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
+++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
@@ -115,11 +115,9 @@
"import pandas as pd\n",
"import numpy as np\n",
"import re\n",
+ "from itertools import combinations\n",
"import warnings\n",
"\n",
- "## HTTP Requests\n",
- "import requests\n",
- "\n",
"## Tables\n",
"from IPython.display import display, display_markdown, HTML\n",
"\n",
@@ -128,7 +126,7 @@
"from matplotlib import pyplot\n",
"\n",
"## Venn Diagram\n",
- "from matplotlib_venn import venn3\n",
+ "from matplotlib_venn import venn3, venn2\n",
"\n",
"## Volcano Plot\n",
"from bokeh.plotting import figure, show\n",
@@ -192,7 +190,6 @@
" in the Ginkgo dataset is a key and the value is the DE data\n",
" for the query gene for that cell type.\n",
" '''\n",
- " # get perturbations with given cell type\n",
" cell_type_results = {}\n",
" for k in cell_types:\n",
" subset = df[df['Perturbation'].str.contains(k)]\n",
@@ -225,6 +222,7 @@
" lincs_de = pd.read_feather(f'{lincs_URL}/{gene_file}')\n",
" # format p-values\n",
" lincs_de['log10adj.P.Val'] = lincs_de['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1\n",
+ " # remove CRISPR KO perturbations\n",
" lincs_ko_perturbs = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_ko_perturbs.txt', sep='\\t')\n",
" lincs_de = lincs_de[~lincs_de['Drug'].isin(lincs_ko_perturbs.cmap_name.to_list())]\n",
"except:\n",
@@ -258,11 +256,11 @@
"metadata": {},
"outputs": [],
"source": [
- "if in_lincs + in_novartis + in_ginkgo < 2:\n",
+ "if in_lincs + in_novartis + in_ginkgo < 1:\n",
" print(f\"LINCS: {in_lincs}\")\n",
" print(f\"Novartis: {in_novartis}\")\n",
" print(f\"Ginkgo: {in_ginkgo}\")\n",
- " raise Exception(\"Execution stopped, gene not found in at least 2 datasets\")"
+ " raise Exception(\"Execution stopped, gene not found in any datasets\")"
]
},
{
@@ -361,8 +359,11 @@
" top = {d.casefold() for d in set(rank_results.head(n).index)}\n",
" return top\n",
"\n",
- "# create download link for table results\n",
+ "\n",
"def download_link(df, fname, link_header='Download full results'):\n",
+ " '''\n",
+ " create download link for table results\n",
+ " '''\n",
" if df.shape[0] == 0: return ''\n",
" csv = df.to_csv(fname, sep='\\t', index=True)\n",
" link = f'
'\n",
@@ -402,23 +403,26 @@
"metadata": {},
"outputs": [],
"source": [
- "ginkgo_drugs_up = {}\n",
- "ginkgo_drugs_down = {}\n",
- "for cell_type, exprdf in ginkgo_gene_expr_dict.items():\n",
- " # rank by level of up-regulation\n",
- " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up', ranking_method)\n",
- " ginkgo_drugs_up[cell_type] = (mean_ranks, full_ranks)\n",
- " display_markdown(f'**Top {top_n} up-regulators for {cell_type}**', raw=True)\n",
- " display(mean_ranks.head(top_n))\n",
- " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_UpReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n",
- " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_UpRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n",
- " # rank by level of down-regulation\n",
- " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down', ranking_method)\n",
- " ginkgo_drugs_down[cell_type] = (mean_ranks, full_ranks)\n",
- " display_markdown(f'**Top {top_n} down-regulators for {cell_type}**', raw=True)\n",
- " display(mean_ranks.head(top_n))\n",
- " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_DnReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n",
- " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_DnRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n"
+ "if in_ginkgo:\n",
+ " ginkgo_drugs_up = {}\n",
+ " ginkgo_drugs_down = {}\n",
+ " for cell_type, exprdf in ginkgo_gene_expr_dict.items():\n",
+ " # rank by level of up-regulation\n",
+ " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up', ranking_method)\n",
+ " ginkgo_drugs_up[cell_type] = (mean_ranks, full_ranks)\n",
+ " display_markdown(f'**Top {top_n} up-regulators for {cell_type}**', raw=True)\n",
+ " display(mean_ranks.head(top_n))\n",
+ " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_UpReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n",
+ " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_UpRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n",
+ " # rank by level of down-regulation\n",
+ " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down', ranking_method)\n",
+ " ginkgo_drugs_down[cell_type] = (mean_ranks, full_ranks)\n",
+ " display_markdown(f'**Top {top_n} down-regulators for {cell_type}**', raw=True)\n",
+ " display(mean_ranks.head(top_n))\n",
+ " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_DnReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n",
+ " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_DnRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n",
+ "else:\n",
+ " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)"
]
},
{
@@ -444,16 +448,19 @@
"metadata": {},
"outputs": [],
"source": [
- "lincs_drugs_up = get_rankings(lincs_de, 'LINCS', '', 'up', ranking_method)\n",
- "lincs_drugs_down = get_rankings(lincs_de, 'LINCS', '', 'down', ranking_method)\n",
- "display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n",
- "display(lincs_drugs_up[0].head(top_n))\n",
- "display(HTML(download_link(lincs_drugs_up[0], f\"l1000_drug_ranks_{query_gene}_UpReg.tsv\", 'Download results averaged across drug dosages')))\n",
- "display(HTML(download_link(lincs_drugs_up[1], f\"l1000_drug_ranks_{query_gene}_full_UpReg.tsv\", 'Download results for all perturbations')))\n",
- "display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n",
- "display(lincs_drugs_down[0].head(top_n))\n",
- "display(HTML(download_link(lincs_drugs_down[0], f\"l1000_drug_ranks_{query_gene}_DnReg.tsv\", 'Download results averaged across drug dosages')))\n",
- "display(HTML(download_link(lincs_drugs_down[1], f\"l1000_drug_ranks_{query_gene}_full_DnReg.tsv\", 'Download results for all perturbations')))"
+ "if in_lincs:\n",
+ " lincs_drugs_up = get_rankings(lincs_de, 'LINCS', '', 'up', ranking_method)\n",
+ " lincs_drugs_down = get_rankings(lincs_de, 'LINCS', '', 'down', ranking_method)\n",
+ " display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n",
+ " display(lincs_drugs_up[0].head(top_n))\n",
+ " display(HTML(download_link(lincs_drugs_up[0], f\"l1000_drug_ranks_{query_gene}_UpReg.tsv\", 'Download results averaged across drug dosages')))\n",
+ " display(HTML(download_link(lincs_drugs_up[1], f\"l1000_drug_ranks_{query_gene}_full_UpReg.tsv\", 'Download results for all perturbations')))\n",
+ " display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n",
+ " display(lincs_drugs_down[0].head(top_n))\n",
+ " display(HTML(download_link(lincs_drugs_down[0], f\"l1000_drug_ranks_{query_gene}_DnReg.tsv\", 'Download results averaged across drug dosages')))\n",
+ " display(HTML(download_link(lincs_drugs_down[1], f\"l1000_drug_ranks_{query_gene}_full_DnReg.tsv\", 'Download results for all perturbations')))\n",
+ "else: \n",
+ " display_markdown(f'**{query_gene}** not found in LINCS L1000 dataset', raw=True)"
]
},
{
@@ -479,17 +486,20 @@
"metadata": {},
"outputs": [],
"source": [
- "novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up', ranking_method)\n",
- "novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down', ranking_method)\n",
+ "if in_novartis:\n",
+ " novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up', ranking_method)\n",
+ " novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down', ranking_method)\n",
"\n",
- "display_markdown(f'**Top {top_n} up-regulators in Novartis DRUG-seq**', raw=True)\n",
- "display(novartis_drugs_up[0].head(top_n))\n",
- "display(HTML(download_link(novartis_drugs_up[0], f'novartis_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n",
- "display(HTML(download_link(novartis_drugs_up[1], f'novartis_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n",
- "display_markdown(f'**Top {top_n} down-regulators in Novartis DRUG-seq**', raw=True)\n",
- "display(novartis_drugs_down[0].head(top_n))\n",
- "display(HTML(download_link(novartis_drugs_down[0], f'novartis_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n",
- "display(HTML(download_link(novartis_drugs_down[1], f'novartis_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))"
+ " display_markdown(f'**Top {top_n} up-regulators in Novartis DRUG-seq**', raw=True)\n",
+ " display(novartis_drugs_up[0].head(top_n))\n",
+ " display(HTML(download_link(novartis_drugs_up[0], f'novartis_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n",
+ " display(HTML(download_link(novartis_drugs_up[1], f'novartis_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n",
+ " display_markdown(f'**Top {top_n} down-regulators in Novartis DRUG-seq**', raw=True)\n",
+ " display(novartis_drugs_down[0].head(top_n))\n",
+ " display(HTML(download_link(novartis_drugs_down[0], f'novartis_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n",
+ " display(HTML(download_link(novartis_drugs_down[1], f'novartis_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))\n",
+ "else:\n",
+ " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq', raw=True)"
]
},
{
@@ -502,15 +512,18 @@
"top_up = {}\n",
"top_down = {}\n",
"# get results from Ginkgo\n",
- "for cell_type in ginkgo_drugs_down.keys():\n",
- " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n",
- " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n",
+ "if in_ginkgo:\n",
+ " for cell_type in ginkgo_drugs_down.keys():\n",
+ " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n",
+ " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n",
"# get results from L1000\n",
- "top_up['lincs_l1000'] = {drug.casefold() for drug in set(lincs_drugs_up[0].index)}\n",
- "top_down['lincs_l1000'] = {drug.casefold() for drug in set(lincs_drugs_down[0].index)}\n",
+ "if in_lincs:\n",
+ " top_up['lincs_l1000'] = get_top(lincs_drugs_up[0], n=50)\n",
+ " top_down['lincs_l1000'] = get_top(lincs_drugs_down[0], n=50)\n",
"# get results from novartis\n",
- "top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n",
- "top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)"
+ "if in_novartis:\n",
+ " top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n",
+ " top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)"
]
},
{
@@ -552,7 +565,11 @@
"metadata": {},
"outputs": [],
"source": [
- "def create_upset(top_sets: dict, ranking_method:str):\n",
+ "def create_upset(top_sets: dict):\n",
+ " '''\n",
+ " Given a dictionary to top up or down regulating genes,\n",
+ " creates an Upset plot. \n",
+ " '''\n",
" rename_keys = {\n",
" 'ginkgo_A549': 'ginkgo_A549',\n",
" 'lincs_l1000': 'lincs_l1000',\n",
@@ -575,19 +592,14 @@
"metadata": {},
"outputs": [],
"source": [
- "display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n",
- "create_upset(top_up, ranking_method)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "0ecff74c",
- "metadata": {},
- "outputs": [],
- "source": [
- "display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n",
- "create_upset(top_down, ranking_method)"
+ "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+ " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
+ "else:\n",
+ " display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n",
+ " create_upset(top_up)\n",
+ "\n",
+ " display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n",
+ " create_upset(top_down)"
]
},
{
@@ -597,7 +609,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def get_overlapping_sets(top_sets:dict, to_file:str):\n",
+ "def get_overlapping_sets(top_sets:dict):\n",
" '''\n",
" Given the dictionary of sets used to created the UpSet plot,\n",
" return the contents of the overlapping sets. \n",
@@ -649,14 +661,17 @@
"metadata": {},
"outputs": [],
"source": [
- "overlap_down = get_overlapping_sets(top_down, 'overlap_down')\n",
- "overlap_up = get_overlapping_sets(top_up, 'overlap_up')\n",
- "display_markdown(\"**Down-regulating drug overlap**\", raw=True)\n",
- "display(overlap_down)\n",
- "display(HTML(download_link(overlap_down, f'overlapping_drugs_{query_gene}_DnReg.tsv')))\n",
- "display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n",
- "display(overlap_up)\n",
- "display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n"
+ "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+ " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
+ "else:\n",
+ " overlap_down = get_overlapping_sets(top_down)\n",
+ " overlap_up = get_overlapping_sets(top_up)\n",
+ " display_markdown(\"**Down-regulating drug overlap**\", raw=True)\n",
+ " display(overlap_down)\n",
+ " display(HTML(download_link(overlap_down, f'overlapping_drugs_{query_gene}_DnReg.tsv')))\n",
+ " display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n",
+ " display(overlap_up)\n",
+ " display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n"
]
},
{
@@ -666,7 +681,7 @@
"metadata": {},
"outputs": [],
"source": [
- "def get_ranking_averages(overlapping_df, direction, ranking_method):\n",
+ "def get_ranking_averages(overlapping_df, data_dict, ranking_method):\n",
" '''\n",
" Retrieve average target ranking across datasets for drugs in overlapping sets. \n",
"\n",
@@ -675,27 +690,6 @@
" Average Rank\n",
" Number of datasets for which drug was a significant regulator of the query gene\n",
" '''\n",
- " # extract up or down data\n",
- " if direction == 'down':\n",
- " data_dict ={\n",
- " 'ginkgo_A549': ginkgo_drugs_down['A549'][1],\n",
- " 'ginkgo_human_dermal_fibroblast': ginkgo_drugs_down['human_dermal_fibroblast'][1],\n",
- " 'ginkgo_human_aortic_smooth_muscle_cells': ginkgo_drugs_down['human_aortic_smooth_muscle_cells'][1],\n",
- " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_down['human_epithelial_melanocytes'][1],\n",
- " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_down['human_skeletal_muscle_myoblasts'][1],\n",
- " 'novartis': novartis_drugs_down[1],\n",
- " 'lincs': lincs_drugs_down[1]\n",
- " }\n",
- " elif direction == 'up':\n",
- " data_dict ={\n",
- " 'ginkgo_A549': ginkgo_drugs_up['A549'][1],\n",
- " 'ginkgo_human_dermal_fibroblast': ginkgo_drugs_up['human_dermal_fibroblast'][1],\n",
- " 'ginkgo_human_aortic_smooth_muscle_cells': ginkgo_drugs_up['human_aortic_smooth_muscle_cells'][1],\n",
- " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_up['human_epithelial_melanocytes'][1],\n",
- " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_up['human_skeletal_muscle_myoblasts'][1],\n",
- " 'novartis': novartis_drugs_up[1],\n",
- " 'lincs': lincs_drugs_up[1]\n",
- " }\n",
" # get average, integrating across datasets\n",
" average_rank_vals = {}\n",
" average_pctrank_vals = {}\n",
@@ -710,7 +704,7 @@
" runsum_pctrank = 0\n",
" runsum_logFC = 0\n",
" runsum_pval = 0\n",
- " for k,df in data_dict.items():\n",
+ " for _,df in data_dict.items():\n",
" subset = df[df['Drug'].str.lower() == d.lower()]\n",
" n = n + subset.shape[0]\n",
" runsum_rank = runsum_rank + subset.Rank.sum()\n",
@@ -736,10 +730,40 @@
" else:\n",
" # sort based on N datasets and average adjusted p-value\n",
" res_df = res_df.sort_values(['N Datasets','Avg Adj.P.Val'], ascending=[False,True])\n",
- " return res_df\n",
- " \n",
- "overlapping_up_TargetRank = get_ranking_averages(overlap_up, 'up', ranking_method)\n",
- "overlapping_down_TargetRank = get_ranking_averages(overlap_down, 'down', ranking_method)"
+ " return res_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9cc1ea15",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# define input data for get_ranking_averages\n",
+ "data_source_present = {'A549': in_ginkgo, \n",
+ " 'human_dermal_fibroblast':in_ginkgo,\n",
+ " 'human_aortic_smooth_muscle_cells':in_ginkgo,\n",
+ " 'human_epithelial_melanocytes': in_ginkgo,\n",
+ " 'human_skeletal_muscle_myoblasts': in_ginkgo,\n",
+ " 'novartis': in_novartis,\n",
+ " 'lincs': in_lincs}\n",
+ "data_dict_down ={}\n",
+ "data_dict_up = {}\n",
+ "for source,present in data_source_present.items():\n",
+ " if (present) & (not source in ['novartis','lincs']):\n",
+ " data_dict_down[source] = ginkgo_drugs_down[source][1]\n",
+ " data_dict_up[source] = ginkgo_drugs_up[source][1]\n",
+ " elif (present) & (source == 'lincs'):\n",
+ " data_dict_down[source] = lincs_drugs_down[1]\n",
+ " data_dict_up[source] = lincs_drugs_up[1]\n",
+ " elif (present) & (source == 'novartis'):\n",
+ " data_dict_down[source] = novartis_drugs_down[1]\n",
+ " data_dict_up[source] = novartis_drugs_up[1]\n",
+ "\n",
+ "if in_ginkgo + in_lincs + in_novartis > 1:\n",
+ " overlapping_up_TargetRank = get_ranking_averages(overlap_up, data_dict_up, ranking_method)\n",
+ " overlapping_down_TargetRank = get_ranking_averages(overlap_down, data_dict_down, ranking_method)"
]
},
{
@@ -757,12 +781,15 @@
"metadata": {},
"outputs": [],
"source": [
- "display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n",
- "display(overlapping_up_TargetRank.head(n=top_n))\n",
- "display(HTML(download_link(overlapping_up_TargetRank, f'overlapping_drugs_averages_{query_gene}_UpReg.tsv')))\n",
- "display_markdown(\"**Averages across datasets: Down-regulating drugs**\", raw=True)\n",
- "display(overlapping_down_TargetRank.head(n=top_n))\n",
- "display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))"
+ "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+ " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
+ "else:\n",
+ " display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n",
+ " display(overlapping_up_TargetRank.head(n=top_n))\n",
+ " display(HTML(download_link(overlapping_up_TargetRank, f'overlapping_drugs_averages_{query_gene}_UpReg.tsv')))\n",
+ " display_markdown(\"**Averages across datasets: Down-regulating drugs**\", raw=True)\n",
+ " display(overlapping_down_TargetRank.head(n=top_n))\n",
+ " display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))"
]
},
{
@@ -783,38 +810,77 @@
"outputs": [],
"source": [
"# combine top up and down drugs across Ginkgo cell types\n",
- "all_ginkgo_up = set()\n",
- "all_ginkgo_down = set()\n",
- "for k,v in top_up.items():\n",
- " if re.search('ginkgo',k):\n",
- " all_ginkgo_up = all_ginkgo_up.union(v)\n",
- " all_ginkgo_down = all_ginkgo_down.union(top_down[k])\n",
- "\n",
- "venn_up = {\n",
- " 'novartis': top_up['novartis'],\n",
- " 'lincs' : top_up['lincs_l1000'],\n",
- " 'ginkgo' : all_ginkgo_up\n",
- "}\n",
- "venn_down = {\n",
- " 'novartis': top_down['novartis'],\n",
- " 'lincs' : top_down['lincs_l1000'],\n",
- " 'ginkgo' : all_ginkgo_down\n",
- "}\n",
+ "if in_ginkgo:\n",
+ " all_ginkgo_up = set()\n",
+ " all_ginkgo_down = set()\n",
+ " for k,v in top_up.items():\n",
+ " if re.search('ginkgo',k):\n",
+ " all_ginkgo_up = all_ginkgo_up.union(v)\n",
+ " all_ginkgo_down = all_ginkgo_down.union(top_down[k])\n",
+ "# define input data for venn diagrams\n",
+ "data_source_present = {'ginkgo':in_ginkgo,\n",
+ " 'lincs_l1000':in_lincs,\n",
+ " 'novartis':in_novartis}\n",
+ "venn_up = {}\n",
+ "venn_down = {}\n",
+ "for source,present in data_source_present.items():\n",
+ " if (present) & (source == 'ginkgo'):\n",
+ " venn_up[source] = all_ginkgo_up\n",
+ " venn_down[source] = all_ginkgo_down\n",
+ " elif present:\n",
+ " venn_up[source] = top_up[source]\n",
+ " venn_down[source] = top_down[source]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "108bbb28",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_venn(venn_dict):\n",
+ " '''\n",
+ " Given a dictionary of sets of top up or down regulating genes,\n",
+ " creates a 2 or 3 set venn diagram. \n",
+ " '''\n",
+ " if len(venn_dict) == 3:\n",
+ " venn3(subsets=(venn_dict['novartis'], venn_dict['lincs_l1000'], venn_dict['ginkgo']),\n",
+ " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n",
+ " elif len(venn_dict) == 2:\n",
+ " venn2(subsets = (venn_dict.values()),\n",
+ " set_labels=(venn_dict.keys()));\n",
+ "\n",
"\n",
"def print_overlap(venn_dict):\n",
- " gnl = venn_dict['ginkgo'].intersection(venn_dict['novartis']).intersection(venn_dict['lincs'])\n",
- " gn = venn_dict['ginkgo'].intersection(venn_dict['novartis']).difference(gnl)\n",
- " gl = venn_dict['ginkgo'].intersection(venn_dict['lincs']).difference(gnl)\n",
- " nl = venn_dict['novartis'].intersection(venn_dict['lincs']).difference(gnl)\n",
- " \n",
- " if len(gn) > 0:\n",
- " display_markdown(f'Novartis and Ginkgo: {[d for d in gn]}', raw=True)\n",
- " if len(nl) > 0:\n",
- " display_markdown(f'Novartis and LINCS L1000: {[d for d in nl]}', raw=True)\n",
- " if len(gl) > 0:\n",
- " display_markdown(f'LINCS L1000 and Ginkgo: {[d for d in gl]}', raw=True)\n",
- " if len(gnl) > 0:\n",
- " display_markdown(f'LINCS L1000, Ginkgo and Novartis: {[d for d in gnl]}', raw=True)"
+ " '''\n",
+ " Given a dictionary of sets of top up or down regulating genes,\n",
+ " prints the unique overlap among all combinations of sets (>= 2). \n",
+ " '''\n",
+ " results = {}\n",
+ " datasets = list(venn_dict.keys())\n",
+ " # Step 1: get all intersections\n",
+ " for r in range(2, len(datasets)+1):\n",
+ " for combo in combinations(datasets, r):\n",
+ " drug_sets = [venn_dict[d] for d in combo]\n",
+ " results[combo] = list(set.intersection(*drug_sets))\n",
+ " # Step 2: subtract intersections of all super sets \n",
+ " for combo, base_set in results.items():\n",
+ " unique_set = set(base_set)\n",
+ " # iterate through all sets from larger combinations\n",
+ " for r in range(len(combo) + 1, len(datasets) + 1):\n",
+ " for sup in combinations(datasets, r):\n",
+ " if set(combo).issubset(sup): # only consider supersets\n",
+ " super_set = set(results[sup])\n",
+ " # get difference between super set and current set, this\n",
+ " # is the new intersection for the combination\n",
+ " new_unique_set = unique_set - super_set\n",
+ " results[combo] = list(new_unique_set)\n",
+ " # Step 3: print results\n",
+ " for datasets, overlap in results.items():\n",
+ " if len(overlap) == 0:\n",
+ " overlap = ['None']\n",
+ " print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")"
]
},
{
@@ -824,11 +890,12 @@
"metadata": {},
"outputs": [],
"source": [
- "display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n",
- "\n",
- "venn3(subsets=(venn_up['novartis'], venn_up['lincs'], venn_up['ginkgo']),\n",
- " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n",
- "print_overlap(venn_up)"
+ "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+ " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
+ "else:\n",
+ " display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n",
+ " create_venn(venn_up)\n",
+ " print_overlap(venn_up)"
]
},
{
@@ -838,11 +905,12 @@
"metadata": {},
"outputs": [],
"source": [
- "display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n",
- "\n",
- "venn3(subsets=(venn_down['novartis'], venn_down['lincs'], venn_down['ginkgo']),\n",
- " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n",
- "print_overlap(venn_down)"
+ "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+ " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
+ "else:\n",
+ " display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n",
+ " create_venn(venn_down)\n",
+ " print_overlap(venn_down)"
]
},
{
@@ -935,9 +1003,12 @@
"metadata": {},
"outputs": [],
"source": [
- "for cell_type, expr_df in ginkgo_gene_expr_dict.items():\n",
- " cell_name = ' '.join(re.sub('human_','',cell_type).split('_'))\n",
- " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo')"
+ "if in_ginkgo:\n",
+ " for cell_type, expr_df in ginkgo_gene_expr_dict.items():\n",
+ " cell_name = ' '.join(re.sub('human_','',cell_type).split('_'))\n",
+ " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo')\n",
+ "else:\n",
+ " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)"
]
},
{
@@ -955,7 +1026,10 @@
"metadata": {},
"outputs": [],
"source": [
- "create_bokeh_volcano_plot(lincs_de, query_gene, '', 'L1000')"
+ "if in_lincs:\n",
+ " create_bokeh_volcano_plot(lincs_de, query_gene, '', 'L1000')\n",
+ "else:\n",
+ " display_markdown(f'**{query_gene}** not found in LINCS L1000 dataset', raw=True)"
]
},
{
@@ -973,7 +1047,10 @@
"metadata": {},
"outputs": [],
"source": [
- "create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis')"
+ "if in_novartis:\n",
+ " create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis')\n",
+ "else:\n",
+ " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)"
]
},
{