From 4a4b7c352d62abd89e04e8d5aed48a206f968a44 Mon Sep 17 00:00:00 2001 From: Lily Taub <115661359+lilydtaub@users.noreply.github.com> Date: Wed, 17 Sep 2025 18:31:08 -0400 Subject: [PATCH] handle gene inputs that are not present in all datasets --- appyters/Drug_Gene_Budger2/appyter.json | 2 +- .../drug_gene_budger2_appyter.ipynb | 383 +++++++++++------- 2 files changed, 231 insertions(+), 154 deletions(-) diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json index e15e61b1..dc052fd3 100644 --- a/appyters/Drug_Gene_Budger2/appyter.json +++ b/appyters/Drug_Gene_Budger2/appyter.json @@ -2,7 +2,7 @@ "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json", "name": "Drug_Gene_Budger2", "title": "Dr. Gene Budger (DGB) 2", - "version": "0.0.5", + "version": "0.0.6", "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets", "image": "dgb_logo.png", "authors": [ diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb index 0357c85c..96403a16 100644 --- a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb +++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb @@ -115,11 +115,9 @@ "import pandas as pd\n", "import numpy as np\n", "import re\n", + "from itertools import combinations\n", "import warnings\n", "\n", - "## HTTP Requests\n", - "import requests\n", - "\n", "## Tables\n", "from IPython.display import display, display_markdown, HTML\n", "\n", @@ -128,7 +126,7 @@ "from matplotlib import pyplot\n", "\n", "## Venn Diagram\n", - "from matplotlib_venn import venn3\n", + "from matplotlib_venn import venn3, venn2\n", "\n", "## Volcano Plot\n", "from bokeh.plotting import figure, show\n", @@ -192,7 +190,6 @@ " in the Ginkgo dataset is a key and the value is the DE data\n", " for the query gene for that cell type.\n", " '''\n", - " # get perturbations with given cell type\n", " cell_type_results = {}\n", " for k in cell_types:\n", " subset = df[df['Perturbation'].str.contains(k)]\n", @@ -225,6 +222,7 @@ " lincs_de = pd.read_feather(f'{lincs_URL}/{gene_file}')\n", " # format p-values\n", " lincs_de['log10adj.P.Val'] = lincs_de['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1\n", + " # remove CRISPR KO perturbations\n", " lincs_ko_perturbs = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_ko_perturbs.txt', sep='\\t')\n", " lincs_de = lincs_de[~lincs_de['Drug'].isin(lincs_ko_perturbs.cmap_name.to_list())]\n", "except:\n", @@ -258,11 +256,11 @@ "metadata": {}, "outputs": [], "source": [ - "if in_lincs + in_novartis + in_ginkgo < 2:\n", + "if in_lincs + in_novartis + in_ginkgo < 1:\n", " print(f\"LINCS: {in_lincs}\")\n", " print(f\"Novartis: {in_novartis}\")\n", " print(f\"Ginkgo: {in_ginkgo}\")\n", - " raise Exception(\"Execution stopped, gene not found in at least 2 datasets\")" + " raise Exception(\"Execution stopped, gene not found in any datasets\")" ] }, { @@ -361,8 +359,11 @@ " top = {d.casefold() for d in set(rank_results.head(n).index)}\n", " return top\n", "\n", - "# create download link for table results\n", + "\n", "def download_link(df, fname, link_header='Download full results'):\n", + " '''\n", + " create download link for table results\n", + " '''\n", " if df.shape[0] == 0: return ''\n", " csv = df.to_csv(fname, sep='\\t', index=True)\n", " link = f'
{link_header}: {fname}
'\n", @@ -402,23 +403,26 @@ "metadata": {}, "outputs": [], "source": [ - "ginkgo_drugs_up = {}\n", - "ginkgo_drugs_down = {}\n", - "for cell_type, exprdf in ginkgo_gene_expr_dict.items():\n", - " # rank by level of up-regulation\n", - " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up', ranking_method)\n", - " ginkgo_drugs_up[cell_type] = (mean_ranks, full_ranks)\n", - " display_markdown(f'**Top {top_n} up-regulators for {cell_type}**', raw=True)\n", - " display(mean_ranks.head(top_n))\n", - " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_UpReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", - " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_UpRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n", - " # rank by level of down-regulation\n", - " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down', ranking_method)\n", - " ginkgo_drugs_down[cell_type] = (mean_ranks, full_ranks)\n", - " display_markdown(f'**Top {top_n} down-regulators for {cell_type}**', raw=True)\n", - " display(mean_ranks.head(top_n))\n", - " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_DnReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", - " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_DnRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n" + "if in_ginkgo:\n", + " ginkgo_drugs_up = {}\n", + " ginkgo_drugs_down = {}\n", + " for cell_type, exprdf in ginkgo_gene_expr_dict.items():\n", + " # rank by level of up-regulation\n", + " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up', ranking_method)\n", + " ginkgo_drugs_up[cell_type] = (mean_ranks, full_ranks)\n", + " display_markdown(f'**Top {top_n} up-regulators for {cell_type}**', raw=True)\n", + " display(mean_ranks.head(top_n))\n", + " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_UpReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_UpRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n", + " # rank by level of down-regulation\n", + " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down', ranking_method)\n", + " ginkgo_drugs_down[cell_type] = (mean_ranks, full_ranks)\n", + " display_markdown(f'**Top {top_n} down-regulators for {cell_type}**', raw=True)\n", + " display(mean_ranks.head(top_n))\n", + " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_DnReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_DnRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n", + "else:\n", + " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)" ] }, { @@ -444,16 +448,19 @@ "metadata": {}, "outputs": [], "source": [ - "lincs_drugs_up = get_rankings(lincs_de, 'LINCS', '', 'up', ranking_method)\n", - "lincs_drugs_down = get_rankings(lincs_de, 'LINCS', '', 'down', ranking_method)\n", - "display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n", - "display(lincs_drugs_up[0].head(top_n))\n", - "display(HTML(download_link(lincs_drugs_up[0], f\"l1000_drug_ranks_{query_gene}_UpReg.tsv\", 'Download results averaged across drug dosages')))\n", - "display(HTML(download_link(lincs_drugs_up[1], f\"l1000_drug_ranks_{query_gene}_full_UpReg.tsv\", 'Download results for all perturbations')))\n", - "display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n", - "display(lincs_drugs_down[0].head(top_n))\n", - "display(HTML(download_link(lincs_drugs_down[0], f\"l1000_drug_ranks_{query_gene}_DnReg.tsv\", 'Download results averaged across drug dosages')))\n", - "display(HTML(download_link(lincs_drugs_down[1], f\"l1000_drug_ranks_{query_gene}_full_DnReg.tsv\", 'Download results for all perturbations')))" + "if in_lincs:\n", + " lincs_drugs_up = get_rankings(lincs_de, 'LINCS', '', 'up', ranking_method)\n", + " lincs_drugs_down = get_rankings(lincs_de, 'LINCS', '', 'down', ranking_method)\n", + " display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n", + " display(lincs_drugs_up[0].head(top_n))\n", + " display(HTML(download_link(lincs_drugs_up[0], f\"l1000_drug_ranks_{query_gene}_UpReg.tsv\", 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(lincs_drugs_up[1], f\"l1000_drug_ranks_{query_gene}_full_UpReg.tsv\", 'Download results for all perturbations')))\n", + " display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n", + " display(lincs_drugs_down[0].head(top_n))\n", + " display(HTML(download_link(lincs_drugs_down[0], f\"l1000_drug_ranks_{query_gene}_DnReg.tsv\", 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(lincs_drugs_down[1], f\"l1000_drug_ranks_{query_gene}_full_DnReg.tsv\", 'Download results for all perturbations')))\n", + "else: \n", + " display_markdown(f'**{query_gene}** not found in LINCS L1000 dataset', raw=True)" ] }, { @@ -479,17 +486,20 @@ "metadata": {}, "outputs": [], "source": [ - "novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up', ranking_method)\n", - "novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down', ranking_method)\n", + "if in_novartis:\n", + " novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up', ranking_method)\n", + " novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down', ranking_method)\n", "\n", - "display_markdown(f'**Top {top_n} up-regulators in Novartis DRUG-seq**', raw=True)\n", - "display(novartis_drugs_up[0].head(top_n))\n", - "display(HTML(download_link(novartis_drugs_up[0], f'novartis_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n", - "display(HTML(download_link(novartis_drugs_up[1], f'novartis_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n", - "display_markdown(f'**Top {top_n} down-regulators in Novartis DRUG-seq**', raw=True)\n", - "display(novartis_drugs_down[0].head(top_n))\n", - "display(HTML(download_link(novartis_drugs_down[0], f'novartis_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n", - "display(HTML(download_link(novartis_drugs_down[1], f'novartis_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))" + " display_markdown(f'**Top {top_n} up-regulators in Novartis DRUG-seq**', raw=True)\n", + " display(novartis_drugs_up[0].head(top_n))\n", + " display(HTML(download_link(novartis_drugs_up[0], f'novartis_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(novartis_drugs_up[1], f'novartis_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n", + " display_markdown(f'**Top {top_n} down-regulators in Novartis DRUG-seq**', raw=True)\n", + " display(novartis_drugs_down[0].head(top_n))\n", + " display(HTML(download_link(novartis_drugs_down[0], f'novartis_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(novartis_drugs_down[1], f'novartis_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))\n", + "else:\n", + " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq', raw=True)" ] }, { @@ -502,15 +512,18 @@ "top_up = {}\n", "top_down = {}\n", "# get results from Ginkgo\n", - "for cell_type in ginkgo_drugs_down.keys():\n", - " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n", - " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n", + "if in_ginkgo:\n", + " for cell_type in ginkgo_drugs_down.keys():\n", + " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n", + " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n", "# get results from L1000\n", - "top_up['lincs_l1000'] = {drug.casefold() for drug in set(lincs_drugs_up[0].index)}\n", - "top_down['lincs_l1000'] = {drug.casefold() for drug in set(lincs_drugs_down[0].index)}\n", + "if in_lincs:\n", + " top_up['lincs_l1000'] = get_top(lincs_drugs_up[0], n=50)\n", + " top_down['lincs_l1000'] = get_top(lincs_drugs_down[0], n=50)\n", "# get results from novartis\n", - "top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n", - "top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)" + "if in_novartis:\n", + " top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n", + " top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)" ] }, { @@ -552,7 +565,11 @@ "metadata": {}, "outputs": [], "source": [ - "def create_upset(top_sets: dict, ranking_method:str):\n", + "def create_upset(top_sets: dict):\n", + " '''\n", + " Given a dictionary to top up or down regulating genes,\n", + " creates an Upset plot. \n", + " '''\n", " rename_keys = {\n", " 'ginkgo_A549': 'ginkgo_A549',\n", " 'lincs_l1000': 'lincs_l1000',\n", @@ -575,19 +592,14 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n", - "create_upset(top_up, ranking_method)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ecff74c", - "metadata": {}, - "outputs": [], - "source": [ - "display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n", - "create_upset(top_down, ranking_method)" + "if in_ginkgo + in_lincs + in_novartis < 2:\n", + " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", + "else:\n", + " display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n", + " create_upset(top_up)\n", + "\n", + " display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n", + " create_upset(top_down)" ] }, { @@ -597,7 +609,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_overlapping_sets(top_sets:dict, to_file:str):\n", + "def get_overlapping_sets(top_sets:dict):\n", " '''\n", " Given the dictionary of sets used to created the UpSet plot,\n", " return the contents of the overlapping sets. \n", @@ -649,14 +661,17 @@ "metadata": {}, "outputs": [], "source": [ - "overlap_down = get_overlapping_sets(top_down, 'overlap_down')\n", - "overlap_up = get_overlapping_sets(top_up, 'overlap_up')\n", - "display_markdown(\"**Down-regulating drug overlap**\", raw=True)\n", - "display(overlap_down)\n", - "display(HTML(download_link(overlap_down, f'overlapping_drugs_{query_gene}_DnReg.tsv')))\n", - "display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n", - "display(overlap_up)\n", - "display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n" + "if in_ginkgo + in_lincs + in_novartis < 2:\n", + " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", + "else:\n", + " overlap_down = get_overlapping_sets(top_down)\n", + " overlap_up = get_overlapping_sets(top_up)\n", + " display_markdown(\"**Down-regulating drug overlap**\", raw=True)\n", + " display(overlap_down)\n", + " display(HTML(download_link(overlap_down, f'overlapping_drugs_{query_gene}_DnReg.tsv')))\n", + " display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n", + " display(overlap_up)\n", + " display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n" ] }, { @@ -666,7 +681,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_ranking_averages(overlapping_df, direction, ranking_method):\n", + "def get_ranking_averages(overlapping_df, data_dict, ranking_method):\n", " '''\n", " Retrieve average target ranking across datasets for drugs in overlapping sets. \n", "\n", @@ -675,27 +690,6 @@ " Average Rank\n", " Number of datasets for which drug was a significant regulator of the query gene\n", " '''\n", - " # extract up or down data\n", - " if direction == 'down':\n", - " data_dict ={\n", - " 'ginkgo_A549': ginkgo_drugs_down['A549'][1],\n", - " 'ginkgo_human_dermal_fibroblast': ginkgo_drugs_down['human_dermal_fibroblast'][1],\n", - " 'ginkgo_human_aortic_smooth_muscle_cells': ginkgo_drugs_down['human_aortic_smooth_muscle_cells'][1],\n", - " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_down['human_epithelial_melanocytes'][1],\n", - " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_down['human_skeletal_muscle_myoblasts'][1],\n", - " 'novartis': novartis_drugs_down[1],\n", - " 'lincs': lincs_drugs_down[1]\n", - " }\n", - " elif direction == 'up':\n", - " data_dict ={\n", - " 'ginkgo_A549': ginkgo_drugs_up['A549'][1],\n", - " 'ginkgo_human_dermal_fibroblast': ginkgo_drugs_up['human_dermal_fibroblast'][1],\n", - " 'ginkgo_human_aortic_smooth_muscle_cells': ginkgo_drugs_up['human_aortic_smooth_muscle_cells'][1],\n", - " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_up['human_epithelial_melanocytes'][1],\n", - " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_up['human_skeletal_muscle_myoblasts'][1],\n", - " 'novartis': novartis_drugs_up[1],\n", - " 'lincs': lincs_drugs_up[1]\n", - " }\n", " # get average, integrating across datasets\n", " average_rank_vals = {}\n", " average_pctrank_vals = {}\n", @@ -710,7 +704,7 @@ " runsum_pctrank = 0\n", " runsum_logFC = 0\n", " runsum_pval = 0\n", - " for k,df in data_dict.items():\n", + " for _,df in data_dict.items():\n", " subset = df[df['Drug'].str.lower() == d.lower()]\n", " n = n + subset.shape[0]\n", " runsum_rank = runsum_rank + subset.Rank.sum()\n", @@ -736,10 +730,40 @@ " else:\n", " # sort based on N datasets and average adjusted p-value\n", " res_df = res_df.sort_values(['N Datasets','Avg Adj.P.Val'], ascending=[False,True])\n", - " return res_df\n", - " \n", - "overlapping_up_TargetRank = get_ranking_averages(overlap_up, 'up', ranking_method)\n", - "overlapping_down_TargetRank = get_ranking_averages(overlap_down, 'down', ranking_method)" + " return res_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cc1ea15", + "metadata": {}, + "outputs": [], + "source": [ + "# define input data for get_ranking_averages\n", + "data_source_present = {'A549': in_ginkgo, \n", + " 'human_dermal_fibroblast':in_ginkgo,\n", + " 'human_aortic_smooth_muscle_cells':in_ginkgo,\n", + " 'human_epithelial_melanocytes': in_ginkgo,\n", + " 'human_skeletal_muscle_myoblasts': in_ginkgo,\n", + " 'novartis': in_novartis,\n", + " 'lincs': in_lincs}\n", + "data_dict_down ={}\n", + "data_dict_up = {}\n", + "for source,present in data_source_present.items():\n", + " if (present) & (not source in ['novartis','lincs']):\n", + " data_dict_down[source] = ginkgo_drugs_down[source][1]\n", + " data_dict_up[source] = ginkgo_drugs_up[source][1]\n", + " elif (present) & (source == 'lincs'):\n", + " data_dict_down[source] = lincs_drugs_down[1]\n", + " data_dict_up[source] = lincs_drugs_up[1]\n", + " elif (present) & (source == 'novartis'):\n", + " data_dict_down[source] = novartis_drugs_down[1]\n", + " data_dict_up[source] = novartis_drugs_up[1]\n", + "\n", + "if in_ginkgo + in_lincs + in_novartis > 1:\n", + " overlapping_up_TargetRank = get_ranking_averages(overlap_up, data_dict_up, ranking_method)\n", + " overlapping_down_TargetRank = get_ranking_averages(overlap_down, data_dict_down, ranking_method)" ] }, { @@ -757,12 +781,15 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n", - "display(overlapping_up_TargetRank.head(n=top_n))\n", - "display(HTML(download_link(overlapping_up_TargetRank, f'overlapping_drugs_averages_{query_gene}_UpReg.tsv')))\n", - "display_markdown(\"**Averages across datasets: Down-regulating drugs**\", raw=True)\n", - "display(overlapping_down_TargetRank.head(n=top_n))\n", - "display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))" + "if in_ginkgo + in_lincs + in_novartis < 2:\n", + " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", + "else:\n", + " display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n", + " display(overlapping_up_TargetRank.head(n=top_n))\n", + " display(HTML(download_link(overlapping_up_TargetRank, f'overlapping_drugs_averages_{query_gene}_UpReg.tsv')))\n", + " display_markdown(\"**Averages across datasets: Down-regulating drugs**\", raw=True)\n", + " display(overlapping_down_TargetRank.head(n=top_n))\n", + " display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))" ] }, { @@ -783,38 +810,77 @@ "outputs": [], "source": [ "# combine top up and down drugs across Ginkgo cell types\n", - "all_ginkgo_up = set()\n", - "all_ginkgo_down = set()\n", - "for k,v in top_up.items():\n", - " if re.search('ginkgo',k):\n", - " all_ginkgo_up = all_ginkgo_up.union(v)\n", - " all_ginkgo_down = all_ginkgo_down.union(top_down[k])\n", - "\n", - "venn_up = {\n", - " 'novartis': top_up['novartis'],\n", - " 'lincs' : top_up['lincs_l1000'],\n", - " 'ginkgo' : all_ginkgo_up\n", - "}\n", - "venn_down = {\n", - " 'novartis': top_down['novartis'],\n", - " 'lincs' : top_down['lincs_l1000'],\n", - " 'ginkgo' : all_ginkgo_down\n", - "}\n", + "if in_ginkgo:\n", + " all_ginkgo_up = set()\n", + " all_ginkgo_down = set()\n", + " for k,v in top_up.items():\n", + " if re.search('ginkgo',k):\n", + " all_ginkgo_up = all_ginkgo_up.union(v)\n", + " all_ginkgo_down = all_ginkgo_down.union(top_down[k])\n", + "# define input data for venn diagrams\n", + "data_source_present = {'ginkgo':in_ginkgo,\n", + " 'lincs_l1000':in_lincs,\n", + " 'novartis':in_novartis}\n", + "venn_up = {}\n", + "venn_down = {}\n", + "for source,present in data_source_present.items():\n", + " if (present) & (source == 'ginkgo'):\n", + " venn_up[source] = all_ginkgo_up\n", + " venn_down[source] = all_ginkgo_down\n", + " elif present:\n", + " venn_up[source] = top_up[source]\n", + " venn_down[source] = top_down[source]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "108bbb28", + "metadata": {}, + "outputs": [], + "source": [ + "def create_venn(venn_dict):\n", + " '''\n", + " Given a dictionary of sets of top up or down regulating genes,\n", + " creates a 2 or 3 set venn diagram. \n", + " '''\n", + " if len(venn_dict) == 3:\n", + " venn3(subsets=(venn_dict['novartis'], venn_dict['lincs_l1000'], venn_dict['ginkgo']),\n", + " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n", + " elif len(venn_dict) == 2:\n", + " venn2(subsets = (venn_dict.values()),\n", + " set_labels=(venn_dict.keys()));\n", + "\n", "\n", "def print_overlap(venn_dict):\n", - " gnl = venn_dict['ginkgo'].intersection(venn_dict['novartis']).intersection(venn_dict['lincs'])\n", - " gn = venn_dict['ginkgo'].intersection(venn_dict['novartis']).difference(gnl)\n", - " gl = venn_dict['ginkgo'].intersection(venn_dict['lincs']).difference(gnl)\n", - " nl = venn_dict['novartis'].intersection(venn_dict['lincs']).difference(gnl)\n", - " \n", - " if len(gn) > 0:\n", - " display_markdown(f'Novartis and Ginkgo: {[d for d in gn]}', raw=True)\n", - " if len(nl) > 0:\n", - " display_markdown(f'Novartis and LINCS L1000: {[d for d in nl]}', raw=True)\n", - " if len(gl) > 0:\n", - " display_markdown(f'LINCS L1000 and Ginkgo: {[d for d in gl]}', raw=True)\n", - " if len(gnl) > 0:\n", - " display_markdown(f'LINCS L1000, Ginkgo and Novartis: {[d for d in gnl]}', raw=True)" + " '''\n", + " Given a dictionary of sets of top up or down regulating genes,\n", + " prints the unique overlap among all combinations of sets (>= 2). \n", + " '''\n", + " results = {}\n", + " datasets = list(venn_dict.keys())\n", + " # Step 1: get all intersections\n", + " for r in range(2, len(datasets)+1):\n", + " for combo in combinations(datasets, r):\n", + " drug_sets = [venn_dict[d] for d in combo]\n", + " results[combo] = list(set.intersection(*drug_sets))\n", + " # Step 2: subtract intersections of all super sets \n", + " for combo, base_set in results.items():\n", + " unique_set = set(base_set)\n", + " # iterate through all sets from larger combinations\n", + " for r in range(len(combo) + 1, len(datasets) + 1):\n", + " for sup in combinations(datasets, r):\n", + " if set(combo).issubset(sup): # only consider supersets\n", + " super_set = set(results[sup])\n", + " # get difference between super set and current set, this\n", + " # is the new intersection for the combination\n", + " new_unique_set = unique_set - super_set\n", + " results[combo] = list(new_unique_set)\n", + " # Step 3: print results\n", + " for datasets, overlap in results.items():\n", + " if len(overlap) == 0:\n", + " overlap = ['None']\n", + " print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")" ] }, { @@ -824,11 +890,12 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n", - "\n", - "venn3(subsets=(venn_up['novartis'], venn_up['lincs'], venn_up['ginkgo']),\n", - " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n", - "print_overlap(venn_up)" + "if in_ginkgo + in_lincs + in_novartis < 2:\n", + " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", + "else:\n", + " display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n", + " create_venn(venn_up)\n", + " print_overlap(venn_up)" ] }, { @@ -838,11 +905,12 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n", - "\n", - "venn3(subsets=(venn_down['novartis'], venn_down['lincs'], venn_down['ginkgo']),\n", - " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n", - "print_overlap(venn_down)" + "if in_ginkgo + in_lincs + in_novartis < 2:\n", + " display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n", + "else:\n", + " display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n", + " create_venn(venn_down)\n", + " print_overlap(venn_down)" ] }, { @@ -935,9 +1003,12 @@ "metadata": {}, "outputs": [], "source": [ - "for cell_type, expr_df in ginkgo_gene_expr_dict.items():\n", - " cell_name = ' '.join(re.sub('human_','',cell_type).split('_'))\n", - " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo')" + "if in_ginkgo:\n", + " for cell_type, expr_df in ginkgo_gene_expr_dict.items():\n", + " cell_name = ' '.join(re.sub('human_','',cell_type).split('_'))\n", + " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo')\n", + "else:\n", + " display_markdown(f'**{query_gene}** not found in Ginkgo datasets', raw=True)" ] }, { @@ -955,7 +1026,10 @@ "metadata": {}, "outputs": [], "source": [ - "create_bokeh_volcano_plot(lincs_de, query_gene, '', 'L1000')" + "if in_lincs:\n", + " create_bokeh_volcano_plot(lincs_de, query_gene, '', 'L1000')\n", + "else:\n", + " display_markdown(f'**{query_gene}** not found in LINCS L1000 dataset', raw=True)" ] }, { @@ -973,7 +1047,10 @@ "metadata": {}, "outputs": [], "source": [ - "create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis')" + "if in_novartis:\n", + " create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis')\n", + "else:\n", + " display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)" ] }, {