From 8d612502ba6f8018b72a889bce121c7d6d774f7b Mon Sep 17 00:00:00 2001 From: Lily Taub <115661359+lilydtaub@users.noreply.github.com> Date: Tue, 16 Sep 2025 09:50:39 -0400 Subject: [PATCH 1/2] update image specification in form.j2 --- appyters/Drug_Gene_Budger2/templates/form.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appyters/Drug_Gene_Budger2/templates/form.j2 b/appyters/Drug_Gene_Budger2/templates/form.j2 index 1eebf3e0..b9eab39f 100644 --- a/appyters/Drug_Gene_Budger2/templates/form.j2 +++ b/appyters/Drug_Gene_Budger2/templates/form.j2 @@ -18,7 +18,7 @@
-

Dr. Gene Budger 2 (DBG2)

+

Dr. Gene Budger 2 (DBG2)

Submit a human gene symbol and explore drugs that maximally increase or decrease the mRNA expression of the input gene based on data from a collection of Connectivity Mapping resources

From 7b39ce00e2914c9e2e586e280ff78d454c151a2a Mon Sep 17 00:00:00 2001 From: Lily Taub <115661359+lilydtaub@users.noreply.github.com> Date: Tue, 16 Sep 2025 09:51:46 -0400 Subject: [PATCH 2/2] add target rank method for all datasets --- appyters/Drug_Gene_Budger2/appyter.json | 2 +- .../drug_gene_budger2_appyter.ipynb | 374 +++++++++--------- 2 files changed, 196 insertions(+), 180 deletions(-) diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json index 3f3ee465..e15e61b1 100644 --- a/appyters/Drug_Gene_Budger2/appyter.json +++ b/appyters/Drug_Gene_Budger2/appyter.json @@ -2,7 +2,7 @@ "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json", "name": "Drug_Gene_Budger2", "title": "Dr. Gene Budger (DGB) 2", - "version": "0.0.4", + "version": "0.0.5", "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets", "image": "dgb_logo.png", "authors": [ diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb index 60108ba4..0357c85c 100644 --- a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb +++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb @@ -22,9 +22,15 @@ "%%appyter hide_code\n", "\n", "{% do SectionField(\n", - " name='input', \n", - " title = 'Gene of interest', \n", - " subtitle = 'Enter a gene for which you wish to get up and down regulators.'\n", + " name='gene_input', \n", + " title = '1. Select an input gene', \n", + " subtitle = 'Enter a human gene of interest'\n", + ") %}\n", + "\n", + "{% do SectionField(\n", + " name='method_input', \n", + " title = '2. Select a drug ranking method', \n", + " subtitle = \"Select a ranking method by which to identify top up- and down-regulating drugs. Options are to rank drugs by the differential expression (DE) p-value of the target (default) or by the rank of the target in each perturbation's DE signature (Target Rank), where ranking is determined by the adjusted p-value of differential expression. This method prioritizes drug specificity over magnitude of regulation.\"\n", ") %}" ] }, @@ -43,7 +49,19 @@ " default = 'C9ORF72',\n", " description = 'Enter the gene symbol of interest.',\n", " file_path = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/all_genes.json',\n", - " section='input'\n", + " section='gene_input'\n", + ")%}\n", + "\n", + "{% set ranking_method = ChoiceField(\n", + " name = 'ranking_method',\n", + " label = 'Ranking Method',\n", + " default = 'Differential Expression P-Value',\n", + " description = \"Rank drugs by the differential expression (DE) p-value of the target or the rank of the target in each perturbation's DE signature. Ranking is determined by the adjusted p-value of differential expression.\",\n", + " choices = [\n", + " 'Differential Expression P-Value',\n", + " 'Target Rank'\n", + " ],\n", + " section='method_input'\n", ")%}" ] }, @@ -55,7 +73,13 @@ "outputs": [], "source": [ "%%appyter code_exec\n", - "query_gene = \"{{ input_gene.value.upper() }}\"" + "query_gene = \"{{ input_gene.value.upper() }}\"\n", + "\n", + "{%- if ranking_method.raw_value == 'Differential Expression P-Value' %}\n", + "ranking_method = 'pval'\n", + "{%- else %}\n", + "ranking_method = 'target_rank'\n", + "{%- endif %}" ] }, { @@ -73,9 +97,9 @@ "source": [ "This notebook takes a gene as input and identifies drugs that maximally up and down regulate the gene's expression in a collection of chemical perturbation datasets.\n", "\n", - "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene epxression results for 1,354 drugs.\n", - "- Novartis DRUG-seq: Differential: Limma-Trend based differential expression results for 4,343 drugs. \n", - "- LINCS L1000 Chemical Perturbations: Queries the [LINCS Reverse Search Dashboard](https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/) for pre-computed characteristic direction-based differential gene expression signatures from RNA-seq-like LINCS L1000 Expression Profiles covering 33,571 drugs.\n", + "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene expression results for 1,354 drugs.\n", + "- Novartis DRUG-seq: Differential: Limma-Trend based differential gene expression results for 4,343 drugs. \n", + "- LINCS L1000 Chemical Perturbations: Limma-Voom based differential gene expression results for a subset of 4,091 drugs from the LINCS L1000 Chemical Perturbation dataset. \n", "\n", "The Ginkgo dataset includes 4 primary cell types (epithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type." ] @@ -123,6 +147,7 @@ "# Storage url for Ginkgo and Novartis DE files\n", "ginkgo_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/ginkgo_de'\n", "novartis_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'\n", + "lincs_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_de'\n", "# silence warnings\n", "warnings.filterwarnings('ignore')" ] @@ -191,38 +216,19 @@ { "cell_type": "code", "execution_count": null, - "id": "200a83a1", - "metadata": {}, - "outputs": [], - "source": [ - "def l1000_reverse_search(gene_id:str, direction:str):\n", - " url = f'https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/api/table/cp/{direction}/{gene_id}'\n", - " headers = {\n", - " 'Accept':'application/json',\n", - " 'Content-Type':'application/json'\n", - " }\n", - " try:\n", - " resp = requests.get(url, headers=headers)\n", - " resp.raise_for_status()\n", - " res = resp.json()\n", - " df = pd.DataFrame(res)\n", - " except requests.exceptions.HTTPError as e:\n", - " print(f\"Gene not found in LINCS: {e}\")\n", - " df=pd.DataFrame()\n", - " return df\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eccf1c5c", + "id": "41658110", "metadata": {}, "outputs": [], "source": [ - "l1000_up = l1000_reverse_search(query_gene.upper(), 'up')\n", - "l1000_down = l1000_reverse_search(query_gene.upper(), 'down')\n", - "if l1000_down.empty or l1000_up.empty:\n", + "# get LINCS DE results for gene\n", + "try:\n", + " lincs_de = pd.read_feather(f'{lincs_URL}/{gene_file}')\n", + " # format p-values\n", + " lincs_de['log10adj.P.Val'] = lincs_de['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1\n", + " lincs_ko_perturbs = pd.read_csv('https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_ko_perturbs.txt', sep='\\t')\n", + " lincs_de = lincs_de[~lincs_de['Drug'].isin(lincs_ko_perturbs.cmap_name.to_list())]\n", + "except:\n", + " print('Gene not in LINCS L1000 dataset')\n", " in_lincs=False" ] }, @@ -241,7 +247,7 @@ " # rename logFC column for concordance with Ginkgo columns\n", " novartis_de.rename(columns={'LogFC':'logFC'}, inplace=True)\n", "except:\n", - " print('Gene not in novartis dataset')\n", + " print('Gene not in Novartis DRUG-seq dataset')\n", " in_novartis=False" ] }, @@ -274,7 +280,8 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(f\"This notebook shows results for the input gene **{query_gene}**\", raw=True)" + "display_markdown(f\"This notebook shows results for the input gene **{query_gene}**\", raw=True)\n", + "display_markdown(f\"Drugs that up and down regulate **{query_gene}** are ranked by method **{ranking_method}**\", raw=True)" ] }, { @@ -290,9 +297,17 @@ "id": "97c523ce", "metadata": {}, "source": [ - "Within each dataset drugs are ranked by the statistical significance of the regulatory relationship. The pipeline uses the differential expression p-value for the Novartis and Ginkgo data and the characterstic direction coefficient for the LINCS L1000 data. \n", + "Within each dataset drugs are ranked by either:\n", + "\n", + "1. The statistical significance of the regulatory relationship. The pipeline uses the adjusted p-value from the differential expression results.\n", + "\n", + "or\n", + "\n", + "2. The rank of the query gene relative to all other targets. Rank is determined by differential expression adjusted p-value. A ranking of 1 indicates that the gene has the most significant adjusted p-value for differential expression compared to all other genes for that perturbation. \n", "\n", - "When a dataset contains multiple perturbations for the same drug (i.e. a cell exposed to the drug at different doses), p-values are averaged across doses to get a single ranking for the drug. " + "When a dataset contains multiple perturbations for the same drug (i.e. a cell exposed to the drug at different doses), p-values and normalized ranks are averaged across doses to get a single ranking for the drug. \n", + "\n", + "The rankings are done separately for up-regulated and down-regulated genes." ] }, { @@ -302,7 +317,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_rankings(data:pd.DataFrame, source:str, cell_type:str, direction:str):\n", + "def get_rankings(data:pd.DataFrame, source:str, cell_type:str, direction:str, option:str):\n", " '''\n", " Given a dataframe of logFC and p-values for a gene of interest across perturbations, \n", " rank the drugs by how the induce or repress the gene. \n", @@ -318,14 +333,22 @@ " ranked_data.loc[ranked_data['Drug']=='Brefeldin-A', 'Drug'] = 'Brefeldin A'\n", " elif source == 'Novartis':\n", " ranked_data.loc[ranked_data['Drug']=='Trichostatin A (racemate)', 'Drug'] = 'Trichostatin A'\n", - " # average rank across all drug dosages\n", - " drug_mean_ranks = ranked_data.loc[:,['Drug','logFC','log10adj.P.Val']].groupby('Drug')[['logFC','log10adj.P.Val']].mean().sort_values('log10adj.P.Val', ascending=False)\n", - " # filter for up or down regulation\n", - " if direction == 'up':\n", - " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] > 0]\n", - " elif direction == 'down':\n", - " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] < 0]\n", - " drug_mean_ranks.rename(columns={'logFC':'Avg logFC', 'log10adj.P.Val':'Avg -log10(Adj.PVal)'}, inplace=True)\n", + " ranked_data.rename(columns={'P.Adj':'adj.P.Val'}, inplace=True)\n", + " if option == 'pval':\n", + " # average rank across all drug dosages\n", + " drug_mean_ranks = ranked_data.loc[:,['Drug','logFC','log10adj.P.Val']].groupby('Drug')[['logFC','log10adj.P.Val']].mean().sort_values('log10adj.P.Val', ascending=False)\n", + " # filter for up or down regulation\n", + " if direction == 'up':\n", + " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] > 0]\n", + " elif direction == 'down':\n", + " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] < 0]\n", + " drug_mean_ranks.rename(columns={'logFC':'Avg logFC', 'log10adj.P.Val':'Avg -log10(Adj.PVal)'}, inplace=True)\n", + " elif option == 'target_rank':\n", + " if direction == 'up':\n", + " ranked_data = ranked_data[ranked_data['GeneDir'] == 'Up']\n", + " elif direction == 'down':\n", + " ranked_data = ranked_data[ranked_data['GeneDir'] == 'Dn']\n", + " drug_mean_ranks = ranked_data.loc[:,['Drug','logFC','adj.P.Val','Rank','PctRank']].groupby('Drug')[['logFC','adj.P.Val','Rank','PctRank']].mean().sort_values('PctRank', ascending=True)\n", " return drug_mean_ranks, ranked_data\n", "\n", "def get_top(rank_results:pd.DataFrame, n=50):\n", @@ -339,10 +362,10 @@ " return top\n", "\n", "# create download link for table results\n", - "def download_link(df, fname):\n", + "def download_link(df, fname, link_header='Download full results'):\n", " if df.shape[0] == 0: return ''\n", " csv = df.to_csv(fname, sep='\\t', index=True)\n", - " link = f'
Download full results: {fname}
'\n", + " link = f'
{link_header}: {fname}
'\n", " return link" ] }, @@ -359,7 +382,7 @@ "id": "c5073607", "metadata": {}, "source": [ - "Drug rankings for the Ginkgo dataset. Top 20 by p-value are shown and the complete table can be downloaded." + "Drug rankings for the Ginkgo dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " ] }, { @@ -383,17 +406,19 @@ "ginkgo_drugs_down = {}\n", "for cell_type, exprdf in ginkgo_gene_expr_dict.items():\n", " # rank by level of up-regulation\n", - " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up')\n", + " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up', ranking_method)\n", " ginkgo_drugs_up[cell_type] = (mean_ranks, full_ranks)\n", " display_markdown(f'**Top {top_n} up-regulators for {cell_type}**', raw=True)\n", " display(mean_ranks.head(top_n))\n", - " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_UpReg_{cell_type}.tsv\")))\n", + " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_UpReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_UpRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n", " # rank by level of down-regulation\n", - " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down')\n", + " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down', ranking_method)\n", " ginkgo_drugs_down[cell_type] = (mean_ranks, full_ranks)\n", " display_markdown(f'**Top {top_n} down-regulators for {cell_type}**', raw=True)\n", " display(mean_ranks.head(top_n))\n", - " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_DnReg_{cell_type}.tsv\")))\n" + " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_{query_gene}_DnReg_{cell_type}.tsv\", 'Download results averaged across drug dosages')))\n", + " display(HTML(download_link(full_ranks, f\"ginkgo_drug_ranks_{query_gene}_full_DnRg_{cell_type}.tsv\", 'Download results for all perturbations')))\n" ] }, { @@ -409,7 +434,7 @@ "id": "8777d900", "metadata": {}, "source": [ - "Drug rankings for the LINCS L1000 dataset. Top 20 by CD-coefficient are shown and the complete table can be downloaded." + "Drug rankings for the LINCS L1000 dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " ] }, { @@ -419,23 +444,16 @@ "metadata": {}, "outputs": [], "source": [ - "def l1000_sort(result, direction):\n", - " df = result[['Perturbagen','CD Coefficient']]\n", - " df = df.groupby('Perturbagen').mean(['CD Coefficient'])\n", - " if direction == 'up':\n", - " df = df.sort_values('CD Coefficient', ascending=False)\n", - " elif direction == 'down':\n", - " df = df.sort_values('CD Coefficient', ascending=True)\n", - " return df\n", - "\n", - "l1000_top_up = l1000_sort(l1000_up, 'up')\n", - "l1000_top_down = l1000_sort(l1000_down, 'down')\n", + "lincs_drugs_up = get_rankings(lincs_de, 'LINCS', '', 'up', ranking_method)\n", + "lincs_drugs_down = get_rankings(lincs_de, 'LINCS', '', 'down', ranking_method)\n", "display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n", - "display(l1000_top_up.head(top_n))\n", - "display(HTML(download_link(l1000_top_up, f\"l1000_drug_ranks_UpReg.tsv\")))\n", + "display(lincs_drugs_up[0].head(top_n))\n", + "display(HTML(download_link(lincs_drugs_up[0], f\"l1000_drug_ranks_{query_gene}_UpReg.tsv\", 'Download results averaged across drug dosages')))\n", + "display(HTML(download_link(lincs_drugs_up[1], f\"l1000_drug_ranks_{query_gene}_full_UpReg.tsv\", 'Download results for all perturbations')))\n", "display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n", - "display(l1000_top_down.head(top_n))\n", - "display(HTML(download_link(l1000_top_down, f\"l1000_drug_ranks_DnReg.tsv\")))" + "display(lincs_drugs_down[0].head(top_n))\n", + "display(HTML(download_link(lincs_drugs_down[0], f\"l1000_drug_ranks_{query_gene}_DnReg.tsv\", 'Download results averaged across drug dosages')))\n", + "display(HTML(download_link(lincs_drugs_down[1], f\"l1000_drug_ranks_{query_gene}_full_DnReg.tsv\", 'Download results for all perturbations')))" ] }, { @@ -451,7 +469,7 @@ "id": "e22fef7a", "metadata": {}, "source": [ - "Drug rankings for the Novartis dataset. Top 20 by p-value are shown and the complete table can be downloaded." + "Drug rankings for the Novartis DRUG-seq dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. " ] }, { @@ -461,15 +479,17 @@ "metadata": {}, "outputs": [], "source": [ - "novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up')\n", - "novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down')\n", + "novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up', ranking_method)\n", + "novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down', ranking_method)\n", "\n", "display_markdown(f'**Top {top_n} up-regulators in Novartis DRUG-seq**', raw=True)\n", "display(novartis_drugs_up[0].head(top_n))\n", - "display(HTML(download_link(novartis_drugs_up[0], 'novartis_drug_ranks_UpReg.tsv')))\n", + "display(HTML(download_link(novartis_drugs_up[0], f'novartis_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n", + "display(HTML(download_link(novartis_drugs_up[1], f'novartis_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n", "display_markdown(f'**Top {top_n} down-regulators in Novartis DRUG-seq**', raw=True)\n", "display(novartis_drugs_down[0].head(top_n))\n", - "display(HTML(download_link(novartis_drugs_down[0], 'novartis_drug_ranks_DnReg.tsv')))" + "display(HTML(download_link(novartis_drugs_down[0], f'novartis_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n", + "display(HTML(download_link(novartis_drugs_down[1], f'novartis_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))" ] }, { @@ -486,8 +506,8 @@ " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n", " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n", "# get results from L1000\n", - "top_up['lincs_l1000'] = {drug.casefold() for drug in set(l1000_top_up.index)}\n", - "top_down['lincs_l1000'] = {drug.casefold() for drug in set(l1000_top_down.index)}\n", + "top_up['lincs_l1000'] = {drug.casefold() for drug in set(lincs_drugs_up[0].index)}\n", + "top_down['lincs_l1000'] = {drug.casefold() for drug in set(lincs_drugs_down[0].index)}\n", "# get results from novartis\n", "top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n", "top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)" @@ -532,15 +552,15 @@ "metadata": {}, "outputs": [], "source": [ - "def create_upset(top_sets: dict):\n", + "def create_upset(top_sets: dict, ranking_method:str):\n", " rename_keys = {\n", - " 'ginkgo_A549': 'ginkgo_A549',\n", - " 'lincs_l1000': 'lincs_l1000',\n", - " 'novartis': 'novartis',\n", - " 'ginkgo_human_epithelial_melanocytes': 'ginkgo_melanocytes',\n", - " 'ginkgo_human_dermal_fibroblast': 'ginkgo_fibroblasts',\n", - " 'ginkgo_human_aortic_smooth_muscle_cells': 'ginkgo_muscle_cells',\n", - " 'ginkgo_human_skeletal_muscle_myoblasts': 'ginkgo_myoblasts'\n", + " 'ginkgo_A549': 'ginkgo_A549',\n", + " 'lincs_l1000': 'lincs_l1000',\n", + " 'novartis': 'novartis',\n", + " 'ginkgo_human_epithelial_melanocytes': 'ginkgo_melanocytes',\n", + " 'ginkgo_human_dermal_fibroblast': 'ginkgo_fibroblasts',\n", + " 'ginkgo_human_aortic_smooth_muscle_cells': 'ginkgo_muscle_cells',\n", + " 'ginkgo_human_skeletal_muscle_myoblasts': 'ginkgo_myoblasts'\n", " }\n", " top_sets = {rename_keys[k]:v for k,v in top_sets.items()}\n", " upset_data = from_contents(top_sets)\n", @@ -556,7 +576,7 @@ "outputs": [], "source": [ "display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n", - "create_upset(top_up)" + "create_upset(top_up, ranking_method)" ] }, { @@ -567,7 +587,7 @@ "outputs": [], "source": [ "display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n", - "create_upset(top_down)" + "create_upset(top_down, ranking_method)" ] }, { @@ -606,6 +626,14 @@ " return overlapping_sets\n" ] }, + { + "cell_type": "markdown", + "id": "6247c9b9", + "metadata": {}, + "source": [ + "## Consensus Regulator Tables" + ] + }, { "cell_type": "markdown", "id": "556a29cc", @@ -625,26 +653,26 @@ "overlap_up = get_overlapping_sets(top_up, 'overlap_up')\n", "display_markdown(\"**Down-regulating drug overlap**\", raw=True)\n", "display(overlap_down)\n", - "display(HTML(download_link(overlap_down, 'overlapping_drugs_DnReg.tsv')))\n", + "display(HTML(download_link(overlap_down, f'overlapping_drugs_{query_gene}_DnReg.tsv')))\n", "display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n", "display(overlap_up)\n", - "display(HTML(download_link(overlap_up, 'overlapping_drugs_UpReg.tsv')))\n" + "display(HTML(download_link(overlap_up, f'overlapping_drugs_{query_gene}_UpReg.tsv')))\n" ] }, { "cell_type": "code", "execution_count": null, - "id": "a4eb5f1d", + "id": "a0f71c19", "metadata": {}, "outputs": [], "source": [ - "def get_logFC_averages(overlapping_df, direction):\n", + "def get_ranking_averages(overlapping_df, direction, ranking_method):\n", " '''\n", - " Retrieve average logFC across datasets for drugs in overlapping sets. \n", + " Retrieve average target ranking across datasets for drugs in overlapping sets. \n", "\n", " Returns dataframe with columns for:\n", " Drug\n", - " Average logFC\n", + " Average Rank\n", " Number of datasets for which drug was a significant regulator of the query gene\n", " '''\n", " # extract up or down data\n", @@ -656,7 +684,7 @@ " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_down['human_epithelial_melanocytes'][1],\n", " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_down['human_skeletal_muscle_myoblasts'][1],\n", " 'novartis': novartis_drugs_down[1],\n", - " 'lincs': l1000_down\n", + " 'lincs': lincs_drugs_down[1]\n", " }\n", " elif direction == 'up':\n", " data_dict ={\n", @@ -666,41 +694,52 @@ " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_up['human_epithelial_melanocytes'][1],\n", " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_up['human_skeletal_muscle_myoblasts'][1],\n", " 'novartis': novartis_drugs_up[1],\n", - " 'lincs': l1000_up\n", + " 'lincs': lincs_drugs_up[1]\n", " }\n", " # get average, integrating across datasets\n", + " average_rank_vals = {}\n", + " average_pctrank_vals = {}\n", " average_logfc_vals = {}\n", + " average_pvals = {}\n", " n_datasets = list()\n", " for _,row in overlapping_df.iterrows():\n", " n_datasets.extend([row['N Datasets']]*len(row['Overlap']))\n", " for d in row['Overlap']:\n", " n = 0\n", - " runsum = 0\n", + " runsum_rank = 0\n", + " runsum_pctrank = 0\n", + " runsum_logFC = 0\n", + " runsum_pval = 0\n", " for k,df in data_dict.items():\n", - " if k == 'lincs':\n", - " subset = df[df['Perturbagen'].str.lower() == d.lower()]\n", - " subset.rename(columns = {'Log2(Fold Change)':'logFC'}, inplace=True)\n", - " else:\n", - " subset = df[df['Drug'].str.lower() == d.lower()]\n", + " subset = df[df['Drug'].str.lower() == d.lower()]\n", " n = n + subset.shape[0]\n", - " runsum = runsum + subset.logFC.sum()\n", - " average_logfc_vals[d] = round(runsum / n,3)\n", + " runsum_rank = runsum_rank + subset.Rank.sum()\n", + " runsum_pctrank = runsum_pctrank + subset.PctRank.sum()\n", + " runsum_logFC = runsum_logFC + subset.logFC.sum()\n", + " runsum_pval = runsum_pval + subset['adj.P.Val'].sum()\n", + " average_rank_vals[d] = round(runsum_rank / n,3)\n", + " average_pctrank_vals[d] = round(runsum_pctrank/n, 3)\n", + " average_logfc_vals[d] = round(runsum_logFC/n, 3)\n", + " average_pvals[d] = round(runsum_pval/n,3)\n", " # create results dataframe\n", " res_df = pd.DataFrame({\n", - " 'Drug': list(average_logfc_vals.keys()),\n", - " 'Avg(logFC)': list(average_logfc_vals.values())\n", + " 'Drug': list(average_rank_vals.keys()),\n", + " 'Avg LogFC': list(average_logfc_vals.values()),\n", + " 'Avg Adj.P.Val': list(average_pvals.values()),\n", + " 'Avg Rank': list(average_rank_vals.values()),\n", + " 'Avg PctRank': list(average_pctrank_vals.values())\n", " })\n", " res_df['N Datasets'] = n_datasets\n", - " # sort based on N datasets and logFC, direction depending on up or down set\n", - " if direction=='up':\n", - " res_df = res_df.sort_values(['N Datasets','Avg(logFC)'], ascending=[False,False])\n", - " elif direction == 'down':\n", - " res_df = res_df.sort_values(['N Datasets','Avg(logFC)'], ascending=[False,True])\n", + " if ranking_method == 'target_rank':\n", + " # sort based on N datasets and average percentile rank\n", + " res_df = res_df.sort_values(['N Datasets','Avg PctRank'], ascending=[False,True])\n", + " else:\n", + " # sort based on N datasets and average adjusted p-value\n", + " res_df = res_df.sort_values(['N Datasets','Avg Adj.P.Val'], ascending=[False,True])\n", " return res_df\n", - "\n", - " \n", - "overlapping_up_logFC = get_logFC_averages(overlap_up, 'up')\n", - "overlapping_down_logFC = get_logFC_averages(overlap_down, 'down')" + " \n", + "overlapping_up_TargetRank = get_ranking_averages(overlap_up, 'up', ranking_method)\n", + "overlapping_down_TargetRank = get_ranking_averages(overlap_down, 'down', ranking_method)" ] }, { @@ -708,7 +747,7 @@ "id": "bc739111", "metadata": {}, "source": [ - "Tables that show the average logFC value (across datasets) for drugs that were found to be significant regulators in more than one dataset." + "The tables below show average logFC, adjusted p-value, raw rank and normalized rank values across datasets for drugs that were found to be significant regulators in more than one dataset." ] }, { @@ -718,12 +757,12 @@ "metadata": {}, "outputs": [], "source": [ - "display_markdown(\"**Average LogFC values across datasets: Up-regulating drugs**\", raw=True)\n", - "display(overlapping_up_logFC.head(n=top_n))\n", - "display(HTML(download_link(overlapping_up_logFC, 'overlapping_drugs_logfc_UpReg.tsv')))\n", - "display_markdown(\"**Average LogFC values across datasets: Down-regulating drugs**\", raw=True)\n", - "display(overlapping_down_logFC.head(n=top_n))\n", - "display(HTML(download_link(overlapping_down_logFC, 'overlapping_drugs_logfc_DnReg.tsv')))\n" + "display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n", + "display(overlapping_up_TargetRank.head(n=top_n))\n", + "display(HTML(download_link(overlapping_up_TargetRank, f'overlapping_drugs_averages_{query_gene}_UpReg.tsv')))\n", + "display_markdown(\"**Averages across datasets: Down-regulating drugs**\", raw=True)\n", + "display(overlapping_down_TargetRank.head(n=top_n))\n", + "display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))" ] }, { @@ -750,30 +789,32 @@ " if re.search('ginkgo',k):\n", " all_ginkgo_up = all_ginkgo_up.union(v)\n", " all_ginkgo_down = all_ginkgo_down.union(top_down[k])\n", + "\n", "venn_up = {\n", - " 'novartis': top_up['novartis'],\n", - " 'lincs' : top_up['lincs_l1000'],\n", - " 'ginkgo' : all_ginkgo_up\n", + " 'novartis': top_up['novartis'],\n", + " 'lincs' : top_up['lincs_l1000'],\n", + " 'ginkgo' : all_ginkgo_up\n", "}\n", "venn_down = {\n", - " 'novartis': top_down['novartis'],\n", - " 'lincs' : top_down['lincs_l1000'],\n", - " 'ginkgo' : all_ginkgo_down\n", + " 'novartis': top_down['novartis'],\n", + " 'lincs' : top_down['lincs_l1000'],\n", + " 'ginkgo' : all_ginkgo_down\n", "}\n", "\n", "def print_overlap(venn_dict):\n", - " gn = venn_dict['ginkgo'].intersection(venn_dict['novartis'])\n", - " gl = venn_dict['ginkgo'].intersection(venn_dict['lincs'])\n", - " nl = venn_dict['novartis'].intersection(venn_dict['lincs'])\n", " gnl = venn_dict['ginkgo'].intersection(venn_dict['novartis']).intersection(venn_dict['lincs'])\n", + " gn = venn_dict['ginkgo'].intersection(venn_dict['novartis']).difference(gnl)\n", + " gl = venn_dict['ginkgo'].intersection(venn_dict['lincs']).difference(gnl)\n", + " nl = venn_dict['novartis'].intersection(venn_dict['lincs']).difference(gnl)\n", + " \n", " if len(gn) > 0:\n", - " display_markdown(f'Novartis and Ginkgo: {gn.difference(gnl)}', raw=True)\n", + " display_markdown(f'Novartis and Ginkgo: {[d for d in gn]}', raw=True)\n", " if len(nl) > 0:\n", - " display_markdown(f'Novartis and LINCS L1000: {nl.difference(gnl)}', raw=True)\n", + " display_markdown(f'Novartis and LINCS L1000: {[d for d in nl]}', raw=True)\n", " if len(gl) > 0:\n", - " display_markdown(f'LINCS L1000 and Ginkgo: {gl.difference(gnl)}', raw=True)\n", + " display_markdown(f'LINCS L1000 and Ginkgo: {[d for d in gl]}', raw=True)\n", " if len(gnl) > 0:\n", - " display_markdown(f'LINCS L1000, Ginkgo and Novartis: {gnl}', raw=True)" + " display_markdown(f'LINCS L1000, Ginkgo and Novartis: {[d for d in gnl]}', raw=True)" ] }, { @@ -784,17 +825,9 @@ "outputs": [], "source": [ "display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n", + "\n", "venn3(subsets=(venn_up['novartis'], venn_up['lincs'], venn_up['ginkgo']),\n", - " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "951ac257", - "metadata": {}, - "outputs": [], - "source": [ + " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n", "print_overlap(venn_up)" ] }, @@ -806,17 +839,9 @@ "outputs": [], "source": [ "display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n", + "\n", "venn3(subsets=(venn_down['novartis'], venn_down['lincs'], venn_down['ginkgo']),\n", - " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c53651f3", - "metadata": {}, - "outputs": [], - "source": [ + " set_labels=('Novartis', 'LINCS L1000', 'Ginkgo'));\n", "print_overlap(venn_down)" ] }, @@ -833,7 +858,7 @@ "id": "30eb397f", "metadata": {}, "source": [ - "The volcano plots show the strength and statistical significance of the drug perturbation for each signature in the dataset (drug and dose specific). Color of points indicate up (red) or down (blue) regulation. Hover over points in the volcano plot to see the label (with cell type, drug, and dose information), logFC, fold-change, and log10-transformed p-value or CD-coefficient. Tools to the right of the plot allow you to manipulate (pan, zoom) and download the figure. " + "The volcano plots show the strength and statistical significance of the drug perturbation for each signature in the dataset (drug and dose specific). Color of points indicate up (red) or down (blue) regulation. Hover over points in the volcano plot to see the label (with cell line, drug, and dose information), logFC, fold-change, log10-transformed p-value, raw rank and normalized rank. Tools to the right of the plot allow you to manipulate (pan, zoom) and download the figure. " ] }, { @@ -854,38 +879,29 @@ " df = expr_data.copy()\n", " \n", " # clean columns\n", + " df['FC'] = 2**df['logFC']\n", " if source == 'Ginkgo':\n", " df['Label'] = df['Perturbation']\n", - " df['FC'] = 2**df['logFC']\n", " elif source == 'Novartis':\n", " df['Label'] = df['Perturbation'] + '_' + df['Drug']\n", - " df['FC'] = 2**df['logFC']\n", " elif source == 'L1000':\n", - " df.rename(columns={'Perturbagen':'Drug', 'Log2(Fold Change)':'logFC','Fold Change':'FC'}, inplace=True)\n", - " df['absCDcoef'] = abs(df['CD Coefficient'])\n", - " df['Label'] = df.index.to_list()\n", + " df['Label'] = df['Perturbation']\n", "\n", " # set plot source\n", - " if source == 'Ginkgo' or source == 'Novartis':\n", - " plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','log10adj.P.Val']])\n", - " x,y='logFC','log10adj.P.Val'\n", - " hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n", - " (\"Log2(FC)\", \"@logFC\"),\n", - " (\"Fold Change\", \"@FC\"),\n", - " ('-Log10(Adj. P-value)',\"@{log10adj.P.Val}{0.00e}\")])\n", - " elif source=='L1000':\n", - " plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','absCDcoef']])\n", - " x,y = 'logFC','absCDcoef'\n", - " hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n", - " (\"Log2(FC)\", \"@logFC\"),\n", - " (\"Fold Change\", \"@FC\"),\n", - " ('abs(CD Coefficient)',\"@{absCDcoef}{0.00e}\")])\n", + " plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','log10adj.P.Val', 'Rank', 'PctRank']])\n", + " x,y='logFC','log10adj.P.Val'\n", + " hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n", + " (\"Log2(FC)\", \"@logFC\"),\n", + " (\"Fold Change\", \"@FC\"),\n", + " ('-Log10(Adj. p-value)',\"@{log10adj.P.Val}{0.00e}\"),\n", + " (\"Raw Rank\", \"@Rank\"),\n", + " (\"Normalized Rank\", \"@PctRank\")])\n", " \n", " # define figure\n", " p = figure(\n", " title=f'{gene_id} Regulation in {source} {cell_type}',\n", " x_axis_label = 'Log2(Fold Change)',\n", - " y_axis_label = 'abs(CD Coefficient)' if source == 'L1000' else '-Log10(Adj. P-value)',\n", + " y_axis_label = '-Log10(Adj. p-value)',\n", " tools = 'pan,wheel_zoom,box_zoom,reset,save'\n", " )\n", "\n", @@ -939,7 +955,7 @@ "metadata": {}, "outputs": [], "source": [ - "create_bokeh_volcano_plot(pd.concat([l1000_up,l1000_down]), query_gene, '', 'L1000')" + "create_bokeh_volcano_plot(lincs_de, query_gene, '', 'L1000')" ] }, {