diff --git a/appyters/Drug_Gene_Budger2/README.md b/appyters/Drug_Gene_Budger2/README.md
new file mode 100644
index 00000000..ad40bc15
--- /dev/null
+++ b/appyters/Drug_Gene_Budger2/README.md
@@ -0,0 +1,11 @@
+# Drug Gene Budger (DGB) 2
+
+This appyter takes a single gene as input and identifies up-regulating and down-regulating drugs from three connectivity mapping resources.
+
+- [Ginkgo GDPx1 and GDPx2 datasets](https://huggingface.co/ginkgo-datapoints)
+
+- [Novartis DRUG-seq U2OS MoABox dataset](https://zenodo.org/records/14291446)
+
+- [LINCS L1000 Chemical Perturbation dataset](https://maayanlab.cloud/sigcom-lincs/#/Download)
+
+In addition to producing tables of ranked up- and down-regulating drugs of the input gene, the notebook creates volcano plot visualizations and UpSet plots that identify overlap in regulators across datasets.
\ No newline at end of file
diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json
new file mode 100644
index 00000000..a3fcf07f
--- /dev/null
+++ b/appyters/Drug_Gene_Budger2/appyter.json
@@ -0,0 +1,31 @@
+{
+ "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
+ "name": "Drug_Gene_Budger2",
+ "title": "Drug Gene Budger (DGB) 2",
+ "version": "0.0.1",
+ "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets",
+ "image": "dgb_logo.png",
+ "authors": [
+ {
+ "name": "Lily Taub",
+ "email": "lilydtaub@gmail.com"
+ }
+ ],
+ "url": "https://github.com/MaayanLab/appyter-catalog",
+ "tags": [
+ "L1000",
+ "DRUG-seq",
+ "RNA-seq"
+ ],
+ "license": "CC-BY-NC-SA-4.0",
+ "public": true,
+ "appyter": {
+ "file": "drug_gene_budger2_appyter.ipynb",
+ "profile": "biojupies",
+ "extras": [
+ "toc",
+ "hide-code",
+ "toggle-code"
+ ]
+ }
+}
\ No newline at end of file
diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
new file mode 100644
index 00000000..65237f47
--- /dev/null
+++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
@@ -0,0 +1,888 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72bb0888",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#%%apyter init\n",
+ "from appyter import magic\n",
+ "magic.init(lambda _=globals: _())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "978b36d3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%appyter hide_code\n",
+ "\n",
+ "{% do SectionField(\n",
+ " name='input', \n",
+ " title = 'Gene of interest', \n",
+ " subtitle = 'Enter a gene for which you wish to get up and down regulators.'\n",
+ ") %}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e07b2ead",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%appyter hide_code\n",
+ "\n",
+ "{% set input_gene = AutocompleteField(\n",
+ " name = 'input_gene',\n",
+ " label = 'Query Gene',\n",
+ " default = 'C9ORF72',\n",
+ " description = 'Enter the gene symbol of interest.',\n",
+ " file_path = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/all_genes.json',\n",
+ " section='input'\n",
+ ")%}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "733c8208",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%appyter code_exec\n",
+ "query_gene = \"{{ input_gene.value.upper() }}\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "123ecde2",
+ "metadata": {},
+ "source": [
+ "# Drug Gene Budger 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7ea9d01d",
+ "metadata": {},
+ "source": [
+ "This notebook takes a gene as input and identifies drugs that maximally up and down regulate the gene's expression in a collection of chemical perturbation datasets.\n",
+ "\n",
+ "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene epxression results for 1,354 drugs.\n",
+ "- Novartis DRUG-seq: Differential: Limma-Trend based differential expression results for 4,343 drugs. \n",
+ "- LINCS L1000 Chemical Perturbations: Queries the [LINCS Reverse Search Dashboard](https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/) for pre-computed characteristic direction-based differential gene expression signatures from RNA-seq-like LINCS L1000 Expression Profiles covering 33,571 drugs.\n",
+ "\n",
+ "The Ginkgo dataset includes 4 primary cell types (eithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "503ae0b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## General\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import re\n",
+ "import warnings\n",
+ "\n",
+ "## HTTP Requests\n",
+ "import requests\n",
+ "\n",
+ "## Tables\n",
+ "from IPython.display import display, display_markdown, HTML\n",
+ "\n",
+ "## UpSet Plot\n",
+ "from upsetplot import from_contents, plot\n",
+ "from matplotlib import pyplot\n",
+ "\n",
+ "## Volcano Plot\n",
+ "from bokeh.plotting import figure, show\n",
+ "from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper\n",
+ "from bokeh.palettes import RdBu\n",
+ "from bokeh.io import output_notebook"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "39a5abae",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Storage url for Ginkgo and Novartis DE files\n",
+ "ginkgo_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/ginkgo_de'\n",
+ "novartis_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'\n",
+ "# silence warnings\n",
+ "warnings.filterwarnings('ignore')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "95e9e0da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "in_ginkgo = in_novartis = in_lincs = True"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ee3e36e5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get Ginkgo DE results for gene\n",
+ "gene_file = f'{query_gene}.f'\n",
+ "try:\n",
+ " ginkgo_de = pd.read_feather(f'{ginkgo_URL}/{gene_file}')\n",
+ " ginkgo_cell_types = list(set(p.split('-')[0] for p in ginkgo_de.Perturbation))\n",
+ "except:\n",
+ " in_ginkgo=False\n",
+ " print('Gene not in Ginkgo dataset')\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9fb1130",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def prepare_ginkgo_data(df, cell_types):\n",
+ " '''Create a results dictionary where each cell type\n",
+ " in the Ginkgo dataset is a key and the value is the DE data\n",
+ " for the query gene for that cell type.\n",
+ " '''\n",
+ " # get perturbations with given cell type\n",
+ " cell_type_results = {}\n",
+ " for k in cell_types:\n",
+ " subset = df[df['Perturbation'].str.contains(k)]\n",
+ " subset['log10adj.P.Val'] = subset['adj.P.Val'].replace(0,1e-323).map(np.log10)*-1\n",
+ " cell_type_results[k] = subset\n",
+ " return cell_type_results\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bdc92b50",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if in_ginkgo:\n",
+ " ginkgo_gene_expr_dict = prepare_ginkgo_data(ginkgo_de, ginkgo_cell_types)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "200a83a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def l1000_reverse_search(gene_id:str, direction:str):\n",
+ " url = f'https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/api/table/cp/{direction}/{gene_id}'\n",
+ " headers = {\n",
+ " 'Accept':'application/json',\n",
+ " 'Content-Type':'application/json'\n",
+ " }\n",
+ " try:\n",
+ " resp = requests.get(url, headers=headers)\n",
+ " resp.raise_for_status()\n",
+ " res = resp.json()\n",
+ " df = pd.DataFrame(res)\n",
+ " except requests.exceptions.HTTPError as e:\n",
+ " print(f\"Gene not found in LINCS: {e}\")\n",
+ " df=pd.DataFrame()\n",
+ " return df\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "eccf1c5c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "l1000_up = l1000_reverse_search(query_gene.upper(), 'up')\n",
+ "l1000_down = l1000_reverse_search(query_gene.upper(), 'down')\n",
+ "if l1000_down.empty or l1000_up.empty:\n",
+ " in_lincs=False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d731a2ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# get Novartis DE results for gene\n",
+ "try:\n",
+ " novartis_de = pd.read_feather(f'{novartis_URL}/{gene_file}').set_index('index')\n",
+ " # format p-values\n",
+ " novartis_de['log10adj.P.Val'] = novartis_de['P.Adj'].replace(0,1e-323).map(np.log10)*-1\n",
+ " # rename logFC column for concordance with Ginkgo columns\n",
+ " novartis_de.rename(columns={'LogFC':'logFC'}, inplace=True)\n",
+ "except:\n",
+ " print('Gene not in novartis dataset')\n",
+ " in_novartis=False"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c5607885",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "if in_lincs + in_novartis + in_ginkgo < 2:\n",
+ " print(f\"LINCS: {in_lincs}\")\n",
+ " print(f\"Novartis: {in_novartis}\")\n",
+ " print(f\"Ginkgo: {in_ginkgo}\")\n",
+ " raise Exception(\"Execution stopped, gene not found in at least 2 datasets\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd12adb9",
+ "metadata": {},
+ "source": [
+ "## Query Gene"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2b75c4da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_markdown(f\"This notebook shows results for the input gene **{query_gene}**\", raw=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "df830837",
+ "metadata": {},
+ "source": [
+ "## Rank Drugs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "97c523ce",
+ "metadata": {},
+ "source": [
+ "Within each dataset drugs are ranked by the statistical significance of the regulatory relationship. The pipeline uses the differential expression p-value for the Novartis and Ginkgo data and the characterstic direction coefficient for the LINCS L1000 data. \n",
+ "\n",
+ "When a dataset contains multiple perturbations for the same drug (i.e. a cell exposed to the drug at different doses), p-values are averaged across doses to get a single ranking for the drug. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32548529",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_rankings(data:pd.DataFrame, source:str, cell_type:str, direction:str):\n",
+ " '''\n",
+ " Given a dataframe of logFC and p-values for a gene of interest across perturbations, \n",
+ " rank the drugs by how the induce or repress the gene. \n",
+ "\n",
+ " Returns a tuple of 1) drug ranks averaged across drug dosages and 2) full\n",
+ " perturbation ranks. \n",
+ " '''\n",
+ " ranked_data = data.copy()\n",
+ " \n",
+ " if (source == 'Ginkgo') & (cell_type=='A549'):\n",
+ " ranked_data.loc[ranked_data['Drug']=='Brefeldin A from Penicillium brefeldianum', 'Drug'] = 'Brefeldin A'\n",
+ " elif (source == 'Ginkgo') & (cell_type != 'A549'):\n",
+ " ranked_data.loc[ranked_data['Drug']=='Brefeldin-A', 'Drug'] = 'Brefeldin A'\n",
+ " elif source == 'Novartis':\n",
+ " ranked_data.loc[ranked_data['Drug']=='Trichostatin A (racemate)', 'Drug'] = 'Trichostatin A'\n",
+ " # average rank across all drug dosages\n",
+ " drug_mean_ranks = ranked_data.loc[:,['Drug','logFC','log10adj.P.Val']].groupby('Drug')[['logFC','log10adj.P.Val']].mean().sort_values('log10adj.P.Val', ascending=False)\n",
+ " # filter for up or down regulation\n",
+ " if direction == 'up':\n",
+ " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] > 0]\n",
+ " elif direction == 'down':\n",
+ " drug_mean_ranks = drug_mean_ranks[drug_mean_ranks['logFC'] < 0]\n",
+ " drug_mean_ranks.rename(columns={'logFC':'Avg logFC', 'log10adj.P.Val':'Avg -log10(Adj.PVal)'}, inplace=True)\n",
+ " return drug_mean_ranks, ranked_data\n",
+ "\n",
+ "def get_top(rank_results:pd.DataFrame, n=50):\n",
+ " '''\n",
+ " Given the drug_mean_ranks result from get_rankings, extract the names of the drugs\n",
+ " that most down- or up-regulate the gene of interest (top N).\n",
+ "\n",
+ " If there are less drugs than N, will return all results.\n",
+ " '''\n",
+ " top = {d.casefold() for d in set(rank_results.head(n).index)}\n",
+ " return top\n",
+ "\n",
+ "# create download link for table results\n",
+ "def download_link(df, fname):\n",
+ " if df.shape[0] == 0: return ''\n",
+ " csv = df.to_csv(fname, sep='\\t', index=True)\n",
+ " link = f'
'\n",
+ " return link"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "07fea912",
+ "metadata": {},
+ "source": [
+ "### Ginkgo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c5073607",
+ "metadata": {},
+ "source": [
+ "Drug rankings for the Ginkgo dataset. Top 20 by p-value are shown and the complete table can be downloaded."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1d05ddc0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "top_n = 20"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "deebf1cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ginkgo_drugs_up = {}\n",
+ "ginkgo_drugs_down = {}\n",
+ "for cell_type, exprdf in ginkgo_gene_expr_dict.items():\n",
+ " # rank by level of up-regulation\n",
+ " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'up')\n",
+ " ginkgo_drugs_up[cell_type] = (mean_ranks, full_ranks)\n",
+ " display_markdown(f'**Top {top_n} up-regulators for {cell_type}**', raw=True)\n",
+ " display(mean_ranks.head(top_n))\n",
+ " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_UpReg_{cell_type}.tsv\")))\n",
+ " # rank by level of down-regulation\n",
+ " mean_ranks, full_ranks = get_rankings(exprdf, 'Ginkgo', cell_type, 'down')\n",
+ " ginkgo_drugs_down[cell_type] = (mean_ranks, full_ranks)\n",
+ " display_markdown(f'**Top {top_n} down-regulators for {cell_type}**', raw=True)\n",
+ " display(mean_ranks.head(top_n))\n",
+ " display(HTML(download_link(mean_ranks, f\"ginkgo_drug_ranks_DnReg_{cell_type}.tsv\")))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fc6465b9",
+ "metadata": {},
+ "source": [
+ "### L1000"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8777d900",
+ "metadata": {},
+ "source": [
+ "Drug rankings for the LINCS L1000 dataset. Top 20 by CD-coefficient are shown and the complete table can be downloaded."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57438b7c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def l1000_sort(result, direction):\n",
+ " df = result[['Perturbagen','CD Coefficient']]\n",
+ " df = df.groupby('Perturbagen').mean(['CD Coefficient'])\n",
+ " if direction == 'up':\n",
+ " df = df.sort_values('CD Coefficient', ascending=False)\n",
+ " elif direction == 'down':\n",
+ " df = df.sort_values('CD Coefficient', ascending=True)\n",
+ " return df\n",
+ "\n",
+ "l1000_top_up = l1000_sort(l1000_up, 'up')\n",
+ "l1000_top_down = l1000_sort(l1000_down, 'down')\n",
+ "display_markdown(f'**Top {top_n} up-regulators in L1000**', raw=True)\n",
+ "display(l1000_top_up.head(top_n))\n",
+ "display(HTML(download_link(l1000_top_up, f\"l1000_drug_ranks_UpReg.tsv\")))\n",
+ "display_markdown(f'**Top {top_n} down-regulators in L1000**', raw=True)\n",
+ "display(l1000_top_down.head(top_n))\n",
+ "display(HTML(download_link(l1000_top_down, f\"l1000_drug_ranks_DnReg.tsv\")))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e177678e",
+ "metadata": {},
+ "source": [
+ "### Novartis DRUG-seq"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e22fef7a",
+ "metadata": {},
+ "source": [
+ "Drug rankings for the Novartis dataset. Top 20 by p-value are shown and the complete table can be downloaded."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "03c5a39f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "novartis_drugs_up = get_rankings(novartis_de, 'Novartis', '', 'up')\n",
+ "novartis_drugs_down = get_rankings(novartis_de, 'Novartis', '', 'down')\n",
+ "\n",
+ "display_markdown(f'**Top {top_n} up-regulators in Novartis DRUG-seq**', raw=True)\n",
+ "display(novartis_drugs_up[0].head(top_n))\n",
+ "display(HTML(download_link(novartis_drugs_up[0], 'novartis_drug_ranks_UpReg.tsv')))\n",
+ "display_markdown(f'**Top {top_n} down-regulators in Novartis DRUG-seq**', raw=True)\n",
+ "display(novartis_drugs_down[0].head(top_n))\n",
+ "display(HTML(download_link(novartis_drugs_down[0], 'novartis_drug_ranks_DnReg.tsv')))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f60e93fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "top_up = {}\n",
+ "top_down = {}\n",
+ "# get results from Ginkgo\n",
+ "for cell_type in ginkgo_drugs_down.keys():\n",
+ " top_up[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_up[cell_type][0], n=50)\n",
+ " top_down[f'ginkgo_{cell_type}'] = get_top(ginkgo_drugs_down[cell_type][0], n=50)\n",
+ "# get results from L1000\n",
+ "top_up['lincs_l1000'] = {drug.casefold() for drug in set(l1000_top_up.index)}\n",
+ "top_down['lincs_l1000'] = {drug.casefold() for drug in set(l1000_top_down.index)}\n",
+ "# get results from novartis\n",
+ "top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n",
+ "top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01b5b49c",
+ "metadata": {},
+ "source": [
+ "## UpSet Plot"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb0246a5",
+ "metadata": {},
+ "source": [
+ "The UpSet plots show the overlap among top up-regulating or down-regulating drugs in each dataset. If there were more than 50 significant regulators in a dataset for a given input gene, the input was restricted to the top 50 regulators."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "71c19377",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Saving Figures\n",
+ "def save_figure(plot_name, **kwargs):\n",
+ " import io\n",
+ " mem = io.BytesIO()\n",
+ " pyplot.savefig(mem, bbox_inches='tight')\n",
+ " with open(plot_name, 'wb') as fw:\n",
+ " fw.write(mem.getbuffer())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f7bb61d7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def create_upset(top_sets: dict):\n",
+ " rename_keys = {\n",
+ " 'ginkgo_A549': 'ginkgo_A549',\n",
+ " 'lincs_l1000': 'lincs_l1000',\n",
+ " 'novartis': 'novartis',\n",
+ " 'ginkgo_human_epithelial_melanocytes': 'ginkgo_melanocytes',\n",
+ " 'ginkgo_human_dermal_fibroblast': 'ginkgo_fibroblasts',\n",
+ " 'ginkgo_human_aortic_smooth_muscle_cells': 'ginkgo_muscle_cells',\n",
+ " 'ginkgo_human_skeletal_muscle_myoblasts': 'ginkgo_myoblasts'\n",
+ " }\n",
+ " top_sets = {rename_keys[k]:v for k,v in top_sets.items()}\n",
+ " upset_data = from_contents(top_sets)\n",
+ " plot(upset_data, orientation = 'horizontal', show_counts = True, element_size = 30)\n",
+ " pyplot.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "58fa71a7",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n",
+ "create_upset(top_up)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0ecff74c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_markdown(f\"**Overlap among top down regulators of {query_gene}**\", raw=True)\n",
+ "create_upset(top_down)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c1aee4c2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_overlapping_sets(top_sets:dict, to_file:str):\n",
+ " '''\n",
+ " Given the dictionary of sets used to created the UpSet plot,\n",
+ " return the contents of the overlapping sets. \n",
+ " '''\n",
+ " # convert to multi-index dataframe\n",
+ " set_df = from_contents(top_sets)\n",
+ " multi_index_df = pd.DataFrame(columns=list(top_sets.keys()))\n",
+ " for colname in multi_index_df.columns:\n",
+ " multi_index_df[colname] = set_df.index.get_level_values(colname).to_list()\n",
+ " # only keep unique sets of intersection contributors\n",
+ " multi_index_df.drop_duplicates(inplace=True)\n",
+ " # sort multi-index for efficient indexing\n",
+ " set_df = set_df.sort_index()\n",
+ " # extract drug intersection for each group\n",
+ " overlapping_sets = pd.DataFrame(columns=['Members', 'Overlap', 'Length'])\n",
+ " for idx in range(multi_index_df.shape[0]):\n",
+ " ixn_drugs = set_df.loc[tuple(multi_index_df.iloc[idx])].id.to_list()\n",
+ " # get group members\n",
+ " ixn_name = multi_index_df.iloc[idx][multi_index_df.iloc[idx]].index.to_list()\n",
+ " ixn_name_joined = '-'.join(ixn_name)\n",
+ " # append results\n",
+ " overlapping_sets = pd.concat([overlapping_sets, pd.DataFrame({'Members':ixn_name_joined, 'Overlap':[ixn_drugs], 'Length':len(ixn_drugs), 'N Datasets':len(ixn_name)})])\n",
+ " \n",
+ " \n",
+ " overlapping_sets = overlapping_sets.sort_values('N Datasets', ascending=False)\n",
+ " return overlapping_sets\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "556a29cc",
+ "metadata": {},
+ "source": [
+ "Below are tabular representations of the UpSet plots."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "726129a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "overlap_down = get_overlapping_sets(top_down, 'overlap_down')\n",
+ "overlap_up = get_overlapping_sets(top_up, 'overlap_up')\n",
+ "display_markdown(\"**Down-regulating drug overlap**\", raw=True)\n",
+ "display(overlap_down)\n",
+ "display(HTML(download_link(overlap_down, 'overlapping_drugs_DnReg.tsv')))\n",
+ "display_markdown(\"**Up-regulating drug overlap**\", raw=True)\n",
+ "display(overlap_up)\n",
+ "display(HTML(download_link(overlap_up, 'overlapping_drugs_UpReg.tsv')))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a4eb5f1d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_logFC_averages(overlapping_df, direction):\n",
+ " '''\n",
+ " Retrieve average logFC across datasets for drugs in overlapping sets. \n",
+ "\n",
+ " Returns dataframe with columns for:\n",
+ " Drug\n",
+ " Average logFC\n",
+ " Number of datasets for which drug was a significant regulator of the query gene\n",
+ " '''\n",
+ " # extract up or down data\n",
+ " if direction == 'down':\n",
+ " data_dict ={\n",
+ " 'ginkgo_A549': ginkgo_drugs_down['A549'][1],\n",
+ " 'ginkgo_human_dermal_fibroblast': ginkgo_drugs_down['human_dermal_fibroblast'][1],\n",
+ " 'ginkgo_human_aortic_smooth_muscle_cells': ginkgo_drugs_down['human_aortic_smooth_muscle_cells'][1],\n",
+ " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_down['human_epithelial_melanocytes'][1],\n",
+ " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_down['human_skeletal_muscle_myoblasts'][1],\n",
+ " 'novartis': novartis_drugs_down[1],\n",
+ " 'lincs': l1000_down\n",
+ " }\n",
+ " elif direction == 'up':\n",
+ " data_dict ={\n",
+ " 'ginkgo_A549': ginkgo_drugs_up['A549'][1],\n",
+ " 'ginkgo_human_dermal_fibroblast': ginkgo_drugs_up['human_dermal_fibroblast'][1],\n",
+ " 'ginkgo_human_aortic_smooth_muscle_cells': ginkgo_drugs_up['human_aortic_smooth_muscle_cells'][1],\n",
+ " 'ginkgo_human_epithelial_melanocytes':ginkgo_drugs_up['human_epithelial_melanocytes'][1],\n",
+ " 'ginkgo_human_skeletal_muscle_myoblasts':ginkgo_drugs_up['human_skeletal_muscle_myoblasts'][1],\n",
+ " 'novartis': novartis_drugs_up[1],\n",
+ " 'lincs': l1000_up\n",
+ " }\n",
+ " # get average, integrating across datasets\n",
+ " average_logfc_vals = {}\n",
+ " n_datasets = list()\n",
+ " for _,row in overlapping_df.iterrows():\n",
+ " n_datasets.extend([row['N Datasets']]*len(row['Overlap']))\n",
+ " for d in row['Overlap']:\n",
+ " n = 0\n",
+ " runsum = 0\n",
+ " for k,df in data_dict.items():\n",
+ " if k == 'lincs':\n",
+ " subset = df[df['Perturbagen'].str.lower() == d.lower()]\n",
+ " subset.rename(columns = {'Log2(Fold Change)':'logFC'}, inplace=True)\n",
+ " else:\n",
+ " subset = df[df['Drug'].str.lower() == d.lower()]\n",
+ " n = n + subset.shape[0]\n",
+ " runsum = runsum + subset.logFC.sum()\n",
+ " average_logfc_vals[d] = round(runsum / n,3)\n",
+ " # create results dataframe\n",
+ " res_df = pd.DataFrame({\n",
+ " 'Drug': list(average_logfc_vals.keys()),\n",
+ " 'Avg(logFC)': list(average_logfc_vals.values())\n",
+ " })\n",
+ " res_df['N Datasets'] = n_datasets\n",
+ " # sort based on N datasets and logFC, direction depending on up or down set\n",
+ " if direction=='up':\n",
+ " res_df = res_df.sort_values(['N Datasets','Avg(logFC)'], ascending=[False,False])\n",
+ " elif direction == 'down':\n",
+ " res_df = res_df.sort_values(['N Datasets','Avg(logFC)'], ascending=[False,True])\n",
+ " return res_df\n",
+ "\n",
+ " \n",
+ "overlapping_up_logFC = get_logFC_averages(overlap_up, 'up')\n",
+ "overlapping_down_logFC = get_logFC_averages(overlap_down, 'down')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc739111",
+ "metadata": {},
+ "source": [
+ "Tables that show the average logFC value (across datasets) for drugs that were found to be significant regulators in more than one dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "10420089",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "display_markdown(\"**Average LogFC values across datasets: Up-regulating drugs**\", raw=True)\n",
+ "display(overlapping_up_logFC.head(n=top_n))\n",
+ "display(HTML(download_link(overlapping_up_logFC, 'overlapping_drugs_logfc_UpReg.tsv')))\n",
+ "display_markdown(\"**Average LogFC values across datasets: Down-regulating drugs**\", raw=True)\n",
+ "display(overlapping_down_logFC.head(n=top_n))\n",
+ "display(HTML(download_link(overlapping_down_logFC, 'overlapping_drugs_logfc_DnReg.tsv')))\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c088c17f",
+ "metadata": {},
+ "source": [
+ "## Volcano Plots"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "30eb397f",
+ "metadata": {},
+ "source": [
+ "The volcano plots show the strength and statistical significance of the drug perturbation for each signature in the dataset (drug and dose specific). Color of points indicate up (red) or down (blue) regulation. Hover over points in the volcano plot to see the label (with cell type, drug, and dose information), logFC, fold-change, and log10-transformed p-value or CD-coefficient. Tools to the right of the plot allow you to manipulate (pan, zoom) and download the figure. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7d6d36ba",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_notebook()\n",
+ "\n",
+ "def create_bokeh_volcano_plot(expr_data:pd.DataFrame, gene_id:str, cell_type:str, source:str):\n",
+ " '''\n",
+ " Given the expression data for a given gene, create an interactive\n",
+ " volcano plot that shows regulation of gene across all perturbations (drug, dosage, cell line).\n",
+ " '''\n",
+ " \n",
+ " df = expr_data.copy()\n",
+ " \n",
+ " # clean columns\n",
+ " if source == 'Ginkgo':\n",
+ " df['Label'] = df['Perturbation']\n",
+ " df['FC'] = 2**df['logFC']\n",
+ " elif source == 'Novartis':\n",
+ " df['Label'] = df['Perturbation'] + '_' + df['Drug']\n",
+ " df['FC'] = 2**df['logFC']\n",
+ " elif source == 'L1000':\n",
+ " df.rename(columns={'Perturbagen':'Drug', 'Log2(Fold Change)':'logFC','Fold Change':'FC'}, inplace=True)\n",
+ " df['absCDcoef'] = abs(df['CD Coefficient'])\n",
+ " df['Label'] = df.index.to_list()\n",
+ "\n",
+ " # set plot source\n",
+ " if source == 'Ginkgo' or source == 'Novartis':\n",
+ " plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','log10adj.P.Val']])\n",
+ " x,y='logFC','log10adj.P.Val'\n",
+ " hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n",
+ " (\"Log2(FC)\", \"@logFC\"),\n",
+ " (\"Fold Change\", \"@FC\"),\n",
+ " ('-Log10(Adj. P-value)',\"@{log10adj.P.Val}{0.00e}\")])\n",
+ " elif source=='L1000':\n",
+ " plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','absCDcoef']])\n",
+ " x,y = 'logFC','absCDcoef'\n",
+ " hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n",
+ " (\"Log2(FC)\", \"@logFC\"),\n",
+ " (\"Fold Change\", \"@FC\"),\n",
+ " ('abs(CD Coefficient)',\"@{absCDcoef}{0.00e}\")])\n",
+ " \n",
+ " # define figure\n",
+ " p = figure(\n",
+ " title=f'{gene_id} Regulation in {source} {cell_type}',\n",
+ " x_axis_label = 'Log2(Fold Change)',\n",
+ " y_axis_label = 'abs(CD Coefficient)' if source == 'L1000' else '-Log10(Adj. P-value)',\n",
+ " tools = 'pan,wheel_zoom,box_zoom,reset,save'\n",
+ " )\n",
+ "\n",
+ " # color mapper\n",
+ " color_mapper = LinearColorMapper(palette = RdBu[10],\n",
+ " low = min(df['logFC']),\n",
+ " high=max(df['logFC']))\n",
+ " # plot\n",
+ " p.scatter(x=x,\n",
+ " y=y,\n",
+ " size=8,\n",
+ " source=plot_source,\n",
+ " fill_alpha=0.6,\n",
+ " color = {'field':'logFC','transform':color_mapper})\n",
+ " p.add_tools(hover)\n",
+ " show(p)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1e29e34",
+ "metadata": {},
+ "source": [
+ "### Ginkgo"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3315ba88",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "for cell_type, expr_df in ginkgo_gene_expr_dict.items():\n",
+ " cell_name = ' '.join(re.sub('human_','',cell_type).split('_'))\n",
+ " create_bokeh_volcano_plot(expr_df, query_gene, cell_name, 'Ginkgo')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3571fcfb",
+ "metadata": {},
+ "source": [
+ "### L1000"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "36a1f0cc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "create_bokeh_volcano_plot(pd.concat([l1000_up,l1000_down]), query_gene, '', 'L1000')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "242d31af",
+ "metadata": {},
+ "source": [
+ "### Novartis DRUG-seq"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7f38d013",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "create_bokeh_volcano_plot(novartis_de, query_gene, '', 'Novartis')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "my-bioinfo-env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/appyters/Drug_Gene_Budger2/requirements.txt b/appyters/Drug_Gene_Budger2/requirements.txt
new file mode 100644
index 00000000..d81fe56f
--- /dev/null
+++ b/appyters/Drug_Gene_Budger2/requirements.txt
@@ -0,0 +1,9 @@
+appyter @ git+https://github.com/Maayanlab/appyter
+numpy
+pandas
+pyarrow
+requests
+IPython
+upsetplot
+matplotlib
+bokeh
\ No newline at end of file
diff --git a/appyters/Drug_Gene_Budger2/static/dgb_logo.png b/appyters/Drug_Gene_Budger2/static/dgb_logo.png
new file mode 100644
index 00000000..aaccbe90
Binary files /dev/null and b/appyters/Drug_Gene_Budger2/static/dgb_logo.png differ