From e50f8a13705e987d1dfebbd7bcfc4305158a7aa5 Mon Sep 17 00:00:00 2001
From: Lily Taub <115661359+lilydtaub@users.noreply.github.com>
Date: Fri, 26 Sep 2025 16:47:28 -0400
Subject: [PATCH 1/2] add Deepcover MoA proteomics data

---
 appyters/Drug_Gene_Budger2/appyter.json       |   2 +-
 .../drug_gene_budger2_appyter.ipynb           | 181 ++++++++++++++++--
 2 files changed, 170 insertions(+), 13 deletions(-)

diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json
index dc052fd3..dbbd7eda 100644
--- a/appyters/Drug_Gene_Budger2/appyter.json
+++ b/appyters/Drug_Gene_Budger2/appyter.json
@@ -2,7 +2,7 @@
     "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
     "name": "Drug_Gene_Budger2",
     "title": "Dr. Gene Budger (DGB) 2",
-    "version": "0.0.6",
+    "version": "0.0.7",
     "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets",
     "image": "dgb_logo.png",
     "authors": [
diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
index 96403a16..dadd541d 100644
--- a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
+++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
@@ -95,13 +95,15 @@
    "id": "7ea9d01d",
    "metadata": {},
    "source": [
-    "This notebook takes a gene as input and identifies drugs that maximally up and down regulate the gene's expression in a collection of chemical perturbation datasets.\n",
+    "This notebook takes a gene as input and identifies drugs that maximally up and down regulate the gene's mRNA expression in a collection of connectivity mapping resources that measure transcriptional response to chemical perturbations:\n",
     "\n",
     "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene expression results for 1,354 drugs.\n",
     "- Novartis DRUG-seq: Differential: Limma-Trend based differential gene expression results for 4,343 drugs. \n",
     "- LINCS L1000 Chemical Perturbations: Limma-Voom based differential gene expression results for a subset of 4,091 drugs from the LINCS L1000 Chemical Perturbation dataset. \n",
     "\n",
-    "The Ginkgo dataset includes 4 primary cell types (epithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type."
+    "The Ginkgo dataset includes 4 primary cell types (epithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type.\n",
+    "\n",
+    "The Deepcover MoA proteomics dataset is used to present protein-level regulation of the query gene. You can compare protein-level and mRNA-level regulation for compounds used in both the Deepcover MoA and connectivity mapping resources."
    ]
   },
   {
@@ -142,10 +144,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Storage url for Ginkgo and Novartis DE files\n",
+    "# Storage URLs for DE gene files\n",
     "ginkgo_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/ginkgo_de'\n",
     "novartis_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'\n",
     "lincs_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_de'\n",
+    "deepcover_moa_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/deepcoverMoa_de'\n",
+    "\n",
     "# silence warnings\n",
     "warnings.filterwarnings('ignore')"
    ]
@@ -263,6 +267,33 @@
     "    raise Exception(\"Execution stopped, gene not found in any datasets\")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ac47199",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get proteomics data\n",
+    "in_deepcover = True\n",
+    "try:\n",
+    "    protein_de = pd.read_feather(f'{deepcover_moa_URL}/{gene_file}').set_index('index')\n",
+    "except:\n",
+    "    in_deepcover=False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cb86094c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get pubchem ID dataframe\n",
+    "pubchem_location = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/cmap_pubchem_ids.csv'\n",
+    "pubchem_ids = pd.read_csv(pubchem_location, dtype = {'Drug':str, 'CID':str})"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "dd12adb9",
@@ -698,13 +729,16 @@
     "    n_datasets = list()\n",
     "    for _,row in overlapping_df.iterrows():\n",
     "        n_datasets.extend([row['N Datasets']]*len(row['Overlap']))\n",
+    "        member_sets = row['Members']\n",
     "        for d in row['Overlap']:\n",
     "            n = 0\n",
     "            runsum_rank = 0\n",
     "            runsum_pctrank = 0\n",
     "            runsum_logFC = 0\n",
     "            runsum_pval = 0\n",
-    "            for _,df in data_dict.items():\n",
+    "            for source_name,df in data_dict.items():\n",
+    "                if not re.search(source_name, member_sets):\n",
+    "                    continue\n",
     "                subset = df[df['Drug'].str.lower() == d.lower()]\n",
     "                n = n + subset.shape[0]\n",
     "                runsum_rank = runsum_rank + subset.Rank.sum()\n",
@@ -730,7 +764,20 @@
     "    else:\n",
     "        # sort based on N datasets and average adjusted p-value\n",
     "        res_df = res_df.sort_values(['N Datasets','Avg Adj.P.Val'], ascending=[False,True])\n",
-    "    return res_df"
+    "    return res_df\n",
+    "\n",
+    "\n",
+    "def join_proteomics(ranking_table, protein_de):\n",
+    "    # join with PubChem ID table\n",
+    "    with_cids = ranking_table.merge(pubchem_ids, how='left', on='Drug')\n",
+    "    # Drop those drugs that did not have PubChem IDs\n",
+    "    with_cids = with_cids[with_cids['CID'].notna()]\n",
+    "    # join with proteomics data on PubChem IDs\n",
+    "    with_proteins = with_cids.merge(protein_de[['UniprotID','Pubchem','logFC']], how='inner', left_on='CID', right_on='Pubchem')\n",
+    "    # clean column names\n",
+    "    with_proteins.rename(columns = {'logFC':'Protein logFC', 'Pubchem' : 'PubChem CID'}, inplace=True)\n",
+    "    with_proteins.drop(columns='CID',inplace=True)\n",
+    "    return with_proteins"
    ]
   },
   {
@@ -792,6 +839,69 @@
     "    display(HTML(download_link(overlapping_down_TargetRank, f'overlapping_drugs_averages_{query_gene}_DnReg.tsv')))"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "2aaee19a",
+   "metadata": {},
+   "source": [
+    "### Protein Regulation\n",
+    "\n",
+    "Query gene regulation at the protein level is dispalyed in the table below. Proteomics data is from the [Deepcover MoA dataset](https://wren.hms.harvard.edu/DeepCoverMOA/), which exposes cells from the HCT116 cancer cell line to 875 small molecule compounds. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53cc7efd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_deepcover:\n",
+    "    up_protein = protein_de[protein_de['logFC'] > 0].loc[:,['UniprotID','Drug','Pubchem','logFC','Zscore','UpRank','PctUpRank']].sort_values('logFC',ascending=False).reset_index().drop(columns='index')\n",
+    "    up_protein.rename(columns={'Pubchem':'PubChem CID','Zscore':'Z-score','UpRank':'Up Rank', 'PctUpRank':'Normalized Up Rank'}, inplace=True)\n",
+    "    display_markdown(\"**Up-regulating drugs**\", raw=True)\n",
+    "    display(up_protein.head(top_n))\n",
+    "    display(HTML(download_link(up_protein, f'DeepcoverMoa_protein_{query_gene}_UpReg.tsv')))\n",
+    "\n",
+    "    dn_protein = protein_de[protein_de['logFC'] < 0].loc[:,['UniprotID','Drug','Pubchem','logFC','Zscore','DnRank','PctDnRank']].sort_values('logFC', ascending=True).reset_index().drop(columns='index')\n",
+    "    dn_protein.rename(columns={'Pubchem':'PubChem CID','Zscore':'Z-score','UpRDnRankank':'Down Rank', 'PctDnRank':'Normalized Down Rank'}, inplace=True)\n",
+    "    display_markdown(\"**Down-regulating drugs**\", raw=True)\n",
+    "    display(dn_protein.head(top_n))\n",
+    "    display(HTML(download_link(dn_protein, f'DeepcoverMoa_protein_{query_gene}_DnReg.tsv')))\n",
+    "else:\n",
+    "    display_markdown(f\"Protein of {query_gene} not in DeepCover MoA Dataset\", raw=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0630a08",
+   "metadata": {},
+   "source": [
+    "If the protein associated with the query gene was found in the Deepcover MoA proteomics dataset, the tables below show how the protein was up or down-regulated by the consensus drugs identified in the connectivity mapping resources. The table only includes compounds that were used in the connectivity mapping resources and the Deepcover MoA dataset. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6b46e327",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_deepcover:\n",
+    "    up_with_cid = join_proteomics(overlapping_up_TargetRank, protein_de)\n",
+    "    dn_with_cid = join_proteomics(overlapping_down_TargetRank, protein_de)\n",
+    "    \n",
+    "    display_markdown(\"**Up-regulating drugs with protein expression**\", raw=True)\n",
+    "    display(up_with_cid.head(n=top_n))\n",
+    "    display(HTML(download_link(up_with_cid, f'{query_gene}_mRNA_protein_UpReg.tsv')))\n",
+    "    \n",
+    "    display_markdown(\"**Down-regulating drugs with protein expression**\", raw=True)\n",
+    "    display(dn_with_cid.head(n=top_n))\n",
+    "    display(HTML(download_link(dn_with_cid, f'{query_gene}_mRNA_protein_DnReg.tsv')))\n",
+    "else:\n",
+    "    display_markdown(f\"Protein of {query_gene} not in DeepCover MoA Dataset\", raw=True)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "2e7c13dd",
@@ -954,22 +1064,43 @@
     "        df['Label'] = df['Perturbation'] + '_' + df['Drug']\n",
     "    elif source == 'L1000':\n",
     "        df['Label'] = df['Perturbation']\n",
+    "    elif source == 'Deepcover MoA':\n",
+    "        df['Label'] = df['Drug']\n",
+    "        df['abs_Zscore'] = df['Zscore'].apply(abs)\n",
     "\n",
     "    # set plot source\n",
-    "    plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','log10adj.P.Val', 'Rank', 'PctRank']])\n",
-    "    x,y='logFC','log10adj.P.Val'\n",
-    "    hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n",
+    "    if source != 'Deepcover MoA':\n",
+    "        plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','log10adj.P.Val', 'Rank', 'PctRank']])\n",
+    "        x,y='logFC','log10adj.P.Val'\n",
+    "        xlabel,ylabel = 'Log2(Fold Change)','-Log10(Adj. p-value)'\n",
+    "        title = f'{gene_id} Regulation in {source} {cell_type}'\n",
+    "        hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n",
     "                            (\"Log2(FC)\", \"@logFC\"),\n",
     "                            (\"Fold Change\", \"@FC\"),\n",
     "                            ('-Log10(Adj. p-value)',\"@{log10adj.P.Val}{0.00e}\"),\n",
     "                            (\"Raw Rank\", \"@Rank\"),\n",
     "                            (\"Normalized Rank\", \"@PctRank\")])\n",
+    "    else:\n",
+    "        plot_source = ColumnDataSource(df.loc[:,['Label','logFC','FC','abs_Zscore','UpRank','DnRank','PctUpRank','PctDnRank']])\n",
+    "        x,y = 'logFC','abs_Zscore'\n",
+    "        xlabel,ylabel = 'Log2(Fold Change)','Abs(Z-score)'\n",
+    "        title = f'{gene_id}: {df[\"UniprotID\"].iloc[0]} Regulation in {source} {cell_type}'\n",
+    "        hover = HoverTool(tooltips=[(\"Label\", \"@Label\"),\n",
+    "                            (\"Log2(FC)\", \"@logFC\"),\n",
+    "                            (\"Fold Change\", \"@FC\"),\n",
+    "                            ('abs(z-score)',\"@{abs_Zscore}{0.00e}\"),\n",
+    "                            (\"Up Rank\", \"@UpRank\"),\n",
+    "                            (\"Normalized Up Rank\",\"@PctUpRank\"),\n",
+    "                            (\"Down Rank\", \"@DnRank\"),\n",
+    "                            (\"Normalized Down Rank\",\"@PctDnRank\")])\n",
+    "\n",
+    "    \n",
     "        \n",
     "    # define figure\n",
     "    p = figure(\n",
-    "        title=f'{gene_id} Regulation in {source} {cell_type}',\n",
-    "        x_axis_label = 'Log2(Fold Change)',\n",
-    "        y_axis_label = '-Log10(Adj. p-value)',\n",
+    "        title=title,\n",
+    "        x_axis_label = xlabel,\n",
+    "        y_axis_label = ylabel,\n",
     "        tools = 'pan,wheel_zoom,box_zoom,reset,save'\n",
     "    )\n",
     "\n",
@@ -1053,6 +1184,30 @@
     "    display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "8c3df5f5",
+   "metadata": {},
+   "source": [
+    "### Deepcover MoA\n",
+    "\n",
+    "The Deepcover MoA proteomics dataset consists of proteome fingerprints for 875 chemical perturbations. This volcano plot of protein expression shows the logFC on the x-axis and the absolute difference in standard deviations between the protein's logFC for a given compound and the protein's mean logFC across all compounds on the y-axis. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "213d50f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_deepcover:\n",
+    "    # check for multiple proteins\n",
+    "    uniprot_ids = list(protein_de['UniprotID'].unique())\n",
+    "    for uid in uniprot_ids:\n",
+    "        create_bokeh_volcano_plot(protein_de[protein_de['UniprotID']==uid], query_gene, '', 'Deepcover MoA')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "ba0c6439",
@@ -1070,7 +1225,9 @@
     "\n",
     "[5] “LINCS L1000 Reverse Search.” n.d. Accessed September 5, 2025. https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/.\n",
     "\n",
-    "[6] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48."
+    "[6] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48.\n",
+    "\n",
+    "[7] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57."
    ]
   }
  ],

From f80559075de7a2f8e5ef0887595d71b2277a50c8 Mon Sep 17 00:00:00 2001
From: Lily Taub <115661359+lilydtaub@users.noreply.github.com>
Date: Tue, 7 Oct 2025 17:19:44 -0400
Subject: [PATCH 2/2] add Tahoe data to DGB2

---
 appyters/Drug_Gene_Budger2/appyter.json       |   2 +-
 .../drug_gene_budger2_appyter.ipynb           | 171 +++++++++++++++---
 2 files changed, 142 insertions(+), 31 deletions(-)

diff --git a/appyters/Drug_Gene_Budger2/appyter.json b/appyters/Drug_Gene_Budger2/appyter.json
index dbbd7eda..86ac03a7 100644
--- a/appyters/Drug_Gene_Budger2/appyter.json
+++ b/appyters/Drug_Gene_Budger2/appyter.json
@@ -2,7 +2,7 @@
     "$schema": "https://raw.githubusercontent.com/MaayanLab/appyter-catalog/main/schema/appyter-validator.json",
     "name": "Drug_Gene_Budger2",
     "title": "Dr. Gene Budger (DGB) 2",
-    "version": "0.0.7",
+    "version": "0.0.8",
     "description": "An appyter that retrieves drugs that up-regulate and down-regulate a single input gene across Connectivity Mapping datasets",
     "image": "dgb_logo.png",
     "authors": [
diff --git a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
index dadd541d..0cd54b0d 100644
--- a/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
+++ b/appyters/Drug_Gene_Budger2/drug_gene_budger2_appyter.ipynb
@@ -100,6 +100,7 @@
     "- Ginkgo GDPx1 and GPDx2: Limma-Voom based differential gene expression results for 1,354 drugs.\n",
     "- Novartis DRUG-seq: Differential: Limma-Trend based differential gene expression results for 4,343 drugs. \n",
     "- LINCS L1000 Chemical Perturbations: Limma-Voom based differential gene expression results for a subset of 4,091 drugs from the LINCS L1000 Chemical Perturbation dataset. \n",
+    "- Tahoe 100-M: DESeq based differential gene expression results for 376 drugs tested across 50 different cancer cell lines. \n",
     "\n",
     "The Ginkgo dataset includes 4 primary cell types (epithelial melanocytes, smooth aortic muscle cells, skeletal muscle myoblasts and dermal fibroblasts) and one cell line (A549 lung carcinoma cell line). Previous analysis showed distinct transcriptional responses by cell type, so the drug rankings for the Ginkgo dataset are separated by cell type.\n",
     "\n",
@@ -119,6 +120,7 @@
     "import re\n",
     "from itertools import combinations\n",
     "import warnings\n",
+    "import hashlib\n",
     "\n",
     "## Tables\n",
     "from IPython.display import display, display_markdown, HTML\n",
@@ -129,6 +131,7 @@
     "\n",
     "## Venn Diagram\n",
     "from matplotlib_venn import venn3, venn2\n",
+    "import matplotlib.pyplot as plt\n",
     "\n",
     "## Volcano Plot\n",
     "from bokeh.plotting import figure, show\n",
@@ -149,6 +152,7 @@
     "novartis_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/novartis_de'\n",
     "lincs_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/lincs_de'\n",
     "deepcover_moa_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/deepcoverMoa_de'\n",
+    "tahoe_URL = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/tahoe_de'\n",
     "\n",
     "# silence warnings\n",
     "warnings.filterwarnings('ignore')"
@@ -161,7 +165,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "in_ginkgo = in_novartis = in_lincs = True"
+    "in_ginkgo = in_novartis = in_lincs = in_tahoe = True"
    ]
   },
   {
@@ -253,6 +257,39 @@
     "    in_novartis=False"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7910ada",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get Tahoe DE results for gene\n",
+    "\n",
+    "# hash_bucket function used to sort genes into buckets\n",
+    "def hash_bucket(gene, num_buckets=512):\n",
+    "    '''\n",
+    "    gene: Gene symbol\n",
+    "    num_buckets: number of hash buckets to create\n",
+    "\n",
+    "    Returns integer hash for gene name (between 0-n_buckets)\n",
+    "    '''\n",
+    "    return int(hashlib.md5(gene.encode()).hexdigest(),16) % num_buckets\n",
+    "\n",
+    "query_gene_encoded = hash_bucket(query_gene)\n",
+    "\n",
+    "try:\n",
+    "    tahoe_de = pd.read_parquet(f'{tahoe_URL}/gene_bucket_{query_gene_encoded}.parquet')\n",
+    "    tahoe_de = tahoe_de[tahoe_de['gene_name']==query_gene]\n",
+    "    tahoe_de['log10adj.P.Val'] = tahoe_de['padj'].replace(0,1e-323).map(np.log10)*-1\n",
+    "    tahoe_de.rename(columns = {'log2FoldChange':'logFC', 'drug':'Drug', 'padj':'adj.P.Val'}, inplace=True)\n",
+    "    tahoe_de['GeneDir'] = np.where(tahoe_de['UpReg']>0,'Up','Dn')\n",
+    "    \n",
+    "except:\n",
+    "    print('Gene not in Tahoe-100M dataset')\n",
+    "    in_tahoe=False"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -260,10 +297,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_lincs + in_novartis + in_ginkgo < 1:\n",
+    "if in_lincs + in_novartis + in_ginkgo + in_tahoe < 1:\n",
     "    print(f\"LINCS: {in_lincs}\")\n",
     "    print(f\"Novartis: {in_novartis}\")\n",
     "    print(f\"Ginkgo: {in_ginkgo}\")\n",
+    "    print(f\"Tahoe-100M: {in_tahoe}\")\n",
     "    raise Exception(\"Execution stopped, gene not found in any datasets\")"
    ]
   },
@@ -290,7 +328,7 @@
    "outputs": [],
    "source": [
     "# Get pubchem ID dataframe\n",
-    "pubchem_location = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/cmap_pubchem_ids.csv'\n",
+    "pubchem_location = 'https://appyters.maayanlab.cloud/storage/DrugRegulators_Appyter/cmap_pubchem_ids_10062025.csv'\n",
     "pubchem_ids = pd.read_csv(pubchem_location, dtype = {'Drug':str, 'CID':str})"
    ]
   },
@@ -533,6 +571,45 @@
     "    display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq', raw=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "c04f416c",
+   "metadata": {},
+   "source": [
+    "### Tahoe-100M"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07834e6",
+   "metadata": {},
+   "source": [
+    "Drug rankings for the Tahoe-100M dataset. Top 20 by the chosen ranking method are shown, and the full results are available for download. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca416e30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_tahoe:\n",
+    "    tahoe_drugs_up = get_rankings(tahoe_de, 'Tahoe', '', 'up', ranking_method)\n",
+    "    tahoe_drugs_down = get_rankings(tahoe_de, 'Tahoe', '', 'down', ranking_method)\n",
+    "\n",
+    "    display_markdown(f'**Top {top_n} up-regulators in Tahoe-100M**', raw=True)\n",
+    "    display(tahoe_drugs_up[0].head(top_n))\n",
+    "    display(HTML(download_link(tahoe_drugs_up[0], f'tahoe_drug_ranks_{query_gene}_UpReg.tsv', 'Download results averaged across drug dosages')))\n",
+    "    display(HTML(download_link(tahoe_drugs_up[1], f'tahoe_drug_ranks_{query_gene}_full_UpReg.tsv', 'Download results for all perturbations')))\n",
+    "    display_markdown(f'**Top {top_n} down-regulators in Tahoe-100M**', raw=True)\n",
+    "    display(tahoe_drugs_down[0].head(top_n))\n",
+    "    display(HTML(download_link(tahoe_drugs_down[0], f'tahoe_drug_ranks_{query_gene}_DnReg.tsv', 'Download results averaged across drug dosages')))\n",
+    "    display(HTML(download_link(tahoe_drugs_down[1], f'tahoe_drug_ranks_{query_gene}_full_DnReg.tsv', 'Download results for all perturbations')))\n",
+    "else:\n",
+    "    display_markdown(f'**{query_gene}** not found in Tahoe-100M', raw=True)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -554,7 +631,11 @@
     "# get results from novartis\n",
     "if in_novartis:\n",
     "    top_up['novartis'] = get_top(novartis_drugs_up[0], n=50)\n",
-    "    top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)"
+    "    top_down['novartis'] = get_top(novartis_drugs_down[0], n=50)\n",
+    "# get results from Tahoe\n",
+    "if in_tahoe:\n",
+    "    top_up['tahoe'] = get_top(tahoe_drugs_up[0], n=50)\n",
+    "    top_down['tahoe'] = get_top(tahoe_drugs_down[0], n=50)"
    ]
   },
   {
@@ -605,6 +686,7 @@
     "            'ginkgo_A549': 'ginkgo_A549',\n",
     "            'lincs_l1000': 'lincs_l1000',\n",
     "            'novartis': 'novartis',\n",
+    "            'tahoe': 'tahoe',\n",
     "            'ginkgo_human_epithelial_melanocytes': 'ginkgo_melanocytes',\n",
     "            'ginkgo_human_dermal_fibroblast': 'ginkgo_fibroblasts',\n",
     "            'ginkgo_human_aortic_smooth_muscle_cells': 'ginkgo_muscle_cells',\n",
@@ -623,7 +705,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(f\"**Overlap among top up regulators of {query_gene}**\", raw=True)\n",
@@ -692,7 +774,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    overlap_down = get_overlapping_sets(top_down)\n",
@@ -713,14 +795,6 @@
    "outputs": [],
    "source": [
     "def get_ranking_averages(overlapping_df, data_dict, ranking_method):\n",
-    "    '''\n",
-    "    Retrieve average target ranking across datasets for drugs in overlapping sets. \n",
-    "\n",
-    "    Returns dataframe with columns for:\n",
-    "    Drug\n",
-    "    Average Rank\n",
-    "    Number of datasets for which drug was a significant regulator of the query gene\n",
-    "    '''\n",
     "    # get average, integrating across datasets\n",
     "    average_rank_vals = {}\n",
     "    average_pctrank_vals = {}\n",
@@ -777,7 +851,7 @@
     "    # clean column names\n",
     "    with_proteins.rename(columns = {'logFC':'Protein logFC', 'Pubchem' : 'PubChem CID'}, inplace=True)\n",
     "    with_proteins.drop(columns='CID',inplace=True)\n",
-    "    return with_proteins"
+    "    return with_proteins.sort_values(['N Datasets', 'Avg Adj.P.Val'], ascending=[False,True])"
    ]
   },
   {
@@ -794,11 +868,12 @@
     "                       'human_epithelial_melanocytes': in_ginkgo,\n",
     "                       'human_skeletal_muscle_myoblasts': in_ginkgo,\n",
     "                       'novartis': in_novartis,\n",
-    "                       'lincs': in_lincs}\n",
+    "                       'lincs': in_lincs,\n",
+    "                       'tahoe': in_tahoe}\n",
     "data_dict_down ={}\n",
     "data_dict_up = {}\n",
     "for source,present in data_source_present.items():\n",
-    "    if (present) & (not source in ['novartis','lincs']):\n",
+    "    if (present) & (not source in ['novartis','lincs','tahoe']):\n",
     "        data_dict_down[source] = ginkgo_drugs_down[source][1]\n",
     "        data_dict_up[source] = ginkgo_drugs_up[source][1]\n",
     "    elif (present) & (source == 'lincs'):\n",
@@ -807,8 +882,11 @@
     "    elif (present) & (source == 'novartis'):\n",
     "        data_dict_down[source] = novartis_drugs_down[1]\n",
     "        data_dict_up[source] = novartis_drugs_up[1]\n",
+    "    elif (present) & (source == 'tahoe'):\n",
+    "        data_dict_down[source] = tahoe_drugs_down[1]\n",
+    "        data_dict_up[source] = tahoe_drugs_up[1]\n",
     "\n",
-    "if in_ginkgo + in_lincs + in_novartis > 1:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe > 1:\n",
     "    overlapping_up_TargetRank = get_ranking_averages(overlap_up, data_dict_up, ranking_method)\n",
     "    overlapping_down_TargetRank = get_ranking_averages(overlap_down, data_dict_down, ranking_method)"
    ]
@@ -828,7 +906,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(\"**Averages across datasets: Up-regulating drugs**\", raw=True)\n",
@@ -909,7 +987,7 @@
    "source": [
     "## Venn Diagrams\n",
     "\n",
-    "The venn diagrams show the overlap among either up-regulating or down-regulating drugs across the three datasets Novartis DRUG-seq, LINCS L1000, and Ginkgo (all cell types grouped). "
+    "The venn diagrams show the pairwise overlap among either up-regulating or down-regulating drugs across the four Connectivity Mapping datasets Tahoe-100M, Novartis DRUG-seq, LINCS L1000, and Ginkgo (all cell types grouped). "
    ]
   },
   {
@@ -930,7 +1008,8 @@
     "# define input data for venn diagrams\n",
     "data_source_present = {'ginkgo':in_ginkgo,\n",
     "                       'lincs_l1000':in_lincs,\n",
-    "                       'novartis':in_novartis}\n",
+    "                       'novartis':in_novartis,\n",
+    "                       'tahoe': in_tahoe}\n",
     "venn_up = {}\n",
     "venn_down = {}\n",
     "for source,present in data_source_present.items():\n",
@@ -990,7 +1069,8 @@
     "    for datasets, overlap in results.items():\n",
     "        if len(overlap) == 0:\n",
     "            overlap = ['None']\n",
-    "        print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")"
+    "        # print(f\"{', '.join(datasets)}: {', '.join(overlap)}\")\n",
+    "        return f\"{', '.join(datasets)}: {', '.join(overlap)}\""
    ]
   },
   {
@@ -1000,12 +1080,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(f'Overlap of top {query_gene} up-regulating drugs across sources', raw=True)\n",
-    "    create_venn(venn_up)\n",
-    "    print_overlap(venn_up)"
+    "    for combo in combinations(list(venn_up.keys()), 2):\n",
+    "        combo_venn = {k:venn_up[k] for k in combo if k in venn_up}\n",
+    "        create_venn(combo_venn)\n",
+    "        plt.title(print_overlap(combo_venn))\n",
+    "        plt.show()"
    ]
   },
   {
@@ -1015,12 +1098,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "if in_ginkgo + in_lincs + in_novartis < 2:\n",
+    "if in_ginkgo + in_lincs + in_novartis + in_tahoe < 2:\n",
     "    display_markdown(f'**{query_gene}** not found in at least 2 datasets')\n",
     "else:\n",
     "    display_markdown(f'Overlap of top {query_gene} down-regulating drugs across sources', raw=True)\n",
-    "    create_venn(venn_down)\n",
-    "    print_overlap(venn_down)"
+    "    for combo in combinations(list(venn_down.keys()), 2):\n",
+    "        combo_venn = {k:venn_down[k] for k in combo if k in venn_down}\n",
+    "        create_venn(combo_venn)\n",
+    "        plt.title(print_overlap(combo_venn))\n",
+    "        plt.show()"
    ]
   },
   {
@@ -1064,6 +1150,8 @@
     "        df['Label'] = df['Perturbation'] + '_' + df['Drug']\n",
     "    elif source == 'L1000':\n",
     "        df['Label'] = df['Perturbation']\n",
+    "    elif source == 'Tahoe':\n",
+    "        df['Label'] = df['Drug'] + '-' + df['concentration'].astype(str) + '-' + df['Cell_ID_Cellosaur']\n",
     "    elif source == 'Deepcover MoA':\n",
     "        df['Label'] = df['Drug']\n",
     "        df['abs_Zscore'] = df['Zscore'].apply(abs)\n",
@@ -1184,6 +1272,27 @@
     "    display_markdown(f'**{query_gene}** not found in Novartis DRUG-seq dataset', raw=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "7ada5a72",
+   "metadata": {},
+   "source": [
+    "### Tahoe-100M"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "679ff8cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if in_tahoe:\n",
+    "    create_bokeh_volcano_plot(tahoe_de, query_gene, '','Tahoe')\n",
+    "else:\n",
+    "    display_markdown(f'**{query_gene}** not found in Tahoe-100M dataset', raw=True)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "8c3df5f5",
@@ -1225,9 +1334,11 @@
     "\n",
     "[5] “LINCS L1000 Reverse Search.” n.d. Accessed September 5, 2025. https://lincs-reverse-search-dashboard.dev.maayanlab.cloud/.\n",
     "\n",
-    "[6] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48.\n",
+    "[6] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57.\n",
+    "\n",
+    "[7] Zhang, Jesse, Airol A. Ubas, Richard de Borja, Valentine Svensson, Nicole Thomas, Neha Thakar, Aidan Winters, et al. 2025. “Tahoe-100M: A Giga-Scale Single-Cell Perturbation Atlas for Context-Dependent Gene Function and Cellular Modeling.” bioRxiv. https://doi.org/10.1101/2025.02.20.639398.\n",
     "\n",
-    "[7] Mitchell, Dylan C., Miljan Kuljanin, Jiaming Li, Jonathan G. Van Vranken, Nathan Bulloch, Devin K. Schweppe, Edward L. Huttlin, and Steven P. Gygi. 2023. “A Proteome-Wide Atlas of Drug Mechanism of Action.” Nature Biotechnology 41 (6): 845–57."
+    "[8] Wang, Zichen, Edward He, Kevin Sani, Kathleen M. Jagodnik, Moshe C. Silverstein, and Avi Ma’ayan. 2019. “Drug Gene Budger (DGB): An Application for Ranking Drugs to Modulate a Specific Gene Based on Transcriptomic Signatures.” Bioinformatics (Oxford, England) 35 (7): 1247–48."
    ]
   }
  ],