Skip to content

Commit

Permalink
April 2024 Connect notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
ianfore committed Dec 1, 2024
1 parent c9b1095 commit 64d45c9
Show file tree
Hide file tree
Showing 5 changed files with 2,839 additions and 40 deletions.
242 changes: 242 additions & 0 deletions notebooks/Part 1 - FASPNotebook09-CRDC-BDC.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"../fasp/runner/credits/images/FASPNotebook09.jpg\" style=\"float: right;\">\n",
"\n",
"### BioDataCatalyst and CRDC Search-Locate-Compute\n",
"\n",
"* Search:\n",
" * CRDC - GECCO Data\n",
" * BioDataCatalyst - COPDGene\n",
"* Locate:\n",
" * CRDC DRS Service\n",
" * BioDataCatalyst DRS Service\n",
"* Compute: \n",
" * CRDC - Seven Bridges CGC\n",
" * ISB-CGC (Google Cloud)\n",
"\n",
"This copy of the FASPNotebook09 was created for the GA4GH Connect meeting April 2023."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from fasp.runner import FASPRunner\n",
"\n",
"# The implementations we're using\n",
"from fasp.loc import DRSMetaResolver\n",
"from fasp.workflow import GCPLSsamtools, sbcgcWESClient\n",
"from fasp.search import BigQuerySearchClient, DataConnectClient\n",
"\n",
"faspRunner = FASPRunner(program='FASPNotebook09.ipynb')\n",
"runNote = 'Two dbGaP Sources, CRDC and BioDataCatalyst'"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Step 1 - Discovery\n",
"Query for relevant DRS objects"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Searching the GA4GH registry for org.ga4gh:drs services\n",
"Running query\n",
"SELECT sp.dbGaP_Subject_ID,\n",
"'sbcgc:'||sb_drs_id FROM collections.public_datasets.dbgap_scr_gecco_susceptibility_subject_phenotypes_multi sp\n",
"join collections.public_datasets.dbgap_scr_gecco_susceptibility_sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id\n",
"join collections.public_datasets.dbgap_scr_gecco_susceptibility_sb_drs_index di on di.sample_id = sm.sample_id \n",
"where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3\n",
"_Retrieving the query_\n",
"____Page1_______________\n",
"____Page2_______________\n",
"____Page3_______________\n",
"____Page4_______________\n",
"____Page5_______________\n",
"____Page6_______________\n",
"subject=2474312, drsID=sbcgc:5baa8cece4b0db63859e6590\n",
"workflow submitted, run:cb4713f5-568e-4019-b86b-a3a27140dc8e\n",
"____________________________________________________________\n",
"subject=2473610, drsID=sbcgc:5baa8d0be4b0db63859e6843\n",
"workflow submitted, run:48d44c09-6d90-4318-81e1-16b5fe5b1d68\n",
"____________________________________________________________\n",
"subject=2474054, drsID=sbcgc:5baa8b9de4b0db63859e5f33\n",
"workflow submitted, run:61b28168-9b21-4f1a-aa3d-3d0a97140cfa\n",
"____________________________________________________________\n"
]
}
],
"source": [
"# TCGA Query - CRDC\n",
"crdcquery = '''SELECT sp.dbGaP_Subject_ID,\n",
"'sbcgc:'||sb_drs_id \n",
"\n",
"FROM dbgap_demo.scr_gecco_susceptibility.subject_phenotypes_multi sp\n",
"\n",
"join dbgap_demo.scr_gecco_susceptibility.sample_multi sm on sm.dbgap_subject_id = sp.dbgap_subject_id\n",
"\n",
"join dbgap_demo.scr_gecco_susceptibility.sb_drs_index di on di.sample_id = sm.sample_id \n",
"\n",
"where AGE between 45 and 55 and sex = 'Female' and file_type = 'cram' limit 3'''\n",
"\n",
"\n",
"bdcquery = '''\n",
" SELECT sp.dbGaP_Subject_ID, 'bdc:'||read_drs_id\n",
"\n",
"FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm\n",
"\n",
"join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id\n",
"\n",
"join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id\n",
"\n",
"where gender = '2' and Age_Enroll between 45 and 55\n",
"\n",
"LIMIT 3'''\n",
"\n",
"searchClient = DataConnectClient('https://publisher-data.publisher.dnastack.com/data-connect/')\n",
"drsClient = DRSMetaResolver()\n",
"\n",
"# Step 3 - set up a class that runs samtools for us\n",
"# providing the location for the resultssettings = faspRunner.settings\n",
"settings = faspRunner.settings\n",
"wesClient = sbcgcWESClient(settings['SevenBridgesProject'])\n",
"\n",
"faspRunner.configure(searchClient, drsClient, wesClient)\n",
"runList = faspRunner.runQuery(crdcquery, runNote)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### BioDataCatalyst runs\t"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running query\n",
"\n",
" SELECT sp.dbGaP_Subject_ID, 'bdc:'||read_drs_id\n",
" FROM `isbcgc-216220.COPDGene.Subject_MULTI` sm\n",
" join `isbcgc-216220.COPDGene.Subject_Phenotypes_HMB` sp on sp.dbgap_subject_id = sm.dbgap_subject_id\n",
" join `isbcgc-216220.COPDGene.COPD_DRS` drs on drs.su_submitter_id = sm.subject_id\n",
" where gender = '2'\n",
" and Age_Enroll between 45 and 55\n",
" LIMIT 3\n",
"subject=599754, drsID=bdc:dg.4503/22b61280-5d92-413b-bca6-ae674ee16d14\n",
"workflow submitted, run:10796449958580678646\n",
"____________________________________________________________\n",
"subject=600588, drsID=bdc:dg.4503/e5529a06-5022-42f1-bddb-761d34d2f4f1\n",
"workflow submitted, run:2497755560539929518\n",
"____________________________________________________________\n",
"subject=595520, drsID=bdc:dg.4503/35599dbc-e531-4a83-97a4-6a74587d2a78\n",
"workflow submitted, run:17902599479779506025\n",
"____________________________________________________________\n"
]
}
],
"source": [
"searchClient = BigQuerySearchClient()\n",
"gcplocation = 'projects/{}/locations/{}'.format(settings['GCPProject'], settings['GCPPipelineRegion'])\n",
"wesClient = GCPLSsamtools(gcplocation, settings['GCPOutputBucket'])\n",
"\n",
"\n",
"faspRunner.configure(searchClient, drsClient, wesClient)\n",
"runList = faspRunner.runQuery(bdcquery, runNote)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAHgAAAA8CAIAAAAiz+n/AAAAw0lEQVR4nO3bsQ3DMBAEQdL1qP9K1M+7AgdKViA8U8FjAyYH7rVmHWVm3ff99hXPXNf1efuGfyF0ROiI0BGhI0JHhI4IHRE6InRE6IjQEaEjQkeEjggdEToidGTPHDZlHWqfuBnu/fYRD814OipCR4SOCB0ROiJ0ROiI0BGhI0JHhI4IHRE6InRE6IjQEaEjQkdshpEjN0P/DPlJ6IjQEaEjQkeEjggdEToidEToiNARoSNCR4SOCB0ROiJ0ROiIzTDyBR7CGmFwt+geAAAAAElFTkSuQmCC",
"text/plain": [
"<PIL.Image.Image image mode=RGB size=120x60 at 0x11D87E940>"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"faspRunner.getFASPicon()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"DataConnectClient BigQuerySearchClient \n",
"DRSMetaResolver DRSMetaResolver \n",
"sbcgcWESClient GCPLSsamtools \n"
]
}
],
"source": [
"faspRunner.rollCredits()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 64d45c9

Please sign in to comment.