Skip to content

Commit

Permalink
Extract sample info from PANCAN_clinicalMatrix (#20)
Browse files Browse the repository at this point in the history
* Extract sample info from PANCAN_clinicalMatrix

Keeps only samples with type equal to "Primary Tumor". This filters multiple
samples from the same patient, which could cause an issue for machine learning
due to a dependent observations (discussed in #10). This filter reduced the
number of samples with expression and mutation from 7,705 to 7,306.

Closes #10: all variables that could help with sample selection or covariates,
that are in PANCAN_clinicalMatrix, are extracted to `data/samples.tsv`.

Relies on documentation of PANCAN_clinicalMatrix variables provided by the
Xena Browser team in #14.

Closes #17: only sample_ids with expression, mutation, and clinical data are
output to `data/`.

* Retain primary blood cancers

Retain cancers whose type is "Primary Blood Derived Cancer - Peripheral Blood".
See #20 (comment)
  • Loading branch information
dhimmel authored and clairemcleod committed Aug 25, 2016
1 parent 11514bb commit aa66efc
Show file tree
Hide file tree
Showing 9 changed files with 7,730 additions and 1,043 deletions.
207 changes: 172 additions & 35 deletions 2.TCGA-process.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
},
"outputs": [],
"source": [
"import collections\n",
"import os\n",
"\n",
"import pandas"
Expand All @@ -26,7 +27,9 @@
"source": [
"## Read sample information\n",
"\n",
"This file contains sample information. See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/PANCAN_clinicalMatrix&host=https://tcga.xenahubs.net) for `PANCAN_clinicalMatrix`."
"This file contains sample information. See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/PANCAN_clinicalMatrix&host=https://tcga.xenahubs.net) for `PANCAN_clinicalMatrix`.\n",
"\n",
"See [cognoma/cancer-data#14](https://github.com/cognoma/cancer-data/issues/14#issuecomment-238642439 \"GitHub Issue: Variable documentation for Xena Browser datasets\") for additional variable documentation."
]
},
{
Expand All @@ -39,7 +42,7 @@
{
"data": {
"text/plain": [
"(12811, 40)"
"10793"
]
},
"execution_count": 2,
Expand All @@ -49,13 +52,44 @@
],
"source": [
"path = os.path.join('download', 'PANCAN_clinicalMatrix.tsv.bz2')\n",
"\n",
"# Mapping to rename and filter columns\n",
"renamer = collections.OrderedDict([\n",
" ('sampleID', 'sample_id'),\n",
" ('_PATIENT', 'patient_id'),\n",
" ('sample_type', 'sample_type'),\n",
" ('_primary_disease', 'disease'),\n",
" ('_primary_site', 'organ_of_origin'),\n",
" ('gender', 'gender'),\n",
" ('age_at_initial_pathologic_diagnosis', 'age_diagnosed'),\n",
" ('_OS_IND', 'dead'),\n",
" ('_OS', 'days_survived'),\n",
" ('_RFS_IND', 'recurred'),\n",
" ('_RFS', 'days_recurrence_free'),\n",
"])\n",
"\n",
"# Keep only these sample types\n",
"# filters duplicate samples per patient\n",
"sample_types = {\n",
" 'Primary Tumor',\n",
" 'Primary Blood Derived Cancer - Peripheral Blood',\n",
"}\n",
"\n",
"clinmat_df = (\n",
" pandas.read_table(path)\n",
" .rename(columns={'sampleID': 'sample_id'})\n",
" .rename(columns=renamer)\n",
" [list(renamer.values())]\n",
" .query(\"sample_type in @sample_types\")\n",
" .set_index('sample_id', drop=False)\n",
")\n",
"# Check that no sample_ids are duplicated\n",
"assert not clinmat_df.sample_id.duplicated().any()\n",
"clinmat_df.shape"
"\n",
"# Fix capitalization of gender\n",
"clinmat_df.gender = clinmat_df.gender.str.title()\n",
"\n",
"# Check that no patients are duplicated\n",
"assert not clinmat_df.duplicated('patient_id', keep=False).any()\n",
"\n",
"len(clinmat_df)"
]
},
{
Expand All @@ -67,15 +101,92 @@
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>sample_id</th>\n",
" <th>patient_id</th>\n",
" <th>sample_type</th>\n",
" <th>disease</th>\n",
" <th>organ_of_origin</th>\n",
" <th>gender</th>\n",
" <th>age_diagnosed</th>\n",
" <th>dead</th>\n",
" <th>days_survived</th>\n",
" <th>recurred</th>\n",
" <th>days_recurrence_free</th>\n",
" </tr>\n",
" <tr>\n",
" <th>sample_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>TCGA-02-0001-01</th>\n",
" <td>TCGA-02-0001-01</td>\n",
" <td>TCGA-02-0001</td>\n",
" <td>Primary Tumor</td>\n",
" <td>glioblastoma multiforme</td>\n",
" <td>Brain</td>\n",
" <td>Female</td>\n",
" <td>44.0</td>\n",
" <td>1.0</td>\n",
" <td>358.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TCGA-02-0003-01</th>\n",
" <td>TCGA-02-0003-01</td>\n",
" <td>TCGA-02-0003</td>\n",
" <td>Primary Tumor</td>\n",
" <td>glioblastoma multiforme</td>\n",
" <td>Brain</td>\n",
" <td>Male</td>\n",
" <td>50.0</td>\n",
" <td>1.0</td>\n",
" <td>144.0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Primary Tumor 10593\n",
"Solid Tissue Normal 1475\n",
"Metastatic 396\n",
"Primary Blood Derived Cancer - Peripheral Blood 200\n",
"Recurrent Tumor 56\n",
"Additional - New Primary 11\n",
"Additional Metastatic 1\n",
"Name: sample_type, dtype: int64"
" sample_id patient_id sample_type \\\n",
"sample_id \n",
"TCGA-02-0001-01 TCGA-02-0001-01 TCGA-02-0001 Primary Tumor \n",
"TCGA-02-0003-01 TCGA-02-0003-01 TCGA-02-0003 Primary Tumor \n",
"\n",
" disease organ_of_origin gender \\\n",
"sample_id \n",
"TCGA-02-0001-01 glioblastoma multiforme Brain Female \n",
"TCGA-02-0003-01 glioblastoma multiforme Brain Male \n",
"\n",
" age_diagnosed dead days_survived recurred \\\n",
"sample_id \n",
"TCGA-02-0001-01 44.0 1.0 358.0 NaN \n",
"TCGA-02-0003-01 50.0 1.0 144.0 NaN \n",
"\n",
" days_recurrence_free \n",
"sample_id \n",
"TCGA-02-0001-01 NaN \n",
"TCGA-02-0003-01 NaN "
]
},
"execution_count": 3,
Expand All @@ -84,8 +195,7 @@
}
],
"source": [
"# Types of samples\n",
"clinmat_df.sample_type.value_counts()"
"clinmat_df.head(2)"
]
},
{
Expand Down Expand Up @@ -864,6 +974,29 @@
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"9283"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Number of patients represented in the expression dataset\n",
"clinmat_df.query(\"sample_id in @expr_df.index\").patient_id.nunique()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -943,7 +1076,7 @@
"TCGA-02-2486-01 6.7716 15.3224 6.3377 2.2199 16.7832"
]
},
"execution_count": 19,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -964,36 +1097,37 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"7705"
"7306"
]
},
"execution_count": 20,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_ids = list(gene_mutation_mat_df.index & expr_df.index)\n",
"sample_ids = list(clinmat_df.index & gene_mutation_mat_df.index & expr_df.index)\n",
"len(sample_ids)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter expression (x) and mutation (y) matrices for common samples\n",
"sample_df = clinmat_df.loc[sample_ids, :]\n",
"x_df = expr_df.loc[sample_ids, :]\n",
"y_df = gene_mutation_mat_df.loc[sample_ids, :]"
]
Expand All @@ -1009,13 +1143,25 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"path = os.path.join('data', 'samples.tsv')\n",
"sample_df.to_csv(path, sep='\\t', float_format='%.0f', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def sample_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):\n",
"def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):\n",
" \"\"\"Randomly subset a dataframe, preserving row and column order.\"\"\"\n",
" if nrows is None:\n",
" nrows = len(df)\n",
Expand All @@ -1031,7 +1177,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 25,
"metadata": {
"collapsed": false
},
Expand All @@ -1048,17 +1194,8 @@
" # Save subsetted datasets\n",
" for sample, nrows, ncols in ('small', 50, 15), ('all-samples', None, 15), ('all-genes', 50, None):\n",
" path = os.path.join('data', 'subset', '{}-{}.tsv'.format(name, sample))\n",
" sample_df(df, nrows=nrows, ncols=ncols).to_csv(path, **tsv_args)"
" subset_df(df, nrows=nrows, ncols=ncols).to_csv(path, **tsv_args)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
Loading

0 comments on commit aa66efc

Please sign in to comment.