Extract sample info from PANCAN_clinicalMatrix (#20)

* Extract sample info from PANCAN_clinicalMatrix Keeps only samples with type equal to "Primary Tumor". This filters multiple samples from the same patient, which could cause an issue for machine learning due to a dependent observations (discussed in #10). This filter reduced the number of samples with expression and mutation from 7,705 to 7,306. Closes #10: all variables that could help with sample selection or covariates, that are in PANCAN_clinicalMatrix, are extracted to `data/samples.tsv`. Relies on documentation of PANCAN_clinicalMatrix variables provided by the Xena Browser team in #14. Closes #17: only sample_ids with expression, mutation, and clinical data are output to `data/`. * Retain primary blood cancers Retain cancers whose type is "Primary Blood Derived Cancer - Peripheral Blood". See #20 (comment)
cognoma · Aug 25, 2016 · aa66efc · aa66efc
1 parent 11514bb
commit aa66efc
Show file tree

Hide file tree

Showing 9 changed files with 7,730 additions and 1,043 deletions.
diff --git a/2.TCGA-process.ipynb b/2.TCGA-process.ipynb
@@ -15,6 +15,7 @@
    },
    "outputs": [],
    "source": [
+    "import collections\n",
     "import os\n",
     "\n",
     "import pandas"
@@ -26,7 +27,9 @@
    "source": [
     "## Read sample information\n",
     "\n",
-    "This file contains sample information. See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/PANCAN_clinicalMatrix&host=https://tcga.xenahubs.net) for `PANCAN_clinicalMatrix`."
+    "This file contains sample information. See the [online documentation](https://genome-cancer.soe.ucsc.edu/proj/site/xena/datapages/?dataset=TCGA.PANCAN.sampleMap/PANCAN_clinicalMatrix&host=https://tcga.xenahubs.net) for `PANCAN_clinicalMatrix`.\n",
+    "\n",
+    "See [cognoma/cancer-data#14](https://github.com/cognoma/cancer-data/issues/14#issuecomment-238642439 \"GitHub Issue: Variable documentation for Xena Browser datasets\") for additional variable documentation."
    ]
   },
   {
@@ -39,7 +42,7 @@
     {
      "data": {
       "text/plain": [
-       "(12811, 40)"
+       "10793"
       ]
      },
      "execution_count": 2,
@@ -49,13 +52,44 @@
    ],
    "source": [
     "path = os.path.join('download', 'PANCAN_clinicalMatrix.tsv.bz2')\n",
+    "\n",
+    "# Mapping to rename and filter columns\n",
+    "renamer = collections.OrderedDict([\n",
+    "    ('sampleID', 'sample_id'),\n",
+    "    ('_PATIENT', 'patient_id'),\n",
+    "    ('sample_type', 'sample_type'),\n",
+    "    ('_primary_disease', 'disease'),\n",
+    "    ('_primary_site', 'organ_of_origin'),\n",
+    "    ('gender', 'gender'),\n",
+    "    ('age_at_initial_pathologic_diagnosis', 'age_diagnosed'),\n",
+    "    ('_OS_IND', 'dead'),\n",
+    "    ('_OS', 'days_survived'),\n",
+    "    ('_RFS_IND', 'recurred'),\n",
+    "    ('_RFS', 'days_recurrence_free'),\n",
+    "])\n",
+    "\n",
+    "# Keep only these sample types\n",
+    "# filters duplicate samples per patient\n",
+    "sample_types = {\n",
+    "    'Primary Tumor',\n",
+    "    'Primary Blood Derived Cancer - Peripheral Blood',\n",
+    "}\n",
+    "\n",
     "clinmat_df = (\n",
     "    pandas.read_table(path)\n",
-    "    .rename(columns={'sampleID': 'sample_id'})\n",
+    "    .rename(columns=renamer)\n",
+    "    [list(renamer.values())]\n",
+    "    .query(\"sample_type in @sample_types\")\n",
+    "    .set_index('sample_id', drop=False)\n",
     ")\n",
-    "# Check that no sample_ids are duplicated\n",
-    "assert not clinmat_df.sample_id.duplicated().any()\n",
-    "clinmat_df.shape"
+    "\n",
+    "# Fix capitalization of gender\n",
+    "clinmat_df.gender = clinmat_df.gender.str.title()\n",
+    "\n",
+    "# Check that no patients are duplicated\n",
+    "assert not clinmat_df.duplicated('patient_id', keep=False).any()\n",
+    "\n",
+    "len(clinmat_df)"
    ]
   },
   {
@@ -67,15 +101,92 @@
    "outputs": [
     {
      "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sample_id</th>\n",
+       "      <th>patient_id</th>\n",
+       "      <th>sample_type</th>\n",
+       "      <th>disease</th>\n",
+       "      <th>organ_of_origin</th>\n",
+       "      <th>gender</th>\n",
+       "      <th>age_diagnosed</th>\n",
+       "      <th>dead</th>\n",
+       "      <th>days_survived</th>\n",
+       "      <th>recurred</th>\n",
+       "      <th>days_recurrence_free</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>sample_id</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>TCGA-02-0001-01</th>\n",
+       "      <td>TCGA-02-0001-01</td>\n",
+       "      <td>TCGA-02-0001</td>\n",
+       "      <td>Primary Tumor</td>\n",
+       "      <td>glioblastoma multiforme</td>\n",
+       "      <td>Brain</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>44.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>358.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>TCGA-02-0003-01</th>\n",
+       "      <td>TCGA-02-0003-01</td>\n",
+       "      <td>TCGA-02-0003</td>\n",
+       "      <td>Primary Tumor</td>\n",
+       "      <td>glioblastoma multiforme</td>\n",
+       "      <td>Brain</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>50.0</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>144.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
       "text/plain": [
-       "Primary Tumor                                      10593\n",
-       "Solid Tissue Normal                                 1475\n",
-       "Metastatic                                           396\n",
-       "Primary Blood Derived Cancer - Peripheral Blood      200\n",
-       "Recurrent Tumor                                       56\n",
-       "Additional - New Primary                              11\n",
-       "Additional Metastatic                                  1\n",
-       "Name: sample_type, dtype: int64"
+       "                       sample_id    patient_id    sample_type  \\\n",
+       "sample_id                                                       \n",
+       "TCGA-02-0001-01  TCGA-02-0001-01  TCGA-02-0001  Primary Tumor   \n",
+       "TCGA-02-0003-01  TCGA-02-0003-01  TCGA-02-0003  Primary Tumor   \n",
+       "\n",
+       "                                 disease organ_of_origin  gender  \\\n",
+       "sample_id                                                          \n",
+       "TCGA-02-0001-01  glioblastoma multiforme           Brain  Female   \n",
+       "TCGA-02-0003-01  glioblastoma multiforme           Brain    Male   \n",
+       "\n",
+       "                 age_diagnosed  dead  days_survived  recurred  \\\n",
+       "sample_id                                                       \n",
+       "TCGA-02-0001-01           44.0   1.0          358.0       NaN   \n",
+       "TCGA-02-0003-01           50.0   1.0          144.0       NaN   \n",
+       "\n",
+       "                 days_recurrence_free  \n",
+       "sample_id                              \n",
+       "TCGA-02-0001-01                   NaN  \n",
+       "TCGA-02-0003-01                   NaN  "
       ]
      },
      "execution_count": 3,
@@ -84,8 +195,7 @@
     }
    ],
    "source": [
-    "# Types of samples\n",
-    "clinmat_df.sample_type.value_counts()"
+    "clinmat_df.head(2)"
    ]
   },
   {
@@ -864,6 +974,29 @@
    "metadata": {
     "collapsed": false
    },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9283"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Number of patients represented in the expression dataset\n",
+    "clinmat_df.query(\"sample_id in @expr_df.index\").patient_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": false
+   },
    "outputs": [
     {
      "data": {
@@ -943,7 +1076,7 @@
        "TCGA-02-2486-01  6.7716  15.3224  6.3377  2.2199  16.7832"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -964,36 +1097,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "7705"
+       "7306"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "sample_ids = list(gene_mutation_mat_df.index & expr_df.index)\n",
+    "sample_ids = list(clinmat_df.index & gene_mutation_mat_df.index & expr_df.index)\n",
     "len(sample_ids)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
     "# Filter expression (x) and mutation (y) matrices for common samples\n",
+    "sample_df = clinmat_df.loc[sample_ids, :]\n",
     "x_df = expr_df.loc[sample_ids, :]\n",
     "y_df = gene_mutation_mat_df.loc[sample_ids, :]"
    ]
@@ -1009,13 +1143,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "path = os.path.join('data', 'samples.tsv')\n",
+    "sample_df.to_csv(path, sep='\\t', float_format='%.0f', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
-    "def sample_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):\n",
+    "def subset_df(df, nrows=None, ncols=None, row_seed=0, col_seed=0):\n",
     "    \"\"\"Randomly subset a dataframe, preserving row and column order.\"\"\"\n",
     "    if nrows is None:\n",
     "        nrows = len(df)\n",
@@ -1031,7 +1177,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 25,
    "metadata": {
     "collapsed": false
    },
@@ -1048,17 +1194,8 @@
     "    # Save subsetted datasets\n",
     "    for sample, nrows, ncols in ('small', 50, 15), ('all-samples', None, 15), ('all-genes', 50, None):\n",
     "        path = os.path.join('data', 'subset', '{}-{}.tsv'.format(name, sample))\n",
-    "        sample_df(df, nrows=nrows, ncols=ncols).to_csv(path, **tsv_args)"
+    "        subset_df(df, nrows=nrows, ncols=ncols).to_csv(path, **tsv_args)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {