From 209c245e6abf44af5de5bb841b9b98a77705afcd Mon Sep 17 00:00:00 2001
From: Haebichan Jung <haebichan.jung@snowflake.com>
Date: Wed, 16 Apr 2025 23:51:04 +0000
Subject: [PATCH] first push

---
 Titanic Survival Analysis.ipynb | 745 +++++++++++++++++++++++---------
 1 file changed, 539 insertions(+), 206 deletions(-)
diff --git a/Titanic Survival Analysis.ipynb b/Titanic Survival Analysis.ipynb
index 87b2e29..5d38352 100644
--- a/Titanic Survival Analysis.ipynb	
+++ b/Titanic Survival Analysis.ipynb	
@@ -1,16 +1,58 @@
 {
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  },
+  "lastEditStatus": {
+   "notebookId": "7gb7mzqm464eijchkhhv",
+   "authorId": "8619036689116",
+   "authorName": "HAEBICHAN",
+   "authorEmail": "Haebichan.jung@snowflake.com",
+   "sessionId": "975dcb98-b50a-46a9-80a0-9307d4737c7a",
+   "lastEditTime": 1744846912042
+  }
+ },
+ "nbformat_minor": 2,
+ "nbformat": 4,
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell1"
+   },
    "source": [
     "<h2>Titanic Passanger Survival Analysis</h2>"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000000"
   },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "name": "cell2",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -29,36 +71,61 @@
    "source": [
     "from IPython.display import Image\n",
     "Image(url= \"https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/5095eabce4b06cb305058603/5095eabce4b02d37bef4c24c/1352002236895/100_anniversary_titanic_sinking_by_esai8mellows-d4xbme8.jpg\")"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000001"
   },
   {
    "cell_type": "code",
    "execution_count": 2,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell3",
+    "language": "python",
+    "codeCollapsed": false
    },
    "outputs": [],
    "source": [
     "import pandas as pd\n",
     "import numpy as np"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000002"
   },
   {
    "cell_type": "code",
    "execution_count": 3,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell4",
+    "language": "python",
+    "codeCollapsed": false
    },
    "outputs": [],
    "source": [
     "train = pd.read_csv(\"input/train.csv\")\n",
     "test = pd.read_csv(\"input/test.csv\")"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000003"
+  },
+  {
+   "cell_type": "code",
+   "id": "c6323e0b-98e2-4139-bb81-e131aa27902a",
+   "metadata": {
+    "language": "python",
+    "name": "cell89",
+    "codeCollapsed": false
+   },
+   "outputs": [],
+   "source": "test.head()",
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "execution_count": 4,
-   "metadata": {},
+   "metadata": {
+    "name": "cell5",
+    "language": "python",
+    "codeCollapsed": false
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -74,12 +141,16 @@
     "print(\"Train Shape:\",train.shape)\n",
     "test.isnull().sum()\n",
     "print(\"Test Shape:\",test.shape)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000004"
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "name": "cell6",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -107,12 +178,16 @@
    ],
    "source": [
     "train.info()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000005"
   },
   {
    "cell_type": "code",
    "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "name": "cell7",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -139,11 +214,14 @@
    ],
    "source": [
     "test.info()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000006"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell8"
+   },
    "source": [
     "### Data Dictionary\n",
     "\n",
@@ -158,12 +236,16 @@
     "**Total rows and columns**\n",
     "\n",
     "We can see that there are 891 rows and 12 columns in our training dataset."
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000007"
   },
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "name": "cell9",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -400,12 +482,16 @@
    ],
    "source": [
     "train.head(10)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000008"
   },
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "name": "cell10",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -551,12 +637,16 @@
    ],
    "source": [
     "train.describe()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000009"
   },
   {
    "cell_type": "code",
    "execution_count": 9,
-   "metadata": {},
+   "metadata": {
+    "name": "cell11",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -683,12 +773,16 @@
    ],
    "source": [
     "test.describe()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000010"
   },
   {
    "cell_type": "code",
    "execution_count": 10,
-   "metadata": {},
+   "metadata": {
+    "name": "cell12",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -715,12 +809,16 @@
    ],
    "source": [
     "train.isnull().sum()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000011"
   },
   {
    "cell_type": "code",
    "execution_count": 11,
-   "metadata": {},
+   "metadata": {
+    "name": "cell13",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -862,20 +960,26 @@
     "test.isnull().sum()\n",
     "test[\"Survived\"] = \"\"\n",
     "test.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000012"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell14"
+   },
    "source": [
     "# Data Visualization using Matplotlib and Seaborn packages."
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000013"
   },
   {
    "cell_type": "code",
    "execution_count": 12,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell15",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -883,11 +987,14 @@
     "%matplotlib inline\n",
     "import seaborn as sns\n",
     "sns.set() # setting seaborn default for plots"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000014"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell16"
+   },
    "source": [
     "# Bar Chart for Categorical Features \n",
     "\n",
@@ -897,13 +1004,16 @@
     "* Parch ( # of parents and children)\n",
     "* Embarked\n",
     "* Cabin"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000015"
   },
   {
    "cell_type": "code",
    "execution_count": 13,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell17",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -913,12 +1023,16 @@
     "    df = pd.DataFrame([survived,dead])\n",
     "    df.index = ['Survived','Dead']\n",
     "    df.plot(kind='bar',stacked=True, figsize=(10,5))"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000016"
   },
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "name": "cell18",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -949,19 +1063,26 @@
     "bar_chart('Sex')\n",
     "print(\"Survived :\\n\",train[train['Survived']==1]['Sex'].value_counts())\n",
     "print(\"Dead:\\n\",train[train['Survived']==0]['Sex'].value_counts())"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000017"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell19"
+   },
    "source": [
     "The Chart confirms **Women more likely survivied than Men**."
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000018"
   },
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {},
+   "metadata": {
+    "name": "cell20",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -994,20 +1115,27 @@
     "bar_chart('Pclass')\n",
     "print(\"Survived :\\n\",train[train['Survived']==1]['Pclass'].value_counts())\n",
     "print(\"Dead:\\n\",train[train['Survived']==0]['Pclass'].value_counts())"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000019"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell21"
+   },
    "source": [
     "The Chart confirms **1st class** more likely survivied than **other classes**.  \n",
     "The Chart confirms **3rd class** more likely dead than **other classes**"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000020"
   },
   {
    "cell_type": "code",
    "execution_count": 16,
-   "metadata": {},
+   "metadata": {
+    "name": "cell22",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -1046,20 +1174,27 @@
     "bar_chart('SibSp')\n",
     "print(\"Survived :\\n\",train[train['Survived']==1]['SibSp'].value_counts())\n",
     "print(\"Dead:\\n\",train[train['Survived']==0]['SibSp'].value_counts())"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000021"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell23"
+   },
    "source": [
     "The Chart confirms a **person aboarded with more than 2 siblings or spouse** more likely survived.  \n",
     "The Chart confirms a **person aboarded without siblings or spouse** more likely dead"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000022"
   },
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "name": "cell24",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -1098,20 +1233,27 @@
     "bar_chart('Parch')\n",
     "print(\"Survived :\\n\",train[train['Survived']==1]['Parch'].value_counts())\n",
     "print(\"Dead:\\n\",train[train['Survived']==0]['Parch'].value_counts())"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000023"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell25"
+   },
    "source": [
     "The Chart confirms a **person aboarded with more than 2 parents or children more likely survived.**  \n",
     "The Chart confirms a **person aboarded alone more likely dead**"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000024"
   },
   {
    "cell_type": "code",
    "execution_count": 18,
-   "metadata": {},
+   "metadata": {
+    "name": "cell26",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -1144,27 +1286,36 @@
     "bar_chart('Embarked')\n",
     "print(\"Survived :\\n\",train[train['Survived']==1]['Embarked'].value_counts())\n",
     "print(\"Dead:\\n\",train[train['Survived']==0]['Embarked'].value_counts())"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000025"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell27"
+   },
    "source": [
     "The Chart confirms a **person aboarded from C** slightly more likely survived.  \n",
     "The Chart confirms a **person aboarded from Q** more likely dead.  \n",
     "The Chart confirms a **person aboarded from S** more likely dead.  "
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000026"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell28"
+   },
    "source": [
     "## 4. Feature engineering"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000027"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell29"
+   },
    "source": [
     "Feature engineering is the process of using domain knowledge of the data\n",
     "to create features (**feature vectors**) that make machine learning algorithms work.  \n",
@@ -1172,12 +1323,16 @@
     "feature vector is an n-dimensional vector of numerical features that represent some object.\n",
     "Many algorithms in machine learning require a numerical representation of objects,\n",
     "since such representations facilitate processing and statistical analysis."
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000028"
   },
   {
    "cell_type": "code",
    "execution_count": 19,
-   "metadata": {},
+   "metadata": {
+    "name": "cell30",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -1324,19 +1479,26 @@
    ],
    "source": [
     "train.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000029"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell31"
+   },
    "source": [
     "#### 4.1 how titanic sank?"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000030"
   },
   {
    "cell_type": "code",
    "execution_count": 20,
-   "metadata": {},
+   "metadata": {
+    "name": "cell32",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -1354,12 +1516,16 @@
    ],
    "source": [
     "Image(url= \"https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/t/5090b249e4b047ba54dfd258/1351660113175/TItanic-Survival-Infographic.jpg?format=1500w\")"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000031"
   },
   {
    "cell_type": "code",
    "execution_count": 21,
-   "metadata": {},
+   "metadata": {
+    "name": "cell33",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -1596,13 +1762,16 @@
    ],
    "source": [
     "train.head(10)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000032"
   },
   {
    "cell_type": "code",
    "execution_count": 22,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell34",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -1610,12 +1779,16 @@
     "\n",
     "for dataset in train_test_data:\n",
     "    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000033"
   },
   {
    "cell_type": "code",
    "execution_count": 23,
-   "metadata": {},
+   "metadata": {
+    "name": "cell35",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -1647,12 +1820,16 @@
    ],
    "source": [
     "train['Title'].value_counts()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000034"
   },
   {
    "cell_type": "code",
    "execution_count": 24,
-   "metadata": {},
+   "metadata": {
+    "name": "cell36",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -1676,11 +1853,14 @@
    ],
    "source": [
     "test['Title'].value_counts()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000035"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell37"
+   },
    "source": [
     "#### Title Map\n",
     "\n",
@@ -1688,13 +1868,16 @@
     "Miss : 1  \n",
     "Mrs: 2  \n",
     "Others: 3  "
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000036"
   },
   {
    "cell_type": "code",
    "execution_count": 25,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell38",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -1704,12 +1887,16 @@
     "\n",
     "for dataset in train_test_data:\n",
     "    dataset['Title'] = dataset[\"Title\"].map(title_mapping)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000037"
   },
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "name": "cell39",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -1855,12 +2042,16 @@
    ],
    "source": [
     "dataset.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000038"
   },
   {
    "cell_type": "code",
    "execution_count": 27,
-   "metadata": {},
+   "metadata": {
+    "name": "cell40",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2006,12 +2197,16 @@
    ],
    "source": [
     "test.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000039"
   },
   {
    "cell_type": "code",
    "execution_count": 28,
-   "metadata": {},
+   "metadata": {
+    "name": "cell41",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2026,25 +2221,32 @@
    ],
    "source": [
     "bar_chart('Title')"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000040"
   },
   {
    "cell_type": "code",
    "execution_count": 29,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell42",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "# delete unnecessary feature from dataset\n",
     "train.drop('Name', axis=1, inplace=True)\n",
     "test.drop('Name', axis=1, inplace=True)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000041"
   },
   {
    "cell_type": "code",
    "execution_count": 30,
-   "metadata": {},
+   "metadata": {
+    "name": "cell43",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2184,25 +2386,32 @@
    ],
    "source": [
     "train.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000042"
   },
   {
    "cell_type": "code",
    "execution_count": 31,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell44",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "sex_mapping = {\"male\": 0, \"female\": 1}\n",
     "for dataset in train_test_data:\n",
     "    dataset['Sex'] = dataset['Sex'].map(sex_mapping)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000043"
   },
   {
    "cell_type": "code",
    "execution_count": 32,
-   "metadata": {},
+   "metadata": {
+    "name": "cell45",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2217,12 +2426,16 @@
    ],
    "source": [
     "bar_chart('Sex')"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000044"
   },
   {
    "cell_type": "code",
    "execution_count": 33,
-   "metadata": {},
+   "metadata": {
+    "name": "cell46",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2362,24 +2575,31 @@
    ],
    "source": [
     "test.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000045"
   },
   {
    "cell_type": "code",
    "execution_count": 34,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell47",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "train[\"Age\"].fillna(train.groupby(\"Title\")[\"Age\"].transform(\"median\"), inplace= True)\n",
     "test[\"Age\"].fillna(test.groupby('Title')['Age'].transform(\"median\"), inplace= True)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000046"
   },
   {
    "cell_type": "code",
    "execution_count": 35,
-   "metadata": {},
+   "metadata": {
+    "name": "cell48",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2945,12 +3165,16 @@
    "source": [
     "train.head(30)\n",
     "#train.groupby(\"Title\")[\"Age\"].transform(\"median\")"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000047"
   },
   {
    "cell_type": "code",
    "execution_count": 36,
-   "metadata": {},
+   "metadata": {
+    "name": "cell49",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -2995,19 +3219,26 @@
     "facet.set(xlim=(0, train['Age'].max()))\n",
     "facet.add_legend() \n",
     "plt.xlim(10,50)\n"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000048"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell50"
+   },
    "source": [
     "Those who were **20 to 30 years old** were **more dead and more survived.**"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000049"
   },
   {
    "cell_type": "code",
    "execution_count": 37,
-   "metadata": {},
+   "metadata": {
+    "name": "cell51",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -3053,11 +3284,14 @@
    "source": [
     "train.info()\n",
     "test.info()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000050"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell52"
+   },
    "source": [
     "**Binning**\n",
     "\n",
@@ -3069,12 +3303,16 @@
     "* adult: 2\n",
     "* mid-age: 3\n",
     "* senior: 4"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000051"
   },
   {
    "cell_type": "code",
    "execution_count": 38,
-   "metadata": {},
+   "metadata": {
+    "name": "cell53",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -3214,13 +3452,16 @@
    ],
    "source": [
     "train.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000052"
   },
   {
    "cell_type": "code",
    "execution_count": 39,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell54",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -3233,12 +3474,16 @@
     "# for dataset in train_test_data:\n",
     "#     dataset.loc[]\n",
     "#train[train['Age'].isin([23])]"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000053"
   },
   {
    "cell_type": "code",
    "execution_count": 40,
-   "metadata": {},
+   "metadata": {
+    "name": "cell55",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -3254,12 +3499,16 @@
    "source": [
     "train.head()\n",
     "bar_chart('Age')"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000054"
   },
   {
    "cell_type": "code",
    "execution_count": 41,
-   "metadata": {},
+   "metadata": {
+    "name": "cell56",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -3304,35 +3553,45 @@
     "print(\"Pclass1:\\n\",Pclass1)\n",
     "print(\"Pclass2:\\n\",Pclass2)\n",
     "print(\"Pclass3:\\n\",Pclass3)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000055"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell57"
+   },
    "source": [
     "more than 50 % of 1st class are from S embark.  \n",
     "more than 50 % of 2st class are from S embark.   \n",
     "more than 50 % of 3st class are from S embark.  \n",
     "\n",
     "**fill out missing embark with S embark**"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000056"
   },
   {
    "cell_type": "code",
    "execution_count": 42,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell58",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "for dataset in train_test_data:\n",
     "    dataset['Embarked'] =  dataset['Embarked'].fillna('S')"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000057"
   },
   {
    "cell_type": "code",
    "execution_count": 43,
-   "metadata": {},
+   "metadata": {
+    "name": "cell59",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -3472,25 +3731,32 @@
    ],
    "source": [
     "train.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000058"
   },
   {
    "cell_type": "code",
    "execution_count": 44,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell60",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "embarked_mapping = {'S':0,'C':1,'Q':2}\n",
     "for dataset in train_test_data:\n",
     "    dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000059"
   },
   {
    "cell_type": "code",
    "execution_count": 45,
-   "metadata": {},
+   "metadata": {
+    "name": "cell61",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4404,12 +4670,16 @@
     "train[\"Fare\"].fillna(train.groupby(\"Pclass\")[\"Fare\"].transform(\"median\"), inplace=True)\n",
     "test[\"Fare\"].fillna(test.groupby(\"Pclass\")[\"Fare\"].transform(\"median\"), inplace=True)\n",
     "train.head(50)\n"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000060"
   },
   {
    "cell_type": "code",
    "execution_count": 46,
-   "metadata": {},
+   "metadata": {
+    "name": "cell62",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4428,12 +4698,16 @@
     "facet.set(xlim = (0, train['Fare'].max()))\n",
     "facet.add_legend()\n",
     "plt.show()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000061"
   },
   {
    "cell_type": "code",
    "execution_count": 47,
-   "metadata": {},
+   "metadata": {
+    "name": "cell63",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4462,13 +4736,16 @@
     "facet.set(xlim=(0, train['Fare'].max()))\n",
     "facet.add_legend()\n",
     "plt.xlim(0, 20)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000062"
   },
   {
    "cell_type": "code",
    "execution_count": 48,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell64",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -4477,12 +4754,16 @@
     "    dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,\n",
     "    dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,\n",
     "    dataset.loc[dataset['Fare'] >= 100, 'Fare'] = 3"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000063"
   },
   {
    "cell_type": "code",
    "execution_count": 49,
-   "metadata": {},
+   "metadata": {
+    "name": "cell65",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4622,12 +4903,16 @@
    ],
    "source": [
     "train.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000064"
   },
   {
    "cell_type": "code",
    "execution_count": 50,
-   "metadata": {},
+   "metadata": {
+    "name": "cell66",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4703,24 +4988,31 @@
    ],
    "source": [
     "train.Cabin.value_counts()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000065"
   },
   {
    "cell_type": "code",
    "execution_count": 51,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell67",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "for dataset in train_test_data:\n",
     "    dataset['Cabin'] =  dataset['Cabin'].str[:1]"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000066"
   },
   {
    "cell_type": "code",
    "execution_count": 52,
-   "metadata": {},
+   "metadata": {
+    "name": "cell68",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4750,57 +5042,73 @@
     "df = pd.DataFrame([Pclass1, Pclass2, Pclass3])\n",
     "df.index = ['1st class','2nd class', '3rd class']\n",
     "df.plot(kind='bar',stacked=True, figsize=(10,5))"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000067"
   },
   {
    "cell_type": "code",
    "execution_count": 53,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell69",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "cabin_mapping = {\"A\": 0, \"B\": 0.4, \"C\": 0.8, \"D\": 1.2, \"E\": 1.6, \"F\": 2, \"G\": 2.4, \"T\": 2.8}\n",
     "for dataset in train_test_data:\n",
     "    dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000068"
   },
   {
    "cell_type": "code",
    "execution_count": 54,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell70",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "# fill missing Fare with median fare for each Pclass\n",
     "train[\"Cabin\"].fillna(train.groupby(\"Pclass\")[\"Cabin\"].transform(\"median\"), inplace=True)\n",
     "test[\"Cabin\"].fillna(test.groupby(\"Pclass\")[\"Cabin\"].transform(\"median\"), inplace=True)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000069"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell71"
+   },
    "source": [
     "**family Size**"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000070"
   },
   {
    "cell_type": "code",
    "execution_count": 55,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell72",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "train[\"FamilySize\"] = train[\"SibSp\"] + train[\"Parch\"] + 1\n",
     "test[\"FamilySize\"] = test[\"SibSp\"] + test[\"Parch\"] + 1"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000071"
   },
   {
    "cell_type": "code",
    "execution_count": 56,
-   "metadata": {},
+   "metadata": {
+    "name": "cell73",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4829,25 +5137,32 @@
     "facet.set(xlim=(0, train['FamilySize'].max()))\n",
     "facet.add_legend()\n",
     "plt.xlim(0)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000072"
   },
   {
    "cell_type": "code",
    "execution_count": 57,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell74",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}\n",
     "for dataset in train_test_data:\n",
     "    dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000073"
   },
   {
    "cell_type": "code",
    "execution_count": 58,
-   "metadata": {},
+   "metadata": {
+    "name": "cell75",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -4993,13 +5308,16 @@
    ],
    "source": [
     "train.head()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000074"
   },
   {
    "cell_type": "code",
    "execution_count": 59,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell76",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -5007,12 +5325,16 @@
     "train = train.drop(features_drop, axis = 1)\n",
     "test = test.drop(features_drop,axis=1)\n",
     "train = train.drop(['PassengerId'], axis=1)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000075"
   },
   {
    "cell_type": "code",
    "execution_count": 60,
-   "metadata": {},
+   "metadata": {
+    "name": "cell77",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -5029,12 +5351,16 @@
     "train_data = train.drop('Survived', axis = 1)\n",
     "target = train['Survived']\n",
     "train_data.shape, target.shape"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000076"
   },
   {
    "cell_type": "code",
    "execution_count": 61,
-   "metadata": {},
+   "metadata": {
+    "name": "cell78",
+    "language": "python"
+   },
    "outputs": [
     {
      "data": {
@@ -5203,20 +5529,26 @@
    ],
    "source": [
     "train_data.head(10)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000077"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell79"
+   },
    "source": [
     "# 5. Modelling"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000078"
   },
   {
    "cell_type": "code",
    "execution_count": 62,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell80",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -5228,12 +5560,16 @@
     "from sklearn.svm import SVC\n",
     "\n",
     "import numpy as np"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000079"
   },
   {
    "cell_type": "code",
    "execution_count": 63,
-   "metadata": {},
+   "metadata": {
+    "name": "cell81",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -5258,32 +5594,42 @@
    ],
    "source": [
     "train.info()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000080"
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "name": "cell82"
+   },
    "source": [
     "# 6.Cross Validation(k-fold)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000081"
   },
   {
    "cell_type": "code",
    "execution_count": 64,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell83",
+    "language": "python"
    },
    "outputs": [],
    "source": [
     "from sklearn.model_selection import KFold\n",
     "from sklearn.model_selection import cross_val_score\n",
     "k_fold = KFold(n_splits=10, shuffle=True, random_state=0)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000082"
   },
   {
    "cell_type": "code",
    "execution_count": 65,
-   "metadata": {},
+   "metadata": {
+    "name": "cell84",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -5299,12 +5645,16 @@
     "scoring = 'accuracy'\n",
     "score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)\n",
     "print(score)"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000083"
   },
   {
    "cell_type": "code",
    "execution_count": 66,
-   "metadata": {},
+   "metadata": {
+    "name": "cell85",
+    "language": "python"
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -5335,13 +5685,16 @@
     "#     round(np.mean(score)*100,2)\n",
     "#     print(\"Score of :\\n\",score)\n",
     "model_fit()"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000084"
   },
   {
    "cell_type": "code",
    "execution_count": 67,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell86",
+    "language": "python"
    },
    "outputs": [],
    "source": [
@@ -5351,55 +5704,35 @@
     "test_data = test.drop(['Survived','PassengerId'], axis=1)\n",
     "prediction = clf1.predict(test_data)\n",
     "# test_data\n"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000085"
   },
   {
    "cell_type": "code",
    "execution_count": 73,
-   "metadata": {},
+   "metadata": {
+    "name": "cell87",
+    "language": "python"
+   },
    "outputs": [],
    "source": [
     "test_data['Survived'] = prediction\n",
     "submission = pd.DataFrame(test['PassengerId'],test_data['Survived'])\n",
     "submission.to_csv(\"Submission.csv\")"
-   ]
+   ],
+   "id": "ce110000-1111-2222-3333-ffffff000086"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "name": "cell88",
+    "language": "python"
    },
    "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.6.3"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
+   "source": [],
+   "id": "ce110000-1111-2222-3333-ffffff000087"
   }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+ ]
+}
\ No newline at end of file