From 209c245e6abf44af5de5bb841b9b98a77705afcd Mon Sep 17 00:00:00 2001 From: Haebichan Jung Date: Wed, 16 Apr 2025 23:51:04 +0000 Subject: [PATCH] first push --- Titanic Survival Analysis.ipynb | 745 +++++++++++++++++++++++--------- 1 file changed, 539 insertions(+), 206 deletions(-) diff --git a/Titanic Survival Analysis.ipynb b/Titanic Survival Analysis.ipynb index 87b2e29..5d38352 100644 --- a/Titanic Survival Analysis.ipynb +++ b/Titanic Survival Analysis.ipynb @@ -1,16 +1,58 @@ { + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + }, + "lastEditStatus": { + "notebookId": "7gb7mzqm464eijchkhhv", + "authorId": "8619036689116", + "authorName": "HAEBICHAN", + "authorEmail": "Haebichan.jung@snowflake.com", + "sessionId": "975dcb98-b50a-46a9-80a0-9307d4737c7a", + "lastEditTime": 1744846912042 + } + }, + "nbformat_minor": 2, + "nbformat": 4, "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell1" + }, "source": [ "

Titanic Passanger Survival Analysis

" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000000" }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "name": "cell2", + "language": "python" + }, "outputs": [ { "data": { @@ -29,36 +71,61 @@ "source": [ "from IPython.display import Image\n", "Image(url= \"https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/5095eabce4b06cb305058603/5095eabce4b02d37bef4c24c/1352002236895/100_anniversary_titanic_sinking_by_esai8mellows-d4xbme8.jpg\")" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000001" }, { "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell3", + "language": "python", + "codeCollapsed": false }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000002" }, { "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell4", + "language": "python", + "codeCollapsed": false }, "outputs": [], "source": [ "train = pd.read_csv(\"input/train.csv\")\n", "test = pd.read_csv(\"input/test.csv\")" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000003" + }, + { + "cell_type": "code", + "id": "c6323e0b-98e2-4139-bb81-e131aa27902a", + "metadata": { + "language": "python", + "name": "cell89", + "codeCollapsed": false + }, + "outputs": [], + "source": "test.head()", + "execution_count": null }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "metadata": { + "name": "cell5", + "language": "python", + "codeCollapsed": false + }, "outputs": [ { "name": "stdout", @@ -74,12 +141,16 @@ "print(\"Train Shape:\",train.shape)\n", "test.isnull().sum()\n", "print(\"Test Shape:\",test.shape)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000004" }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "name": "cell6", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -107,12 +178,16 @@ ], "source": [ "train.info()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000005" }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "name": "cell7", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -139,11 +214,14 @@ ], "source": [ "test.info()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000006" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell8" + }, "source": [ "### Data Dictionary\n", "\n", @@ -158,12 +236,16 @@ "**Total rows and columns**\n", "\n", "We can see that there are 891 rows and 12 columns in our training dataset." - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000007" }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, + "metadata": { + "name": "cell9", + "language": "python" + }, "outputs": [ { "data": { @@ -400,12 +482,16 @@ ], "source": [ "train.head(10)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000008" }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "name": "cell10", + "language": "python" + }, "outputs": [ { "data": { @@ -551,12 +637,16 @@ ], "source": [ "train.describe()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000009" }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "name": "cell11", + "language": "python" + }, "outputs": [ { "data": { @@ -683,12 +773,16 @@ ], "source": [ "test.describe()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000010" }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "name": "cell12", + "language": "python" + }, "outputs": [ { "data": { @@ -715,12 +809,16 @@ ], "source": [ "train.isnull().sum()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000011" }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "metadata": { + "name": "cell13", + "language": "python" + }, "outputs": [ { "data": { @@ -862,20 +960,26 @@ "test.isnull().sum()\n", "test[\"Survived\"] = \"\"\n", "test.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000012" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell14" + }, "source": [ "# Data Visualization using Matplotlib and Seaborn packages." - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000013" }, { "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell15", + "language": "python" }, "outputs": [], "source": [ @@ -883,11 +987,14 @@ "%matplotlib inline\n", "import seaborn as sns\n", "sns.set() # setting seaborn default for plots" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000014" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell16" + }, "source": [ "# Bar Chart for Categorical Features \n", "\n", @@ -897,13 +1004,16 @@ "* Parch ( # of parents and children)\n", "* Embarked\n", "* Cabin" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000015" }, { "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell17", + "language": "python" }, "outputs": [], "source": [ @@ -913,12 +1023,16 @@ " df = pd.DataFrame([survived,dead])\n", " df.index = ['Survived','Dead']\n", " df.plot(kind='bar',stacked=True, figsize=(10,5))" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000016" }, { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "name": "cell18", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -949,19 +1063,26 @@ "bar_chart('Sex')\n", "print(\"Survived :\\n\",train[train['Survived']==1]['Sex'].value_counts())\n", "print(\"Dead:\\n\",train[train['Survived']==0]['Sex'].value_counts())" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000017" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell19" + }, "source": [ "The Chart confirms **Women more likely survivied than Men**." - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000018" }, { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "name": "cell20", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -994,20 +1115,27 @@ "bar_chart('Pclass')\n", "print(\"Survived :\\n\",train[train['Survived']==1]['Pclass'].value_counts())\n", "print(\"Dead:\\n\",train[train['Survived']==0]['Pclass'].value_counts())" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000019" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell21" + }, "source": [ "The Chart confirms **1st class** more likely survivied than **other classes**. \n", "The Chart confirms **3rd class** more likely dead than **other classes**" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000020" }, { "cell_type": "code", "execution_count": 16, - "metadata": {}, + "metadata": { + "name": "cell22", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -1046,20 +1174,27 @@ "bar_chart('SibSp')\n", "print(\"Survived :\\n\",train[train['Survived']==1]['SibSp'].value_counts())\n", "print(\"Dead:\\n\",train[train['Survived']==0]['SibSp'].value_counts())" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000021" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell23" + }, "source": [ "The Chart confirms a **person aboarded with more than 2 siblings or spouse** more likely survived. \n", "The Chart confirms a **person aboarded without siblings or spouse** more likely dead" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000022" }, { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "name": "cell24", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -1098,20 +1233,27 @@ "bar_chart('Parch')\n", "print(\"Survived :\\n\",train[train['Survived']==1]['Parch'].value_counts())\n", "print(\"Dead:\\n\",train[train['Survived']==0]['Parch'].value_counts())" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000023" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell25" + }, "source": [ "The Chart confirms a **person aboarded with more than 2 parents or children more likely survived.** \n", "The Chart confirms a **person aboarded alone more likely dead**" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000024" }, { "cell_type": "code", "execution_count": 18, - "metadata": {}, + "metadata": { + "name": "cell26", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -1144,27 +1286,36 @@ "bar_chart('Embarked')\n", "print(\"Survived :\\n\",train[train['Survived']==1]['Embarked'].value_counts())\n", "print(\"Dead:\\n\",train[train['Survived']==0]['Embarked'].value_counts())" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000025" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell27" + }, "source": [ "The Chart confirms a **person aboarded from C** slightly more likely survived. \n", "The Chart confirms a **person aboarded from Q** more likely dead. \n", "The Chart confirms a **person aboarded from S** more likely dead. " - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000026" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell28" + }, "source": [ "## 4. Feature engineering" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000027" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell29" + }, "source": [ "Feature engineering is the process of using domain knowledge of the data\n", "to create features (**feature vectors**) that make machine learning algorithms work. \n", @@ -1172,12 +1323,16 @@ "feature vector is an n-dimensional vector of numerical features that represent some object.\n", "Many algorithms in machine learning require a numerical representation of objects,\n", "since such representations facilitate processing and statistical analysis." - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000028" }, { "cell_type": "code", "execution_count": 19, - "metadata": {}, + "metadata": { + "name": "cell30", + "language": "python" + }, "outputs": [ { "data": { @@ -1324,19 +1479,26 @@ ], "source": [ "train.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000029" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell31" + }, "source": [ "#### 4.1 how titanic sank?" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000030" }, { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "name": "cell32", + "language": "python" + }, "outputs": [ { "data": { @@ -1354,12 +1516,16 @@ ], "source": [ "Image(url= \"https://static1.squarespace.com/static/5006453fe4b09ef2252ba068/t/5090b249e4b047ba54dfd258/1351660113175/TItanic-Survival-Infographic.jpg?format=1500w\")" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000031" }, { "cell_type": "code", "execution_count": 21, - "metadata": {}, + "metadata": { + "name": "cell33", + "language": "python" + }, "outputs": [ { "data": { @@ -1596,13 +1762,16 @@ ], "source": [ "train.head(10)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000032" }, { "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell34", + "language": "python" }, "outputs": [], "source": [ @@ -1610,12 +1779,16 @@ "\n", "for dataset in train_test_data:\n", " dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\\.', expand=False)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000033" }, { "cell_type": "code", "execution_count": 23, - "metadata": {}, + "metadata": { + "name": "cell35", + "language": "python" + }, "outputs": [ { "data": { @@ -1647,12 +1820,16 @@ ], "source": [ "train['Title'].value_counts()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000034" }, { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "metadata": { + "name": "cell36", + "language": "python" + }, "outputs": [ { "data": { @@ -1676,11 +1853,14 @@ ], "source": [ "test['Title'].value_counts()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000035" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell37" + }, "source": [ "#### Title Map\n", "\n", @@ -1688,13 +1868,16 @@ "Miss : 1 \n", "Mrs: 2 \n", "Others: 3 " - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000036" }, { "cell_type": "code", "execution_count": 25, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell38", + "language": "python" }, "outputs": [], "source": [ @@ -1704,12 +1887,16 @@ "\n", "for dataset in train_test_data:\n", " dataset['Title'] = dataset[\"Title\"].map(title_mapping)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000037" }, { "cell_type": "code", "execution_count": 26, - "metadata": {}, + "metadata": { + "name": "cell39", + "language": "python" + }, "outputs": [ { "data": { @@ -1855,12 +2042,16 @@ ], "source": [ "dataset.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000038" }, { "cell_type": "code", "execution_count": 27, - "metadata": {}, + "metadata": { + "name": "cell40", + "language": "python" + }, "outputs": [ { "data": { @@ -2006,12 +2197,16 @@ ], "source": [ "test.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000039" }, { "cell_type": "code", "execution_count": 28, - "metadata": {}, + "metadata": { + "name": "cell41", + "language": "python" + }, "outputs": [ { "data": { @@ -2026,25 +2221,32 @@ ], "source": [ "bar_chart('Title')" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000040" }, { "cell_type": "code", "execution_count": 29, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell42", + "language": "python" }, "outputs": [], "source": [ "# delete unnecessary feature from dataset\n", "train.drop('Name', axis=1, inplace=True)\n", "test.drop('Name', axis=1, inplace=True)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000041" }, { "cell_type": "code", "execution_count": 30, - "metadata": {}, + "metadata": { + "name": "cell43", + "language": "python" + }, "outputs": [ { "data": { @@ -2184,25 +2386,32 @@ ], "source": [ "train.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000042" }, { "cell_type": "code", "execution_count": 31, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell44", + "language": "python" }, "outputs": [], "source": [ "sex_mapping = {\"male\": 0, \"female\": 1}\n", "for dataset in train_test_data:\n", " dataset['Sex'] = dataset['Sex'].map(sex_mapping)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000043" }, { "cell_type": "code", "execution_count": 32, - "metadata": {}, + "metadata": { + "name": "cell45", + "language": "python" + }, "outputs": [ { "data": { @@ -2217,12 +2426,16 @@ ], "source": [ "bar_chart('Sex')" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000044" }, { "cell_type": "code", "execution_count": 33, - "metadata": {}, + "metadata": { + "name": "cell46", + "language": "python" + }, "outputs": [ { "data": { @@ -2362,24 +2575,31 @@ ], "source": [ "test.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000045" }, { "cell_type": "code", "execution_count": 34, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell47", + "language": "python" }, "outputs": [], "source": [ "train[\"Age\"].fillna(train.groupby(\"Title\")[\"Age\"].transform(\"median\"), inplace= True)\n", "test[\"Age\"].fillna(test.groupby('Title')['Age'].transform(\"median\"), inplace= True)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000046" }, { "cell_type": "code", "execution_count": 35, - "metadata": {}, + "metadata": { + "name": "cell48", + "language": "python" + }, "outputs": [ { "data": { @@ -2945,12 +3165,16 @@ "source": [ "train.head(30)\n", "#train.groupby(\"Title\")[\"Age\"].transform(\"median\")" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000047" }, { "cell_type": "code", "execution_count": 36, - "metadata": {}, + "metadata": { + "name": "cell49", + "language": "python" + }, "outputs": [ { "data": { @@ -2995,19 +3219,26 @@ "facet.set(xlim=(0, train['Age'].max()))\n", "facet.add_legend() \n", "plt.xlim(10,50)\n" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000048" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell50" + }, "source": [ "Those who were **20 to 30 years old** were **more dead and more survived.**" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000049" }, { "cell_type": "code", "execution_count": 37, - "metadata": {}, + "metadata": { + "name": "cell51", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -3053,11 +3284,14 @@ "source": [ "train.info()\n", "test.info()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000050" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell52" + }, "source": [ "**Binning**\n", "\n", @@ -3069,12 +3303,16 @@ "* adult: 2\n", "* mid-age: 3\n", "* senior: 4" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000051" }, { "cell_type": "code", "execution_count": 38, - "metadata": {}, + "metadata": { + "name": "cell53", + "language": "python" + }, "outputs": [ { "data": { @@ -3214,13 +3452,16 @@ ], "source": [ "train.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000052" }, { "cell_type": "code", "execution_count": 39, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell54", + "language": "python" }, "outputs": [], "source": [ @@ -3233,12 +3474,16 @@ "# for dataset in train_test_data:\n", "# dataset.loc[]\n", "#train[train['Age'].isin([23])]" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000053" }, { "cell_type": "code", "execution_count": 40, - "metadata": {}, + "metadata": { + "name": "cell55", + "language": "python" + }, "outputs": [ { "data": { @@ -3254,12 +3499,16 @@ "source": [ "train.head()\n", "bar_chart('Age')" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000054" }, { "cell_type": "code", "execution_count": 41, - "metadata": {}, + "metadata": { + "name": "cell56", + "language": "python" + }, "outputs": [ { "data": { @@ -3304,35 +3553,45 @@ "print(\"Pclass1:\\n\",Pclass1)\n", "print(\"Pclass2:\\n\",Pclass2)\n", "print(\"Pclass3:\\n\",Pclass3)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000055" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell57" + }, "source": [ "more than 50 % of 1st class are from S embark. \n", "more than 50 % of 2st class are from S embark. \n", "more than 50 % of 3st class are from S embark. \n", "\n", "**fill out missing embark with S embark**" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000056" }, { "cell_type": "code", "execution_count": 42, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell58", + "language": "python" }, "outputs": [], "source": [ "for dataset in train_test_data:\n", " dataset['Embarked'] = dataset['Embarked'].fillna('S')" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000057" }, { "cell_type": "code", "execution_count": 43, - "metadata": {}, + "metadata": { + "name": "cell59", + "language": "python" + }, "outputs": [ { "data": { @@ -3472,25 +3731,32 @@ ], "source": [ "train.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000058" }, { "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell60", + "language": "python" }, "outputs": [], "source": [ "embarked_mapping = {'S':0,'C':1,'Q':2}\n", "for dataset in train_test_data:\n", " dataset['Embarked'] = dataset['Embarked'].map(embarked_mapping)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000059" }, { "cell_type": "code", "execution_count": 45, - "metadata": {}, + "metadata": { + "name": "cell61", + "language": "python" + }, "outputs": [ { "data": { @@ -4404,12 +4670,16 @@ "train[\"Fare\"].fillna(train.groupby(\"Pclass\")[\"Fare\"].transform(\"median\"), inplace=True)\n", "test[\"Fare\"].fillna(test.groupby(\"Pclass\")[\"Fare\"].transform(\"median\"), inplace=True)\n", "train.head(50)\n" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000060" }, { "cell_type": "code", "execution_count": 46, - "metadata": {}, + "metadata": { + "name": "cell62", + "language": "python" + }, "outputs": [ { "data": { @@ -4428,12 +4698,16 @@ "facet.set(xlim = (0, train['Fare'].max()))\n", "facet.add_legend()\n", "plt.show()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000061" }, { "cell_type": "code", "execution_count": 47, - "metadata": {}, + "metadata": { + "name": "cell63", + "language": "python" + }, "outputs": [ { "data": { @@ -4462,13 +4736,16 @@ "facet.set(xlim=(0, train['Fare'].max()))\n", "facet.add_legend()\n", "plt.xlim(0, 20)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000062" }, { "cell_type": "code", "execution_count": 48, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell64", + "language": "python" }, "outputs": [], "source": [ @@ -4477,12 +4754,16 @@ " dataset.loc[(dataset['Fare'] > 17) & (dataset['Fare'] <= 30), 'Fare'] = 1,\n", " dataset.loc[(dataset['Fare'] > 30) & (dataset['Fare'] <= 100), 'Fare'] = 2,\n", " dataset.loc[dataset['Fare'] >= 100, 'Fare'] = 3" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000063" }, { "cell_type": "code", "execution_count": 49, - "metadata": {}, + "metadata": { + "name": "cell65", + "language": "python" + }, "outputs": [ { "data": { @@ -4622,12 +4903,16 @@ ], "source": [ "train.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000064" }, { "cell_type": "code", "execution_count": 50, - "metadata": {}, + "metadata": { + "name": "cell66", + "language": "python" + }, "outputs": [ { "data": { @@ -4703,24 +4988,31 @@ ], "source": [ "train.Cabin.value_counts()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000065" }, { "cell_type": "code", "execution_count": 51, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell67", + "language": "python" }, "outputs": [], "source": [ "for dataset in train_test_data:\n", " dataset['Cabin'] = dataset['Cabin'].str[:1]" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000066" }, { "cell_type": "code", "execution_count": 52, - "metadata": {}, + "metadata": { + "name": "cell68", + "language": "python" + }, "outputs": [ { "data": { @@ -4750,57 +5042,73 @@ "df = pd.DataFrame([Pclass1, Pclass2, Pclass3])\n", "df.index = ['1st class','2nd class', '3rd class']\n", "df.plot(kind='bar',stacked=True, figsize=(10,5))" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000067" }, { "cell_type": "code", "execution_count": 53, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell69", + "language": "python" }, "outputs": [], "source": [ "cabin_mapping = {\"A\": 0, \"B\": 0.4, \"C\": 0.8, \"D\": 1.2, \"E\": 1.6, \"F\": 2, \"G\": 2.4, \"T\": 2.8}\n", "for dataset in train_test_data:\n", " dataset['Cabin'] = dataset['Cabin'].map(cabin_mapping)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000068" }, { "cell_type": "code", "execution_count": 54, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell70", + "language": "python" }, "outputs": [], "source": [ "# fill missing Fare with median fare for each Pclass\n", "train[\"Cabin\"].fillna(train.groupby(\"Pclass\")[\"Cabin\"].transform(\"median\"), inplace=True)\n", "test[\"Cabin\"].fillna(test.groupby(\"Pclass\")[\"Cabin\"].transform(\"median\"), inplace=True)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000069" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell71" + }, "source": [ "**family Size**" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000070" }, { "cell_type": "code", "execution_count": 55, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell72", + "language": "python" }, "outputs": [], "source": [ "train[\"FamilySize\"] = train[\"SibSp\"] + train[\"Parch\"] + 1\n", "test[\"FamilySize\"] = test[\"SibSp\"] + test[\"Parch\"] + 1" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000071" }, { "cell_type": "code", "execution_count": 56, - "metadata": {}, + "metadata": { + "name": "cell73", + "language": "python" + }, "outputs": [ { "data": { @@ -4829,25 +5137,32 @@ "facet.set(xlim=(0, train['FamilySize'].max()))\n", "facet.add_legend()\n", "plt.xlim(0)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000072" }, { "cell_type": "code", "execution_count": 57, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell74", + "language": "python" }, "outputs": [], "source": [ "family_mapping = {1: 0, 2: 0.4, 3: 0.8, 4: 1.2, 5: 1.6, 6: 2, 7: 2.4, 8: 2.8, 9: 3.2, 10: 3.6, 11: 4}\n", "for dataset in train_test_data:\n", " dataset['FamilySize'] = dataset['FamilySize'].map(family_mapping)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000073" }, { "cell_type": "code", "execution_count": 58, - "metadata": {}, + "metadata": { + "name": "cell75", + "language": "python" + }, "outputs": [ { "data": { @@ -4993,13 +5308,16 @@ ], "source": [ "train.head()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000074" }, { "cell_type": "code", "execution_count": 59, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell76", + "language": "python" }, "outputs": [], "source": [ @@ -5007,12 +5325,16 @@ "train = train.drop(features_drop, axis = 1)\n", "test = test.drop(features_drop,axis=1)\n", "train = train.drop(['PassengerId'], axis=1)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000075" }, { "cell_type": "code", "execution_count": 60, - "metadata": {}, + "metadata": { + "name": "cell77", + "language": "python" + }, "outputs": [ { "data": { @@ -5029,12 +5351,16 @@ "train_data = train.drop('Survived', axis = 1)\n", "target = train['Survived']\n", "train_data.shape, target.shape" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000076" }, { "cell_type": "code", "execution_count": 61, - "metadata": {}, + "metadata": { + "name": "cell78", + "language": "python" + }, "outputs": [ { "data": { @@ -5203,20 +5529,26 @@ ], "source": [ "train_data.head(10)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000077" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell79" + }, "source": [ "# 5. Modelling" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000078" }, { "cell_type": "code", "execution_count": 62, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell80", + "language": "python" }, "outputs": [], "source": [ @@ -5228,12 +5560,16 @@ "from sklearn.svm import SVC\n", "\n", "import numpy as np" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000079" }, { "cell_type": "code", "execution_count": 63, - "metadata": {}, + "metadata": { + "name": "cell81", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -5258,32 +5594,42 @@ ], "source": [ "train.info()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000080" }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "name": "cell82" + }, "source": [ "# 6.Cross Validation(k-fold)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000081" }, { "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell83", + "language": "python" }, "outputs": [], "source": [ "from sklearn.model_selection import KFold\n", "from sklearn.model_selection import cross_val_score\n", "k_fold = KFold(n_splits=10, shuffle=True, random_state=0)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000082" }, { "cell_type": "code", "execution_count": 65, - "metadata": {}, + "metadata": { + "name": "cell84", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -5299,12 +5645,16 @@ "scoring = 'accuracy'\n", "score = cross_val_score(clf, train_data, target, cv=k_fold, n_jobs=1, scoring=scoring)\n", "print(score)" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000083" }, { "cell_type": "code", "execution_count": 66, - "metadata": {}, + "metadata": { + "name": "cell85", + "language": "python" + }, "outputs": [ { "name": "stdout", @@ -5335,13 +5685,16 @@ "# round(np.mean(score)*100,2)\n", "# print(\"Score of :\\n\",score)\n", "model_fit()" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000084" }, { "cell_type": "code", "execution_count": 67, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell86", + "language": "python" }, "outputs": [], "source": [ @@ -5351,55 +5704,35 @@ "test_data = test.drop(['Survived','PassengerId'], axis=1)\n", "prediction = clf1.predict(test_data)\n", "# test_data\n" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000085" }, { "cell_type": "code", "execution_count": 73, - "metadata": {}, + "metadata": { + "name": "cell87", + "language": "python" + }, "outputs": [], "source": [ "test_data['Survived'] = prediction\n", "submission = pd.DataFrame(test['PassengerId'],test_data['Survived'])\n", "submission.to_csv(\"Submission.csv\")" - ] + ], + "id": "ce110000-1111-2222-3333-ffffff000086" }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "name": "cell88", + "language": "python" }, "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } + "source": [], + "id": "ce110000-1111-2222-3333-ffffff000087" } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + ] +} \ No newline at end of file