diff --git a/.DS_Store b/.DS_Store index 01222e6..b9a5256 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb new file mode 100644 index 0000000..6ab5f7b --- /dev/null +++ b/.ipynb_checkpoints/lab-dw-data-structuring-and-combining-checkpoint.ipynb @@ -0,0 +1,1460 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e", + "metadata": { + "id": "25d7736c-ba17-4aff-b6bb-66eba20fbf4e" + }, + "source": [ + "# Lab | Data Structuring and Combining Data" + ] + }, + { + "cell_type": "markdown", + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986", + "metadata": { + "id": "a2cdfc70-44c8-478c-81e7-2bc43fdf4986" + }, + "source": [ + "## Challenge 1: Combining & Cleaning Data\n", + "\n", + "In this challenge, we will be working with the customer data from an insurance company, as we did in the two previous labs. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv\n", + "\n", + "But this time, we got new data, which can be found in the following 2 CSV files located at the links below.\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2.csv\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3.csv\n", + "\n", + "Note that you'll need to clean and format the new data.\n", + "\n", + "Observation:\n", + "- One option is to first combine the three datasets and then apply the cleaning function to the new combined dataset\n", + "- Another option would be to read the clean file you saved in the previous lab, and just clean the two new files and concatenate the three clean datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2dc256fc-5d53-4481-a68f-fe0a84edf051", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonFMaster779946010000Personal AutoFour-Door Car2
1QZ44356ArizonaFBachelor6979530940Personal AutoFour-Door Car1131
2AI49188NevadaFBachelor1288743487671080Personal AutoTwo-Door Car566
3WW63253CaliforniaMBachelor76458601060Corporate AutoSUV529
4GA49547WashingtonMHigh School or Below53630736357680Personal AutoFour-Door Car17
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington F Master 779946 \n", + "1 QZ44356 Arizona F Bachelor 697953 \n", + "2 AI49188 Nevada F Bachelor 1288743 \n", + "3 WW63253 California M Bachelor 764586 \n", + "4 GA49547 Washington M High School or Below 536307 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0 1000 0 Personal Auto \n", + "1 0 94 0 Personal Auto \n", + "2 48767 108 0 Personal Auto \n", + "3 0 106 0 Corporate Auto \n", + "4 36357 68 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2 \n", + "1 Four-Door Car 1131 \n", + "2 Two-Door Car 566 \n", + "3 SUV 529 \n", + "4 Four-Door Car 17 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1_clean.csv\")\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "484c7c3e-f186-4968-8cb4-08d94813646c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2_clean.csv\")\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3570b057-a6f8-4f98-b654-7d06231a276a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3_clean.csv\")\n", + "df3.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "492d06e3-92c7-4105-ac72-536db98d3244", + "metadata": { + "id": "492d06e3-92c7-4105-ac72-536db98d3244" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonFMaster7799460.01000.00Personal AutoFour-Door Car2.0
1QZ44356ArizonaFBachelor6979530.094.00Personal AutoFour-Door Car1131.0
2AI49188NevadaFBachelor128874348767.0108.00Personal AutoTwo-Door Car566.0
3WW63253CaliforniaMBachelor7645860.0106.00Corporate AutoSUV529.0
4GA49547WashingtonMHigh School or Below53630736357.068.00Personal AutoFour-Door Car17.0
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington F Master 779946 \n", + "1 QZ44356 Arizona F Bachelor 697953 \n", + "2 AI49188 Nevada F Bachelor 1288743 \n", + "3 WW63253 California M Bachelor 764586 \n", + "4 GA49547 Washington M High School or Below 536307 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 0 Personal Auto \n", + "1 0.0 94.0 0 Personal Auto \n", + "2 48767.0 108.0 0 Personal Auto \n", + "3 0.0 106.0 0 Corporate Auto \n", + "4 36357.0 68.0 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.0 \n", + "1 Four-Door Car 1131.0 \n", + "2 Two-Door Car 566.0 \n", + "3 SUV 529.0 \n", + "4 Four-Door Car 17.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# First option (clean 3 files together)\n", + "\n", + "df2 = df2.rename(columns={\"ST\": \"state\", \"GENDER\": \"gender\"})\n", + "df3 = df3.rename(columns={\"ST\": \"state\", \"GENDER\": \"gender\"})\n", + "df2.columns = [col.lower().replace(\" \", \"_\") for col in df2.columns] \n", + "df3.columns = [col.lower().replace(\" \", \"_\") for col in df3.columns]\n", + "\n", + "df = pd.concat([df1, df2, df3], ignore_index=True)\n", + "df = df.round(2)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "591d2c33-05a0-4844-bb25-e7391cec5065", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "gender\n", + "F 13845\n", + "M 13143\n", + "Male 80\n", + "female 60\n", + "Femal 34\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.gender.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5e2047db-c55a-4f4a-a55b-3108bc36b27c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "California 9215\n", + "Oregon 7803\n", + "Arizona 4963\n", + "Nevada 2646\n", + "Washington 2331\n", + "Cali 240\n", + "AZ 148\n", + "WA 60\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.state.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "81bdbed7-a4e7-4c06-a070-79628b5fcb99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gender: ['F' 'M' nan]\n", + "education: ['Master' 'Bachelor' 'High School or Below' 'College' 'Doctor' nan]\n", + "state: ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon' nan]\n" + ] + } + ], + "source": [ + "# Define a dictionary for each column to map inconsistent values to consistent ones\n", + "gender_dict ={\"Male\": \"M\", \"female\": \"F\", \"Female\": \"F\", \"Femal\": \"F\"}\n", + "education_dict = {\"Bachelors\": \"Bachelor\"}\n", + "\n", + "# Define a dictionary mapping state abbreviations to state names\n", + "state_dict = {\"AZ\": \"Arizona\", \"Cali\": \"California\", \"WA\": \"Washington\"}\n", + "\n", + "# Create a dictionary to store the column-specific dictionaries\n", + "column_dicts = {\"gender\": gender_dict, \"education\": education_dict, \"state\": state_dict}\n", + "\n", + "# Loop through each column in the dataset and clean the inconsistent values using the corresponding dictionary mapping\n", + "for col in column_dicts:\n", + " df[col] = df[col].replace(column_dicts[col])\n", + "\n", + "# Verify that each column contains only consistent values\n", + "for col in column_dicts:\n", + " print(col + \":\", df[col].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "204b6349-d9c3-4c47-a378-8f0e450149f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "vehicle_class\n", + "Four-Door Car 13920\n", + "Two-Door Car 5687\n", + "SUV 5321\n", + "Luxury 2478\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "luxury = [\"Sports Car\", \"Luxury SUV\", \"Luxury Car\"]\n", + "df[\"vehicle_class\"] = df[\"vehicle_class\"].apply(lambda x: \"Luxury\" if x in luxury else x)\n", + "df.vehicle_class.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57", + "metadata": { + "id": "31b8a9e7-7db9-4604-991b-ef6771603e57" + }, + "source": [ + "# Challenge 2: Structuring Data" + ] + }, + { + "cell_type": "markdown", + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b", + "metadata": { + "id": "a877fd6d-7a0c-46d2-9657-f25036e4ca4b" + }, + "source": [ + "In this challenge, we will continue to work with customer data from an insurance company, but we will use a dataset with more columns, called marketing_customer_analysis.csv, which can be found at the following link:\n", + "\n", + "https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\n", + "\n", + "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." + ] + }, + { + "cell_type": "markdown", + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", + "metadata": { + "id": "df35fd0d-513e-4e77-867e-429da10a9cc7" + }, + "source": [ + "1. You work at the marketing department and you want to know which sales channel brought the most sales in terms of total revenue. Using pivot, create a summary table showing the total revenue for each sales channel (branch, call center, web, and mail).\n", + "Round the total revenue to 2 decimal points. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "markdown", + "id": "640993b2-a291-436c-a34d-a551144f8196", + "metadata": { + "id": "640993b2-a291-436c-a34d-a551144f8196" + }, + "source": [ + "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b07f3ebb-eb5a-484f-be59-486ad88e0180", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_claim_amount
sales_channel
Agent1810226.82
Branch1301204.00
Call Center926600.82
Web706600.04
\n", + "
" + ], + "text/plain": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.pivot_table(values='total_claim_amount', index='sales_channel', aggfunc='sum').round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "697997b9-86ea-416f-9982-d49f54321a1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
gendereducation
FBachelor7874.27
College7748.82
Doctor7328.51
High School or Below8675.22
Master8157.05
MBachelor7703.60
College8052.46
Doctor7415.33
High School or Below8149.69
Master8168.83
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value\n", + "gender education \n", + "F Bachelor 7874.27\n", + " College 7748.82\n", + " Doctor 7328.51\n", + " High School or Below 8675.22\n", + " Master 8157.05\n", + "M Bachelor 7703.60\n", + " College 8052.46\n", + " Doctor 7415.33\n", + " High School or Below 8149.69\n", + " Master 8168.83" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_table = pd.pivot_table(df, values='customer_lifetime_value', index=['gender', 'education'], aggfunc='mean').round(2)\n", + "\n", + "pivot_table" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "946c0cf9-a602-499c-9a6f-0c08deec85c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
genderFM
education
Bachelor7874.277703.60
College7748.828052.46
Doctor7328.517415.33
High School or Below8675.228149.69
Master8157.058168.83
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value \n", + "gender F M\n", + "education \n", + "Bachelor 7874.27 7703.60\n", + "College 7748.82 8052.46\n", + "Doctor 7328.51 7415.33\n", + "High School or Below 8675.22 8149.69\n", + "Master 8157.05 8168.83" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#another way\n", + "pivot_table_2 = round(df.pivot_table(index=\"education\", columns=\"gender\", values=[\"customer_lifetime_value\"]), 2)\n", + "pivot_table_2.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", + "metadata": { + "id": "32c7f2e5-3d90-43e5-be33-9781b6069198" + }, + "source": [ + "## Bonus\n", + "\n", + "You work at the customer service department and you want to know which months had the highest number of complaints by policy type category. Create a summary table showing the number of complaints by policy type and month.\n", + "Show it in a long format table." + ] + }, + { + "cell_type": "markdown", + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291", + "metadata": { + "id": "e3d09a8f-953c-448a-a5f8-2e5a8cca7291" + }, + "source": [ + "*In data analysis, a long format table is a way of structuring data in which each observation or measurement is stored in a separate row of the table. The key characteristic of a long format table is that each column represents a single variable, and each row represents a single observation of that variable.*\n", + "\n", + "*More information about long and wide format tables here: https://www.statology.org/long-vs-wide-data/*" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "3a069e0b-b400-470e-904d-d17582191be4", + "metadata": { + "id": "3a069e0b-b400-470e-904d-d17582191be4" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monthpolicy_type0
01Corporate Auto1252
11Personal Auto4329
21Special Auto237
32Corporate Auto1089
42Personal Auto3799
52Special Auto204
\n", + "
" + ], + "text/plain": [ + " month policy_type 0\n", + "0 1 Corporate Auto 1252\n", + "1 1 Personal Auto 4329\n", + "2 1 Special Auto 237\n", + "3 2 Corporate Auto 1089\n", + "4 2 Personal Auto 3799\n", + "5 2 Special Auto 204" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a pivot table with the number of open complaints by month and policy type\n", + "pivot_table = pd.pivot_table(df, values='number_of_open_complaints', index=['month'], columns=['policy_type'], aggfunc='count', fill_value=0)\n", + "\n", + "# Stack the pivot table to create a long format table\n", + "pivot_table.stack().reset_index()" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/lab-dw-data-structuring-and-combining.ipynb b/lab-dw-data-structuring-and-combining.ipynb index ec4e3f9..6ab5f7b 100644 --- a/lab-dw-data-structuring-and-combining.ipynb +++ b/lab-dw-data-structuring-and-combining.ipynb @@ -36,14 +36,727 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, + "id": "2dc256fc-5d53-4481-a68f-fe0a84edf051", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonFMaster779946010000Personal AutoFour-Door Car2
1QZ44356ArizonaFBachelor6979530940Personal AutoFour-Door Car1131
2AI49188NevadaFBachelor1288743487671080Personal AutoTwo-Door Car566
3WW63253CaliforniaMBachelor76458601060Corporate AutoSUV529
4GA49547WashingtonMHigh School or Below53630736357680Personal AutoFour-Door Car17
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington F Master 779946 \n", + "1 QZ44356 Arizona F Bachelor 697953 \n", + "2 AI49188 Nevada F Bachelor 1288743 \n", + "3 WW63253 California M Bachelor 764586 \n", + "4 GA49547 Washington M High School or Below 536307 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0 1000 0 Personal Auto \n", + "1 0 94 0 Personal Auto \n", + "2 48767 108 0 Personal Auto \n", + "3 0 106 0 Corporate Auto \n", + "4 36357 68 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2 \n", + "1 Four-Door Car 1131 \n", + "2 Two-Door Car 566 \n", + "3 SUV 529 \n", + "4 Four-Door Car 17 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "df1 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1_clean.csv\")\n", + "df1.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "484c7c3e-f186-4968-8cb4-08d94813646c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file2_clean.csv\")\n", + "df2.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3570b057-a6f8-4f98-b654-7d06231a276a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "0 RB50392 Washington NaN Master NaN \n", + "1 QZ44356 Arizona F Bachelor 697953.59% \n", + "2 AI49188 Nevada F Bachelor 1288743.17% \n", + "3 WW63253 California M Bachelor 764586.18% \n", + "4 GA49547 Washington M High School or Below 536307.65% \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "0 0.0 1000.0 1/0/00 Personal Auto \n", + "1 0.0 94.0 1/0/00 Personal Auto \n", + "2 48767.0 108.0 1/0/00 Personal Auto \n", + "3 0.0 106.0 1/0/00 Corporate Auto \n", + "4 36357.0 68.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "0 Four-Door Car 2.704934 \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "3 SUV 529.881344 \n", + "4 Four-Door Car 17.269323 " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3 = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file3_clean.csv\")\n", + "df3.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "id": "492d06e3-92c7-4105-ac72-536db98d3244", "metadata": { "id": "492d06e3-92c7-4105-ac72-536db98d3244" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customerstategendereducationcustomer_lifetime_valueincomemonthly_premium_autonumber_of_open_complaintspolicy_typevehicle_classtotal_claim_amount
0RB50392WashingtonFMaster7799460.01000.00Personal AutoFour-Door Car2.0
1QZ44356ArizonaFBachelor6979530.094.00Personal AutoFour-Door Car1131.0
2AI49188NevadaFBachelor128874348767.0108.00Personal AutoTwo-Door Car566.0
3WW63253CaliforniaMBachelor7645860.0106.00Corporate AutoSUV529.0
4GA49547WashingtonMHigh School or Below53630736357.068.00Personal AutoFour-Door Car17.0
\n", + "
" + ], + "text/plain": [ + " customer state gender education customer_lifetime_value \\\n", + "0 RB50392 Washington F Master 779946 \n", + "1 QZ44356 Arizona F Bachelor 697953 \n", + "2 AI49188 Nevada F Bachelor 1288743 \n", + "3 WW63253 California M Bachelor 764586 \n", + "4 GA49547 Washington M High School or Below 536307 \n", + "\n", + " income monthly_premium_auto number_of_open_complaints policy_type \\\n", + "0 0.0 1000.0 0 Personal Auto \n", + "1 0.0 94.0 0 Personal Auto \n", + "2 48767.0 108.0 0 Personal Auto \n", + "3 0.0 106.0 0 Corporate Auto \n", + "4 36357.0 68.0 0 Personal Auto \n", + "\n", + " vehicle_class total_claim_amount \n", + "0 Four-Door Car 2.0 \n", + "1 Four-Door Car 1131.0 \n", + "2 Two-Door Car 566.0 \n", + "3 SUV 529.0 \n", + "4 Four-Door Car 17.0 " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "# First option (clean 3 files together)\n", + "\n", + "df2 = df2.rename(columns={\"ST\": \"state\", \"GENDER\": \"gender\"})\n", + "df3 = df3.rename(columns={\"ST\": \"state\", \"GENDER\": \"gender\"})\n", + "df2.columns = [col.lower().replace(\" \", \"_\") for col in df2.columns] \n", + "df3.columns = [col.lower().replace(\" \", \"_\") for col in df3.columns]\n", + "\n", + "df = pd.concat([df1, df2, df3], ignore_index=True)\n", + "df = df.round(2)\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "591d2c33-05a0-4844-bb25-e7391cec5065", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "gender\n", + "F 13845\n", + "M 13143\n", + "Male 80\n", + "female 60\n", + "Femal 34\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.gender.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5e2047db-c55a-4f4a-a55b-3108bc36b27c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "state\n", + "California 9215\n", + "Oregon 7803\n", + "Arizona 4963\n", + "Nevada 2646\n", + "Washington 2331\n", + "Cali 240\n", + "AZ 148\n", + "WA 60\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.state.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "81bdbed7-a4e7-4c06-a070-79628b5fcb99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gender: ['F' 'M' nan]\n", + "education: ['Master' 'Bachelor' 'High School or Below' 'College' 'Doctor' nan]\n", + "state: ['Washington' 'Arizona' 'Nevada' 'California' 'Oregon' nan]\n" + ] + } + ], + "source": [ + "# Define a dictionary for each column to map inconsistent values to consistent ones\n", + "gender_dict ={\"Male\": \"M\", \"female\": \"F\", \"Female\": \"F\", \"Femal\": \"F\"}\n", + "education_dict = {\"Bachelors\": \"Bachelor\"}\n", + "\n", + "# Define a dictionary mapping state abbreviations to state names\n", + "state_dict = {\"AZ\": \"Arizona\", \"Cali\": \"California\", \"WA\": \"Washington\"}\n", + "\n", + "# Create a dictionary to store the column-specific dictionaries\n", + "column_dicts = {\"gender\": gender_dict, \"education\": education_dict, \"state\": state_dict}\n", + "\n", + "# Loop through each column in the dataset and clean the inconsistent values using the corresponding dictionary mapping\n", + "for col in column_dicts:\n", + " df[col] = df[col].replace(column_dicts[col])\n", + "\n", + "# Verify that each column contains only consistent values\n", + "for col in column_dicts:\n", + " print(col + \":\", df[col].unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "204b6349-d9c3-4c47-a378-8f0e450149f5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "vehicle_class\n", + "Four-Door Car 13920\n", + "Two-Door Car 5687\n", + "SUV 5321\n", + "Luxury 2478\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "luxury = [\"Sports Car\", \"Luxury SUV\", \"Luxury Car\"]\n", + "df[\"vehicle_class\"] = df[\"vehicle_class\"].apply(lambda x: \"Luxury\" if x in luxury else x)\n", + "df.vehicle_class.value_counts()" ] }, { @@ -70,18 +783,6 @@ "This dataset contains information such as customer demographics, policy details, vehicle information, and the customer's response to the last marketing campaign. Our goal is to explore and analyze this data by performing data cleaning, formatting, and structuring." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", - "metadata": { - "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" - }, - "outputs": [], - "source": [ - "# Your code goes here" - ] - }, { "cell_type": "markdown", "id": "df35fd0d-513e-4e77-867e-429da10a9cc7", @@ -103,6 +804,511 @@ "2. Create a pivot table that shows the average customer lifetime value per gender and education level. Analyze the resulting table to draw insights." ] }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b07f3ebb-eb5a-484f-be59-486ad88e0180", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
unnamed:_0customerstatecustomer_lifetime_valueresponsecoverageeducationeffective_to_dateemploymentstatusgender...number_of_policiespolicy_typepolicyrenew_offer_typesales_channeltotal_claim_amountvehicle_classvehicle_sizevehicle_typemonth
00DK49336Arizona4809.216960NoBasicCollege2011-02-18EmployedM...9Corporate AutoCorporate L3Offer3Agent292.800000Four-Door CarMedsizeA2
11KX64629California2228.525238NoBasicCollege2011-01-18UnemployedF...1Personal AutoPersonal L3Offer4Call Center744.924331Four-Door CarMedsizeA1
22LZ68649Washington14947.917300NoBasicBachelor2011-02-10EmployedM...2Personal AutoPersonal L3Offer3Call Center480.000000SUVMedsizeA2
33XL78013Oregon22332.439460YesExtendedCollege2011-01-11EmployedM...2Corporate AutoCorporate L3Offer2Branch484.013411Four-Door CarMedsizeA1
44QA50777Oregon9025.067525NoPremiumBachelor2011-01-17Medical LeaveF...7Personal AutoPersonal L2Offer1Branch707.925645Four-Door CarMedsizeA1
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " unnamed:_0 customer state customer_lifetime_value response \\\n", + "0 0 DK49336 Arizona 4809.216960 No \n", + "1 1 KX64629 California 2228.525238 No \n", + "2 2 LZ68649 Washington 14947.917300 No \n", + "3 3 XL78013 Oregon 22332.439460 Yes \n", + "4 4 QA50777 Oregon 9025.067525 No \n", + "\n", + " coverage education effective_to_date employmentstatus gender ... \\\n", + "0 Basic College 2011-02-18 Employed M ... \n", + "1 Basic College 2011-01-18 Unemployed F ... \n", + "2 Basic Bachelor 2011-02-10 Employed M ... \n", + "3 Extended College 2011-01-11 Employed M ... \n", + "4 Premium Bachelor 2011-01-17 Medical Leave F ... \n", + "\n", + " number_of_policies policy_type policy renew_offer_type \\\n", + "0 9 Corporate Auto Corporate L3 Offer3 \n", + "1 1 Personal Auto Personal L3 Offer4 \n", + "2 2 Personal Auto Personal L3 Offer3 \n", + "3 2 Corporate Auto Corporate L3 Offer2 \n", + "4 7 Personal Auto Personal L2 Offer1 \n", + "\n", + " sales_channel total_claim_amount vehicle_class vehicle_size \\\n", + "0 Agent 292.800000 Four-Door Car Medsize \n", + "1 Call Center 744.924331 Four-Door Car Medsize \n", + "2 Call Center 480.000000 SUV Medsize \n", + "3 Branch 484.013411 Four-Door Car Medsize \n", + "4 Branch 707.925645 Four-Door Car Medsize \n", + "\n", + " vehicle_type month \n", + "0 A 2 \n", + "1 A 1 \n", + "2 A 2 \n", + "3 A 1 \n", + "4 A 1 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/marketing_customer_analysis_clean.csv\")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26", + "metadata": { + "id": "aa10d9b0-1c27-4d3f-a8e4-db6ab73bfd26" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
total_claim_amount
sales_channel
Agent1810226.82
Branch1301204.00
Call Center926600.82
Web706600.04
\n", + "
" + ], + "text/plain": [ + " total_claim_amount\n", + "sales_channel \n", + "Agent 1810226.82\n", + "Branch 1301204.00\n", + "Call Center 926600.82\n", + "Web 706600.04" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.pivot_table(values='total_claim_amount', index='sales_channel', aggfunc='sum').round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "697997b9-86ea-416f-9982-d49f54321a1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
gendereducation
FBachelor7874.27
College7748.82
Doctor7328.51
High School or Below8675.22
Master8157.05
MBachelor7703.60
College8052.46
Doctor7415.33
High School or Below8149.69
Master8168.83
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value\n", + "gender education \n", + "F Bachelor 7874.27\n", + " College 7748.82\n", + " Doctor 7328.51\n", + " High School or Below 8675.22\n", + " Master 8157.05\n", + "M Bachelor 7703.60\n", + " College 8052.46\n", + " Doctor 7415.33\n", + " High School or Below 8149.69\n", + " Master 8168.83" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pivot_table = pd.pivot_table(df, values='customer_lifetime_value', index=['gender', 'education'], aggfunc='mean').round(2)\n", + "\n", + "pivot_table" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "946c0cf9-a602-499c-9a6f-0c08deec85c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_lifetime_value
genderFM
education
Bachelor7874.277703.60
College7748.828052.46
Doctor7328.517415.33
High School or Below8675.228149.69
Master8157.058168.83
\n", + "
" + ], + "text/plain": [ + " customer_lifetime_value \n", + "gender F M\n", + "education \n", + "Bachelor 7874.27 7703.60\n", + "College 7748.82 8052.46\n", + "Doctor 7328.51 7415.33\n", + "High School or Below 8675.22 8149.69\n", + "Master 8157.05 8168.83" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#another way\n", + "pivot_table_2 = round(df.pivot_table(index=\"education\", columns=\"gender\", values=[\"customer_lifetime_value\"]), 2)\n", + "pivot_table_2.head(10)" + ] + }, { "cell_type": "markdown", "id": "32c7f2e5-3d90-43e5-be33-9781b6069198", @@ -130,14 +1336,100 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "3a069e0b-b400-470e-904d-d17582191be4", "metadata": { "id": "3a069e0b-b400-470e-904d-d17582191be4" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
monthpolicy_type0
01Corporate Auto1252
11Personal Auto4329
21Special Auto237
32Corporate Auto1089
42Personal Auto3799
52Special Auto204
\n", + "
" + ], + "text/plain": [ + " month policy_type 0\n", + "0 1 Corporate Auto 1252\n", + "1 1 Personal Auto 4329\n", + "2 1 Special Auto 237\n", + "3 2 Corporate Auto 1089\n", + "4 2 Personal Auto 3799\n", + "5 2 Special Auto 204" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code goes here" + "# Create a pivot table with the number of open complaints by month and policy type\n", + "pivot_table = pd.pivot_table(df, values='number_of_open_complaints', index=['month'], columns=['policy_type'], aggfunc='count', fill_value=0)\n", + "\n", + "# Stack the pivot table to create a long format table\n", + "pivot_table.stack().reset_index()" ] } ], @@ -160,7 +1452,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.12.4" } }, "nbformat": 4,