cars_price.py

# -*- coding: utf-8 -*-
"""Session Project.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1fmxLuEihlmoeIoJdw0iP2a9cR3R0swP3

# Introdction

## The data belongs to car insurance company. <br>`symboling` refers to the risk associated with this car relative to its set.<br> `normalized-losses` is the average loss per car per year.

## Questions:


---


What questions we wish to ask?

> What are the features affecting the price of the car?
"""

# Commented out IPython magic to ensure Python compatibility.
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# %matplotlib inline

# read dataset
df = pd.read_csv('Automobile_data.csv')
df.head()

df.shape

df.info()

"""Adding a correlation heat map to check if there are relations between features"""

corr = df.corr() # correlation map
fig, ax = plt.subplots(figsize=(20,20))
ax = sns.heatmap(corr, annot=True, cmap= 'vlag_r', center = 0)

"""**From the previous heat map it is shown that there are correlation between some features like :**
>1. wheel-base &
>> a. length\
>> b. width\
>> c. curb-weight
> 2. length &
>> a. width\
>> b. curb-weight
> 3. width & 
>> a. curb-weight  
>> b. engine-size
> 4. engine-size &
>> a. curb-weight  
> 5. highway-mpg &
>> a. city-mpg\
>> b. curb-weight  
>> c. length
> 6. Price &
>> Width\
>> Curb-weight\
>> Engine-size\
>> Horsepower

#Data Cleaning 

---

## Looking for missing data in normalized-losses column
"""

pd.to_numeric(df['normalized-losses'], errors='coerce').isnull().sum()

df[df['normalized-losses'] == '?']

df['normalized-losses'] = pd.to_numeric(df['normalized-losses'], errors='coerce')

df['normalized-losses'].value_counts()

"""We Could fill the null values with the column mean"""

normalized_losses_mean = df['normalized-losses'].mean()
normalized_losses_mean

df['normalized-losses'].fillna(normalized_losses_mean, inplace = True)

df['normalized-losses'].isnull().sum() # Check

"""## Cleaning num-of-doors"""

df['num-of-doors'].value_counts()

df['num-of-doors'].replace('four', 4, inplace = True)
df['num-of-doors'].replace('two', 2, inplace = True)
df['num-of-doors'] = pd.to_numeric(df['num-of-doors'], errors='coerce')

df['num-of-doors'].value_counts()

df['num-of-doors'].isnull().sum()

# Dropping the 2 null values
df.dropna(inplace = True)

df['num-of-doors'].isnull().sum()

"""##Other numerical columns that should be treated `peak-rpm`, `horsepower`, `bore`, `stroke` and `price`"""

df['peak-rpm'].unique()

pd.to_numeric(df['peak-rpm'], errors='coerce').isnull().sum()

"""Dropping the non numerical"""

df.drop(df[pd.to_numeric(df['peak-rpm'], errors='coerce').isnull()].index, inplace= True)

pd.to_numeric(df['price'], errors= 'coerce').isnull().sum()

"""Dropping the records with missing prices"""

df.drop(df[pd.to_numeric(df['price'], errors='coerce').isnull()].index, inplace= True)

pd.to_numeric(df['bore'], errors= 'coerce').isnull().sum()

df.drop(df[pd.to_numeric(df['bore'], errors='coerce').isnull()].index, inplace= True)

pd.to_numeric(df['stroke'], errors= 'coerce').isnull().sum()

"""Changing data type for qualitative columns"""

df[['price', 'horsepower', 'peak-rpm', 'num-of-doors']] = df[['price', 'horsepower', 'peak-rpm', 'num-of-doors']].astype(int)
df[['bore', 'stroke', 'normalized-losses']] = df[['bore', 'stroke', 'normalized-losses']].astype(float)

df.info()

"""##Checking outliers for numerical variables"""

plt.figure(figsize=[10,10])
plt.subplot(3,2,1)
df.boxplot(column=['city-mpg'])
plt.subplot(3,2,2)
df.boxplot(column=['curb-weight'])
plt.subplot(3,2,3)
df.boxplot(column=['symboling'])
plt.subplot(3,2,4)
df.boxplot(column=['price'])
plt.subplot(3,2,5)
df.boxplot(column=['bore'])
plt.subplot(3,2,6)
df.boxplot(column=['wheel-base'])
;

df.describe()

"""# Exploring Data"""

plt.figure(figsize=[12,8])
plt.subplot(2,2,1)
bin_edges = np.arange(5000, 50000+1000, 1000)
plt.hist(data=df, x='price', bins = bin_edges)
plt.xlabel('price')
plt.subplot(2,2,2)
bin_edges = np.arange(5, 60+3, 3)
plt.hist(data=df, x='city-mpg', bins = bin_edges)
plt.xlabel('city-mpg')
plt.subplot(2,2,3)
bin_edges = np.arange(5, 60+3, 3)
plt.hist(data=df, x='highway-mpg', bins = bin_edges)
plt.xlabel('highway-mpg')
plt.subplot(2,2,4)
bin_edges = np.arange(30, 300+10, 10)
plt.hist(data=df, x='horsepower', bins = bin_edges)
plt.xlabel('horsepower');

"""Price seems to be right skewed with too many outliers."""

base_color = sns.color_palette()[0]
plt.figure(figsize=[20,5])
order = df.groupby('make').price.mean().sort_values(ascending = False).index
sns.barplot(data= df , x='make', y='price', color=base_color, order= order)
plt.xticks(rotation = 30)
;

df.head()

plt.figure(figsize=[20,15])
plt.subplot(2,2,1)
sns.regplot(data=df, x='horsepower', y='price')
plt.subplot(2,2,2)
sns.regplot(data=df, x='engine-size', y='price')
plt.subplot(2,2,3)
sns.regplot(data=df, x='curb-weight', y='price')
plt.subplot(2,2,4)
sns.regplot(data=df, x='width', y='price');

"""# Encoding some categorical Features

--------

## Encoding the body-style feature
"""

df_new = df

df_new['body-style'].value_counts()

# We can select more or less
dummies_body = pd.get_dummies(df_new['body-style'], drop_first=True)
dummies_make = pd.get_dummies(df_new['make'], drop_first=True)
dummies_wheels = pd.get_dummies(df_new['drive-wheels'], drop_first=True)

df_new = df_new.join(dummies_body).join(dummies_make).join(dummies_wheels)
df_new.head()

"""## Question#1 What are the features affecting the price of the car?

From exploratory part of the analysis we found the price is most afftect with the following variables:
> Width\
> Curb-weight\
> Engine-size\
> Horsepower  
> city-mpg  
> highway-mpg

### Building Linear Regression Model between the above models
"""

import statsmodels.api as sm
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
df['intercept'] = 1
df.rename(columns={'curb-weight' : 'curb_weight', 'engine-size' : 'engine_size', 'city-mpg':'city_mpg',
                    'highway-mpg':'highway_mpg'}, inplace = True) # for some reason VIF check doesn't work on columns with `-` inbetween
df.head()

"""#### Multicolinearity Check"""

y, X = dmatrices('price~width+curb_weight+engine_size+horsepower+city_mpg+highway_mpg', df, return_type = 'dataframe')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(1)

"""As excepted city_mpg and highway_mpg are dependant varaibles. Deleting `highway_mpg` and try the check again"""

y, X = dmatrices('price~width+curb_weight+engine_size+horsepower+city_mpg', df, return_type = 'dataframe')
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns
vif.round(1)

"""All VIFs are below 10, which means that those variables are not correlated to each other and we can use them all."""

reg_m = sm.OLS(df['price'], df[['intercept', 'width', 'curb_weight', 'engine_size', 'horsepower', 'city_mpg']])
res = reg_m.fit()
res.summary()

"""We got a good model with all variables statistically significant except the `curb_weight` and `city_mpg`.  
Trying again without the `curb_weight` nor `city_mpg` variabale:  

"""

reg_m = sm.OLS(df['price'], df[['intercept', 'width', 'engine_size', 'horsepower']])
res = reg_m.fit()
res.summary()

pred_price = -59020 + (df.width * 818.5) + (df.engine_size * 106) + (df.horsepower * 46.3)

"""### Trying out with modeling with Machine Learning"""

df_new.head()

"""# Sci-Kit Learn Linear Regression using the previous numerical features + categorical features"""

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
#features = df_new['width', 'engine-size', 'horsepower']
X = df_new[['width', 'engine-size', 'horsepower', 'wheel-base','curb-weight','bore',
            'hardtop',
            'hatchback',	'sedan',	'wagon',	'audi',	'bmw',	'chevrolet',	'dodge',	'honda',
            'isuzu',	'jaguar',	'mazda',	'mercedes-benz',	'mercury',	'mitsubishi',	'nissan',
            'peugot',	'plymouth',	'porsche',	'saab',	'subaru',	'toyota',	'volkswagen',	'volvo',
            'fwd',	'rwd',]]
y = df_new['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
print('Mean Root squared error: %.2f'% np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2: %.2f'% r2_score(y_test, y_pred))
print('Mean Absolute Error: %.2f' % mean_absolute_error(y_test, y_pred))

"""Using RandomForestRessor with same features"""

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300, random_state=0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print('Mean Root squared error: %.2f'% np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2: %.2f'% r2_score(y_test, y_pred))
print('Mean Absolute Error: %.2f' % mean_absolute_error(y_test, y_pred))

"""# Limitation

> Dataset was limited in number of rows, thus the outliers were kept.  
> We dropped 12 records total.  
> We wished to estimate the normalized losses but it had many NAN values
"""