#Import Packages
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pylab
import scipy.stats as stats
import seaborn as sns
import sklearn as sk
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import *
from sklearn import metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
#Original Dataset
df = pd.read_csv('kc_house_data.csv')
On inspection of raw data, the following was noted:
- Out of 23 data columns, 3 contained null values
- waterfront, view, and yr.renovated
- the id column contained numerous duplications
- the lat column consisted of inconsistent decimal significant figures
- the sqft_basement column contained a significant numner of "?" in place of numerical data
- Several columns consisted of categorical data
- waterfront, view, condition, and grade
- A few significant outliyers
- One home in the bedroom column had 33 bedrooms
- One home had 7.5 bathroom
df.head() #view of first 5 rows of data
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | date | price | bedrooms | bathrooms | sqft_living | sqft_lot | floors | waterfront | view | ... | grade | sqft_above | sqft_basement | yr_built | yr_renovated | zipcode | lat | long | sqft_living15 | sqft_lot15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7129300520 | 10/13/2014 | 221900.0 | 3 | 1.00 | 1180 | 5650 | 1.0 | NaN | 0.0 | ... | 7 | 1180 | 0.0 | 1955 | 0.0 | 98178 | 47.5112 | -122.257 | 1340 | 5650 |
1 | 6414100192 | 12/9/2014 | 538000.0 | 3 | 2.25 | 2570 | 7242 | 2.0 | 0.0 | 0.0 | ... | 7 | 2170 | 400.0 | 1951 | 1991.0 | 98125 | 47.7210 | -122.319 | 1690 | 7639 |
2 | 5631500400 | 2/25/2015 | 180000.0 | 2 | 1.00 | 770 | 10000 | 1.0 | 0.0 | 0.0 | ... | 6 | 770 | 0.0 | 1933 | NaN | 98028 | 47.7379 | -122.233 | 2720 | 8062 |
3 | 2487200875 | 12/9/2014 | 604000.0 | 4 | 3.00 | 1960 | 5000 | 1.0 | 0.0 | 0.0 | ... | 7 | 1050 | 910.0 | 1965 | 0.0 | 98136 | 47.5208 | -122.393 | 1360 | 5000 |
4 | 1954400510 | 2/18/2015 | 510000.0 | 3 | 2.00 | 1680 | 8080 | 1.0 | 0.0 | 0.0 | ... | 8 | 1680 | 0.0 | 1987 | 0.0 | 98074 | 47.6168 | -122.045 | 1800 | 7503 |
5 rows Ă— 21 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
id 21597 non-null int64
date 21597 non-null object
price 21597 non-null float64
bedrooms 21597 non-null int64
bathrooms 21597 non-null float64
sqft_living 21597 non-null int64
sqft_lot 21597 non-null int64
floors 21597 non-null float64
waterfront 19221 non-null float64
view 21534 non-null float64
condition 21597 non-null int64
grade 21597 non-null int64
sqft_above 21597 non-null int64
sqft_basement 21597 non-null object
yr_built 21597 non-null int64
yr_renovated 17755 non-null float64
zipcode 21597 non-null int64
lat 21597 non-null float64
long 21597 non-null float64
sqft_living15 21597 non-null int64
sqft_lot15 21597 non-null int64
dtypes: float64(8), int64(11), object(2)
memory usage: 3.5+ MB
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
id 21597 non-null int64
date 21597 non-null object
price 21597 non-null float64
bedrooms 21597 non-null int64
bathrooms 21597 non-null float64
sqft_living 21597 non-null int64
sqft_lot 21597 non-null int64
floors 21597 non-null float64
waterfront 19221 non-null float64
view 21534 non-null float64
condition 21597 non-null int64
grade 21597 non-null int64
sqft_above 21597 non-null int64
sqft_basement 21597 non-null object
yr_built 21597 non-null int64
yr_renovated 17755 non-null float64
zipcode 21597 non-null int64
lat 21597 non-null float64
long 21597 non-null float64
sqft_living15 21597 non-null int64
sqft_lot15 21597 non-null int64
dtypes: float64(8), int64(11), object(2)
memory usage: 3.5+ MB
df.columns
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
'lat', 'long', 'sqft_living15', 'sqft_lot15'],
dtype='object')
# All "?" in sqft_basement column replaced with null values
df = df.replace('?' , np.nan)
# A new data column was created by taking each value in
#'yr_built' and subtracting from the current year 2019
df['age'] = 2019-df['yr_built']
#In order to provide more manageable values, 'price was divided by 1000,
# converting 'price' to thousands ($)
df['price1000'] = df['price']/1000
#Due to the large spread of data, and the substantially increased variance
# as 'price' approached millions, the dataset 'price' was capped at
# 1,000,000 ie 'price' would include only homes valued at under 1 million dollars
df_Mill = df.loc[df['price'] < 1000000]
df_Mill.describe()
#Columns of interest for <$1,000,000 Homes
dfX= df_Mill[['sqft_living', 'bathrooms', 'floors', 'grade'
, 'sqft_living15', 'age', 'zipcode', 'view']]
dfY= df_Mill['price1000']
#Dataframe with Dummy Variables
dfXDummy1 = pd.get_dummies(dfX, columns=['view','zipcode'])
dfXDummy1 = dfXDummy1.drop(['view_0.0','zipcode_98001'], axis=1)
dfXDummy1.head()
#####Most relevant zipcodes
relevant_zip = dfXDummy1[['zipcode_98040', 'zipcode_98004', 'zipcode_98039',
'zipcode_98005', 'zipcode_98075', 'zipcode_98112',
'zipcode_98119', 'zipcode_98006', 'zipcode_98109',
'zipcode_98102', 'zipcode_98074', 'zipcode_98105',
'zipcode_98199', 'zipcode_98033', 'zipcode_98077', 'zipcode_98052', 'zipcode_98053']]
relevant_zip.head()
#######Final Data
X= dfXDummy1[['sqft_living', 'bathrooms', 'floors', 'grade',
'sqft_living15', 'age']]
Y= dfY
df.hist(figsize=(14,18))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1676a5c0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1a1765d198>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1a17684400>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1a17717668>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1a175f6898>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1a1761eb00>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1a18161d68>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1879ffd0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c187a6080>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c187f84e0>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c1881f748>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c188489b0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18870c18>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1889ae80>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c188cd128>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c188f4390>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c1891f5f8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18947860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18971ac8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18999d30>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x1c189c5f98>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c189f4240>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18a1d4a8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18a47710>,
<matplotlib.axes._subplots.AxesSubplot object at 0x1c18a6f978>]],
dtype=object)
sns.set(style="whitegrid")
sns.boxplot(x=df["bedrooms"],y=df["price1000"])
plt.ylim(0, 5000)
plt.title("Number of bedrooms in homes sold")
Text(0.5, 1.0, 'Number of bedrooms in homes sold')
corr = new_data.corr()
plt.figure(figsize=(20,10))
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.set_context("paper", font_scale=2)
plt.title('Variables of Interest in our Analysis', fontsize=40)
sns.heatmap(new_data.corr(),annot=True, linewidth=.5, cmap="coolwarm", mask=mask)
<matplotlib.axes._subplots.AxesSubplot at 0x1c1c20fe48>
new_data = df.rename(columns = {'price': "Home Price", 'bedrooms': "Number of Bedrooms", 'bathrooms': "Number of Bathrooms",
'sqft_living': "House Square Footage", 'floors': "Number of Floors", 'view': "View Grade", 'grade' : "King's County Grade", 'sqft_living15' : "Neighbors Home Square Footage", 'age' : "Home Age"})
new_data = new_data.drop(['sqft_lot', 'id', 'waterfront', 'condition', 'sqft_above', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_lot15', 'price1000'], axis=1)
From our regression model, with 'price1000' as the dependent/target variable and explanatory variables of 'sqft_living', 'bedrooms', 'bathrooms', 'floors', 'sqft_lot', 'grade', 'sqft_living15', 'sqft_lot15', 'age', we obtained an R squared value was 0.551.
#Regression Model
X = sm.add_constant(X)# adding a constant
model=sm.OLS(Y, X).fit()
model.summary()
Dep. Variable: | price1000 | R-squared: | 0.548 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.548 |
Method: | Least Squares | F-statistic: | 4058. |
Date: | Fri, 10 May 2019 | Prob (F-statistic): | 0.00 |
Time: | 14:07:09 | Log-Likelihood: | -1.2663e+05 |
No. Observations: | 20107 | AIC: | 2.533e+05 |
Df Residuals: | 20100 | BIC: | 2.533e+05 |
Df Model: | 6 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -663.4939 | 9.899 | -67.026 | 0.000 | -682.897 | -644.091 |
sqft_living | 0.0511 | 0.002 | 22.061 | 0.000 | 0.047 | 0.056 |
bathrooms | 29.7995 | 2.182 | 13.655 | 0.000 | 25.522 | 34.077 |
floors | 39.1622 | 2.166 | 18.081 | 0.000 | 34.917 | 43.408 |
grade | 89.8342 | 1.465 | 61.333 | 0.000 | 86.963 | 92.705 |
sqft_living15 | 0.0541 | 0.002 | 22.374 | 0.000 | 0.049 | 0.059 |
age | 2.8027 | 0.042 | 66.566 | 0.000 | 2.720 | 2.885 |
Omnibus: | 431.653 | Durbin-Watson: | 1.959 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 490.746 |
Skew: | 0.323 | Prob(JB): | 2.73e-107 |
Kurtosis: | 3.411 | Cond. No. | 3.09e+04 |
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.09e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
Note: Our Model ran with all explanatory variables in the X dataframe, and the target variable (House price in thousands of dollars) in the Y dataframe. All explanatory variables are significant.
A Q-Q plot was done, illustrating that our model residuals were normally distributed and the assumptions required for our model to be reliable were met.
#Residual Plot
residuals = model.resid
fig = sm.qqplot(residuals, stats.t, fit=True, line='45')
plt.show()
We performed a 5-fold split of our dataset with 80% of the data in training and 20% of the data in testing.
#Train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
lm = LinearRegression()
model = lm.fit(X_train, Y_train)
predictions = lm.predict(X_test)
lm.score(X_train, Y_train)
0.5478371888662472
Below is a visualization of our prediction line from the training data compared to the testing data.
sns.set(style='ticks')
sns.regplot(Y_test, y=predictions, scatter_kws={'alpha':0.05});
plt.title('Prediction Performance for House Price, < $1 million, 5-folds')
plt.ylabel('Test Data')
plt.xlabel('Trained Regression')
/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
Text(0.5, 0, 'Trained Regression')
We used Root Mean Squared Error to evaluate model performance. On average, our trained regression model mis-estimated the true values of homes in the test data by $131,000.
#On Average our model missed each data point by $131,000
metrics.mean_squared_error(Y_test, predictions)**0.5
130.42073232092025
MSEs = cross_val_score(lm, X, y=Y, scoring='neg_mean_squared_error', cv=10)
RMSE = (MSEs*-1)**0.5
print(RMSE)
[131.86508051 134.14601432 133.83294761 129.96903792 129.35030657
128.62966488 131.03051626 133.22863926 132.86070996 135.89446624]
Our linear regression model was repeated, but this time with the inclusion of categorical variables (view and zipcode) via dummy variables
dfXDummy1.shape
(20107, 79)
X1= dfXDummy1
Y1= dfY
X1 = sm.add_constant(X1)# adding a constant
model1=sm.OLS(Y, X1).fit()
model1.summary()
Dep. Variable: | price1000 | R-squared: | 0.819 |
---|---|---|---|
Model: | OLS | Adj. R-squared: | 0.818 |
Method: | Least Squares | F-statistic: | 1147. |
Date: | Fri, 10 May 2019 | Prob (F-statistic): | 0.00 |
Time: | 14:07:14 | Log-Likelihood: | -1.1742e+05 |
No. Observations: | 20107 | AIC: | 2.350e+05 |
Df Residuals: | 20027 | BIC: | 2.356e+05 |
Df Model: | 79 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
---|---|---|---|---|---|---|
const | -351.9106 | 8.071 | -43.601 | 0.000 | -367.731 | -336.091 |
sqft_living | 0.0891 | 0.002 | 59.021 | 0.000 | 0.086 | 0.092 |
bathrooms | 10.4835 | 1.406 | 7.456 | 0.000 | 7.727 | 13.240 |
floors | 0.0086 | 1.498 | 0.006 | 0.995 | -2.927 | 2.944 |
grade | 46.4428 | 0.996 | 46.627 | 0.000 | 44.490 | 48.395 |
sqft_living15 | 0.0381 | 0.002 | 22.747 | 0.000 | 0.035 | 0.041 |
age | 0.8152 | 0.033 | 25.070 | 0.000 | 0.751 | 0.879 |
view_1.0 | 47.4121 | 5.334 | 8.888 | 0.000 | 36.956 | 57.868 |
view_2.0 | 49.1039 | 3.200 | 15.345 | 0.000 | 42.832 | 55.376 |
view_3.0 | 71.8240 | 4.838 | 14.845 | 0.000 | 62.341 | 81.307 |
view_4.0 | 173.6434 | 7.622 | 22.782 | 0.000 | 158.704 | 188.583 |
zipcode_98002 | 12.6631 | 7.378 | 1.716 | 0.086 | -1.799 | 27.125 |
zipcode_98003 | -9.1416 | 6.648 | -1.375 | 0.169 | -22.172 | 3.889 |
zipcode_98004 | 476.4774 | 8.501 | 56.049 | 0.000 | 459.815 | 493.140 |
zipcode_98005 | 317.3658 | 8.287 | 38.297 | 0.000 | 301.123 | 333.609 |
zipcode_98006 | 261.1887 | 6.194 | 42.170 | 0.000 | 249.049 | 273.329 |
zipcode_98007 | 247.3902 | 8.458 | 29.248 | 0.000 | 230.811 | 263.970 |
zipcode_98008 | 234.0668 | 6.780 | 34.526 | 0.000 | 220.778 | 247.355 |
zipcode_98010 | 108.7740 | 9.424 | 11.542 | 0.000 | 90.303 | 127.246 |
zipcode_98011 | 137.0527 | 7.440 | 18.422 | 0.000 | 122.471 | 151.635 |
zipcode_98014 | 131.2463 | 8.759 | 14.985 | 0.000 | 114.079 | 148.414 |
zipcode_98019 | 106.2619 | 7.499 | 14.170 | 0.000 | 91.563 | 120.961 |
zipcode_98022 | 28.6355 | 7.037 | 4.069 | 0.000 | 14.842 | 42.429 |
zipcode_98023 | -25.1245 | 5.773 | -4.352 | 0.000 | -36.440 | -13.809 |
zipcode_98024 | 166.8150 | 10.765 | 15.496 | 0.000 | 145.714 | 187.916 |
zipcode_98027 | 191.0343 | 6.121 | 31.210 | 0.000 | 179.037 | 203.032 |
zipcode_98028 | 125.9201 | 6.648 | 18.942 | 0.000 | 112.890 | 138.950 |
zipcode_98029 | 224.3133 | 6.522 | 34.395 | 0.000 | 211.530 | 237.096 |
zipcode_98030 | 4.7181 | 6.813 | 0.692 | 0.489 | -8.636 | 18.073 |
zipcode_98031 | 11.8622 | 6.689 | 1.773 | 0.076 | -1.249 | 24.974 |
zipcode_98032 | -11.1297 | 8.661 | -1.285 | 0.199 | -28.107 | 5.847 |
zipcode_98033 | 305.6161 | 6.307 | 48.458 | 0.000 | 293.254 | 317.978 |
zipcode_98034 | 177.8213 | 5.714 | 31.119 | 0.000 | 166.621 | 189.021 |
zipcode_98038 | 46.6946 | 5.602 | 8.335 | 0.000 | 35.714 | 57.675 |
zipcode_98039 | 630.6376 | 37.552 | 16.794 | 0.000 | 557.032 | 704.243 |
zipcode_98040 | 400.0434 | 8.270 | 48.375 | 0.000 | 383.834 | 416.253 |
zipcode_98042 | 17.9354 | 5.656 | 3.171 | 0.002 | 6.849 | 29.022 |
zipcode_98045 | 116.4758 | 7.185 | 16.211 | 0.000 | 102.393 | 130.559 |
zipcode_98052 | 249.7555 | 5.684 | 43.942 | 0.000 | 238.615 | 260.896 |
zipcode_98053 | 248.7953 | 6.206 | 40.093 | 0.000 | 236.632 | 260.959 |
zipcode_98055 | 34.3812 | 6.729 | 5.109 | 0.000 | 21.191 | 47.571 |
zipcode_98056 | 107.3387 | 6.047 | 17.751 | 0.000 | 95.486 | 119.191 |
zipcode_98058 | 39.4726 | 5.888 | 6.704 | 0.000 | 27.932 | 51.013 |
zipcode_98059 | 105.9001 | 5.939 | 17.832 | 0.000 | 94.260 | 117.541 |
zipcode_98065 | 132.5385 | 6.578 | 20.149 | 0.000 | 119.646 | 145.432 |
zipcode_98070 | 159.7169 | 8.999 | 17.748 | 0.000 | 142.078 | 177.356 |
zipcode_98072 | 174.6230 | 6.819 | 25.607 | 0.000 | 161.256 | 187.990 |
zipcode_98074 | 214.0568 | 6.087 | 35.164 | 0.000 | 202.125 | 225.988 |
zipcode_98075 | 236.3458 | 6.564 | 36.007 | 0.000 | 223.480 | 249.212 |
zipcode_98077 | 188.3671 | 7.669 | 24.562 | 0.000 | 173.335 | 203.399 |
zipcode_98092 | -6.0001 | 6.262 | -0.958 | 0.338 | -18.273 | 6.273 |
zipcode_98102 | 354.6796 | 10.376 | 34.182 | 0.000 | 334.341 | 375.018 |
zipcode_98103 | 290.8573 | 5.780 | 50.323 | 0.000 | 279.528 | 302.186 |
zipcode_98105 | 337.4468 | 7.900 | 42.712 | 0.000 | 321.961 | 352.932 |
zipcode_98106 | 103.7352 | 6.364 | 16.301 | 0.000 | 91.262 | 116.209 |
zipcode_98107 | 295.0762 | 6.908 | 42.718 | 0.000 | 281.537 | 308.616 |
zipcode_98108 | 97.3486 | 7.561 | 12.875 | 0.000 | 82.529 | 112.169 |
zipcode_98109 | 366.5413 | 10.366 | 35.360 | 0.000 | 346.223 | 386.860 |
zipcode_98112 | 369.8744 | 8.256 | 44.803 | 0.000 | 353.693 | 386.056 |
zipcode_98115 | 292.3780 | 5.751 | 50.837 | 0.000 | 281.105 | 303.651 |
zipcode_98116 | 267.8100 | 6.588 | 40.648 | 0.000 | 254.896 | 280.724 |
zipcode_98117 | 288.1988 | 5.798 | 49.707 | 0.000 | 276.834 | 299.563 |
zipcode_98118 | 141.1640 | 5.830 | 24.214 | 0.000 | 129.737 | 152.591 |
zipcode_98119 | 361.5467 | 8.327 | 43.421 | 0.000 | 345.226 | 377.867 |
zipcode_98122 | 276.8193 | 6.883 | 40.221 | 0.000 | 263.329 | 290.310 |
zipcode_98125 | 178.2824 | 6.090 | 29.275 | 0.000 | 166.346 | 190.219 |
zipcode_98126 | 175.7309 | 6.332 | 27.753 | 0.000 | 163.320 | 188.142 |
zipcode_98133 | 140.9221 | 5.808 | 24.263 | 0.000 | 129.538 | 152.306 |
zipcode_98136 | 230.0131 | 6.938 | 33.153 | 0.000 | 216.414 | 243.612 |
zipcode_98144 | 212.5596 | 6.531 | 32.545 | 0.000 | 199.758 | 225.361 |
zipcode_98146 | 97.0609 | 6.673 | 14.546 | 0.000 | 83.981 | 110.140 |
zipcode_98148 | 45.0351 | 11.890 | 3.788 | 0.000 | 21.730 | 68.340 |
zipcode_98155 | 129.8014 | 5.943 | 21.839 | 0.000 | 118.152 | 141.451 |
zipcode_98166 | 97.8556 | 6.953 | 14.074 | 0.000 | 84.227 | 111.484 |
zipcode_98168 | 33.5840 | 6.756 | 4.971 | 0.000 | 20.341 | 46.827 |
zipcode_98177 | 198.6020 | 7.129 | 27.860 | 0.000 | 184.629 | 212.575 |
zipcode_98178 | 37.0373 | 6.824 | 5.428 | 0.000 | 23.662 | 50.413 |
zipcode_98188 | 24.5124 | 8.398 | 2.919 | 0.004 | 8.052 | 40.973 |
zipcode_98198 | 20.0634 | 6.685 | 3.001 | 0.003 | 6.960 | 33.167 |
zipcode_98199 | 322.1598 | 6.916 | 46.582 | 0.000 | 308.604 | 335.716 |
Omnibus: | 1515.035 | Durbin-Watson: | 1.981 |
---|---|---|---|
Prob(Omnibus): | 0.000 | Jarque-Bera (JB): | 4926.075 |
Skew: | 0.367 | Prob(JB): | 0.00 |
Kurtosis: | 5.311 | Cond. No. | 1.93e+05 |
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.93e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, test_size=0.2)
lm = LinearRegression()
model = lm.fit(X_train1, Y_train1)
predictions = lm.predict(X_test1)
lm.score(X_train1, Y_train1)
0.8216350456408583
MSEs1 = cross_val_score(lm, X1,Y1, scoring='neg_mean_squared_error', cv=10)
RMSEs1 = (MSEs1*-1)**0.5
print(RMSEs1)
[83.85768923 88.33768146 83.36782907 82.9558431 81.09160222 82.05150065
82.796006 81.50038961 86.81897319 85.10965661]
residuals1 = model1.resid
fig = sm.qqplot(residuals1, stats.t, fit=True, line='45')
plt.title('QQ-plot 2')
plt.show()
We would give the following recommendations to home owners in King's county looking to maximize the final sale value of their home:
-
Inreasing your avialble living space, on average increases home value by $89.10 per sqft
-
Make use of deadspace between walls and under staircases
-
When possible, consider light additions to your home such as sunrooms
-
-
There is a very strong correlation between bathroom number and increased housing sale price
- Observed about $10,483 increase in sale price per bathroom increase
- Increase the number of bathrooms in your home, even if converting 1/4 baths to 1/2 baths and 1/2 baths to full etc.
-
Your neighbors living space size has an effect on your home value
- The increasing of living space by your neighbors in their own homes will increase the return you get for your home
-
View matters!
- Improvements to the view from your home that increase the home's Kings county grade have a significant impact on sale price