# Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.cluster import KMeans
# Use this Dataframe to narrow down users who have written reviews for restaurants
filename = 'yelp_review.csv'
filepath = os.path.join('~','homework','yelp-dataset',filename)
reviews_data_df = pd.read_csv(filepath)
filename = 'restaurants.csv'
filepath = os.path.join(filename)
restaurant_data_df = pd.read_csv(filepath)
restaurant_data_df.rename(columns={'stars':'overall_stars'},inplace=True)
business_and_review = restaurant_data_df.merge(reviews_data_df,on='business_id')
business_and_review.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
business_id |
name |
neighborhood |
address |
city |
state |
postal_code |
latitude |
longitude |
overall_stars |
... |
is_open |
categories |
review_id |
user_id |
stars |
date |
text |
useful |
funny |
cool |
0 |
PfOCPjBrlQAnz__NXj9h_w |
"Brick House Tavern + Tap" |
NaN |
"581 Howe Ave" |
Cuyahoga Falls |
OH |
44221 |
41.119535 |
-81.47569 |
3.5 |
... |
1 |
American (New);Nightlife;Bars;Sandwiches;Ameri... |
rnvu74aNIPn93Z9Ue11Epw |
bgtr7jOlALe8ay4g7pnf4A |
4 |
2016-05-05 |
I've always enjoyed my time at brick house foo... |
0 |
0 |
0 |
1 |
PfOCPjBrlQAnz__NXj9h_w |
"Brick House Tavern + Tap" |
NaN |
"581 Howe Ave" |
Cuyahoga Falls |
OH |
44221 |
41.119535 |
-81.47569 |
3.5 |
... |
1 |
American (New);Nightlife;Bars;Sandwiches;Ameri... |
OE5dH664oSSK_-TvIl263w |
ndZOHkG_FWdL4TRd9xRfNQ |
2 |
2017-03-19 |
1st time here. Came w my Unc bc Louies was clo... |
0 |
0 |
0 |
2 |
PfOCPjBrlQAnz__NXj9h_w |
"Brick House Tavern + Tap" |
NaN |
"581 Howe Ave" |
Cuyahoga Falls |
OH |
44221 |
41.119535 |
-81.47569 |
3.5 |
... |
1 |
American (New);Nightlife;Bars;Sandwiches;Ameri... |
SWeHmo8uHHWbbv94VdAMeQ |
UqtE83J-6NMQNNJzpnWwEQ |
1 |
2015-08-31 |
Worse service ever andI use to be a server so ... |
0 |
0 |
0 |
3 |
PfOCPjBrlQAnz__NXj9h_w |
"Brick House Tavern + Tap" |
NaN |
"581 Howe Ave" |
Cuyahoga Falls |
OH |
44221 |
41.119535 |
-81.47569 |
3.5 |
... |
1 |
American (New);Nightlife;Bars;Sandwiches;Ameri... |
fCCL0pAgYGccUs8shHHIUA |
PixAmdRa58XZ3hr8YLS36g |
5 |
2015-10-09 |
I am updating my review to 5-stars because I t... |
2 |
0 |
2 |
4 |
PfOCPjBrlQAnz__NXj9h_w |
"Brick House Tavern + Tap" |
NaN |
"581 Howe Ave" |
Cuyahoga Falls |
OH |
44221 |
41.119535 |
-81.47569 |
3.5 |
... |
1 |
American (New);Nightlife;Bars;Sandwiches;Ameri... |
-VOh0NNnIo1r24hjwSZL4Q |
Hly0NEWXMxbuXa2zm_NJOQ |
4 |
2015-01-15 |
I enjoyed this place. I went the night the Bu... |
0 |
0 |
0 |
5 rows × 21 columns
# Rating distributions normalized by total reviews written
filename= 'restaurant_reviewers_pct.xlsx'
filepath = os.path.join(filename)
restaurant_reviews_pct = pd.read_excel(filepath,header=1)
restaurant_reviews_pct.fillna(0,inplace=True)
restaurant_reviews_pct.set_index(['User Id'],inplace=True)
restaurant_reviews_pct.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
1 |
2 |
3 |
4 |
5 |
User Id |
|
|
|
|
|
0_0-eY7lxC-2DgCATusr-g |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0_0Lg3MMEi9NAGztLdL52Q |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0_0r9eJyfLs87vFLaXIsRg |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0_0u-ggFJhigAwWuvok8Zg |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0_0YL1uj3Ds_ZFdPt9wx4w |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
# Raw counts of Ratings associated with reviews
filename= 'restaurant_reviewers_cnt.xlsx'
filepath = os.path.join(filename)
restaurant_reviews_cnt = pd.read_excel(filepath,header=1)
restaurant_reviews_cnt.fillna(0,inplace=True)
restaurant_reviews_cnt.set_index(['User Id'],inplace=True)
restaurant_reviews_cnt.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
1 |
2 |
3 |
4 |
5 |
User Id |
|
|
|
|
|
0_0-eY7lxC-2DgCATusr-g |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0_0Lg3MMEi9NAGztLdL52Q |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0_0r9eJyfLs87vFLaXIsRg |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0_0u-ggFJhigAwWuvok8Zg |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0_0YL1uj3Ds_ZFdPt9wx4w |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
# Statistical metrics of rating distributions
filename = 'rating_description_per_user_alt.csv'
filepath = os.path.join(filename)
rating_desc = pd.read_csv(filepath)
rating_desc.drop('mode',axis=1,inplace=True)
rating_desc.fillna(0,inplace=True)
rating_desc.set_index('user_id',inplace=True)
rating_desc.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
10th_quantile |
25th_quantile |
75th_quantile |
90th_quantile |
max |
mean_avg |
med |
min |
review_count |
std |
var |
user_id |
|
|
|
|
|
|
|
|
|
|
|
---1lKK3aKOuomHnwAkAow |
1.0 |
3.00 |
5.00 |
5.0 |
5 |
3.861111 |
4.5 |
1 |
72 |
1.466101 |
2.149452 |
---PLwSf5gKdIoVnyRHgBA |
3.0 |
3.00 |
3.00 |
3.0 |
3 |
3.000000 |
3.0 |
3 |
1 |
0.000000 |
0.000000 |
---cu1hq55BP9DWVXXKHZg |
1.3 |
1.75 |
3.25 |
3.7 |
4 |
2.500000 |
2.5 |
1 |
2 |
2.121320 |
4.500000 |
---udAKDsn0yQXmzbWQNSw |
4.1 |
4.25 |
4.75 |
4.9 |
5 |
4.500000 |
4.5 |
4 |
2 |
0.707107 |
0.500000 |
--0RtXvcOIE4XbErYca6Rw |
4.0 |
4.00 |
4.00 |
4.0 |
4 |
4.000000 |
4.0 |
4 |
1 |
0.000000 |
0.000000 |
# Create Derived descriptors DataFrame
temp_1 = restaurant_reviews_pct.merge(rating_desc,left_index=True,right_index=True)
def round_star(metric):
return int(np.round(metric))
# min to med slope
def min_med(x):
med_star = int(np.floor(x['med']))
min_star = int(np.round(x['min']))
rise = x[str(med_star)] - x[str(min_star)]
run = med_star - min_star
if run == 0:
return 0
else:
return rise/run
temp_1['min_med_slope'] = temp_1.apply(min_med,axis=1)
# med to max slope
def med_max(x):
med_star = int(np.ceil(x['med']))
max_star = int(np.round(x['max']))
rise = x[str(max_star)] - x[str(med_star)]
run = max_star - med_star
if run == 0:
return 0
else:
return rise/run
temp_1['med_max_slope'] = temp_1.apply(med_max,axis=1)
temp_1['mean-med'] = temp_1.apply(lambda x: np.round(x['mean_avg']*2)/2-x['med'],axis=1)
temp_1['inflection'] = temp_1.apply(lambda x: x['min_med_slope']*x['med_max_slope'],axis=1)
derived = temp_1[['mean-med','min_med_slope','med_max_slope','inflection']]
derived.head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
mean-med |
min_med_slope |
med_max_slope |
inflection |
0_0-eY7lxC-2DgCATusr-g |
0.0 |
0.0 |
0.0 |
0.0 |
0_0Lg3MMEi9NAGztLdL52Q |
0.0 |
0.0 |
0.0 |
0.0 |
0_0r9eJyfLs87vFLaXIsRg |
0.0 |
0.0 |
0.0 |
0.0 |
0_0u-ggFJhigAwWuvok8Zg |
0.0 |
0.0 |
0.0 |
0.0 |
0_0YL1uj3Ds_ZFdPt9wx4w |
0.0 |
0.0 |
0.0 |
0.0 |
def plot_dist(user,show=True):
data = business_and_review.loc[business_and_review['user_id'] == user]['stars']
plt.hist(list(data),bins=[x+0.5 for x in range(6)],align='mid',width=0.9)
plt.title(user)
if show:
plt.show()
def normalize(input_list):
if (np.min(input_list) == 0) & (np.max(input_list)==1):
return input_list
else:
ref = np.max(input_list) - np.min(input_list)
return (input_list - np.min(input_list))/ref
restaurant_reviews_cnt.index.nunique()
# source = restaurant_reviews_cnt
# source = restaurant_reviews_pct
# source = rating_desc
# source = restaurant_reviews.merge(rating_desc,left_index=True,right_index=True)
# source = restaurant_reviews_pct.merge(
# restaurant_reviews_cnt,left_index=True,right_index=True).merge(
# rating_desc[[
# 'review_count','10th_quantile','90th_quantile','mean_avg','var','med'
# ]],left_index=True,right_index=True)
source = restaurant_reviews_pct.merge(
rating_desc[[
'review_count'
]],left_index=True,right_index=True).merge(derived,left_index=True,right_index=True)
# source = derived
source.index.nunique()
if True:
# Remove low-count reviewers
source = source[source['review_count'] >= 10]
source.index.nunique()
if False:
# Separate Single-Rating reviewers (reviewers who predominantly give out the same rating)
monotony_threshold = 0.8
# source = source[source.apply(lambda x: x[['1_x','2_x','3_x','4_x','5_x']].max() < monotony_threshold,axis=1)]
source = source[source.apply(lambda x: x[['1','2','3','4','5']].max() < monotony_threshold,axis=1)]
source.index.nunique()
source.apply(normalize,axis=0).head()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
1 |
2 |
3 |
4 |
5 |
review_count |
mean-med |
min_med_slope |
med_max_slope |
inflection |
0_6zmGnyHQrls5sJdKqudQ |
0.000000 |
0.000000 |
0.295455 |
0.090909 |
0.636364 |
0.000337 |
0.375 |
0.466999 |
0.640000 |
0.838762 |
0_9HCOdSBSrgeboN_pDcgA |
0.105263 |
0.000000 |
0.057018 |
0.210526 |
0.631579 |
0.003034 |
0.375 |
0.432588 |
0.640000 |
0.838762 |
0_71XZqxNetnJjJIINQCOg |
0.000000 |
0.227273 |
0.196970 |
0.272727 |
0.363636 |
0.000337 |
0.500 |
0.373599 |
0.705455 |
0.843559 |
0_aerl3cE66GEmqiAh9_aA |
0.000000 |
0.125000 |
0.000000 |
0.100000 |
0.800000 |
0.000000 |
0.375 |
0.502283 |
0.640000 |
0.838762 |
0_Cfb4ZEG2mQY9C2ACHnbA |
0.062500 |
0.195312 |
0.270833 |
0.343750 |
0.187500 |
0.007417 |
0.375 |
0.406678 |
0.527500 |
0.821756 |
data = source.apply(normalize,axis=0).values.tolist()
# graphically show Sum of Squares Error
sse = []
try_k = 15
for k in range(1, try_k+1):
# print(f"k = {k}")
kmeans = KMeans(n_clusters=k)
kmeans.fit(data)
sse.append(kmeans.inertia_)
plt.plot(range(1, try_k+1), sse)
plt.title('Sum of Squares Error vs. N Clusters')
plt.xlabel('N Clusters')
plt.ylabel('SSE')
plt.show()
# Automate selection of N Clusters
select_threshold = 10
gain = []
for i in range(1,try_k):
this_gain = 100*(sse[i-1]-sse[i])/sse[i-1]
gain.append(this_gain)
if this_gain <= select_threshold:
print(f'select k = {i}')
n_clusters = i
break
kmeans = KMeans(n_clusters=n_clusters)
# kmeans = KMeans(n_clusters=6)
kmeans.fit(data)
# Predict the clusters
predicted_clusters = kmeans.predict(data)
prediction = []
for i,user in enumerate(source.index):
prediction.append(predicted_clusters[i])
k_means_output = pd.DataFrame({'user_id':list(source.index),
'prediction':prediction})
output_context = source.merge(k_means_output,left_index=True,right_on='user_id')
output_context.groupby('prediction')['user_id'].count()
prediction
0 12666
1 13619
2 6542
3 18843
4 13316
Name: user_id, dtype: int64
# Spot check predictions
sample = 10
plt.figure(figsize = [20,10])
counter = 1
# for i in range(6):
for i in range(n_clusters):
sample_users = np.random.choice(output_context[output_context['prediction']==i]['user_id'],sample)
for j,this_user in enumerate(sample_users):
plt.subplot(n_clusters,sample,counter)
# this_user = np.random.choice(output_context[output_context['prediction']==i]['user_id'],1)[0]
plot_dist(this_user,show=False)
plt.title(f'{i}, {this_user[:5]}')
counter+=1
plt.show()
output_context.groupby('prediction').head().sort_values('prediction')
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
1 |
2 |
3 |
4 |
5 |
review_count |
mean-med |
min_med_slope |
med_max_slope |
inflection |
prediction |
user_id |
12 |
0.000000 |
0.052632 |
0.105263 |
0.526316 |
0.315789 |
19 |
0.0 |
0.236842 |
-0.210526 |
-0.049861 |
0 |
0_NbnQy2AwOUDtDUr4uDTg |
32 |
0.044444 |
0.044444 |
0.133333 |
0.488889 |
0.288889 |
45 |
0.0 |
0.148148 |
-0.200000 |
-0.029630 |
0 |
0A9i7xt77g1PYS7j219XcQ |
10 |
0.037037 |
0.148148 |
0.111111 |
0.592593 |
0.111111 |
27 |
-0.5 |
0.185185 |
-0.481481 |
-0.089163 |
0 |
0_J-edyZgAkDPHjpZKmiMg |
9 |
0.000000 |
0.050633 |
0.240506 |
0.569620 |
0.139241 |
79 |
0.0 |
0.259494 |
-0.430380 |
-0.111681 |
0 |
0_iROv6FL4KCveV916IpTA |
5 |
0.000000 |
0.090909 |
0.090909 |
0.545455 |
0.272727 |
11 |
0.0 |
0.227273 |
-0.272727 |
-0.061983 |
0 |
0_dLB7aYv_yCRg13g3cs0g |
11 |
0.000000 |
0.000000 |
0.090909 |
0.090909 |
0.818182 |
11 |
-0.5 |
0.363636 |
0.000000 |
0.000000 |
1 |
0_m_fXlRXxGFKUMkVo2nUg |
8 |
0.000000 |
0.000000 |
0.100000 |
0.300000 |
0.600000 |
10 |
-0.5 |
0.250000 |
0.000000 |
0.000000 |
1 |
0_ImqV5kJJC01CBX7uk8cQ |
0 |
0.000000 |
0.000000 |
0.272727 |
0.090909 |
0.636364 |
11 |
-0.5 |
0.181818 |
0.000000 |
0.000000 |
1 |
0_6zmGnyHQrls5sJdKqudQ |
1 |
0.105263 |
0.000000 |
0.052632 |
0.210526 |
0.631579 |
19 |
-0.5 |
0.131579 |
0.000000 |
0.000000 |
1 |
0_9HCOdSBSrgeboN_pDcgA |
3 |
0.000000 |
0.100000 |
0.000000 |
0.100000 |
0.800000 |
10 |
-0.5 |
0.233333 |
0.000000 |
0.000000 |
1 |
0_aerl3cE66GEmqiAh9_aA |
47 |
0.400000 |
0.100000 |
0.000000 |
0.000000 |
0.500000 |
10 |
-0.5 |
-0.200000 |
0.500000 |
-0.100000 |
2 |
0AKk4nxIt0rlbyylj7s73Q |
30 |
0.190476 |
0.285714 |
0.095238 |
0.190476 |
0.238095 |
21 |
0.0 |
-0.047619 |
0.071429 |
-0.003401 |
2 |
0A5Bjqb3dLj6NfsBHzDuXA |
44 |
0.333333 |
0.047619 |
0.142857 |
0.190476 |
0.285714 |
21 |
0.0 |
-0.095238 |
0.071429 |
-0.006803 |
2 |
0aJBU9Jv15xA3MA5HZ0gjg |
26 |
0.700000 |
0.300000 |
0.000000 |
0.000000 |
0.000000 |
10 |
0.5 |
0.000000 |
-0.400000 |
-0.000000 |
2 |
0-Ypt2ywfCYQ_HEfOZ0r4g |
24 |
0.083333 |
0.333333 |
0.166667 |
0.208333 |
0.208333 |
24 |
0.0 |
0.041667 |
0.020833 |
0.000868 |
2 |
0-UxxxWLz1muOzPx2F3-kQ |
7 |
0.000000 |
0.090909 |
0.272727 |
0.272727 |
0.363636 |
11 |
0.0 |
0.090909 |
0.090909 |
0.008264 |
3 |
0_G3Cuk88nBAQ1M03g56lA |
16 |
0.090909 |
0.000000 |
0.181818 |
0.363636 |
0.363636 |
11 |
0.0 |
0.090909 |
0.000000 |
0.000000 |
3 |
0-5GNzaQOJuMJeVgECShEw |
14 |
0.000000 |
0.214286 |
0.214286 |
0.214286 |
0.357143 |
14 |
-0.5 |
0.000000 |
0.142857 |
0.000000 |
3 |
0_nrYU_QKIHQ3zc1w5Rm3w |
13 |
0.200000 |
0.133333 |
0.000000 |
0.266667 |
0.400000 |
15 |
-0.5 |
0.022222 |
0.133333 |
0.002963 |
3 |
0_Nf5WBUb3U1j610Yjbg7A |
2 |
0.000000 |
0.181818 |
0.181818 |
0.272727 |
0.363636 |
11 |
0.0 |
0.045455 |
0.090909 |
0.004132 |
3 |
0_71XZqxNetnJjJIINQCOg |
17 |
0.071429 |
0.500000 |
0.071429 |
0.285714 |
0.071429 |
14 |
1.0 |
0.428571 |
-0.142857 |
-0.061224 |
4 |
0-7mDJd9bOGtmq3FHoZB-Q |
19 |
0.000000 |
0.100000 |
0.200000 |
0.400000 |
0.300000 |
10 |
0.0 |
0.150000 |
-0.100000 |
-0.015000 |
4 |
0-BQfUKMcI0dX_OiaGlJjA |
4 |
0.062500 |
0.156250 |
0.250000 |
0.343750 |
0.187500 |
32 |
-0.5 |
0.093750 |
-0.156250 |
-0.014648 |
4 |
0_Cfb4ZEG2mQY9C2ACHnbA |
15 |
0.250000 |
0.083333 |
0.583333 |
0.083333 |
0.000000 |
12 |
-0.5 |
0.166667 |
-0.500000 |
-0.083333 |
4 |
0_RQdCuZGHAtBIZ5ST8TGw |
6 |
0.200000 |
0.100000 |
0.200000 |
0.300000 |
0.200000 |
10 |
-0.5 |
0.000000 |
-0.100000 |
-0.000000 |
4 |
0_FRTEe0XJnPxS4Nsj5tng |