Note : This is a generated markdown export from the Jupyter notebook file clustering_meanshift.ipynb .
You can also view the notebook with the nbviewer from Jupyter.
Clustering with MeanShift
% matplotlib inline
import matplotlib .pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import umap
from sklearn import datasets , cluster
def get_color (i , n_clusters ):
if i == - 1 :
return 'gray'
return plt .cm .jet (float (i ) / n_clusters )
digits = datasets .load_digits ()
fig , axes = plt .subplots (nrows = 1 , ncols = 10 , figsize = (10 , 3 ))
for ax , image , label in zip (axes , digits .images , digits .target ):
ax .set_axis_off ()
ax .imshow (image , cmap = plt .cm .gray_r )
ax .set_title ('%i' % label )
X = digits .data
y = digits .target
n_clusters = 10
meanshift = cluster .MeanShift ()
label = meanshift .fit_predict (X )
predicted_clusters = np .unique (label )
true_clusters = list (range (0 , n_clusters ))
embedding = umap .UMAP ().fit_transform (X )
df = pd .DataFrame (embedding , columns = ['X1' , 'X2' ])
df ['true_cluster' ] = y
df ['predicted_cluster' ] = label
df .head ()
X1
X2
true_cluster
predicted_cluster
0
15.643538
7.158202
0
0
1
-3.757154
9.996881
1
0
2
0.414931
9.605992
2
0
3
0.273054
5.647682
3
0
4
4.314103
18.812122
4
0
fig , (ax1 , ax2 ) = plt .subplots (2 , 1 , sharey = True , figsize = (10 , 10 ))
fig .suptitle ('Clusters in high dimensional data (features = {})' .format (np .shape (X )[1 ]), fontsize = 14 , fontweight = 'bold' )
ax1 .set_title ('True values' )
for i in true_clusters :
ax1 .scatter (df [df .true_cluster == i ].X1 , df [df .true_cluster == i ].X2 , label = i , color = get_color (i , len (true_clusters )))
ax2 .set_title ('Predicted cluste' )
for i in predicted_clusters :
ax2 .scatter (df [df .predicted_cluster == i ].X1 , df [df .predicted_cluster == i ].X2 , label = i , color = get_color (i , len (predicted_clusters )))
ax1 .legend (bbox_to_anchor = (1.1 , 1 ))
ax2 .legend (bbox_to_anchor = (1.1 , 1 ))
plt .show ()
X , y = datasets .make_blobs (n_samples = 750 , centers = [[3 ,4 ],[- 2 ,6 ],[3 ,12 ]], cluster_std = [1 , 0.8 , 1.5 ],
random_state = 0 )
n_clusters = 3
meanshift = cluster .MeanShift ()
label = meanshift .fit_predict (X )
predicted_clusters = np .unique (label )
true_clusters = list (range (0 , n_clusters ))
df = pd .DataFrame (X , columns = ['X1' , 'X2' ])
df ['true_cluster' ] = y
df ['predicted_cluster' ] = label
df .head ()
X1
X2
true_cluster
predicted_cluster
0
2.600551
4.370056
0
1
1
-2.309497
5.591766
1
0
2
2.196590
3.310450
0
1
3
0.940436
10.398387
2
2
4
4.230291
5.202380
0
1
fig , (ax1 , ax2 ) = plt .subplots (2 , 1 , sharey = True , figsize = (10 , 10 ))
fig .suptitle ('Clusters in low dimensional' , fontsize = 14 , fontweight = 'bold' )
ax1 .set_title ('True values' )
for i in true_clusters :
ax1 .scatter (df [df .true_cluster == i ].X1 , df [df .true_cluster == i ].X2 , label = i , color = get_color (i , len (true_clusters )))
ax2 .set_title ('Predicted cluster = {}' .format (len (predicted_clusters )))
for i in predicted_clusters :
ax2 .scatter (df [df .predicted_cluster == i ].X1 , df [df .predicted_cluster == i ].X2 , label = i , color = get_color (i , len (predicted_clusters )))
ax1 .legend (bbox_to_anchor = (1.1 , 1 ))
ax2 .legend (bbox_to_anchor = (1.1 , 1 ))
plt .show ()