import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()


plot_size   = 14
plot_width  = 5
plot_height = 5

params = {'legend.fontsize': 'large',
          'figure.figsize': (plot_width,plot_height),
          'axes.labelsize': plot_size,
          'axes.titlesize': plot_size,
          'xtick.labelsize': plot_size*0.75,
          'ytick.labelsize': plot_size*0.75,
          'axes.titlepad': 25}
plt.rcParams.update(params)


from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs


num_customers = 40


coord, clust_true = make_blobs(n_samples=num_customers, 
                               centers=3, 
                               cluster_std=1, 
                               random_state = 2)


coord

array([[ -1.54915892,  -7.25010857],
       [  0.61758013,  -1.36802291],
       [  1.36369409,   0.06608172],
       [  0.35857025,  -0.7851559 ],
       [ -0.89752435,  -5.42677013],
       [ -3.71486953,  -9.36874886],
       [ -4.25209341,  -3.4847562 ],
       [  1.60459034,  -1.24558156],
       [ -0.8748411 ,   0.43763252],
       [ -1.99653623,  -4.77782225],
       [  1.20936556,  -3.15216453],
       [ -1.23856256, -10.59940081],
       [ -2.02797291,  -9.47245011],
       [ -0.74104364, -10.07763506],
       [ -0.77722054, -10.72676345],
       [ -1.91775697, -10.66908765],
       [ -2.27031954,  -4.83274261],
       [ -0.72864791,  -7.18926735],
       [ -3.63296701,  -3.34704806],
       [  0.67974136,  -0.52254041],
       [ -1.97416044,  -3.32681457],
       [ -1.61892392,  -9.71765939],
       [ -0.76794095,  -2.14509066],
       [ -2.15820985,  -9.63790953],
       [ -1.29923245,  -8.30647414],
       [ -2.01196044,  -3.52563248],
       [ -2.33805418, -10.39048298],
       [ -1.06834753,  -2.658024  ],
       [  1.49510676,  -2.13776585],
       [  1.42674589,  -0.01517292],
       [  1.99361544,  -1.67464467],
       [ -2.70131918,  -9.63497056],
       [ -1.02353151, -10.47025441],
       [ -1.6322142 ,  -3.06730015],
       [  2.46092757,  -1.62922949],
       [ -1.78211322,  -3.47052225],
       [ -2.69138291,  -1.80881652],
       [  0.16411427,  -1.20584193],
       [  0.99325932,  -0.75119958],
       [ -2.24589423,  -2.5508473 ]])


clust_true

array([0, 1, 1, 1, 2, 0, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 2, 1, 2, 0,
       2, 0, 0, 2, 0, 2, 1, 1, 1, 0, 0, 2, 1, 2, 2, 1, 1, 2])


plt.scatter(coord[:, 0], 
            coord[:, 1], 
            s=plot_size*2, 
            cmap='viridis');


model = KMeans(n_clusters=2)

model.fit(coord)

clust_pred = model.predict(coord)


plt.scatter(coord[:, 0],   
            coord[:, 1],
            c = clust_pred, 
            s=plot_size*2, 
            cmap='Accent')

centers = model.cluster_centers_

plt.scatter(centers[:, 0], 
            centers[:, 1], 
            c = 'red', 
            s=plot_size*10, 
            alpha=0.5);


model.inertia_

172.85989892525026


from yellowbrick.cluster import KElbowVisualizer

visualizer = KElbowVisualizer(model, k=(2,12),timings=False)
visualizer.fit(coord)   # Fit the data to the visualizer
visualizer.show()       # Finalize and render the figure

<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>

Clustering with sklearn¶

Why don't you experiment with different values for `centers` and `cluster_std`?¶

Clustering with sklearn¶

Why don't you experiment with different values for centers and cluster_std?¶

Why don't you experiment with different values for `centers` and `cluster_std`?¶