SKLearn Clustering Examples

[1]:
import random
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

cmap = plt.get_cmap('tab10')
colors = [cmap(i) for i in range(cmap.N)]

mpl.rcParams["font.size"] = 24
mpl.rcParams["lines.linewidth"] = 2

seed = 0
random.seed(seed)
np.random.seed(seed)
n_samples = 10
markersize = 100

Sample data points

[2]:
# Sample 10 points
m1 = [1, 1]
cov1 = [[1, 0], [0, 1]]
s1 = np.random.multivariate_normal(m1, cov1, 5)

m2 = [-1, -1]
cov2 = [[1, -0.5], [-0.5, 1]]
s2 = np.random.multivariate_normal(m2, cov2, 5)

samples = np.concat([s1, s2])
np.random.shuffle(samples)
[3]:
plt.figure(figsize=(5, 5))
plt.scatter(samples[:, 0], samples[:, 1], s=markersize, color="k")
plt.xlim([-2, 2])
plt.ylim([-2, 2])
plt.axis("equal")
[3]:
(np.float64(-2.659691942232599),
 np.float64(3.1307603678824707),
 np.float64(-1.3757641112226862),
 np.float64(3.4607340235073694))
../../_images/examples_cluster_Reference_Ch3_part_1_sklearn_clustering_example_4_1.png

KMeans from sklearn

The clustering results may be different from the choice of initial centers. Here we specify the initial centers the same as what we used in the lecture slides.

[4]:
from sklearn.cluster import k_means
centroid, labels, inertia = k_means(samples, n_clusters=2,
       init=samples[[0, 2], :])
print(labels)
[0 1 0 0 1 1 1 0 1 0]
[5]:
plt.figure(figsize=(5, 5))
plt.scatter(samples[:, 0], samples[:, 1],
            s=markersize, color=[colors[l] for l in labels])

plt.axis("equal")
[5]:
(np.float64(-2.659691942232599),
 np.float64(3.1307603678824707),
 np.float64(-1.3757641112226862),
 np.float64(3.4607340235073694))
../../_images/examples_cluster_Reference_Ch3_part_1_sklearn_clustering_example_7_1.png

DBSCAN from sklearn

[6]:
from sklearn.cluster import DBSCAN

eps = 2.0
min_samples = 3
dbscan = DBSCAN(eps=eps, min_samples=min_samples, \
    metric="euclidean", algorithm="auto")
dbscan.fit(samples)
print(dbscan.labels_)

plt.figure(figsize=(5, 5))
plt.scatter(samples[:, 0], samples[:, 1],
            s=markersize, color=[colors[l] if l>=0 else "k" for l in dbscan.labels_])
# plt.axis("off")
plt.axis("equal")
[ 0 -1  0  0  1  1  1  0  1  0]
[6]:
(np.float64(-2.659691942232599),
 np.float64(3.1307603678824707),
 np.float64(-1.3757641112226862),
 np.float64(3.4607340235073694))
../../_images/examples_cluster_Reference_Ch3_part_1_sklearn_clustering_example_9_2.png