Compare K-Means and K-Center on Simulated Data

[1]:
import random
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

cmap = plt.get_cmap('tab10')
colors = [cmap(i) for i in range(cmap.N)]

mpl.rcParams["font.size"] = 24
mpl.rcParams["lines.linewidth"] = 2

seed = 0
random.seed(seed)
np.random.seed(seed)

markersize = 50

Simulate Outliers

[2]:
centers = np.array([[0, 0], [np.sin(np.pi/3), np.cos(np.pi/3)], [np.sin(np.pi/3*2), np.cos(np.pi/3*2)], \
    [np.sin(np.pi/3*3), np.cos(np.pi/3*3)], [np.sin(np.pi/3*4), np.cos(np.pi/3*4)],
    [np.sin(np.pi/3*5), np.cos(np.pi/3*5)]])*3
n_samples = [300, 4, 4, 4, 4, 4]
samples = None
for center, n in zip(centers, n_samples):
    cov = [[0.1, 0], [0, 0.1]]
    s = np.random.multivariate_normal(center, cov, n)
    if samples is None:
        samples = s
    else:
        samples = np.vstack([samples, s])
[3]:
plt.figure(figsize=(5, 5))
plt.scatter(samples[:, 0], samples[:, 1], s=markersize, color="k", alpha=0.5)
plt.xlim([-2, 2])
plt.ylim([-2, 2])
# plt.axis("off")
plt.axis("equal")
[3]:
(np.float64(-3.172265148537022),
 np.float64(3.0733223233455673),
 np.float64(-3.7468851481628227),
 np.float64(2.2003839005377284))
../../_images/examples_cluster_Reference_Ch3_part_1_compare_kmeans_kcenter_on_simulated_data_4_1.png

K-Means Results

[4]:
from sklearn.cluster import k_means
k = 6
centroid, labels, inertia = k_means(samples, n_clusters=k,
                                    init=samples[:k, :])

plt.figure(figsize=(5, 5))
plt.scatter(samples[:, 0], samples[:, 1], s=markersize,
            color=[colors[l] for l in labels], alpha=0.5)

plt.title("KMeans")
plt.axis("equal")
[4]:
(np.float64(-3.172265148537022),
 np.float64(3.0733223233455673),
 np.float64(-3.7468851481628227),
 np.float64(2.2003839005377284))
../../_images/examples_cluster_Reference_Ch3_part_1_compare_kmeans_kcenter_on_simulated_data_6_1.png

K-Center Results

[5]:
def k_center(points, k):
    losses = []
    n_points = points.shape[0]
    centers = []
    distances = np.full(n_points, np.inf)

    for j in range(0, k):
        if j == 0:
            centers = [0]

        for i in range(n_points):
            new_dist = np.linalg.norm(points[i]-points[centers[-1]])
            distances[i] = min(distances[i], new_dist)

        if j>=1:
            next_center = np.argmax(distances)
            centers.append(next_center)

        losses.append(np.max(distances))

    return centers, losses
[6]:
k = 6
new_centers, losses = k_center(samples, k)
new_centers = samples[new_centers]
[7]:
labels = [np.argmin(np.sum((samples[i, :]-new_centers)**2, axis=1)) for i in range(samples.shape[0])]
plt.figure(figsize=(5, 5))
plt.scatter(samples[:, 0], samples[:, 1],
            s=markersize, color=[colors[l] for l in labels], alpha=0.5)

plt.title("KCenter")
plt.axis("equal")
[7]:
(np.float64(-3.172265148537022),
 np.float64(3.0733223233455673),
 np.float64(-3.7468851481628227),
 np.float64(2.2003839005377284))
../../_images/examples_cluster_Reference_Ch3_part_1_compare_kmeans_kcenter_on_simulated_data_10_1.png