from scipy.io import loadmat
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.metrics.cluster import adjusted_rand_score
from scipy.cluster.hierarchy import dendrogram, ward, single, average

mnist = loadmat('mnist-original')
mnist_data = mnist["data"].T
mnist_label = mnist["label"][0]
import numpy as np
print("Number of datapoints: {}\n".format(mnist_data.shape[0]))
print("Number of features: {}\n".format(mnist_data.shape[1]))
print("List of labels: {}\n".format(np.unique(mnist_label)))

Number of datapoints: 70000

Number of features: 784

List of labels: [0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]


%matplotlib inline
import matplotlib.pyplot as plt
index = 4
print("Value of datapoint no. {}:\n{}\n".format(index,mnist_data[index]))
print("As image:\n")
plt.imshow(mnist_data[index].reshape(28,28),cmap=plt.cm.gray_r)
plt.show()

Value of datapoint no. 4:
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0  46 105 254 254 254 254 255 239  41
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  37
 118 222 254 253 253 253 253 253 253 211  54   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0  14 200 253 253 254 253 253 253 253 253
 253 253 116   0   0   0   0   0   0   0   0   0   0   0   0   0  16 160
 236 253 253 253 254 253 253 246 229 253 253 253 116   0   0   0   0   0
   0   0   0   0   0   0   0   0  99 253 253 253 253 253 254 253 253 213
  99 253 253 253 116   0   0   0   0   0   0   0   0   0   0   0   0  25
 194 253 253 253 253 131  97 169 253  93  99 253 253 253 116   0   0   0
   0   0   0   0   0   0   0   0   0 206 253 253 251 233 127   9   0  18
  38   3  15 171 253 253 116   0   0   0   0   0   0   0   0   0   0   0
  55 240 253 253 233   0   0   0   0   0   0   0  31 186 253 253 116   0
   0   0   0   0   0   0   0   0   0   0 176 253 253 253 127   0   0   0
   0   0   0   0  99 253 253 253 116   0   0   0   0   0   0   0   0   0
   0   0 176 253 253 131   9   0   0   0   0   0   0   0  99 253 253 253
 116   0   0   0   0   0   0   0   0   0   0 119 254 254 232  75   0   0
   0   0   0   0   0   0   0 158 254 254 117   0   0   0   0   0   0   0
   0   0   0 118 253 253 154   0   0   0   0   0   0   0   0   0   0 156
 253 253 116   0   0   0   0   0   0   0   0   0   0 118 253 253 154   0
   0   0   0   0   0   0   0   0   0 156 253 253 116   0   0   0   0   0
   0   0   0   0  46 222 253 253 154   0   0   0   0   0   0   0   0   7
 116 246 253 180   9   0   0   0   0   0   0   0   0   0   0 118 253 253
 154   0   0   0   0   0   0   0   0 116 253 253 253 174   0   0   0   0
   0   0   0   0   0   0   0 118 253 253 154   0   0   0   0   0   0   0
 110 246 253 253 240  67   0   0   0   0   0   0   0   0   0   0   0 118
 253 253 238 215  49  20  20  20  66 215 241 253 245 233  64   0   0   0
   0   0   0   0   0   0   0   0   0  82 229 253 253 253 253 253 253 253
 254 253 253 240 107   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0 176 253 253 253 253 253 253 253 254 253 253 108   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0  40 239 253 253 253 253
 253 253 254 161  57   4   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]

As image:


from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
k = 10

scaler = MinMaxScaler()
scaler.fit(mnist_data)
scaled_data = scaler.transform(mnist_data)

kmeansCluster = KMeans(n_clusters=k, random_state=69, n_init=15, init='k-means++')
kmeansCluster.fit(scaled_data)
print("done")

done


import mglearn

rows=int(k/5)
fig,axes = plt.subplots(rows,5,figsize=(20,4*rows))
for ax,cc,i in zip (axes.ravel(),kmeansCluster.cluster_centers_,np.arange(axes.ravel().size)):
    ax.set_title("idx: {}, size: {}".format(i,len(np.where(kmeansCluster.labels_==i)[0])))
    ax.imshow(cc.reshape(28,28),cmap=plt.cm.gray_r)
plt.show()


from sklearn.metrics.cluster import adjusted_rand_score

fit_pred = kmeansCluster.fit_predict(scaled_data)

adjusted_rand_score(mnist_label, fit_pred)

0.3654675097713884


from sklearn.utils import resample
small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=500,
                                               replace='false',
                                               random_state=69)
small_mnist_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)


from scipy.cluster.hierarchy import dendrogram, ward

linkage_array = ward(small_mnist_data)
dendrogram(linkage_array)

plt.show()


small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=30,
                                               replace='false',
                                               random_state=69)

print("Ward:")
linkage_array = ward(small_mnist_data)
dendrogram(linkage_array)
plt.show()

print("Single:")
linkage_array = single(small_mnist_data)
dendrogram(linkage_array)
plt.show()

print("Average:")
linkage_array = average(small_mnist_data)
dendrogram(linkage_array)
plt.show()

Ward:

Single:

Average:


from sklearn.utils import resample
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import mglearn

small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=1000,
                                               replace='false',
                                               random_state=69)

scaler = MinMaxScaler()
#scaler = StandardScaler()
scaler.fit(small_mnist_data)
scaled_data = scaler.transform(small_mnist_data)

# Estimate best eps and min_sample values
for i in range(1,10):
    for j in range(2,10):
        dbscan = DBSCAN(eps=i,min_samples=j)
        clusters = dbscan.fit_predict(scaled_data)
        # remove all noise (-1)
        cluster_count = [ i for i in clusters if i >= 0]
        if len(cluster_count)/len(clusters) > 0.9:
            percentage = "{:.2f}".format(len(cluster_count)/len(clusters)*100)
            print(f"{percentage}% good clusters with eps={i}, min_samples={j}, cluster_count: {len(np.unique(cluster_count))}")

97.70% good clusters with eps=8, min_samples=2, cluster_count: 1
97.70% good clusters with eps=8, min_samples=3, cluster_count: 1
97.50% good clusters with eps=8, min_samples=4, cluster_count: 1
97.40% good clusters with eps=8, min_samples=5, cluster_count: 1
97.40% good clusters with eps=8, min_samples=6, cluster_count: 1
97.30% good clusters with eps=8, min_samples=7, cluster_count: 1
96.90% good clusters with eps=8, min_samples=8, cluster_count: 1
96.80% good clusters with eps=8, min_samples=9, cluster_count: 1
99.90% good clusters with eps=9, min_samples=2, cluster_count: 1
99.90% good clusters with eps=9, min_samples=3, cluster_count: 1
99.90% good clusters with eps=9, min_samples=4, cluster_count: 1
99.90% good clusters with eps=9, min_samples=5, cluster_count: 1
99.90% good clusters with eps=9, min_samples=6, cluster_count: 1
99.90% good clusters with eps=9, min_samples=7, cluster_count: 1
99.90% good clusters with eps=9, min_samples=8, cluster_count: 1
99.90% good clusters with eps=9, min_samples=9, cluster_count: 1


from sklearn.decomposition import PCA
from sklearn.utils import resample

small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=10000,
                                               replace='false',
                                               random_state=69)

pca = PCA(n_components=100, whiten=True, random_state=69)
X_pca = pca.fit_transform(small_mnist_data)
print("done")

done


# apply DBSCAN with default parameters
dbscan = DBSCAN(eps=15, min_samples=3)
labels = dbscan.fit_predict(X_pca)
print("Unique labels: {}".format(np.unique(labels)))

Unique labels: [0]


print("Number of points per cluster: {}".format(np.bincount(labels + 1)))

Number of points per cluster: [    0 10000]


for eps in [1, 3, 5, 7, 9, 11, 13]:
    print("\neps={}".format(eps))
    dbscan = DBSCAN(eps=eps, min_samples=3)
    labels = dbscan.fit_predict(X_pca)
    print("Number of clusters: {}".format(len(np.unique(labels))))
    print("Cluster sizes: {}".format(np.bincount(labels + 1)))

eps=1
Number of clusters: 31
Cluster sizes: [9910    3    3    3    3    3    3    3    3    3    3    3    3    3
    3    3    3    3    3    3    3    3    3    3    3    3    3    3
    3    3    3]

eps=3
Number of clusters: 69
Cluster sizes: [9637   13   14    3    9   13   10    3   33    4    3    3   31    8
    3    3    3    3    4   20    3    3    3    3   12    4    3    8
    9    3    3    5    3    3    3    3    3    3    3    6    3    3
    5    3    3    4    3    4    3    4    3    3    3    3    3    3
    3    5    3    3    3    3    3    3    3    3    3    3    3]

eps=5
Number of clusters: 51
Cluster sizes: [8841 1000    3    3    3    3    3    3    4    3    3    3    3    3
    3    4    3    3    3    3    3    4    3    3    3    3    3    3
    3   10    3    3    3    3    3    3    3    3    3    3    3    5
    3    3    3    3    3    3    3    3    3]

eps=7
Number of clusters: 104
Cluster sizes: [7477 2110   12   18    3    3    3    4    3    3    3    3   10    7
    3    3    4    3    3    3    8    3    4    3    4    3    3    3
    3    5    4    5    4    3    3    6    3    3    3    3    3    4
    3   18    3    6    4    4    5    5    3    3    3    3    3    3
    3    3    3    4    3    3    6    3    3    3    3    3    3    8
    4    3    3    5    3    3    3    3    3    6    6    3    3    3
    3    4    6    3    3    3    3    3    4    5    3    3    5    3
    5    3    3    3    3    3]

eps=9
Number of clusters: 90
Cluster sizes: [3737 5943    7    4    3    3    3    4    3    6    3    3    6    4
    5    5    8    4    6    7    3    4    3    3    3    3    3    3
    5    3    4    3    3    3    4    4    3    3    9    3    4    4
    4    3    3    3    4    3    3    3    3    4    4    3    4    3
    4    3    3    5    3    3    3    3    3    3    3    4    3    3
    3    4    3    3    3    3    3    3    3    3    3    4    4    3
    3    3    3    3    3    3]

eps=11
Number of clusters: 17
Cluster sizes: [ 768 9184    3    3    3    3    4    3    3    3    3    3    3    3
    4    3    4]

eps=13
Number of clusters: 2
Cluster sizes: [  42 9958]


# Have to use an even smaller dataset as plot throws and my kernel stops
small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=1000,
                                               replace='false',
                                               random_state=69)

pca = PCA(n_components=100, whiten=True, random_state=69)
X_pca = pca.fit_transform(small_mnist_data)
print("Done")
dbscan = DBSCAN(min_samples=3, eps=7)
labels = dbscan.fit_predict(X_pca)

for cluster in range(max(labels) + 1):
    mask = labels == cluster
    n_images =  np.sum(mask)
    fig, axes = plt.subplots(1, n_images, figsize=(n_images * 1.5, 4),
                             subplot_kw={'xticks': (), 'yticks': ()})
    for image, label, ax in zip(small_mnist_data[mask], small_mnist_label[mask], axes):

        ax.imshow(image.reshape(28,28), vmin=0, vmax=1)
        ax.set_title(label)

Done


small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=10000,
                                               replace='false',
                                               random_state=69)

pca = PCA(n_components=100, whiten=True, random_state=69)
X_pca = pca.fit_transform(small_mnist_data)

# extract clusters with k-Means
km = KMeans(n_clusters=10, random_state=0)
labels_km = km.fit_predict(X_pca)
print("Cluster sizes k-means: {}".format(np.bincount(labels_km)))

Cluster sizes k-means: [ 651  901  882  844 1022  443 1017  489 2832  919]


fig, axes = plt.subplots(2, 5, subplot_kw={'xticks': (), 'yticks': ()},
                         figsize=(12, 4))
for center, ax in zip(km.cluster_centers_, axes.ravel()):
    ax.imshow(pca.inverse_transform(center).reshape(28,28),
              vmin=0, vmax=1)


small_mnist_data, small_mnist_label = resample(mnist_data,
                                               mnist_label,
                                               n_samples=10000,
                                               replace='false',
                                               random_state=69)

pca = PCA(n_components=100, whiten=True, random_state=69)
X_pca = pca.fit_transform(small_mnist_data)

# extract clusters with ward agglomerative clustering
agglomerative = AgglomerativeClustering(n_clusters=10)
labels_agg = agglomerative.fit_predict(X_pca)
print("cluster sizes agglomerative clustering: {}".format(
       np.bincount(labels_agg)))

cluster sizes agglomerative clustering: [3077 1486 1256  861  866  513  873  434  376  258]


print("ARI: {:.2f}".format(adjusted_rand_score(labels_agg, labels_km)))

ARI: 0.21


linkage_array = ward(X_pca)
# now we plot the dendrogram for the linkage_array
# containing the distances between clusters
plt.figure(figsize=(20, 5))
dendrogram(linkage_array, p=7, truncate_mode='level', no_labels=True)
plt.xlabel("Sample index")
plt.ylabel("Cluster distance")

Text(0, 0.5, 'Cluster distance')


n_clusters = 10
# Run through clusters and plot the correct hits.
for cluster in range(n_clusters):
    mask = labels_agg == cluster
    fig, axes = plt.subplots(1, 10, subplot_kw={'xticks': (), 'yticks': ()},
                             figsize=(15, 8))
    axes[0].set_ylabel(np.sum(mask))
    for image, label, asdf, ax in zip(small_mnist_data[mask], small_mnist_label[mask],
                                      labels_agg[mask], axes):
        ax.imshow(image.reshape(28,28), vmin=0, vmax=1)
        ax.set_title(label, fontdict={'fontsize': 9})


# extract clusters with ward agglomerative clustering
agglomerative = AgglomerativeClustering(n_clusters=40)
labels_agg = agglomerative.fit_predict(X_pca)
print("cluster sizes agglomerative clustering: {}".format(np.bincount(labels_agg)))

n_clusters = 40
for cluster in [10, 13, 19, 22, 36]: # hand-picked "interesting" clusters
    mask = labels_agg == cluster
    fig, axes = plt.subplots(1, 15, subplot_kw={'xticks': (), 'yticks': ()},
                             figsize=(15, 8))
    cluster_size = np.sum(mask)
    axes[0].set_ylabel("#{}: {}".format(cluster, cluster_size))
    for image, label, asdf, ax in zip(small_mnist_data[mask], small_mnist_label[mask],
                                      labels_agg[mask], axes):
        ax.imshow(image.reshape(28,28), vmin=0, vmax=1)
        ax.set_title(label, fontdict={'fontsize': 9})
    for i in range(cluster_size, 15):
        axes[i].set_visible(False)

cluster sizes agglomerative clustering: [381 325 435 336 165 179 434 469 153 258 507 742 295 232 230 156 279 174
 376 255 208 154 234  99 126  67 293 120 366 345 186 143 112 180 139 148
 160 190 189 160]

3. Clustering of Handwritten Digits¶