"""
K Means Clustering In Infinite Dimensions
By Adam Blumenfeld @nexclap/AdamBlumenfeld
"""
# Imports
from random import randint as random
from matplotlib import pyplot as plt
from matplotlib import style
from math import sqrt
import numpy as np
# Set style of graphs
style.use("ggplot")
# Dataset
data=[(1,1), (1,2), (2,1),(2,3),(2,4),(3,3),(7,9),(7,6),(8,5),(8,9),(9,6),(9,9), (2, 8), (1, 9), (1,7), (3, 7), (4, 9)]
# Multidimensional dataset
_data=[(1,1, 2, 3), (1,2, 2, 4), (2,1, 2, 3),(2,3, 2, 4),(2,4, 3, 2),(3,3, 2, 1),(7,9, 8, 9),(7,6, 9, 8),(8,5, 8, 5),(8,9, 9, 6),(9,6, 6, 9),(9,9, 9, 5), (2, 8, 4, 5), (1, 9, 4, 6), (1,7, 4, 4), (3, 7, 3, 4), (4, 9, 5, 5)]
# Plot dataset
plt.scatter([point[0] for point in data], [point[1] for point in data])
plt.show()
class KMeansClustering:
# Helper function
def fit(self, data, k=3, ndim=2, tolerance=0.001):
self.k = k
self.data = data
self.ndim = ndim.
self._centroids()
self.cluster()
self._fit(dev=self.std(), best=self.clusters)
return self.clusters
# Recursive function
def _fit(self, dev=float('inf'), tolerance=0.001, best=None):
self._centroids()
self.cluster()
if self.std() <= dev:
if dev - self.std() <= tolerance:
return best
return self._fit(dev, tolerance, best)
self._fit(dev, tolerance, best)
# Recursive function for finding one cluster
def cluster(self, m=None):
self.assign()
self.reassign_centroids()
if self.clusters == m:
return self.clusters
return self.cluster(self.clusters)
# Euclidean distance
def euiclid(self, p1, p2):
return sqrt(sum([(p2[i] - p1[i])**2 for i in range(len(p1))]))
# Mean
def mean(self, data):
return [(sum([point[i] for point in data]) / len(data)) for i in range(self.ndim)]
# Initialize first centroids:
def _centroids(self):
self.centroids = [[random(0, max(self.data)[i]) for i in range(self.ndim)] for z in range(self.k)]
# Reassign centroids
def reassign_centroids(self):
self.centroids = [self.mean(self.clusters[i]) for i in range(len(self.clusters))]
# Assign clusters:
def assign(self):
self.clusters = {i: [] for i in range(len(self.centroids))}
for point in self.data:
distances = [self.euiclid(centroid, point) for centroid in self.centroids]
for i in range(len(self.centroids)):
if min(distances) == distances[i]:
self.clusters[i].append(point)
# Standard deviation function
def std(self):
return sqrt(sum([sum([self.euiclid(self.centroids[key], point) for point in cluster]) for key, cluster in self.clusters.items()]))
model = KMeansClustering()
# Fit on two dimensional dataset
model.fit(data, ndim=2)
# Fit on four dimensional dataset
# model.fit(_data, ndim=4)
smart ass