
K Means Clustering In Infinite Dimensions (Without Sklearn)
This is K Means Clustering in infinite dimensions finally complete! K Means Clustering follows under the category of clustering (hence the name). Clustering falls under another subset of machine learning called unsupervised machine learning, which is doing machine learning on datasets without labels. I have provided a 2d dataset and a 4d dataset to test my model. Both of which worked. Sometimes you will get a Zero Division Error, if that occurs, just run the code until it works. To learn more about K Means Clustering, go to the following tutorial: https://www.youtube.com/watch?v=4b5d3muPQmA&t=296s
"""
K Means Clustering In Infinite Dimensions
By Adam Blumenfeld @nexclap/AdamBlumenfeld
"""
# Imports
from random import randint as random
from matplotlib import pyplot as plt
from matplotlib import style
from math import sqrt
import numpy as np
# Set style of graphs
style.use("ggplot")
# Dataset
data=[(1,1), (1,2), (2,1),(2,3),(2,4),(3,3),(7,9),(7,6),(8,5),(8,9),(9,6),(9,9), (2, 8), (1, 9), (1,7), (3, 7), (4, 9)]
# Multidimensional dataset
_data=[(1,1, 2, 3), (1,2, 2, 4), (2,1, 2, 3),(2,3, 2, 4),(2,4, 3, 2),(3,3, 2, 1),(7,9, 8, 9),(7,6, 9, 8),(8,5, 8, 5),(8,9, 9, 6),(9,6, 6, 9),(9,9, 9, 5), (2, 8, 4, 5), (1, 9, 4, 6), (1,7, 4, 4), (3, 7, 3, 4), (4, 9, 5, 5)]
# Plot dataset
plt.scatter([point[0] for point in data], [point[1] for point in data])
plt.show()
class KMeansClustering:
# Helper function
def fit(self, data, k=3, ndim=2, tolerance=0.001):
self.k = k
self.data = data
self.ndim = ndim.
self._centroids()
self.cluster()
self._fit(dev=self.std(), best=self.clusters)
return self.clusters
# Recursive function
def _fit(self, dev=float('inf'), tolerance=0.001, best=None):
self._centroids()
self.cluster()
if self.std() <= dev:
if dev - self.std() <= tolerance:
return best
return self._fit(dev, tolerance, best)
self._fit(dev, tolerance, best)
# Recursive function for finding one cluster
def cluster(self, m=None):
self.assign()
self.reassign_centroids()
if self.clusters == m:
return self.clusters
return self.cluster(self.clusters)
# Euclidean distance
def euiclid(self, p1, p2):
return sqrt(sum([(p2[i] - p1[i])**2 for i in range(len(p1))]))
# Mean
def mean(self, data):
return [(sum([point[i] for point in data]) / len(data)) for i in range(self.ndim)]
# Initialize first centroids:
def _centroids(self):
self.centroids = [[random(0, max(self.data)[i]) for i in range(self.ndim)] for z in range(self.k)]
# Reassign centroids
def reassign_centroids(self):
self.centroids = [self.mean(self.clusters[i]) for i in range(len(self.clusters))]
# Assign clusters:
def assign(self):
self.clusters = {i: [] for i in range(len(self.centroids))}
for point in self.data:
distances = [self.euiclid(centroid, point) for centroid in self.centroids]
for i in range(len(self.centroids)):
if min(distances) == distances[i]:
self.clusters[i].append(point)
# Standard deviation function
def std(self):
return sqrt(sum([sum([self.euiclid(self.centroids[key], point) for point in cluster]) for key, cluster in self.clusters.items()]))
model = KMeansClustering()
# Fit on two dimensional dataset
model.fit(data, ndim=2)
# Fit on four dimensional dataset
# model.fit(_data, ndim=4)
smart ass