#K Means Clustering


K Means Clustering In Infinite Dimensions (Without Sklearn)

This is K Means Clustering in infinite dimensions finally complete! K Means Clustering follows under the category of clustering (hence the name). Clustering falls under another subset of machine learning called unsupervised machine learning, which is doing machine learning on datasets without labels. I have provided a 2d dataset and a 4d dataset to test my model. Both of which worked. Sometimes you will get a Zero Division Error, if that occurs, just run the code until it works. To learn more about K Means Clustering, go to the following tutorial: https://www.youtube.com/watch?v=4b5d3muPQmA&t=296s

""" K Means Clustering In Infinite Dimensions By Adam Blumenfeld @nexclap/AdamBlumenfeld """ # Imports from random import randint as random from matplotlib import pyplot as plt from matplotlib import style from math import sqrt import numpy as np # Set style of graphs style.use("ggplot") # Dataset data=[(1,1), (1,2), (2,1),(2,3),(2,4),(3,3),(7,9),(7,6),(8,5),(8,9),(9,6),(9,9), (2, 8), (1, 9), (1,7), (3, 7), (4, 9)] # Multidimensional dataset _data=[(1,1, 2, 3), (1,2, 2, 4), (2,1, 2, 3),(2,3, 2, 4),(2,4, 3, 2),(3,3, 2, 1),(7,9, 8, 9),(7,6, 9, 8),(8,5, 8, 5),(8,9, 9, 6),(9,6, 6, 9),(9,9, 9, 5), (2, 8, 4, 5), (1, 9, 4, 6), (1,7, 4, 4), (3, 7, 3, 4), (4, 9, 5, 5)] # Plot dataset plt.scatter([point[0] for point in data], [point[1] for point in data]) plt.show() class KMeansClustering: # Helper function def fit(self, data, k=3, ndim=2, tolerance=0.001): self.k = k self.data = data self.ndim = ndim. self._centroids() self.cluster() self._fit(dev=self.std(), best=self.clusters) return self.clusters # Recursive function def _fit(self, dev=float('inf'), tolerance=0.001, best=None): self._centroids() self.cluster() if self.std() <= dev: if dev - self.std() <= tolerance: return best return self._fit(dev, tolerance, best) self._fit(dev, tolerance, best) # Recursive function for finding one cluster def cluster(self, m=None): self.assign() self.reassign_centroids() if self.clusters == m: return self.clusters return self.cluster(self.clusters) # Euclidean distance def euiclid(self, p1, p2): return sqrt(sum([(p2[i] - p1[i])**2 for i in range(len(p1))])) # Mean def mean(self, data): return [(sum([point[i] for point in data]) / len(data)) for i in range(self.ndim)] # Initialize first centroids: def _centroids(self): self.centroids = [[random(0, max(self.data)[i]) for i in range(self.ndim)] for z in range(self.k)] # Reassign centroids def reassign_centroids(self): self.centroids = [self.mean(self.clusters[i]) for i in range(len(self.clusters))] # Assign clusters: def assign(self): self.clusters = {i: [] for i in range(len(self.centroids))} for point in self.data: distances = [self.euiclid(centroid, point) for centroid in self.centroids] for i in range(len(self.centroids)): if min(distances) == distances[i]: self.clusters[i].append(point) # Standard deviation function def std(self): return sqrt(sum([sum([self.euiclid(self.centroids[key], point) for point in cluster]) for key, cluster in self.clusters.items()])) model = KMeansClustering() # Fit on two dimensional dataset model.fit(data, ndim=2) # Fit on four dimensional dataset # model.fit(_data, ndim=4)
1