#Machine Learning


Linear Regression Using Gradient Decent

This is 𝓛𝓲𝓷𝓮𝓪𝓻 𝓡𝓮𝓰𝓻𝓮𝓼𝓼𝓲𝓸𝓷 𝓤𝓼𝓲𝓷𝓰 𝓖𝓻𝓪𝓭𝓲𝓮𝓷𝓽 𝓓𝓮𝓬𝓮𝓷𝓽 optimized to 50 milliseconds (average of 12).

""" @nexclap/AdamBlumenfeld """ # Imports import numpy as np from random import randint from matplotlib import pyplot as plt # Define Style Of Matplotlib Graphs plt.style.use("ggplot") # Define data X = np.array([1, 3, 5, 6, 8, 10, 11, 18, 19, 20, 24, 26, 30, 32, 36, 38, 39, 40, 43, 46, 52, 55, 56, 58, 59]) y = np.array([3, 4, 5, 7, 8, 9, 10, 12, 14, 15, 21, 36, 37, 38, 39, 40, 43, 46, 49, 51, 54, 56, 58, 60, 69]) # Plot data plt.scatter(X, y) plt.show() #Regressor Class class Regressor: # Training Function def fit(self, X, y, learning_rate=0.00001, converge=0.001, cst=False): # Cst is weather or not to make a history of cost for further analysis self.cst_b = cst if cst: self.cst = [[], []] # Dataset self.X = X self.y = y # Learning rate, or "a" in the gradient decent formula self.learning_rate = learning_rate # The M and B values in the hypothysis function self.theta = [0, 0] # Cost, which initialtes at infinity self.cost = float('inf') # The iterator of the gradient decent algorithm, mine is recursive (Lol, I just had to add that flex) self.gradient_decent_step(converge) # isub for theta, basically saying theta -= (whatever), only for practical reasons, I had to make it a seprete function def theta_isub(self, i, other): self.theta[i] -= other return self.theta[i] # Calculate and update (or store if cst is True) cost def _cost(self, iteration=None): # Cost function self.cost = (1/(2*len(X))*sum([(self.h(X[index]) - y[index])*X[index] for index in range(len(X))])**2) if self.cst_b: # Update cst self.cst[0].append(self.cost) self.cst[1].append(iteration) # Hypothesis function def h(self, x): # h_θ(x) = θ₁ + θ₀x (Yes, I know that in my hypothysis function is switched around) return x*self.theta[0] + self.theta[1] # Gradient decent iterator def gradient_decent_step(self, converge, iteration=1): # Base case: if the cost is less than the set convergence point than accept current theata values if self.cost <= converge: return None # Do one iteration of gradient decent self._step() # Compute cost self._cost(iteration) return self.gradient_decent_step(converge, iteration+1) # All the math of gradient decent, (Now you know why I made the theta_isub function) def _step(self): return [self.theta_isub(0, self.learning_rate * (1/len(X)*sum([(self.h(X[index]) - y[index])*X[index] for index in range(len(X))]))),self.theta_isub(1, self.learning_rate * (1/len(X)*sum([self.h(X[index]) - y[index] for index in range(len(X))])))] # Define a model model = Regressor() # Train model (With cst = True for graphing) model.fit(X, y, cst=True) # Get the theta (M and B values) and the cst variable (or history of cost to iterations) theta = model.theta cst = model.cst # Nerd plot stuff (Plot linear regression graph) x = np.linspace(0,60,100) y1 = theta[0]*x+theta[1] plt.title("Linear Regression") plt.scatter(X, y, c='teal') plt.plot(x, y1) #plt.savefig("linear_regression.png") (Saves graph to file) plt.show() # More nerd plot stuf (Plot cost graph (cst)) plt.title("Cost") plt.plot(cst[1], cst[0]) #plt.savefig("cost.png") (Saves graph to file) plt.show()

KNN Algorithm

This is an KNN algorithm made using dictionaries and for loops.


#KNN Algorithm by Kanishka Verose import math import operator #Data set: data = [(1, 10, "blue"), (3, 9, "blue"), (4,8, "red"),(6, 0, "blue"), (6, 7, "red"), (5,9, "red"),(3, 2, "blue"), (4, 6, "blue"), (4,2, "red"),(3, 0, "blue"), (4, 7, "blue"), (7,4, "red"),(3, 11, "blue"), (11, 9, "blue"), (4,54, "red"),(1, 14, "blue"), (2, 2, "blue"), (12,8, "red")] #Ask for number of clusters: def coord(x , y): distance = math.sqrt((x[1]-y[1])**2 + (x[0]-y[0])**2) return distance k = int(input("How many nearest neighbors? " )) xcoord = int(input("What is the x coordinate? ")) ycoord = int(input("What is the y coordinate? ")) data_point = (xcoord, ycoord) #Compare each piece of data to the point distances = {} for a in data: distance = coord(data_point, a) distances[a] = distance distances = sorted(distances.items(), key = lambda kv: kv[1]) #print(distances) #Find the nearest Neigbors and give the value nearest_neigbors = distances[0:k] colors = {} for x in nearest_neigbors: color = x[0][2] if color in colors: colors[color] +=1 else: colors[color] = 1 colors = sorted(colors.items(), key = lambda kv: kv[1]) print("The nearest neigbors are ", nearest_neigbors, " and the color is ", colors[-1])

KNN(K Nearest Neighbors)

This is my final version of K Nearest Neighbors. I still may need to add more points.


from statistics import mode from math import sqrt import operator data = [(1,1,"red"),(2,4,"red"),(6,3,"red"),(7,4,"blue"),(4,2,"blue"),(3,4,"green")] x = int(input("What is the x value?")) y = int(input("What is the y value?")) k = int(input("How many closest points would you like?")) point = (x,y) print(point) def distance(x2,y2): dist = ((x2[0]-x)**2)+((y2[1]-y)**2) return sqrt(dist) if k>len(data): print("Enter a different number of points") Dictionary = {} for i in range(0,len(data)): var = distance(data[i],data[i]) print(var) Dictionary[data[i]]=var sorted_Dictionary = sorted(Dictionary.items(), key=operator.itemgetter(1)) print(sorted_Dictionary) Dict = {} start = 0 var2 = [] for i in range(0,k): var1 = (sorted_Dictionary[i][0][2]) Dict[start] = var1 start = start + 1 for i in range(0,k): var2.append(Dict[i]) print(var2) def most_frequent(var2): return max(set(var2), key = var2.count) print("Your color is:") print(most_frequent(var2))

KNN(K Nearest Neighbors)

This is my final version of K Nearest Neighbors. I still may need to add more points.


from statistics import mode from math import sqrt import operator data = [(1,1,"red"),(2,4,"red"),(6,3,"red"),(7,4,"blue"),(4,2,"blue"),(3,4,"green")] x = int(input("What is the x value?")) y = int(input("What is the y value?")) k = int(input("How many closest points would you like?")) point = (x,y) print(point) def distance(x2,y2): dist = ((x2[0]-x)**2)+((y2[1]-y)**2) return sqrt(dist) if k>len(data): print("Enter a different number of points") Dictionary = {} for i in range(0,len(data)): var = distance(data[i],data[i]) print(var) Dictionary[data[i]]=var sorted_Dictionary = sorted(Dictionary.items(), key=operator.itemgetter(1)) print(sorted_Dictionary) Dict = {} start = 0 var2 = [] for i in range(0,k): var1 = (sorted_Dictionary[i][0][2]) Dict[start] = var1 start = start + 1 for i in range(0,k): var2.append(Dict[i]) print(var2) def most_frequent(var2): return max(set(var2), key = var2.count) print("Your color is:") print(most_frequent(var2))

K Means Clustering

K Means Clustering Algorithm w/ a fixed value for k ---------------------------------------------------------------- Takes in a set of data points and groups them into k # of effective clusters.

""" K Means Clustering By: Kush Arora (June 4,2019) """ ##Imports import pandas as pd import numpy as np import matplotlib.pyplot as plt from math import sqrt ##data df=pd.DataFrame({ "x":[12,20,28,18,29,33,24], "y":[39,36,30,52,54,46,55] }) np.random.seed(200) # of clusters k=3 #global variables l=[] c=[] red=[] blue=[] green=[] ##distance formula def d_formula(x1,x2,y1,y2): return sqrt((x2-x1)**2 + (y2-y1)**2) ##list of centroids centroids=[] for i in range(0,k): centroids.append((np.random.randint(0,80),np.random.randint(0,80))) #formatting fig= plt.figure(figsize=(5,5)) #plotting all the data points plt.scatter(df['x'], df['y'],color='k') #coloring the centroids colmap=[(1,'r'),(2,'g'),(3,'b')] #plotting the centroids for i in range (0,k): a=centroids[i] b=colmap[i] plt.scatter(a[0], a[1], color=b[1]) def c_point(centroids,data): global l for x in centroids: xCoords= df['x'] yCoords=df['y'] for y in range(0,len(yCoords)): var=d_formula(x[0],xCoords[y],x[1],yCoords[y]) l.append(var) def find_nearest(l,k): #list l --> list of distances from points to centroids three=[] global red, blue, green if int(len(l)/k) != 0: for value in range(len(l)-1,-1,-(int(len(l)/k))): three.append(l[value]) #removing the three values after appending to another list l.pop(value) #separates to closest value if three != [] and l != []: if min(three) == three[0]: red.append(min(three)) if min(three) == three[1]: blue.append(min(three)) else: green.append(min(three)) return find_nearest(l,k) ##establishing length of x&y axis plt.xlim(0,80) plt.ylim(0,80) #shows graph plt.show() #running functions c_point(centroids,df) find_nearest(l,k) print("1rst Cluster Distances:",red) print("2nd Cluster Distances:",blue) print("3rd Cluster Distances:",green)

KNN(K Nearest Neighbors)

This is my version of K nearest neighbors in python. What this code is doing is it is taking in a point entered by the user. Then is finds what color that point belongs in. So far my code is only able to categorize the point when asked for only one nearest point, I will have to work on this in the future. I will also have to add more points to the data set.


from statistics import mode from math import sqrt import operator data = [(1,1,"red"),(2,4,"red"),(6,3,"red"),(7,4,"blue"),(4,2,"blue"),(3,4,"green")] x = int(input("What is the x value?")) y = int(input("What is the y value?")) k = int(input("How many closest points would you like?")) point = (x,y) print(point) def distance(x2,y2): dist = ((x2[0]-x)**2)+((y2[1]-y)**2) return sqrt(dist) if k>len(data): print("Enter a different number of points") Dictionary = {} for i in range(0,len(data)): var = distance(data[i],data[i]) print(var) Dictionary[data[i]]=var sorted_Dictionary = sorted(Dictionary.items(), key=operator.itemgetter(1)) print(sorted_Dictionary) for i in range(0,k): var1 = (sorted_Dictionary[0][0][2]) print(var1)

K Means Clustering In Infinite Dimensions (Without Sklearn)

This is K Means Clustering in infinite dimensions finally complete! K Means Clustering follows under the category of clustering (hence the name). Clustering falls under another subset of machine learning called unsupervised machine learning, which is doing machine learning on datasets without labels. I have provided a 2d dataset and a 4d dataset to test my model. Both of which worked. Sometimes you will get a Zero Division Error, if that occurs, just run the code until it works. To learn more about K Means Clustering, go to the following tutorial: https://www.youtube.com/watch?v=4b5d3muPQmA&t=296s

""" K Means Clustering In Infinite Dimensions By Adam Blumenfeld @nexclap/AdamBlumenfeld """ # Imports from random import randint as random from matplotlib import pyplot as plt from matplotlib import style from math import sqrt import numpy as np # Set style of graphs style.use("ggplot") # Dataset data=[(1,1), (1,2), (2,1),(2,3),(2,4),(3,3),(7,9),(7,6),(8,5),(8,9),(9,6),(9,9), (2, 8), (1, 9), (1,7), (3, 7), (4, 9)] # Multidimensional dataset _data=[(1,1, 2, 3), (1,2, 2, 4), (2,1, 2, 3),(2,3, 2, 4),(2,4, 3, 2),(3,3, 2, 1),(7,9, 8, 9),(7,6, 9, 8),(8,5, 8, 5),(8,9, 9, 6),(9,6, 6, 9),(9,9, 9, 5), (2, 8, 4, 5), (1, 9, 4, 6), (1,7, 4, 4), (3, 7, 3, 4), (4, 9, 5, 5)] # Plot dataset plt.scatter([point[0] for point in data], [point[1] for point in data]) plt.show() class KMeansClustering: # Helper function def fit(self, data, k=3, ndim=2, tolerance=0.001): self.k = k self.data = data self.ndim = ndim. self._centroids() self.cluster() self._fit(dev=self.std(), best=self.clusters) return self.clusters # Recursive function def _fit(self, dev=float('inf'), tolerance=0.001, best=None): self._centroids() self.cluster() if self.std() <= dev: if dev - self.std() <= tolerance: return best return self._fit(dev, tolerance, best) self._fit(dev, tolerance, best) # Recursive function for finding one cluster def cluster(self, m=None): self.assign() self.reassign_centroids() if self.clusters == m: return self.clusters return self.cluster(self.clusters) # Euclidean distance def euiclid(self, p1, p2): return sqrt(sum([(p2[i] - p1[i])**2 for i in range(len(p1))])) # Mean def mean(self, data): return [(sum([point[i] for point in data]) / len(data)) for i in range(self.ndim)] # Initialize first centroids: def _centroids(self): self.centroids = [[random(0, max(self.data)[i]) for i in range(self.ndim)] for z in range(self.k)] # Reassign centroids def reassign_centroids(self): self.centroids = [self.mean(self.clusters[i]) for i in range(len(self.clusters))] # Assign clusters: def assign(self): self.clusters = {i: [] for i in range(len(self.centroids))} for point in self.data: distances = [self.euiclid(centroid, point) for centroid in self.centroids] for i in range(len(self.centroids)): if min(distances) == distances[i]: self.clusters[i].append(point) # Standard deviation function def std(self): return sqrt(sum([sum([self.euiclid(self.centroids[key], point) for point in cluster]) for key, cluster in self.clusters.items()])) model = KMeansClustering() # Fit on two dimensional dataset model.fit(data, ndim=2) # Fit on four dimensional dataset # model.fit(_data, ndim=4)

Hierarchical Clustering from Scratch

My progress in coding the hierarchical clustering algorithm so far. I have to troubleshoot it now and then I will be done.

from math import sqrt import numpy as np from random import choice, randint data = [(1, 3, "blue"), (2, 4, "red"), (3, 5, "green"), (3, 4, "red"), (6, 7, "green"), (7, 9, "blue")] redc = [] bluec = [] greenc = [] for q in range(0, len(data)-1): ind = data[q][2] if ind == "red": redc.append(data[q]) elif ind == "blue": bluec.append(data[q]) else: greenc.append(data[q]) centroids = [] redsumx = 0 bluesumx = 0 greensumx = 0 redsumy = 0 bluesumy = 0 greensumy = 0 for i in range(0, len(redc)-1): redsumx += redc[i][0] bluesumx += bluec[i][0] greensumx += greenc[i][0] redsumy += redc[i][1] bluesumy += bluec[i][1] greensumy += greenc[i][1] redp=(redsumx/len(redc), redsumy/len(redc)) centroids.append(redp) bluep=(bluesumx/len(bluec), bluesumy/len(bluec)) centroids.append(bluep) greenp=(greensumx/len(greenc), greensumy/len(greenc)) centroids.append(greenp) #print(centroids) def hierclustering(redlist, bluelist, greenlist): clus1 = [] print(bluelist) rb = sqrt((redlist[0]-bluelist[0])**2 + (redlist[1]-bluelist[1])**2) bg = sqrt((bluelist[0]-greenlist[0])**2 + (bluelist[1]-greenlist[1])**2) rg = sqrt((redlist[0]-greenlist[0])**2 + (redlist[1]-greenlist[1])**2) if rb < bg and rb < rg: clus1.append(redc + bluec) print("The second cluster is between red and blue.") elif bg > rb and bg > rg: clus1.append(bluec + greenc) print("The second cluster is between blue and green.") elif rg > rb and rg > bg: clus1.append(redc + greenc) print("The second cluster is between red and green.") else: return hierclustering(clus1, clus1, clus1) print(hierclustering(centroids[1], centroids[0], centroids[2]))

Hierarchical Clustering from Scratch

My progress on the Hierarchical Clustering algorithm so far. I have identified the centroids of the original clusters so far

from math import sqrt import numpy as np from random import choice, randint data = [(1, 3, "blue"), (2, 4, "red"), (3, 5, "green"), (3, 4, "red"), (6, 7, "green"), (7, 9, "blue")] redc = [] bluec = [] greenc = [] for q in range(0, len(data)-1): ind = data[q][2] if ind == "red": redc.append(data[q]) elif ind == "blue": bluec.append(data[q]) else: greenc.append(data[q]) centroids = [] redsumx = 0 bluesumx = 0 greensumx = 0 redsumy = 0 bluesumy = 0 greensumy = 0 for i in range(0, len(redc)-1): redsumx += redc[i][0] bluesumx += bluec[i][0] greensumx += greenc[i][0] redsumy += redc[i][1] bluesumy += bluec[i][1] greensumy += greenc[i][1] redp=(redsumx/len(redc), redsumy/len(redc)) centroids.append(redp) bluep=(bluesumx/len(bluec), bluesumy/len(bluec)) centroids.append(bluep) greenp=(greensumx/len(greenc), greensumy/len(greenc)) centroids.append(greenp) #print(centroids)

K-Means Clustering from Scratch Completed

My rendition of the K-Means Clustering algorithm without using any machine learning packages. I haven't gotten an output yet, so if anyone could help with that it would be greatly appreciated.

from matplotlib import pyplot as plt import numpy as np from math import sqrt import operator from random import choice, randint data = [(2, 3), (5, 6), (3, 4), (7, 8), (3, 5), (1, 0), (9, 4), (2, 5), (8, 0), (7, 6)] k = 3 centroids = [] for i in range(k): centroids.append((randint(1,9), randint(1,9))) def kmeans(): #print(centroids) bluecluster = [] redcluster = [] greencluster = [] for m in range(len(data)): eudblue = sqrt((centroids[0][0]-data[m][0])**2+(centroids[0][1]-data[m][1])**2) eudred = sqrt((centroids[1][0]-data[m][0])**2+(centroids[1][1]-data[m][1])**2) eudgreen = sqrt((centroids[2][0]-data[m][0])**2+(centroids[2][1]-data[m][1])**2) if eudblue < eudred and eudblue < eudgreen: bluecluster.append(data[m]) elif eudred < eudblue and eudred < eudgreen: redcluster.append(data[m]) elif eudgreen < eudblue and eudgreen < eudred: greencluster.append(data[m]) #print(bluecluster) #print(redcluster) #print(greencluster) bluevar = 0 redvar = 0 greenvar = 0 for b in range(0, len(bluecluster)-1): bluevar += sqrt((bluecluster[b][0]-centroids[0][0])**2+(bluecluster[b][1]-centroids[0][1])**2) for r in range(0, len(redcluster)-1): redvar += sqrt((redcluster[r][0]-centroids[1][0])**2+(redcluster[r][1]-centroids[1][1])**2) for g in range(0, len(greencluster)-1): greenvar += sqrt((greencluster[g][0]-centroids[2][0])**2+(greencluster[g][1]-centroids[2][1])**2) while bluevar != redvar and bluevar != greenvar and redvar != greenvar: bsumx = 0 rsumx = 0 gsumx = 0 bsumy = 0 rsumy = 0 gsumy = 0 for a in range(len(bluecluster)): bsumx += bluecluster[a][0] bsumy += bluecluster[a][1] for s in range(len(redcluster)): rsumx += redcluster[s][0] rsumy += redcluster[s][1] for h in range(len(greencluster)): gsumx += greencluster[h][0] gsumy += greencluster[h][1] centroids[0] = (bsumx/len(bluecluster), bsumy/len(bluecluster)) centroids[1] = (rsumx/len(redcluster), rsumy/len(redcluster)) centroids[2] = (gsumx/len(greencluster), gsumy/len(greencluster)) return centroids print(kmeans())

K-Means Clustering from Scratch

The second check in on my progress of coding KMC by scratch. I am almost done. I need to find a recursive call that functions properly, and then I will be finished.

from matplotlib import pyplot as plt import numpy as np from math import sqrt import operator from random import choice, randint data = [(2, 3), (5, 6), (3, 4), (7, 8), (3, 5), (1, 0), (9, 4), (2, 5), (8, 0), (7, 6)] k = 3 centroids = [] for i in range(k): centroids.append((randint(1,9), randint(1,9))) def kmeans(bluecluster, redcluster, greencluster): #print(centroids) bluecluster = [] redcluster = [] greencluster = [] for m in range(len(data)): eudblue = sqrt((centroids[0][0]-data[m][0])**2+(centroids[0][1]-data[m][1])**2) eudred = sqrt((centroids[1][0]-data[m][0])**2+(centroids[1][1]-data[m][1])**2) eudgreen = sqrt((centroids[2][0]-data[m][0])**2+(centroids[2][1]-data[m][1])**2) if eudblue < eudred and eudblue < eudgreen: bluecluster.append(data[m]) elif eudred < eudblue and eudred < eudgreen: redcluster.append(data[m]) elif eudgreen < eudblue and eudgreen < eudred: greencluster.append(data[m]) #print(bluecluster) #print(redcluster) #print(greencluster) bluevar = 0 redvar = 0 greenvar = 0 for b in range(0, len(bluecluster)-1): bluevar += sqrt((bluecluster[b][0]-centroids[0][0])**2+(bluecluster[b][1]-centroids[0][1])**2) for r in range(0, len(redcluster)-1): redvar += sqrt((redcluster[r][0]-centroids[1][0])**2+(redcluster[r][1]-centroids[1][1])**2) for g in range(0, len(greencluster)-1): greenvar += sqrt((greencluster[g][0]-centroids[2][0])**2+(greencluster[g][1]-centroids[2][1])**2) bsumx = 0 rsumx = 0 gsumx = 0 bsumy = 0 rsumy = 0 gsumy = 0 for a in range(len(bluecluster)): bsumx += bluecluster[a][0] bsumy += bluecluster[a][1] for s in range(len(redcluster)): rsumx += redcluster[s][0] rsumy += redcluster[s][1] for h in range(len(greencluster)): gsumx += greencluster[h][0] gsumy += greencluster[h][1] centroids[0] = (bsumx/len(bluecluster), bsumy/len(bluecluster)) centroids[1] = (rsumx/len(redcluster), rsumy/len(redcluster)) centroids[2] = (gsumx/len(greencluster), gsumy/len(greencluster)) if bluevar == redvar and bluevar == greenvar and redvar == greenvar: print(bluecluster) print(redcluster) print(greencluster) else: print("debug") kmeans(bluecluster, redcluster, greencluster) kmeans([], [], []) """ point = input("Enter desired point (x, y): ") X = point[0] y = point[1] blueeud = sqrt((centroids[0][0]-X)**2+(centroids[0][1]-y)**2) redeud = sqrt((centroids[1][0]-X)**2+(centroids[1][1]-y)**2) greeneud = sqrt((centroids[2][0]-X)**2+(centroids[2][1]-y)**2) #if """

K Nearest Neighbor (KNN)


import operator import math from math import sqrt data = [(1, 10, "blue"), (3, 9, "blue"), (4,8, "red"),(6, 0, "blue"), (6, 7, "red"), (5,9, "red"),(3, 2, "blue"), (4, 6, "blue"), (4,2, "red"),(3, 0, "blue"), (4, 7, "blue"), (7,4, "red"),(3, 11, "blue"), (11, 9, "blue"), (1, 14, "blue"), (2, 2, "blue"), (12,8, "red")] mydictionary = {} xcorrdinate = int(input("Enter a x-coordinate: ")) ycorrdinate = int(input("Enter a y-coordinate: ")) k = int(input("Enter value for K: ")) for y in data: distance = sqrt((xcorrdinate - y[0])**2 + (ycorrdinate - y[1])**2) mydictionary[y] = distance sorted_x = sorted(mydictionary.items(), key = operator.itemgetter(1)) list = [] colors = [] for x in sorted_x: list.append(x[1]) for y in range(k): print(sorted_x[y][1]) for z in range(k): colors.append(sorted_x[z][0][2]) def most_frequent(colors): return max(set(colors), key = colors.count) print(most_frequent(colors))
1 2