#machinelearning


Logistic Regression

This is logistic regression from scratch using python.

x = [0, 1, 2, 3, 5, 6, 7, 8] y = [0, 0, 0, 0, 1, 1, 1, 1] import numpy as np import pandas as pd import matplotlib.pyplot as plt plt.plot(x, y) plt.savefig('plot.png') def slopeDerivative(x, y, slope, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + yInt difference = (y[i] - predicted) * x[i] total += difference returnValue = (-2/length) * total return returnValue def interceptDerivative(x, y, slope, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + yInt difference = (y[i] - predicted) total += difference returnValue = (-2/length) * total return returnValue m = 0 c = 0 l = 0.0001 iterations = 100000 for i in range(iterations): derivativeSlope = slopeDerivative(x, y, m ,c) derivativeIntercept = interceptDerivative(x, y, m ,c) m = m - (l * derivativeSlope) print (m) c = c - (l * derivativeIntercept) m = round(m, 3) c = round(c, 3) print ("The slope is " + str(m) + " and the y - intercept = " + str(c)) newY = [] biggest = [] howBigIsTheTumor = float(input("How big is the tumor in cm? ")) userPrediction = 1 / (1 + (2.71828 ** (-1 * ((howBigIsTheTumor*m)+c)))) print ("There is a " + str(userPrediction) + " that the tumor is malignant!") for f in range(0, 8): xValues = [] recentValue = f for i in range(10000): xValues.append(recentValue) recentValue += 0.0001 for i in range(len(xValues)): yprime = 1 / (1 + (2.71828 ** (-1 * ((xValues[i]*m)+c)))) newY.append(yprime) for n in range(len(xValues)): biggest.append(xValues[n]) plt.plot(biggest, newY) plt.savefig("graph")

Linear Regression with any amount of variables

hm = int(input("Enter how many categories or inputs do you want: ")) boo = int(input("How many inputs per category do you want: ")) dataSet = [] for be in range(hm): print ("This is your " + str(be) + "th input set.") inputer = [] for i in range(boo): point = float(input("Enter value: ")) inputer.append(point) dataSet.append(inputer) outputs = [] for i in range(boo): print ("Enter you output.") output = float(input("Enter value: ")) outputs.append(output) dataSet.append(outputs) values = [] for i in range(hm): values.append(0) yIntercept = 0 def slopeDerivativeOne(dataSet, values, hm, boo, yIntercept, wanted): length = hm total = 0 wantedValues = [] a = dataSet[wanted] for i in range(len(a)): wantedValues.append(a[i]) for f in range(length): innerValues = [] for i in range(len(dataSet)): b = dataSet[i] val = b[f] innerValues.append(val) innerTotal = 0 for i in range(len(values)): bobby = values[i] * innerValues[i] innerTotal += bobby innerTotal += yIntercept jj = len(dataSet) - 1 nn = dataSet[jj] difference = (nn[f] - innerTotal) * wantedValues[f] total += difference returnValue = (-2/length) * total return returnValue def DerivativeOne(dataSet, values, hm, boo, yIntercept, wanted): length = hm total = 0 wantedValues = [] a = dataSet[wanted] for i in range(len(a)): wantedValues.append(a[i]) for f in range(length): innerValues = [] for i in range(len(dataSet)): z = dataSet[i] val = z[f] innerValues.append(val) innerTotal = 0 for i in range(len(values)): bobby = values[i] * innerValues[i] innerTotal += bobby innerTotal += yIntercept jj = len(dataSet) - 1 nn = dataSet[jj] difference = (nn[f] - innerTotal) total += difference print (total) returnValue = (-2/length) * total return returnValue def getCost(dataSet, values, hm, boo, yIntercept, wanted): total = 0 length = hm for f in range(length): innerValues = [] for i in range(len(dataSet)): z = dataSet[i] val = z[f] innerValues.append(val) innerTotal = 0 for i in range(len(values)): bobby = values[i] * innerValues[i] innerTotal += bobby innerTotal += yIntercept jj = len(dataSet) - 1 nn = dataSet[jj] difference = (nn[f] - innerTotal) total += difference print (total) return (abs(total)) l = 0.0001 iterations = 100000 previousValues = [] for x in range(len(values)): print (x) previousValues.append(5) previousYInt = 5 j = 0 while (True): j += 1 ds = [] for i in range(len(values)): d = slopeDerivativeOne(dataSet, values, hm, boo, yIntercept, i) word = values[i] - (l * d) values[i] = word d1 = DerivativeOne(dataSet, values, hm, boo, yIntercept, i) yIntercept = yIntercept - (l * d1) print (j) if j > 1000000: False break previousValues = (values) previousYInt = yIntercept for i in range(len(values)): bob = round(values[i], 2) print (bob) bobby = round(yIntercept, 2) print ("y-intercept" + str(bobby))

Multi-variable linear regression

This is linear regression with more than one input variable.

x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] y = [1, 4, 9, 16, 25, 36, 49, 64, 81, 100] z = [1, 8, 27, 64, 125, 216, 343, 512, 729, 1000] def slopeDerivativeOne(x, y, z, slope, slope1, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + (slope1 * y[i]) + yInt difference = (z[i] - predicted) * x[i] total += difference returnValue = (-2/length) * total return returnValue def slopeDerivativeTwo(x, y, z, slope, slope1, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + (slope1 * y[i]) + yInt difference = (z[i] - predicted) * y[i] total += difference returnValue = (-2/length) * total return returnValue def interceptDerivative(x, y, z, slope, slope1, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + (slope1 * y[i]) + yInt difference = (z[i] - predicted) total += difference returnValue = (-2/length) * total return returnValue m = 0 m1 = 0 c = 0 l = 0.0001 iterations = 1000000 for i in range(iterations): d = slopeDerivativeOne(x, y, z, m, m1, c) d1 = slopeDerivativeTwo(x, y, z, m, m1, c) d2 = interceptDerivative(x, y, z, m, m1, c) m = m - (l * d) m1 = m1 - (l * d1) c = c - (l * d2) print (m, m1, c)

Linear Regression

This is linear regression from scratch. I used a Coursera course for help with this.

import numpy as np import pandas as pd import matplotlib.pyplot as plt x = [1, 5, 3, 4, 7, 9, 12, 13, 15, 16, 17, 4, 5, 2, 10, 23, 25] y = [5, 12, 23, 14, 17, 8, 20, 21, 25, 38, 42, 10, 13, 7, 23, 50, 55] plt.plot(x, y) plt.show() def slopeDerivative(x, y, slope, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + yInt difference = (y[i] - predicted) * x[i] total += difference returnValue = (-2/length) * total return returnValue def interceptDerivative(x, y, slope, yInt): length = len(x) total = 0 for i in range(length): predicted = (slope * x[i]) + yInt difference = (y[i] - predicted) total += difference returnValue = (-2/length) * total return returnValue m = 0 c = 0 l = 0.0001 iterations = 1000000 for i in range(iterations): derivativeSlope = slopeDerivative(x, y, m ,c) derivativeIntercept = interceptDerivative(x, y, m ,c) m = m - (l * derivativeSlope) c = c - (l * derivativeIntercept) print (m, c)

Naive Bayes Classifier

This is an example of the naive Bayes classifier. In this code, I used old data to determine whether it was a good day to golf.


import math from statistics import mean # here we will find the answer to playing assuming the outlook is sunny, the temperature is cool, the humidity is high, and the wind is strong outlook = ["sunny", "sunny", "overcast", "rainy", "rainy", "rainy", "overcast", "sunny", "sunny", "rainy", "sunny", "overcast", "overcast", "rainy"] temperature = ["hot", "hot", "hot", "mild", "cool", "cool", "cool", "mild", "cool", "mild", "mild", "mild", "hot", "mild"] humidity = ["high", "high", "high", "high", "normal", "normal","normal", "high", "normal", "normal", "normal", "high", "normal", "high"] windy = ["false", "true", "false", "false", "false", "true", "true", "false", "false", "false", "true", "true", "false", "true"] play = ["no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"] #print (len(temperature)) #print (len(outlook)) #print (len(humidity)) #print (len(windy)) #print (len(play)) sunny = 0 cool = 0 high = 0 strong = 0 for i in range(14): if outlook[i] == "sunny": sunny+= 1 if temperature[i] == "cool": cool += 1 if humidity[i] == "high": high+= 1 if windy[i] == "true": strong+= 1 print (sunny) print (cool) print (high) print (strong) countYes = 0 countNo = 0 length = len(play) for i in range(length): if play[i] == "no": countNo += 1 else: countYes += 1 print (countNo) print (countYes) probYesWindy = 0 probNoWindy = 0 probYesNoWindy = 0 probNoNoWindy = 0 probYes = countYes / 14 probNo = countNo / 14 for i in range(length): if play[i] == "yes": if windy[i] == "false": probYesNoWindy += 1 if windy[i] == "true": probYesWindy += 1 if play[i] == "no": if windy[i] == "false": probNoNoWindy += 1 if windy[i] == "true": probNoWindy += 1 YesWindy = probYesWindy/countYes NoWindy = probNoWindy / countNo YesNoWindy = probYesNoWindy/countYes NoNoWindy = probNoNoWindy / countNo print (YesWindy) print (NoWindy) probYesHigh = 0 probNoHigh = 0 probYesNormal = 0 probNoNormal = 0 for i in range(length): if play[i] == "yes": if humidity[i] == "high": probYesHigh += 1 if humidity[i] == "normal": probYesNormal += 1 if play[i] == "no": if humidity[i] == "high": probNoHigh += 1 if humidity[i] == "normal": probNoNormal += 1 yesHigh = probYesHigh/countYes noHigh = probNoHigh / countNo yesNormal = probYesNormal/countYes noNormal = probNoNormal / countNo print (yesHigh) print (noHigh) probYesSunny = 0 probNoSunny = 0 probYesOvercast = 0 probNoOvercast = 0 probYesRainy = 0 probNoRainy = 0 for i in range(length): if play[i] == "yes": if outlook[i] == "sunny": probYesSunny += 1 if outlook[i] == "overcast": probYesOvercast += 1 if outlook[i] == "rainy": probYesRainy += 1 if play[i] == "no": if outlook[i] == "sunny": probNoSunny += 1 if outlook[i] == "overcast": probNoOvercast += 1 if outlook[i] == "rainy": probNoRainy += 1 YesSunny = probYesSunny/countYes NoSunny = probNoSunny / countNo YesOvercast = probYesOvercast/countYes NoOvercast = probNoOvercast / countNo YesRainy = probYesRainy / countYes NoRainy = probNoRainy / countNo print (YesSunny) print (NoSunny) probYesHot = 0 probNoHot = 0 probYesMild = 0 probNoMild = 0 probYesCool = 0 probNoCool = 0 for i in range(length): if play[i] == "yes": if temperature[i] == "hot": probYesHot += 1 if temperature[i] == "mild": probYesMild += 1 if temperature[i] == "cool": probYesCool += 1 if play[i] == "no": if temperature[i] == "hot": probNoHot += 1 if temperature[i] == "mild": probNoMild += 1 if temperature[i] == "cool": probNoCool += 1 YesHot = probYesHot/countYes NoHot = probNoHot / countNo YesMild = probYesMild /countYes NoMild = probNoMild / countNo YesCool = probYesCool / countYes NoCool = probNoCool / countNo print (YesCool) print (NoCool) pYes = (YesSunny) * (YesCool) * (yesHigh) * (YesWindy) * (probYes) pNo = (NoSunny) * (NoCool) * (noHigh) * (NoWindy) * (probNo) print (pYes) print (pNo) commonP = (sunny/14) * (high / 14) * (cool /14) * (strong/ 14) finalYes= pYes / commonP finalNo = pNo / commonP if finalYes > finalNo: print ("Go out and play. The chances of you playing are " + str(finalYes) + " and the chances you don't are " + str(finalNo)) else: print ("Do not go out and play. The chances of you playing are " + str(finalYes) + " and the chances you don't are " + str(finalNo))

Implementing Hierarchical Clustering into a dataset

This code implements Hierarchical Clustering into a real dataset of information regarding credit cards. It outputs the centroids of the number of clusters that the user wants the data to be split into.


import pandas as pd import math import random csv = pd.read_csv('Card.csv', header = 0) data = [] for x in range(len(csv)): balance = csv.iloc[x].iloc[1] balanceFrequency = csv.iloc[x].iloc[2] purchases = csv.iloc[x].iloc[3] oneOffPurchases = csv.iloc[x].iloc[4] installmentsPurchases = csv.iloc[x].iloc[5] cashAdvance = csv.iloc[x].iloc[6] accountTuple = (balance, balanceFrequency, purchases, oneOffPurchases, installmentsPurchases, cashAdvance) data.append(accountTuple) k = int(input("How many final clusters do you want(max is 97):")) def distance(x, y): return math.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2 + (x[2] - y[2]) ** 2 + (x[3] - y[3]) ** 2 + (x[4] - y[4]) ** 2 + (x[5] - y[5]) ** 2) def minDistance(clusters): minDist = 10000 for x in data: for y in data: dist = distance(x, y) if dist < minDist and dist != 0: minDist = dist minX = x minY = y minPoints = (minX, minY) return minPoints def findCentroid(minPoints): x = minPoints[0] y = minPoints[1] centroid = [] for d in range(len(x)): centroid.append((x[d] + y[d])/2) return centroid data1 = data while k < len(data): minPoints = minDistance(data) data.append(findCentroid(minPoints)) data.remove(minPoints[0]), data.remove(minPoints[1]) print("Centroids:", data)

Implementing K Means Clustering into a dataset

This code takes a real dataset of information of credit card users and uses K means Clustering to categorize the users into separate clusters. It outputs the lowest variation and the centroids of the clusters


import pandas as pd import math import random csv = pd.read_csv('Card.csv', header = 0) data = [] for x in range(len(csv)): balance = csv.iloc[x].iloc[1] balanceFrequency = csv.iloc[x].iloc[2] purchases = csv.iloc[x].iloc[3] oneOffPurchases = csv.iloc[x].iloc[4] installmentsPurchases = csv.iloc[x].iloc[5] cashAdvance = csv.iloc[x].iloc[6] accountTuple = (balance, balanceFrequency, purchases, oneOffPurchases, installmentsPurchases, cashAdvance) data.append(accountTuple) clstr1 = [] lowestVarClstr1 = [] clstr2 = [] lowestVarClstr2 = [] clstr3 = [] lowestVarClstr3 = [] centroids = [] lowestVarCentroids = [] def orignalCentroids(): for a in range(3): rand = data[random.randrange(len(data))] centroids.append(rand) def findDistance(x, y): return math.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2 + (x[2] - y[2]) ** 2 + (x[3] - y[3]) ** 2 + (x[4] - y[4]) ** 2 + (x[5] - y[5]) ** 2) def sortIntoClstr(): clstr1.clear() clstr2.clear() clstr3.clear() for d in data: coorCentClstr1 = centroids[0] coorCentClstr2 = centroids[1] coorCentClstr3 = centroids[2] distClstr1 = findDistance(coorCentClstr1, d) distClstr2 = findDistance(coorCentClstr2, d) distClstr3 = findDistance(coorCentClstr3, d) if distClstr1 < distClstr2 and distClstr1 < distClstr3: clstr1.append(d) elif distClstr2 < distClstr1 and distClstr2 < distClstr3: clstr2.append(d) elif distClstr3 < distClstr1 and distClstr3 < distClstr2: clstr3.append(d) def newCentroid(cluster): x1 = 0 x2 = 0 x3 = 0 x4 = 0 x5 = 0 x6 = 0 for c in cluster: coor = c x1 += coor[0] x2 += coor[1] x3 += coor[2] x4 += coor[3] x5 += coor[4] x6 += coor[5] x1 /= (len(cluster) + 1) x2 /= (len(cluster) + 1) x3 /= (len(cluster) + 1) x4 /= (len(cluster) + 1) x5 /= (len(cluster) + 1) x6 /= (len(cluster) + 1) return (x1, x2, x3, x4, x5, x6) def findNewCentroids(): centroids.clear() centroids.append(newCentroid(clstr1)) centroids.append(newCentroid(clstr2)) centroids.append(newCentroid(clstr3)) def varCluster(cluster, centroid): var = 0 for c in cluster: var += findDistance(centroid, c) return var def findVariation(): variation = 0 variation += varCluster(clstr1, centroids[0]) variation += varCluster(clstr2, centroids[1]) variation += varCluster(clstr3, centroids[2]) return variation def oneCycle(): for x in range(0, 100, 1): sortIntoClstr() findNewCentroids() return findVariation() orignalCentroids() for c in range(21): variation = oneCycle() lowestVariation = 99999999999999999 if variation < lowestVariation: lowestVariation = variation lowestVarCentroids = centroids print('Lowest Variation: ') print(lowestVariation) print('Centroids: ') print(lowestVarCentroids)

Implementing KNN on a dataset

This code takes a real dataset of types of glass and uses K nearest neighbors to classify what the new type of glass(given by the user) is used for. The user needs to input the following to receive what type of glass they have: RI: refractive index Na: Sodium (unit measurement: weight percent in corresponding oxide, as are attributes 4-10) Mg: Magnesium Al: Aluminum Si: Silicon K: Potassium Ca: Calcium Ba: Barium Fe: Iron


import pandas as pd import math csv = pd.read_csv('glass.csv', header = 0) data = [] for x in range(len(csv)): rl = csv.iloc[x].iloc[0] na = csv.iloc[x].iloc[1] mg = csv.iloc[x].iloc[2] al = csv.iloc[x].iloc[3] si = csv.iloc[x].iloc[4] k = csv.iloc[x].iloc[5] ca = csv.iloc[x].iloc[6] ba = csv.iloc[x].iloc[7] fe = csv.iloc[x].iloc[8] glassType = csv.iloc[x].iloc[9] glassTuple = (rl, na, mg, al, si, k, ca, ba, fe, glassType) data.append(glassTuple) userData = [] userData.append(float(input("Enter Rl value of glass: "))) userData.append(float(input("Enter Na value of glass: "))) userData.append(float(input("Enter Mg value of glass: "))) userData.append(float(input("Enter Al value of glass: "))) userData.append(float(input("Enter Si value of glass: "))) userData.append(float(input("Enter K value of glass: "))) userData.append(float(input("Enter Ca value of glass: "))) userData.append(float(input("Enter Ba value of glass: "))) userData.append(float(input("Enter Fe value of glass: "))) distances = [] for x in data: Rl = x[0] Na = x[1] Mg = x[2] Al = x[3] Si = x[4] K = x[5] Ca = x[6] Ba = x[7] Fe = x[8] distance = ((Rl ** 2) + (Na ** 2) + (Mg ** 2) + (Al ** 2) + (Si ** 2) + (K ** 2) + (Ca ** 2) + (Ba ** 2) + (Fe ** 2)) ** (1. /9) distances.append(distance) k = int(input("Enter number of neighbors you wish to calculate (max is 214): ")) nearestDistances = [] nearestNeighbors = [] while len(nearestDistances) > k: nearestDistance = min(distances) nearestDistances.append(nearestDistance) index = distances.index(nearestDistance) distances.remove[index] nearestNeighbors.append(index) glassTypes = [] for n in nearestNeighbors: point = data[n] glassType = point[9] glassTypes.append(glassType) type1 = 0 type2 = 0 type3 = 0 type4 = 0 type5 = 0 type6 = 0 type7 = 0 for t in glassTypes: if(t == 1): type1 += 1 elif(t == 2): type2 += 1 elif(t == 3): type3 += 1 elif(t == 4): type4 += 1 elif(t == 5): type5 += 1 elif(t == 6): type6 += 1 else: type7 += 1 if type1 < type2: if type2 < type3: if type3 < type4: if type4 < type5: if type5 < type6: if type6 < type7: print("The glass is used for headlamps") else: print("The glass is used for tableware") else: print("The glass is used for containers") else: print("The glass is used for vehicle windows that don't float") else: print("The glass is used for vehicle windows that float") else: print("The glass is used for building windows that don't float") else: print("The glass is used for building windows that float")

Hierarchical Clustering Algorithm (with a fixed k-value)

Given a set of data, this program works to cluster points together based on proximity to each other (in accordance with Euclidean distance; with the closest points at any given point in time being clumped together for the next iteration of the loop). A more detailed explanation on what exactly this code is doing can be found in the URL below.


import math import random data = [(1,5), (3,2), (4,7), (2,9), (7,1), (8,7), (9,6), (3,4), (6,4)] datax = [1,3,4,2,7,8,9,3,6] datay = [5,2,7,9,1,7,6,4,4] k = 5 a = len(data) p = len(data) - k for point in range(0,p,1): a=0 b=0 distancesx = [] for x in datax: if b !=0: dist1 = abs(datax[a]-datax[b]) distancesx.append(dist1) b=b+1 distancesy = [] a=0 b=0 for y in datay: if b !=0: dist2 = abs(datay[a]-datay[b]) distancesy.append(dist2) b=b+1 x = 0 finaldistances = [] for m in distancesx: distfinal = math.sqrt((distancesx[x])**2+(distancesy[x])**2) finaldistances.append(distfinal) x = x+1 t = min(finaldistances) u = finaldistances.index(t) finaldistances = [] listx = [] listy = [] listx.append(datax[0]) listx.append(datax[u+1]) listy.append(datay[0]) listy.append(datay[u+1]) #find centroid between both values in the list: meanx = (sum(listx))/2 meany = (sum(listy))/2 centroid = (meanx, meany) data.remove(data[a]) data.remove(data[u+1]) datax.remove(datax[a]) datax.remove(datax[u+1]) datay.remove(datay[a]) datay.remove(datay[u+1]) data.append(centroid) datax.append(meanx) datay.append(meany) listx = [] listy = [] u = 0 a = 0 print(data)

Hierarchical Clustering

This is an example of Hierarchical Clustering



import math data = list(list()) dataList = [(5, 1), (4, 5), (5, 3), (1, 5), (10, 20), (6, 14),(6, 8), (3, 15), (1, 25), (0, 9), (13, 8), (21, 4), (14, 27), (11, 24), (22, 17)] for i in range(len(dataList)): data.append(list()) counter = 0 for j in data: j.append(dataList[counter]) j.append(dataList[counter]) counter += 1 k = input("Enter number of clusters you would like: ") def euclidDistance(x, y): return math.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) def minDistance(clusters): data = list() for j in clusters: data.append(j[0]) minDistance = 1000000 minPoints = list() minPoints.append(0) minPoints.append(0) counterX = 0 for x in data: counterY = 0 for y in data: if(euclidDistance(x,y) < minDistance and euclidDistance(x,y) != 0): minDistance = euclidDistance(x, y) minPoints[0] = counterX minPoints[1] = counterY counterY += 1 counterX += 1 return minPoints def clustering(clusters): global k if len(clusters) == int(k): return clusters else: minPoints = minDistance(clusters) clusters.append(list()) clusters[len(clusters) - 1].append(((clusters[minPoints[0]][0][0] + clusters[minPoints[1]][0][0]) / 2, (clusters[minPoints[0]][0][1] + clusters[minPoints[1]][0][1]) / 2)) for i in range (1, len(clusters[minPoints[0]])): clusters[len(clusters) - 1].append(clusters[minPoints[0]][i]) for j in range (1, len(clusters[minPoints[1]])): clusters[len(clusters) - 1].append(clusters[minPoints[1]][j]) if minPoints[0] < minPoints[1]: del clusters[minPoints[0]] del clusters[minPoints[1] - 1] else: del clusters[minPoints[1]] del clusters[minPoints[0] - 1] return clustering(clusters) clusters = clustering(data) counter = 1 for j in clusters: print("Cluster", counter) counter += 1 for i in range(1, len(j)): print(j[i])

K means Clustering example(without boolean statement to determine if clusters are changed)

This is an example of K means clustering. It uses methods to assign the data to specific clusters. There's only three clusters and the centroids are randomly assigned in the beginning. Since I haven't properly figured out how to implement a boolean expression to check if any points are changed, I just have a for loop which goes on 100 cycles. It's a work in progress.


import math import random data = [(1, 10), (3, 9), (4, 8),(6, 0), (6, 7), (5, 9),(3, 2), (4, 6), (4, 2),(3, 0,), (4, 7), (7, 4),(3, 11), (11, 9), (4, 54),(1, 14), (2, 2), (12, 8)] greenCluster = [] redCluster = [] blueCluster = [] centroids = [] for a in range(3): rand = data.pop(random.randrange(len(data))) centroids.append(rand) def findDistance(x, y): return math.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1]) ** 2) def sortIntoColors(): greenCluster.clear() redCluster.clear() blueCluster.clear() for d in data: coorCentGreen = centroids[0] coorCentRed = centroids[1] coorCentBlue = centroids[2] distGreen = findDistance(coorCentGreen, d) distRed = findDistance(coorCentRed, d) distBlue = findDistance(coorCentBlue, d) if distGreen < distRed and distGreen < distBlue: greenCluster.append(d) elif distRed < distGreen and distRed < distBlue: redCluster.append(d) elif distBlue < distGreen and distBlue < distRed: blueCluster.append(d) def newCentroid(cluster): x1 = 0 x2 = 0 for c in cluster: coor = c x1 += coor[0] x2 += coor[1] x1 = x1/(len(cluster) + 1) x2 = x2/(len(cluster) + 1) return (x1, x2) def findNewCentroids(): centroids.clear() centroids.append(newCentroid(greenCluster)) centroids.append(newCentroid(redCluster)) centroids.append(newCentroid(blueCluster)) for x in range(0, 100, 1): sortIntoColors() findNewCentroids() print('green Cluster: ') print(greenCluster) print('red Cluster: ') print(redCluster) print('blue Cluster: ') print(blueCluster)

Finding "k" number of neighbors to find closest color

This code asks the user for a point(x1, x2). Using the point and an list made initially, it finds "k" number of points closest to the users point. The code returns which color the majority of the "k" number of points are. Solved by appending in distances using a for/in loop. Then finding the nearest points using while loop. Then converts the data of the nearest points into the color of the points and adds up number of points in each color using for/in loop. Simple if/else loop at the end to figure out what color the majority of points are in. min() is finding the minimum value in the list len() is length of list list_name.remove(index) is removing data from a list using the data's index number

import math data = [(1, 10, "blue"), (2, 15, "red"), (2, 4, "blue"), (4, 2, "red"), (5, 5, "blue"), (6, 1, "red"), (1, 6, "blue"), (21, 19, "red"), (19, 21, "blue"), (15, 12, "red"), (6, 7, "blue"), (7, 6, "red"), (12, 13, "blue"), (13, 12, "red"), (16, 17, "blue")] userData = [] userDataX1 = int(input("Enter x1 coordinate for your data: ")) userDataX2 = int(input("Enter x2 coordinate for your data: ")) userData.append(userDataX1) userData.append(userDataX2) distances = [] for x in data: a = x[0] b = x[1] distance = math.sqrt((a - userDataX1) ** 2 + (b - userDataX2) ** 2) distances.append(distance) k = int(input("Enter number of neighbors you wish to calculate (max is 15): ")) nearestDistances = [] nearestNeighbors = [] while len(nearestDistances) > k: nearestDistance = min(distances) nearestDistances.append(nearestDistance) index = distances.index(nearestDistance) distances.remove[index] nearestNeighbors.append(index) colors = [] for n in nearestNeighbors: point = data[n] color = point[2] colors.append(color) blue = 0 red = 0 for c in colors: if c == "red": red += 1 else: blue += 1 if red > blue: print("your coordinate would be red") else: print("your coordinate would be blue")
1 2