设a为预测点,b为其中一个样本点,在向量空间里,它们的形成的夹角为θ,那么θ越小(cosθ的值越接近1),就说明a点越接近b点。所以我们可以通过考察余弦相似度来预测a点的类型。
from collections import Counter
import numpy as np
class MyKnn:
def __init__(self,neighbors):
self.k = neighbors
def fit(self,X,Y):
self.X = np.array(X)
self.Y = np.array(Y)
if self.X.ndim != 2 or self.Y.ndim != 1:
raise Exception("dimensions are wrong!")
if self.X.shape[0] != self.Y.shape[0]:
raise Exception("input labels are not correct!")
def predict(self,X_pre):
pre = np.array(X_pre)
if self.X.ndim != pre.ndim:
raise Exception("input dimensions are wrong!")
rs = []
for p in pre:
temp = []
for a in self.X:
cos = (p @ a)/np.linalg.norm(p)/np.linalg.norm(a)
temp.append(cos)
temp = np.array(temp)
indices = np.argsort(temp)[:-self.k-1:-1]
ss = np.take(self.Y,indices)
found = Counter(ss).most_common(1)[0][0]
print(found)
rs.append(found)
return np.array(rs)
测试:
# 用鸢尾花数据集来验证我们上面写的算法
from sklearn.datasets import load_iris
# 使用train_test_split对数据集进行拆分,一部分用于训练,一部分用于测试验证
from sklearn.model_selection import train_test_split
# 1.生成一个kNN模型
myknn = MyKnn(5)
# 2.准备数据集:特征集X_train和标签集y_train
X_train,y_train = load_iris(return_X_y=True)
# 留出30%的数据集用于验证测试
X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.3)
# 3.训练模型
myknn.fit(X_train,y_train)
# 4.预测,acc就是预测结果
acc = myknn.predict(X_test)
# 计算准确率
(acc == y_test).mean()
其实如果余弦相似度来进行分类,那么根据文章最开头讲到的,其实取余弦值最大的点作为预测类型也可以:
import numpy as np
class MyClassicfication:
def fit(self,X,Y):
self.X = np.array(X)
self.Y = np.array(Y)
if self.X.ndim != 2 or self.Y.ndim != 1:
raise Exception("dimensions are wrong!")
if self.X.shape[0] != self.Y.shape[0]:
raise Exception("input labels are not correct!")
def predict(self,X_pre):
pre = np.array(X_pre)
if self.X.ndim != pre.ndim:
raise Exception("input dimensions are wrong!")
rs = []
for p in pre:
temp = []
for a in self.X:
cos = (p @ a)/np.linalg.norm(p)/np.linalg.norm(a)
temp.append(cos)
temp = np.array(temp)
index = np.argsort(temp)[-1]
found = np.take(self.Y,index)
rs.append(found)
return np.array(rs)
测试:
# 用鸢尾花数据集来验证我们上面写的算法
from sklearn.datasets import load_iris
# 使用train_test_split对数据集进行拆分,一部分用于训练,一部分用于测试验证
from sklearn.model_selection import train_test_split
# 1.生成一个kNN模型
myCla = MyClassicfication
# 2.准备数据集:特征集X_train和标签集y_train
X_train,y_train = load_iris(return_X_y=True)
# 留出30%的数据集用于验证测试
X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.3)
# 3.训练模型
myCla.fit(X_train,y_train)
# 4.预测,acc就是预测结果
acc = myCla.predict(X_test)
# 计算准确率
(acc == y_test).mean()
经测试,上面两种方式的准确率是差不多的。