KNN-手写数字数据集:
使用sklearn中的KNN算法工具包( KNeighborsClassifier)替换实现分类器的构建,注意使用的是汉明距离;
运行结果:(大概要运行4分钟左右)
代码:
import pandas as pd
import os
def hamming(str1, str2):
if len(str1) != len(str2):
raise ValueError("两个字符串长度不相等")
return sum(c1 != c2 for c1, c2 in zip(str1, str2))
def get_train():
path = 'digits/trainingDigits'
trainingFileList0 = os.listdir(path)
trainingFileList = [file[2:] if file.startswith('._') else file for file in trainingFileList0]
train = pd.DataFrame()
img = []
labels = []
for i in range(len(trainingFileList)):
filename = trainingFileList[i]
with open(f'digits/trainingDigits/{filename}', 'r') as f:
txt = f.read().replace('\n', '')
img.append(txt)
filelabel = filename.split('_')[0]
labels.append(filelabel)
train['img'] = img
train['labels'] = labels
return train
def get_test():
path = 'digits/testDigits'
testFileList0 = os.listdir(path)
testFileList = [file[2:] if file.startswith('._') else file for file in testFileList0]
test = pd.DataFrame()
img = []
labels = []
for filename in testFileList:
with open(f'digits/testDigits/{filename}', 'r') as f:
txt = f.read().replace('\n', '')
img.append(txt)
filelabel = filename.split('_')[0]
labels.append(filelabel)
test['img'] = img
test['labels'] = labels
return test
def handwritingClass(train, test, k):
n = train.shape[0]
m = test.shape[0]
result = []
for i in range(m):
dist = []
for j in range(n):
d = str(hamming(train.iloc[j, 0], test.iloc[i, 0]))
dist.append(d)
dist_l = pd.DataFrame({'dist': dist, 'labels': train.iloc[:, 1]})
dr = dist_l.sort_values(by='dist')[:k]
re = dr.loc[:, 'labels'].value_counts()
result.append(re.index[0])
result = pd.Series(result)
test['predict'] = result
acc = (test.iloc[:, -1] == test.iloc[:, -2]).mean()
print(f'模型预测准确率为{acc:.5f}')
return test
# 获取训练集和测试集
train = get_train()
test = get_test()
# 调用函数
handwritingClass(train, test, 3)