示例代码
首先引入相关包
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd
下载鸢尾花数据集
iris = datasets.load_iris()
print(type(iris))
<class 'sklearn.utils.Bunch'>
数据和数据类别
X = iris.data
y = iris.target
print(X.shape)
print(y.shape)
#train_test_split
#y是排好序的,前50个是0,中间50个是1,后面50个是2
#将原始数据打乱随机化,X,y分离..他们之间有对应关系
#第一种方式把X,y合并然后打乱,然后拆分(有时间试验试一下)
#我们使用第二种
#随机索引
shuffle_index = np.random.permutation(len(X))
print(shuffle_index)
输出结果
[ 30 22 56 92 55 2 85 118 5 60 36 33 81 75 83 117 12 80
128 19 105 42 98 69 68 35 20 0 88 38 65 149 58 119 45 48
109 43 113 79 114 29 74 90 11 91 124 57 148 130 63 129 139 17
32 93 40 7 146 21 136 8 16 127 140 62 13 14 59 121 4 54
49 41 132 67 110 64 78 137 24 97 122 47 108 1 28 61 82 86
111 116 87 72 101 31 89 102 96 143 107 126 120 25 115 18 145 10
133 134 142 27 84 39 138 104 26 125 77 71 103 50 106 76 100 95
52 34 9 123 3 6 73 51 144 66 44 99 147 37 15 23 141 131
70 112 135 46 94 53]
指定一下测试数据集比例
#指定一下多少元素作为训练数据集的比例
test_ratio = 0.2
test_size = int(len(X) * test_ratio)#这个例子中有30个数据集
print(test_size)
打印结果
30
取出测试集合训练集的索引
test_indexs = shuffle_index[:test_size]
train_indexs = shuffle_index[test_size:]
print(test_indexs)
print(train_indexs)
X_train = X[train_indexs]
y_train = y[train_indexs]
X_test = X[test_indexs]
y_test = y[test_indexs]
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
相关数据的shape
(120, 4)
(120,)
(30, 4)
(30,)
使用我们封装的算法
model_selection.py里面定义了一个函数train_test_split
代码
import numpy as np
def train_test_split(X,y,test_ratio = 0.2 ,seed = None):
"""将数据X和y按照test_ration分割成X_train,X_test,y_train,y_test"""
assert X.shape[0] == y.shape[0],\
"the size of X must be equal to the size of y"
assert 0.0<= test_ratio <= 1.0,\
"test_ration must be valid"
if seed:
np.random.seed(seed)
shuffle_index = np.random.permutation(len(X))
test_size = int(len(X) * test_ratio)
test_indexs = shuffle_index[:test_size]
train_indexs = shuffle_index[test_size:]
X_train = X[train_indexs]
y_train = y[train_indexs]
X_test = X[test_indexs]
y_test = y[test_indexs]
#返回一个四元组
return X_train,X_test,y_train,y_test
使用首先引入
#使用我们的算法
from knn.model_selection import train_test_split
分离数据
X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)
(120, 4)
(120,)
print(X_test.shape)
print(y_test.shape)
(30, 4)
(30,)
我们还封装了一个KNN(可以输入多个样本)
注意的问题:私有化。。
python KNN.py
#重新整理我们的KNN代码使得其架构更像sklearn
import numpy as np
from collections import Counter
定义一个类
class KNNClassifier():
初始化
def __init__(self,k):
"""初始化KNN分类器,需要传入K的值"""
assert k>=1,"k必须合法"
self.k = k
self._X_train = None#将我们训练的数据私有化,加 _
self._y_train = None#私有化
fit
def fit(self,X_train, y_train):
"""根据训练数据集X_train和y_train训练分类器"""
# shape[0]指的就是行数,也就是X_train的数据点的个数,k显然必须小于总体样本个数
assert 1 <= self.k <= X_train.shape[0],'K必须在一个合理的范围'
# 数据点的个数必须与数据标签个数相同
assert X_train.shape[0] == y_train.shape[0],'数据点个数必须与数据标签个数相同'
self._X_train = X_train
self._y_train = y_train
return self
预测多个样本点(在里面调用了单点预测的算法_predict())
def predict(self,X_predict):
"""给定待遇测数据集X_predict,返回表示X_predict的结果向量"""
#传入的数据不为空
assert self._X_train is not None and self._y_train is not None,'must fit before predict!'
#数据的维数必须一致
assert X_predict.shape[1] == self._X_train.shape[1],'the feature number of X_predict must be equal to X_train'
y_predict = [self._predict(x) for x in X_predict]
return np.array(y_predict)
单个样本点测试函数
def _predict(self,x):
"""给定单个待测数据x,返回x的预测结果值"""
# x的维数必须和数据集中的X_train的维数保持一致
assert x.shape[0] == self._X_train.shape[1],"the feature number of X_predict must be equal to X_train"
distances = [np.sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
nearest = np.argsort(distances)
topK_y = [self._y_train[i] for i in nearest[:self.k]]
votes = Counter(topK_y)
return votes.most_common(1)[0][0]
显示属性
def __repr__(self):
"""显示属性"""
return "KNN(k=%d)" % self.k
利用这个算法进行预测
from knn.KNN import KNNClassifier
实例化
#实例化
my_knn_clf = KNNClassifier(k = 3)
fit
my_knn_clf.fit(X_train,y_train)
预测
y_predict = my_knn_clf.predict(X_test)
print(y_predict)
预测结果
[0 2 2 2 2 2 0 0 2 2 1 1 0 2 2 0 0 1 0 1 2 1 2 1 1 2 2 1 0 0]
准确率
print(sum(y_predict == y_test) / len(y_test))
0.9333333333333333
使用sklearn中的train_test_split
#sklearn中的train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)