Python机器学习:KNN算法03训练数据集,测试数据集train test split

示例代码
首先引入相关包

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd

下载鸢尾花数据集

iris = datasets.load_iris()
print(type(iris))
<class 'sklearn.utils.Bunch'>

数据和数据类别

X = iris.data
y = iris.target
print(X.shape)
print(y.shape)

在这里插入图片描述
train_test_split

#train_test_split
#y是排好序的,前50个是0,中间50个是1,后面50个是2
#将原始数据打乱随机化,X,y分离..他们之间有对应关系
#第一种方式把X,y合并然后打乱,然后拆分(有时间试验试一下)
#我们使用第二种
#随机索引
shuffle_index = np.random.permutation(len(X))
print(shuffle_index)

输出结果

[ 30  22  56  92  55   2  85 118   5  60  36  33  81  75  83 117  12  80
 128  19 105  42  98  69  68  35  20   0  88  38  65 149  58 119  45  48
 109  43 113  79 114  29  74  90  11  91 124  57 148 130  63 129 139  17
  32  93  40   7 146  21 136   8  16 127 140  62  13  14  59 121   4  54
  49  41 132  67 110  64  78 137  24  97 122  47 108   1  28  61  82  86
 111 116  87  72 101  31  89 102  96 143 107 126 120  25 115  18 145  10
 133 134 142  27  84  39 138 104  26 125  77  71 103  50 106  76 100  95
  52  34   9 123   3   6  73  51 144  66  44  99 147  37  15  23 141 131
  70 112 135  46  94  53]

指定一下测试数据集比例

#指定一下多少元素作为训练数据集的比例
test_ratio = 0.2
test_size = int(len(X) * test_ratio)#这个例子中有30个数据集
print(test_size)

打印结果

30

取出测试集合训练集的索引

test_indexs = shuffle_index[:test_size]
train_indexs = shuffle_index[test_size:]
print(test_indexs)
print(train_indexs)

Python机器学习:KNN算法03训练数据集,测试数据集train test split_第1张图片
根据索引去找数据


X_train = X[train_indexs]
y_train = y[train_indexs]

X_test = X[test_indexs]
y_test = y[test_indexs]

print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)

相关数据的shape

(120, 4)
(120,)
(30, 4)
(30,)

使用我们封装的算法
model_selection.py里面定义了一个函数train_test_split
代码

import numpy as np

def train_test_split(X,y,test_ratio = 0.2 ,seed = None):
    """将数据X和y按照test_ration分割成X_train,X_test,y_train,y_test"""
    assert X.shape[0] == y.shape[0],\
        "the size of X must be equal to the size of y"
    assert 0.0<= test_ratio <= 1.0,\
        "test_ration must be valid"

    if seed:
        np.random.seed(seed)


    shuffle_index = np.random.permutation(len(X))

    test_size = int(len(X) * test_ratio)

    test_indexs = shuffle_index[:test_size]
    train_indexs = shuffle_index[test_size:]

    X_train = X[train_indexs]
    y_train = y[train_indexs]

    X_test = X[test_indexs]
    y_test = y[test_indexs]
    #返回一个四元组
    return X_train,X_test,y_train,y_test

使用首先引入

#使用我们的算法
from knn.model_selection import train_test_split

分离数据

X_train,X_test,y_train,y_test = train_test_split(X,y)
print(X_train.shape)
print(y_train.shape)

(120, 4)
(120,)
print(X_test.shape)
print(y_test.shape)
(30, 4)
(30,)

我们还封装了一个KNN(可以输入多个样本)
注意的问题:私有化。。

python KNN.py

#重新整理我们的KNN代码使得其架构更像sklearn
import numpy as np
from collections import Counter

定义一个类

class KNNClassifier():

初始化

    def __init__(self,k):
        """初始化KNN分类器,需要传入K的值"""
        assert k>=1,"k必须合法"
        self.k = k
        self._X_train = None#将我们训练的数据私有化,加 _
        self._y_train = None#私有化

fit

    def fit(self,X_train, y_train):
        """根据训练数据集X_train和y_train训练分类器"""

        # shape[0]指的就是行数,也就是X_train的数据点的个数,k显然必须小于总体样本个数
        assert 1 <= self.k <= X_train.shape[0],'K必须在一个合理的范围'

        # 数据点的个数必须与数据标签个数相同
        assert X_train.shape[0] == y_train.shape[0],'数据点个数必须与数据标签个数相同'

        self._X_train = X_train
        self._y_train = y_train
        return self

预测多个样本点(在里面调用了单点预测的算法_predict())

   def predict(self,X_predict):
        """给定待遇测数据集X_predict,返回表示X_predict的结果向量"""

        #传入的数据不为空
        assert self._X_train is not None and self._y_train is not None,'must fit before predict!'

        #数据的维数必须一致
        assert X_predict.shape[1] == self._X_train.shape[1],'the feature number of X_predict must be equal to X_train'

        y_predict = [self._predict(x) for  x in X_predict]

        return np.array(y_predict)

单个样本点测试函数

    def _predict(self,x):
        """给定单个待测数据x,返回x的预测结果值"""

        # x的维数必须和数据集中的X_train的维数保持一致
        assert x.shape[0] == self._X_train.shape[1],"the feature number of X_predict must be equal to X_train"

        distances = [np.sqrt(np.sum((x_train - x) ** 2)) for x_train in self._X_train]
        nearest = np.argsort(distances)

        topK_y = [self._y_train[i] for i in nearest[:self.k]]
        votes = Counter(topK_y)
        return votes.most_common(1)[0][0]

显示属性

    def __repr__(self):
        """显示属性"""
        return "KNN(k=%d)" % self.k

利用这个算法进行预测

from knn.KNN import KNNClassifier

实例化

#实例化
my_knn_clf = KNNClassifier(k = 3)

fit

my_knn_clf.fit(X_train,y_train)

预测

y_predict = my_knn_clf.predict(X_test)
print(y_predict)

预测结果

[0 2 2 2 2 2 0 0 2 2 1 1 0 2 2 0 0 1 0 1 2 1 2 1 1 2 2 1 0 0]

准确率

print(sum(y_predict == y_test) / len(y_test))
0.9333333333333333

使用sklearn中的train_test_split

#sklearn中的train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y)

Python机器学习:KNN算法03训练数据集,测试数据集train test split_第2张图片
Python机器学习:KNN算法03训练数据集,测试数据集train test split_第3张图片
Python机器学习:KNN算法03训练数据集,测试数据集train test split_第4张图片

你可能感兴趣的:(Python机器学习)