import pandas as pd
import numpy as np
from pandas import Series,DataFrame
from sklearn.neighbors import KNeighborsClassifier # KNN 算法
from sklearn.model_selection import cross_val_score # 交叉表
from sklearn.model_selection import train_test_split # 随机分组
from sklearn.model_selection import GridSearchCV # gcv最优值预测,和结果预测
from sklearn.metrics import accuracy_score # 结果预测
# KFold StratifiedKFold 将数据分为多少份
from sklearn.model_selection import KFold,StratifiedKFold # 将数据均等分裂, sFold分布更合适
from sklearn.preprocessing import MinMaxScaler, StandardScaler # 归一化操作
1. 使用自己定义的方法:
weights = ['distance', 'uniform']
result = {}
for k in range(1, int(data.shape[0]**0.5)):
for w in weights:
knn = KNeighborsClassifier(n_neighbors=k, weights=k)
sm = cross_val_score(knn, X,y,scoring='sccuracy', cv=6).mean()
result[w+str(k)] = sm
result
np.array(list(result.values())).argmax() # 21
list(result.keys())[np.array(list(result.values())).argmax()] # 'uniform11'
2. 使用封装的方法 GridSearchCV
# 导包
from sklearn.model_selection import train_test_split
# grid 网格,search搜索,cv:cross_validation
# 搜索算法最合适的参数, 三个参数就封装了for循环选择
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
knn = KNeighborsClassifier()
parmas = {'n_neighbors':[i for i in range(1, 30)],
'weights':['distance','uniform'],
'p':[1, 2]}
gcv = GridSearchCV(knn, parmas,scoring='accuracy', cv=6)
gcv.fit(X_train, y_train)
gcv.best_estimator_
# 参数最好的选择
gcv.best_estimator_
y_= gcv.predict(X_test)
# 1
(y_ == y_test).mean() # 0.9473684210526315
# 2
gcv.score(X_test, y_test) # 0.9473684210526315
# 3
accuracy_score(y_test, y_) # 0.9473684210526315
pd.crosstab(margins=True, index = y_test, columns=y_, rownames=['True'],
colnames=['predict'])
# 交叉表同混合矩阵一样,有行和列,此交叉表中行是代表真实的结果,列代表的是预测的结果
# 先看行B,真实的B一共有78个, 预测的正确的有 77 个,其中一个被预测为M
# 行M,真实的M一共有 36个,预测正确的有32个,其中五个被预测为B
# 混合矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_)
# 精确率 召回率 f1-score调和平均值
from sklearn.metrics import classification_report
print(classification_report(y_test, y_, target_names=['B','M']))
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# MinMaxScaler 和最大值最小值归一化的效果一样
mms = MinMaxScaler()
# (X-X.min())/(X.max() - X.min())
mms.fit(X)
X2 = mms.transform(X)
X2.rount(6)
# StandardScaler
# (X-X.mean())/X.std()
ss = StandardScaler()
X3 = ss.fit_transform(X)
X3
cols = ['marital_status', 'occupation',
'relationship', 'race', 'sex','native_country']
for col in cols:
u = X[col].unique()
def convert(x):
return np.argwhere(u==x)[0, 0]
X[col] = X[col].map(convert)
from sklearn import preprocessing
.
.
.
cols = ['workclass','marital_status','occupation','relationship', 'race', 'sex', 'native_country']
for col in cols:
cols = data[col].unique()
le = preprocessing.LabelEncoder()
le.fit(cols)
list(le.classes_)
data[col] = data[col].map(lambda x : le.transform([x])[0])
data = np.random.randint(0, 10, size=(8, 2))
target = np.array([0, 0, 1, 0, 1, 1, 1, 0])
display(data, target)
KFold = KFold(n_splits=4)
# train, test 是索引,只要有索引就可以获取数据
for train,test in KFold.split(data, target):
print(target[train], target[test])
# print(data[train], data[test])
# sKFold 分为 4份, 每一份数据特征,数据样本比例和原来是一样的
sKFold = StratifiedKFold(n_splits=4)
for train, test in sKFold.split(data, target):
print(target[train], target[test])
望您:
“情深不寿,强极则辱,谦谦君子,温润如玉”。