"""
SVM支持向量机
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
from sklearn import svm
mat = loadmat('data/ex6data1.mat')
'''打印结果: dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])'''
X = mat['X']
y = mat['y']
'''大多数SVM的库会自动帮你添加额外的特征x0,所以无需手动添加。'''
def plotData(X, y):
plt.figure(figsize=(8, 5))
plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='rainbow')
plt.xlabel('X1')
plt.ylabel('X2')
plotData(X, y)
数据散点图:

def plotData(X, y):
plt.figure(figsize=(8, 5))
plt.scatter(X[:, 0], X[:, 1], c=y.flatten(), cmap='rainbow')
plt.xlabel('X1')
plt.ylabel('X2')
def plotBoundary(clf, X):
x_min, x_max = X[:, 0].min() * 1.2, X[:, 0].max() * 1.1
y_min, y_max = X[:, 1].min() * 1.1, X[:, 1].max() * 1.1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 500),
np.linspace(y_min, y_max, 500))
'''
>>> X, Y = np.meshgrid([1,2,3], [4,5,6,7])
>>> X
array([[1, 2, 3],
[1, 2, 3],
[1, 2, 3],
[1, 2, 3]])
>>> Y
array([[4, 4, 4],
[5, 5, 5],
[6, 6, 6],
[7, 7, 7]])
'''
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
"""
==> np.c_[X.ravel(),Y.ravel()]
==> [[1,4],
[2,4],
[3,4],
[1,5],
[2,5],
...
[3,7]]
"""
Z = Z.reshape(xx.shape)
'''
运行结果:
[[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
...
[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]
[1 1 1 ... 1 1 1]]
'''
plt.contour(xx, yy, Z)
models = [svm.SVC(C, kernel='linear') for C in [1, 100]]
clfs = [model.fit(X, y.ravel()) for model in models]
title = ['SVM Decision Boundary with C = {} (Example Dataset 1)'.format(C) for C in [1, 100]]
for model, title in zip(clfs, title):
plt.figure(figsize=(8, 5))
plotData(X, y)
plotBoundary(model, X)
plt.title(title)
"""
当C比较小时:模型对误分类的惩罚增大,间隔比较宽。
当C比较大时:模型对误分类的惩罚减小,间隔比较窄。
"""


高斯核函数公式:

def gaussKernel(x1, x2, sigma):
return np.exp(-((x1 - x2) ** 2).sum() / (2 * sigma ** 2))
print(gaussKernel(np.array([1, 2, 1]), np.array([0, 4, -1]), 2.))
mat = loadmat('data/ex6data2.mat')
X2 = mat['X']
y2 = mat['y']
plotData(X2, y2)

sigma = 0.1
gamma = np.power(sigma, -2.) / 2
'''
高斯核函数中的gamma越大,相对高斯函数中的σ越小,此时的分布曲线也就会越高越瘦。
高斯核函数中的gamma越小,相对高斯函数中的σ越大,此时的分布曲线也就越矮越胖。
'''
clf = svm.SVC(C=1, kernel='rbf', gamma=gamma)
model = clf.fit(X2, y2.flatten())
plotData(X2, y2)
plotBoundary(model, X2)

引入第三组数据:
mat3 = loadmat('data/ex6data3.mat')
X3, y3 = mat3['X'], mat3['y']
Xval, yval = mat3['Xval'], mat3['yval']
plotData(X3, y3)

Cvalues = (0.01, 0.03, 0.1, 0.3, 1., 3., 10., 30.)
sigmavalues = Cvalues
best_pair, best_score = (0, 0), 0
for C in Cvalues:
for sigma in sigmavalues:
gamma = np.power(sigma, -2.) / 2
model = svm.SVC(C=C, kernel='rbf', gamma=gamma)
model.fit(X3, y3.flatten())
this_score = model.score(Xval, yval)
'''
model.score函数的返回值是决定系数,也称R2。
可以测度回归直线对样本数据的拟合程度,决定系数的取值在0到1之间,
决定系数越高,模型的拟合效果越好,即模型解释因变量的能力越强。
'''
if this_score > best_score:
best_score = this_score
best_pair = (C, sigma)
print('最优(C,sigma)权值:', best_pair, '决定系数为:', best_score)
model = svm.SVC(C=1., kernel='rbf', gamma=np.power(.1, -2.) / 2)
model.fit(X3, y3.flatten())
plotData(X3, y3)
plotBoundary(model, X3)
model.score ( ) 函数的返回值是决定系数,也称R2,其计算公式如下:

