import matplotlib.pyplot as plt
import numpy as np
def loadDataSet():
dataMat=[]
labelMat=[]
fr=open('H:/机器学习课程资料/machinelearninginaction/Ch05/testSet.txt')
for line in fr.readlines():
lineArr=line.strip().split()
dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
labelMat.append(int(lineArr[2]))
fr.close()
return dataMat,labelMat
#绘制图像
def plotDataSet():
dataMat,labelMat=loadDataSet()
dataArr=np.array(dataMat)
n=np.shape(dataMat)[0] #样本数目
xcord1=[]
ycord1=[]
xcord2=[]
ycord2=[]
for i in range(n):
if int(labelMat[i])==1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=20,c='r',marker='s',alpha=.5)
ax.scatter(xcord2,ycord2,s=20,c='g',alpha=.5)
plt.title('DataSet')
plt.xlabel('x')
plt.ylabel('y')
plt.show()
#查看数据分布
plotDataSet()
针对上述数据,假设sigmoid函数输入记为z,那么z=w0x0+w1x1+w2x2
#sigmoid函数
def sigmoid(inx):
return 1.0/(1+np.exp(-inx))
#梯度上升算法
def gradAscent(dataMatIn,classLabels):
dataMatrix=np.mat(dataMatIn) #转换为np矩阵数据类型
labelMat=np.mat(classLabels).T
m,n=np.shape(dataMatrix) #返回m行数,n列数
alpha=0.001 #系数
maxCycles=500 #最大迭代次数
weights=np.ones((n,1)) #权重初值
for k in range(maxCycles):
h=sigmoid(dataMatrix*weights)
error=labelMat-h
weights=weights+alpha*dataMatrix.T*error
return weights.getA() #返回数组
#测试
dataArr,labelMat=loadDataSet()
weights=gradAscent(dataArr,labelMat)
weights
array([[ 4.12414349],
[ 0.48007329],
[-0.6168482 ]])
import matplotlib.pyplot as plt
def plotBestFit(weights):
dataMat,labelMat=loadDataSet()
dataArr=np.array(dataMat) #矩阵换成数组形式
n=np.shape(dataArr)[0] #数据个数
xcord1=[]
ycord1=[]
xcord2=[]
ycord2=[]
for i in range(n):
if int(labelMat[i])==1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=20,c='r',marker='s')
ax.scatter(xcord2,ycord2,s=20,c='g')
x=np.arange(-3.0,3.0,0.1)
y=(-weights[0]-weights[1]*x)/weights[2] #分界线即sigmoid函数自变量为0,得到x1,x2关系式
ax.plot(x,y)
plt.xlabel('X1')
plt.ylabel('X2')
plt.show()
#绘图
plotBestFit(weights)
梯度上升算法每次迭代都要遍历整个数据集,对于大量样本可采用一次只用一个样本更新的随机梯度上升法
#改进的随机梯度上升算法
import random
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
m,n=np.shape(dataMatrix)
weights=np.ones(n)
for j in range(numIter):
dataIndex=list(range(m))
for i in range(m):
alpha=4/(1.0+j+i)+0.01 #每次调整alpha值
randIndex=int(random.uniform(0,len(dataIndex))) #随机选取样本,可以减少周期性波动
h=sigmoid(sum(dataMatrix[randIndex]*weights)) #梯度上升这里是向量,随机梯度上升数据类型都是数值
error=classLabels[randIndex]-h
weights=weights+alpha*error*dataMatrix[randIndex]
del(dataIndex[randIndex])
return weights
#查看算法效果
dataMat,labelMat=loadDataSet()
weights=stocGradAscent1(np.array(dataMat),labelMat)
plotBestFit(weights)
为了更加直观的表现两种算法的收敛情况,我们绘制相应的关系曲线图
#调整梯度上升和随机梯度上升函数
def gradAscent(dataMatIn,classlabels):
dataMatrix=np.mat(dataMatIn)
labelMat=np.mat(classlabels).T
m,n=np.shape(dataMatrix)
alpha=0.01
maxCycles=500
weights=np.ones((n,1))
weights_array=np.array([])
for k in range(maxCycles):
h=sigmoid(dataMatrix*weights)
error=labelMat-h
weights=weights+alpha*dataMatrix.T*error
weights_array=np.append(weights_array,weights) #一行
weights_array=weights_array.reshape(maxCycles,n)
return weights.getA(),weights_array
def stocGradAscent1(dataMatrix,classLabels,numIter=150):
m,n=np.shape(dataMatrix)
weights=np.ones(n)
weights_array=np.array([])
for j in range(numIter):
dataIndex=list(range(m))
for i in range(m):
alpha=4/(1.0+j+i)+0.01
randIndex=int(random.uniform(0,len(dataIndex)))
h=sigmoid(sum(dataMatrix[randIndex]*weights))
error=classLabels[randIndex]-h
weights=weights+alpha*error*dataMatrix[randIndex]
weights_array=np.append(weights_array,weights,axis=0)
del(dataIndex[randIndex])
weights_array=weights_array.reshape(numIter*m,n)
return weights,weights_array
#绘制回归系数与迭代次数的关系
def plotWeights(weights_array1,weights_array2):
#画布分成三行两列
fig,axs=plt.subplots(nrows=3,ncols=2,sharex=False,sharey=False,figsize=(20,10))
x1=np.arange(0,len(weights_array1),1)
#绘制w0与迭代次数的关系
axs[0][0].plot(x1,weights_array1[:,0])
axs0_title_text=axs[0][0].set_title('梯度上升算法:回归系数与迭代次数关系')
axs0_ylabel_text=axs[0][0].set_ylabel('W0')
plt.setp(axs0_title_text,size=20,weight='bold',color='black')
plt.setp(axs0_ylabel_text,size=20,weight='bold',color='black')
#绘制w1与迭代次数关系
axs[1][0].plot(x1,weights_array1[:,1])
axs1_ylabel_text=axs[1][0].set_ylabel('W1')
plt.setp(axs1_ylabel_text,size=20,weight='bold',color='black')
#绘制w2与迭代次数关系
axs[2][0].plot(x1,weights_array1[:,2])
axs2_xlabel_text=axs[2][0].set_title('迭代次数')
axs2_ylabel_text=axs[2][0].set_ylabel('W2')
plt.setp(axs2_xlabel_text,size=20,weight='bold',color='black')
plt.setp(axs2_ylabel_text,size=20,weight='bold',color='black')
x2=np.arange(0,len(weights_array2),1)
#绘制w0与迭代次数关系
axs[0][1].plot(x2,weights_array2[:,0])
axs0_title_text=axs[0][1].set_title('随机梯度上升算法:回归系数与迭代次数关系')
axs0_ylabel_text=axs[0][1].set_ylabel('W0')
plt.setp(axs0_title_text,size=20,weight='bold',color='black')
plt.setp(axs0_ylabel_text,size=20,weight='bold',color='black')
#绘制w1与迭代次数关系
axs[1][1].plot(x2,weights_array2[:,1])
axs1_ylabel_text=axs[1][1].set_ylabel('W1')
plt.setp(axs1_ylabel_text,size=20,weight='bold',color='black')
#绘制w2与迭代次数关系
axs[2][1].plot(x2,weights_array2[:,2])
axs2_xlabel_text=axs[2][1].set_title('迭代次数')
axs2_ylabel_text=axs[2][1].set_ylabel('W2')
plt.setp(axs2_xlabel_text,size=20,weight='bold',color='black')
plt.setp(axs2_ylabel_text,size=20,weight='bold',color='black')
plt.show()
#绘图
dataMat,labelMat=loadDataSet()
weights1,weights_array1=gradAscent(dataMat,labelMat)
weights2,weights_array2=stocGradAscent1(np.array(dataMat),labelMat)
plotWeights(weights_array1,weights_array2)
随机梯度上升实际上迭代次数为150,横坐标是因为有100个样本点,所以更新的次数为150*100=15000次,由图可以看出大约更新2000次,即遍历了整个数据集20次就收敛了,而梯度上升法需要遍历整个数据集300遍
import numpy as np
import random
def classifyVector(inX,weights):
prob=sigmoid(sum(inX*weights))
if prob>0.5:
return 1.0
else:
return 0.0
#使用随机梯度上升算法
def colicTest():
frTrain=open('H:/机器学习课程资料/machinelearninginaction/Ch05/horseColicTraining.txt')
frTest=open('H:/机器学习课程资料/machinelearninginaction/Ch05/horseColicTest.txt')
trainingSet=[]
trainingLabels=[]
for line in frTrain.readlines(): #将数据添加进训练集
currLine=line.strip().split('\t')
lineArr=[]
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
trainWeights=stocGradAscent1(np.array(trainingSet),trainingLabels,500) #使用随机梯度上升法训练
errorCount=0
numTestVec=0.0
for line in frTest.readlines(): #数据添加进测试集
numTestVec+=1.0
currLine=line.strip().split('\t')
lineArr=[]
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
if int(classifyVector(np.array(lineArr),trainWeights))!=int(currLine[-1]):
errorCount+=1
errorRate=(float(errorCount)/numTestVec)*100
print('测试集错误率为:%.2f%%'%errorRate)
#测试随机梯度上升算法
colicTest()
测试集错误率为:32.84%
#使用梯度上升算法
def colicTest2():
frTrain=open('H:/机器学习课程资料/machinelearninginaction/Ch05/horseColicTraining.txt')
frTest=open('H:/机器学习课程资料/machinelearninginaction/Ch05/horseColicTest.txt')
trainingSet=[]
trainingLabels=[]
for line in frTrain.readlines(): #将数据添加进训练集
currLine=line.strip().split('\t')
lineArr=[]
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
trainWeights=gradAscent(np.array(trainingSet),trainingLabels) #使用随机梯度上升法训练
errorCount=0
numTestVec=0.0
for line in frTest.readlines(): #数据添加进测试集
numTestVec+=1.0
currLine=line.strip().split('\t')
lineArr=[]
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
if int(classifyVector(np.array(lineArr),trainWeights[:,0]))!=int(currLine[-1]):
errorCount+=1
errorRate=(float(errorCount)/numTestVec)*100
print('测试集错误率为:%.2f%%'%errorRate)
#测试梯度上升算法
colicTest2()
测试集错误率为:28.36%
比较上面两种方法的结果,当我们数据集较小的时候应采取梯度上升算法,当数据集非常大的时候采用随机梯度上升算法
官方文档
参考博客
from sklearn.linear_model import LogisticRegression
def colicSklearn():
frTrain=open('H:/机器学习课程资料/machinelearninginaction/Ch05/horseColicTraining.txt')
frTest=open('H:/机器学习课程资料/machinelearninginaction/Ch05/horseColicTest.txt')
trainingSet=[]
trainingLabels=[]
testSet=[]
testLabels=[]
for line in frTrain.readlines(): #将数据添加进训练集
currLine=line.strip().split('\t')
lineArr=[]
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
trainingSet.append(lineArr)
trainingLabels.append(float(currLine[-1]))
for line in frTest.readlines(): #将数据添加进测试集
currLine=line.strip().split('\t')
lineArr=[]
for i in range(len(currLine)-1):
lineArr.append(float(currLine[i]))
testSet.append(lineArr)
testLabels.append(float(currLine[-1]))
classifier=LogisticRegression(solver='liblinear',max_iter=10) #solver参数
classifier.fit(trainingSet,trainingLabels)
test_accurcy=classifier.score(testSet,testLabels)*100
print('正确率:%f%%'%test_accurcy)
colicSklearn()
正确率:73.134328%