机器学习第8章--预测回归

regression.py

from numpy import*
import matplotlib.pyplot as plt
#数据导入函数,返回dataMat,labelMat
def loadDataSet(fileName):  # generalfunction to parse tab -delimited floats
    numFeat =len(open(fileName).readline().split('\t')) - 1 # get number of fields
    dataMat = []; labelMat = []
    fr = open(fileName)
    for line in fr.readlines():
        lineArr = []
        curLine =line.strip().split('\t')
        for i in range(numFeat):
           lineArr.append(float(curLine[i]))#列表的列表
        dataMat.append(lineArr)
       labelMat.append(float(curLine[-1]))
    return dataMat, labelMat

#标准回归函数,返回ws
def standRegres(xArr, yArr):
    xMat = mat(xArr);
    yMat = mat(yArr).T
    xTx = xMat.T * xMat
    if linalg.det(xTx) == 0.0:
        print("This matrix issingular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T * yMat)
    return ws
#测试普通的拟合
def _test_8_1():
   xArr,yArr=loadDataSet("ex0.txt")
#    print(xArr[0:2])
#    print(yArr[0:2])
    xMat=mat(xArr)
    yMat=mat(yArr)
    fig=plt.figure()
    ax=fig.add_subplot(111)
   t=ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
    #拟合直线
    xCopy=xMat.copy()
    xCopy.sort(0)
    ws=standRegres(xArr,yArr)
    yHat=xCopy*ws
    t=ax.plot(xCopy[:,1],yHat)
    t=ax.plot(ws)
    print(ws)
    plt.show()

#_test_8_1()

#局部加权线性回归函数,返回ws,这里是单个点的估计
def lwlr(testPoint,xArr,yArr,k=1.0):
    xMat = mat(xArr); yMat = mat(yArr).T
    m = shape(xMat)[0]
    weights = mat(eye((m)))
    for j in range(m):                      #next 2 lines createweights matrix
        diffMat = testPoint -xMat[j,:]     #
        weights[j,j] =exp(diffMat*diffMat.T/(-2.0*k**2))
    xTx = xMat.T * (weights * xMat)
    if linalg.det(xTx) == 0.0:
        print("This matrix issingular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T * (weights *yMat))
    return testPoint * ws
#返回所有点的估计(yHat)
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
    m = shape(testArr)[0]
    yHat = zeros(m)
    for i in range(m):
        yHat[i] =lwlr(testArr[i],xArr,yArr,k)
    return yHat

#测试局部加权拟合
def _test_8_2(k):
   xArr,yArr=loadDataSet("ex0.txt")
    yHat=lwlrTest(xArr,xArr,yArr,k)
    xMat=mat(xArr)
    yMat=mat(yArr)
    srtInd=xMat[:,1].argsort(0)
    xSort=xMat[srtInd][:,0,:]
    fig=plt.figure()
    ax=fig.add_subplot(111)
   t=ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0],s=2,c='red')
    #拟合直线
    ws=standRegres(xArr,yArr)
    t=ax.plot(xSort[:,1],yHat[srtInd])
    plt.show()

#_test_8_2(0.01) #k=1时欠拟合,k=0.003时过拟合
#测试误差,模拟核
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
    return ((yArr-yHatArr)**2).sum()
def _test_wucha_():
    abX,abY=loadDataSet("abalone.txt")
   yHat01=lwlrTest(abX[0:99],abX[0:99],abY[0:99],0.1)
    yHat1 = lwlrTest(abX[0:99],abX[0:99],abY[0:99], 1)
    yHat10 =lwlrTest(abX[0:99],abX[0:99], abY[0:99], 10)
    print(rssError(abY[0:99],yHat01.T))
    print(rssError(abY[0:99],yHat1.T))
    print(rssError(abY[0:99],yHat10.T))
    print("Look at the newdata\n")
   yHat01=lwlrTest(abX[100:199],abX[0:99],abY[0:99],0.1)
    yHat1 =lwlrTest(abX[100:199],abX[0:99], abY[0:99], 1)
    yHat10 = lwlrTest(abX[100:199],abX[0:99],abY[0:99], 10)
   print(rssError(abY[100:199],yHat01.T))
    print(rssError(abY[100:199],yHat1.T))
   print(rssError(abY[100:199],yHat10.T))
#_test_wucha_()

#返回的单个的系数向量
def ridgeRegres(xMat, yMat, lam=0.2):
    xTx = xMat.T * xMat
    denom = xTx + eye(shape(xMat)[1]) *lam
    if linalg.det(denom) == 0.0:
        print("This matrix issingular, cannot do inverse")
        return
    ws = denom.I * (xMat.T * yMat)
    return ws
#返回的是系数矩阵,系数矩阵里面的每一个系数向量对应了一个不同的拉姆达
def ridgeTest(xArr, yArr):
    xMat = mat(xArr);
    yMat = mat(yArr).T
    yMean = mean(yMat, 0)
    yMat = yMat - yMean  # to eliminate X0 take mean off of Y
    # regularize X's
    xMeans = mean(xMat, 0)  # calc mean then subtract it off
    xVar = var(xMat, 0)  # calc variance of Xi then divide by it
    xMat = (xMat - xMeans) / xVar
    numTestPts = 30
    wMat = zeros((numTestPts,shape(xMat)[1]))
    for i in range(numTestPts):
        ws = ridgeRegres(xMat, yMat,exp(i - 10))
        wMat[i, :] = ws.T
    return wMat
#测试一下加入拉姆达之后鲍鱼的年龄预测的误差还是不是和用局部线性回归那样大了,
#绘制了回归系数随着拉姆达的变化的变化,便于在拉姆达一定范围内去寻求最佳拟合。
def _test_8_3():
   abX,abY=loadDataSet("abalone.txt")
    ridgeWeights=ridgeTest(abX,abY)
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.plot(ridgeWeights)
    print(ridgeWeights)
   plt.show()
#_test_8_3()
#输入矩阵标准化
def regularize(xMat):#regularize by columns
    inMat = xMat.copy()
    inMeans = mean(inMat,0)   #calc mean then subtract it off
    inVar = var(inMat,0)      #calc variance of Xi then divide by it
    inMat = (inMat - inMeans)/inVar
    return inMat
#逐步线性回归算法
def stageWise(xArr,yArr,eps=0.01,numIt=100):
    xMat = mat(xArr); yMat=mat(yArr).T
    yMean = mean(yMat,0)
    yMat = yMat - yMean     #can also regularize ys but will getsmaller coef
    xMat = regularize(xMat)
    m,n=shape(xMat)
    returnMat = zeros((numIt,n)) #testingcode remove
    ws = zeros((n,1)); wsTest =ws.copy(); wsMax = ws.copy()
    for i in range(numIt):
#        print(ws.T)
        lowestError = inf;
        for j in range(n):
            for sign in [-1,1]:
                wsTest = ws.copy()
                wsTest[j] += eps*sign
                yTest = xMat*wsTest
                rssE =rssError(yMat.A,yTest.A)
                if rssE < lowestError:
                    lowestError = rssE
                    wsMax = wsTest
        ws = wsMax.copy()
        returnMat[i,:]=ws.T
    return returnMat
#看迭代矩阵的变化
def _test_8_4():
   xArr,yArr=loadDataSet("abalone.txt")
   ridgeWeights=stageWise(xArr,yArr,0.005,1000)
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.plot(ridgeWeights)
    plt.show()

#    xMat=mat(xArr)
#    yMat=mat(yArr).T
#    xMat=regularize(xMat)
#    yM=mean(yMat,0)
#    yMat=yMat-yM
#    print(standRegres(xMat,yMat.T).T)
_test_8_4()

test_lego.py

from time importsleep
import json
import urllib.request
import socket
socket.setdefaulttimeout(20)  # 设置socket层的超时时间为20秒

def searchForSet(retX, retY, setNum, yr, numPce, origPrc):
    sleep(10)
    myAPIstr ='AIzaSyD2cR2KFyx12hXu6PFU-wrWot3NXvko8vY'
    searchURL = 'https://www.googleapis.com/shopping/search/v1/public/products?key=%s&country=US&q=lego+%d&alt=json'% (
    myAPIstr, setNum)
    pg =urllib.request.urlopen(searchURL)
    retDict = json.loads(pg.read())
    for i inrange(len(retDict['items'])):
        try:
            currItem = retDict['items'][i]
            ifcurrItem['product']['condition'] == 'new':
                newFlag = 1
            else:
                newFlag = 0
            listOfInv =currItem['product']['inventories']
            for item in listOfInv:
                sellingPrice =item['price']
                if sellingPrice >origPrc * 0.5:
                   print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc,sellingPrice))
                    retX.append([yr,numPce, newFlag, origPrc])
                   retY.append(sellingPrice)
        except:
            print('problem with item %d'% i)
    pg.close()  # 注意关闭response
def setDataCollect(retX, retY):
    searchForSet(retX, retY, 8288, 2006,800, 49.99)
    searchForSet(retX, retY, 10030, 2002,3096, 269.99)
    searchForSet(retX, retY, 10179, 2007,5195, 499.99)
    searchForSet(retX, retY, 10181, 2007,3428, 199.99)
    searchForSet(retX, retY, 10189, 2008,5922, 299.99)
    searchForSet(retX, retY, 10196, 2009,3263, 249.99)
lgX=[];lgY=[]
setDataCollect(lgX, lgY)

 


你可能感兴趣的:(机器学习第8章--预测回归)