from sklearn import datasets
iris = datasets.load_iris()
>>> print(iris.data) [[ 0. 0. 5. ..., 0. 0. 0.] [ 0. 0. 0. ..., 10. 0. 0.] [ 0. 0. 0. ..., 16. 9. 0.] ..., [ 0. 0. 1. ..., 6. 0. 0.] [ 0. 0. 2. ..., 12. 0. 0.] [ 0. 0. 10. ..., 12. 1. 0.]]
>>> from sklearn import svm >>> clf = svm.SVC(gamma=0.001, C=100.)
fit(X, y)
和 predict(T)
.
X, y = iris.data, iris.target
iris.shape
得(28,19)
digits.target 就是数字数据集各样例对应的真实数字值。也就是我们的程序要学习的。
6.pickle来保存scikit中的模型
>>>import pickle
>>>s = pickle.dumps(clf)
>>>clf2 = pickle.loads(s)
一个 estimator 可以是任意一个从数据中学习到的对象;他可能是分类算法(classification),回归算法(regression), 聚类算法(clustering),或者一个变换算法
不管他是何种算法,所有的 estimator 对象都向外部暴露了一个 fit 方法 ,该成员方法的操作对象是一个数据集
一个estimator的所有参数即可以在初始化的时候设置,也可以 按对应属性修改:
>>> estimator = Estimator(param1=1, param2=2)
>>> estimator.param1
X
中的未知标签的样本,并返回预测的标签
y
.
每一个estimator暴露一个计算estimator在测试数据上的测试得分的方法: score 得分越大,estimator对数据的拟合模型越好。 .
>>> # Split iris data in train and test data
>>> # A random permutation, to split the data randomly
>>> np.random.seed(0)
>>> indices = np.random.permutation(len(iris_X))
>>> iris_X_train = iris_X[indices[:-10]]
>>> iris_y_train = iris_y[indices[:-10]]
>>> iris_X_test = iris_X[indices[-10:]]
>>> iris_y_test = iris_y[indices[-10:]]
>>> # Create and fit a nearest-neighbor classifier
>>> from sklearn.neighbors import KNeighborsClassifier
>>> knn = KNeighborsClassifier()
>>> knn.fit(iris_X_train, iris_y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=1, n_neighbors=5, p=2,
weights='uniform')
>>> knn.predict(iris_X_test)
array([1, 2, 1, 0, 0, 0, 2, 1, 2, 0])
>>> iris_y_test
array([1, 1, 1, 0, 0, 0, 2, 1, 2, 0])
import csv
rf = open('bank.csv','rb')
reader = csv.reader(rf)
此处要注意,打开一个csv文件,必须用二进制的形式打开。
此时的reader为一个迭代器,它只能使用next()和for循环。
reader.next() 返回即为第一行的内容。
要看得到所有内容,就可以使用for循环了。
for row in reader: print row
接下来,来看写入csv文件。
wf = open('bank2.csv','wb')
writer = csv.writer(wf)
writer.writerow(['id','age','sex','region','income','married','children','car','save_act','current_act','mortgage','pep'])
writer.writerow(reader.next())
线性回归的最简单形式是通过调节一个参数集合为数据集拟合一个线性模型,使得其残差平方和尽可能小。
线性模型:
- : 数据
- : 目标变量
- : 系数
- : 观测噪声
>>> from sklearn import linear_model
>>> regr = linear_model.LinearRegression()
>>> regr.fit(diabetes_X_train, diabetes_y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
>>> print(regr.coef_)
[ 0.30349955 -237.63931533 510.53060544 327.73698041 -814.13170937
492.81458798 102.84845219 184.60648906 743.51961675 76.09517222]
>>> # The mean square error
>>> np.mean((regr.predict(diabetes_X_test)-diabetes_y_test)**2)
2004.56760268...
>>> # Explained variance score: 1 is perfect prediction
>>> # and 0 means that there is no linear relationship
>>> # between X and Y.
>>> regr.score(diabetes_X_test, diabetes_y_test)
0.5850753022690...
>>> import numpy as np >>> X_folds = np.array_split(X_digits, 3) >>> y_folds = np.array_split(y_digits, 3)
交叉验证生成器 (cross-validation generators)去产生一个 索引列表来达到数据分割的目的:
>>> from sklearn import cross_validation
>>> k_fold = cross_validation.KFold(n=6, n_folds=3)
>>> for train_indices, test_indices in k_fold:
... print('Train: %s | test: %s' % (train_indices, test_indices))
Train: [2 3 4 5] | test: [0 1]
Train: [0 1 4 5] | test: [2 3]
Train: [0 1 2 3] | test: [4 5]
基于交叉验证生成器,交叉验证的实现将会变得非常简单轻松:
>>> kfold = cross_validation.KFold(len(X_digits), n_folds=3)
>>> [svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test])
... for train, test in kfold]
[0.93489148580968284, 0.95659432387312182, 0.93989983305509184]
为了计算一个estimator的score
方法的值, sklearn 暴露了一个辅助性的方法:
>>> cross_validation.cross_val_score(svc, X_digits, y_digits, cv=kfold, n_jobs=-1)
array([ 0.93489149, 0.95659432, 0.93989983])
from numpy.linalg import inv
from numpy import dot, transpose
X = [[1, 6, 2], [1, 8, 1], [1, 10, 0], [1, 14, 2], [1, 18, 0]]
y = [[7], [9], [13], [17.5], [18]]
print(dot(inv(dot(transpose(X), X)), dot(transpose(X), y)))
from numpy.linalg import lstsqprint(lstsq(X, y)[0])
(1)会基于linear_model.LinearRegression建立一元/多元线性回归模型;会基于LinearRegression和preprocessing.PolynomialFeatures建立一元多次线性回归模型;会基于linear_model.SGDRegressor建立随机梯度下降SGD模型;
(2)使用model.fit()建模,使用model.predict()预测,使用model.score()求测试集的R-Square;
(3)基于cross_validation,会用train_test_split()函数划分训练集和测试集,会用cross_val_score()计算交叉检验的R-Squre结果;
from sklearn.linear_model import LinearRegression
X = [[8], [9], [11], [16], [12]]
y = [[11], [8.5], [15], [18], [11]]
model = LinearRegression()
model.fit(X, y)#建立一元线性回归模型
X = [[6, 2], [8, 1], [10, 0], [14, 2], [18, 0]]
y = [[7], [9], [13], [17.5], [18]]
model = LinearRegression()
model.fit(X, y) #建立二元线性回归模型
print('预测一张12英寸匹萨价格:$%.2f' % model.predict([12])[0]) #单值预测
X_test = [[8, 2], [9, 0], [11, 2], [16, 2], [12, 0]]
predictions = model.predict(X_test)#一组数进行预测
model.score(X_test, y_test) #LinearRegression的score方法可以计算R方
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
X_train = [[6], [8], [10], [14], [18]]
y_train = [[7], [9], [13], [17.5], [18]] #需要输入列向量,而不是行向量
X_test = [[6], [8], [11], [16]]#测试数据准备
y_test = [[8], [12], [15], [18]]
quadratic_featurizer = PolynomialFeatures(degree=2) #定义多项式的最高阶数
X_train_quadratic = quadratic_featurizer.fit_transform(X_train) #为fit()建模准备输入
regressor_quadratic = LinearRegression()
regressor_quadratic.fit(X_train_quadratic, y_train)# fit()函数建模
X_test_quadratic = quadratic_featurizer.transform(X_test) #为预测准备输入量
y_test_quadratic=regressor_quadratic.predict(xx_quadratic) #使用模型预测数据
print('一元线性回归 r-squared', regressor.score(X_test, y_test)) #计算R-Square
from sklearn.cross_validation import train_test_split
df = pd.read_csv('mlslpic/winequality-red.csv', sep=';')
X = df[list(df.columns)[:-1]]
y = df['quality']
X_train, X_test, y_train, y_test = train_test_split(X, y)
train_size
改变训练集和测试集的比例X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.5)
from sklearn.cross_validation import cross_val_score
regressor = LinearRegression()
scores = cross_val_score(regressor, X, y, cv=5)
print(scores.mean(), scores
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler()
y_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
y_train = y_scaler.fit_transform(y_train)
X_test = X_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)
regressor = SGDRegressor(loss='squared_loss')
regressor.fit_transform(X_train, y_train) #建立模型
print('测试集R方值:', regressor.score(X_test, y_test))
>>> a = [1,2,3,4,5,6]
>>> a[:-1] #输出[1, 2, 3, 4, 5]
>>> a[1:2] #输出[2]
>>> a[1:] #输出[2, 3, 4, 5, 6]
enumerate()
循环,得到数组的序号idx
(放在前面的)和数值val
(放在后面的);for idx, val in enumerate(ints):
print(idx, val)
linspace()
将区间进行划分xx = np.linspace(0, 26, 100)
xx #输出: array([ 0. , 6.5, 13. , 19.5, 26. ])
arange
和linsapce
,X=np.arange(-6,6,5);X
# 输出array([-6, -1, 4])
X=np.arange(-6,6,1);X
# 输出array([-6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5])
X=np.linspace(-6,6,5);X
# 输出 array([-6., -3., 0., 3., 6.])
LogisticRegression
分类器进行训练和分类from sklearn.linear_model.logistic import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
math
库math
库import math
print math.sin(10) # sine
print math.cos(10) # cosine
print math.tan(10) # tangent
print math.asin(10) # arc sine
print math.acos(10) # arc cosine
print math.atan(10) # arc tangent
print math.sinh(10) # hyperbolic sine
print math.cosh(10) # hyperbolic cosine
print math.tanh(10) # hyperbolic tangent
print math.pow(2, 4) # 2 raised to 4
print math.exp(4) # e ^ 4
print math.sqrt(10) # square root
print math.pow(5, 1/3.0) # cubic root of 5
print math.log(3) # ln; natural logarithm
print math.log(100, 10) # base 10
print math.ceil(2.3) # ceiling
print math.floor(2.7) # floor
print math.pi
print math.e
numpy.linalg.eig()
计算特征值和特征向量import numpy as np
w,v=np.linalg.eig(np.array([[1,-2],[2,-6]]))
sklearn用户手册http://download.csdn.net/detail/ssrob/8757217