%matplotlib inline
import seaborn
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
import warnings; warnings.simplefilter('ignore') #忽略可能会出现的警告信息,警告并不是错误,可以忽略;
1. 数据获取
import pandas as pd
import numpy as np
import tushare as ts
hs300 = ts.get_k_data('hs300', start = '2013-01-01', end = '2016-06-25') #训练集数据
hs300.set_index('date', inplace = True)
hs300.set_index('date', inplace = True)
hs300['returns'] = hs300['close'].pct_change()
hs300.dropna(inplace=True) #有NaN值的话会影响机器学习算法;
2. 数据处理——特征工程处理
#特征的生成;
for i in range(1, 8, 1):
hs300['close - ' + str(i) + 'd'] = hs300['close'].shift(i)
hs_7d = hs300[[x for x in hs300.columns if 'close' in x]].iloc[7:] #从第八行开始没有NaN地方开始取值作为features
import sklearn
from sklearn import linear_model
X_train = hs_7d
X_train = sklearn.preprocessing.scale(X_train) #对features进行标准化;
X_train
3. 逻辑回归预测股价趋势算法实现
lm = linear_model.LogisticRegression(C=1000)
# 计算出训练集的labels;
y_train = np.sign(hs_7d['close'].pct_change().shift(-1)) #非常重要;拿到下一天的收益,用.shift(-1)
y_train.replace(to_replace= np.NaN, value = 0,inplace = True)
y_train = y_train.reshape(-1,1)
y_train[-10:]
lm.fit(X_train, y_train)
lm.score(X_train, y_train)
hs300['prediction'] = np.NaN
hs300['prediction'].ix[7:] = lm.predict(X_train) #返回的是我预测的训练集的labels;
hs300['strategy'] = (hs300['prediction'].shift(1) * hs300['returns'] + 1).cumprod()
hs300['cum_ret'] = (hs300['returns']+1).cumprod()
hs300[['strategy','cum_ret']].dropna().plot(figsize=(10, 6))
4. 改变算法:SVM
X_train = hs_7d
from sklearn.svm import SVC
clf_SVC = SVC(kernel = 'linear')
clf_SVC.fit(X_train, y_train)
clf_SVC.score(X_train, y_train)
hs300['prediction'] = np.NaN
hs300['prediction'].ix[7:] = clf_SVC.predict(X_train)
hs300['strategy'] = (hs300['prediction'].shift(1) * hs300['returns'] + 1).cumprod()
hs300['cum_ret'] = (hs300['returns']+1).cumprod()
hs300[['strategy','cum_ret']].dropna().plot(figsize=(10, 6))
5. 逻辑回归算法在测试集的验证
hs300_test = ts.get_k_data('hs300', start = '2016-07-01', end = '2017-06-30') #测试集数据
hs300_test.set_index('date', inplace = True)
hs300_test['returns'] = hs300_test['close'].pct_change()
hs300_test.dropna(inplace=True)
for i in range(1, 8, 1):
hs300_test['close - ' + str(i) + 'd'] = hs300_test['close'].shift(i)
hs_7d_test = hs300_test[[x for x in hs300_test.columns if 'close' in x]].iloc[7:]
X_test = hs_7d_test
X_test = sklearn.preprocessing.scale(X_test)
X_test
hs300_test['prediction'] = np.NaN
hs300_test['prediction'].ix[7:] = lm.predict(X_test) #给你测试集的features,返回的是预测的测试集的labels
hs300_test['strategy'] = (hs300_test['prediction'].shift(1) * hs300_test['returns'] + 1).cumprod()
hs300_test['cum_ret'] = (hs300_test['returns']+1).cumprod()
hs300_test[['strategy','cum_ret']].dropna().plot(figsize=(10, 6))
6.SVM算法在测试集的验证
X_test = hs_7d_test
hs300_test['prediction'] = np.NaN
hs300_test['prediction'].ix[7:] = clf_SVC.predict(X_test)
hs300_test['strategy'] = (hs300_test['prediction'].shift(1) * hs300_test['returns'] + 1).cumprod()
hs300_test['cum_ret'] = (hs300_test['returns']+1).cumprod()
hs300_test[['strategy','cum_ret']].dropna().plot(figsize=(10, 6))