# optimize test1
# 先简单了解一下数据的大概情况
# import pandas as pd
# data = pd.read_excel('data.xlsx')
# print(data.shape) # (27722,8)
# print(data.isnull().any()) # 查看数据每列是否有缺失值
# print(data.isnull().sum().tolist()) # 查看数据每列一共有多少个缺失值 各列缺失值总和分别为[0, 0, 3, 16, 3, 111, 3, 3]
# stocks = pd.unique(data['Stkcd'])
# print(stocks.size) # 输出3620,一共有3620支股票
def question_one(df):
"""
:param df: 原始数据,DataFrame类型
:return: 一个Series类型数据,包含每支股票四个季度每股同比增长率;一个列表类型数据,为连续四个季度每股收益同比增长率大于等于20%的股票的代码
"""
# 定义计算每股收益同比增长率函数
# a季度每股收益同比增长率 = (今年a季度母公司的每股收益 - 去年a季度母公司的每股收益) ÷ 去年a季度母公司的每股收益
def calculate_rate(x):
"""
:param x: df分组后的数据
:return: 每支股票的四个季度每股同比增长率的列表
"""
r1 = (x.iloc[4, 2] - x.iloc[0, 2]) / x.iloc[0, 2]
r2 = (x.iloc[5, 2] - x.iloc[1, 2]) / x.iloc[1, 2]
r3 = (x.iloc[6, 2] - x.iloc[2, 2]) / x.iloc[2, 2]
r4 = (x.iloc[7, 2] - x.iloc[3, 2]) / x.iloc[3, 2]
return list(map(lambda t:round(t,2), [r1,r2,r3,r4]))
df.dropna(inplace=True)
# 考虑到仅仅是删除缺失值,这种操作可能导致某一股票的四个季度中缺少几个季度的数据,因此将数据不足8份的股票都删除
filtered_df = df.groupby(by='Stkcd').filter(lambda t: len(t)==8)
cal_df = filtered_df.groupby(by='Stkcd').apply(calculate_rate,include_groups=False)
# 筛选每股收益同比增长率连续四个季度大于20%的股票
all_great_then = cal_df.apply(lambda t: all(list(map(lambda y: y>=0.2,t))))
result_stock = all_great_then[all_great_then.values == True]
return cal_df,result_stock.index.tolist()
def question_two(df):
"""
:param df: 原始数据,DataFrame类型
:return: 2017年、2018年每股资本公积和每股未分配利润最大的10只股票,DataFrame类型
"""
# 同问题一一样处理数据
df.dropna(inplace=True)
filtered_df = df.groupby(by='Stkcd').filter(lambda t:len(t)==8)
def calculate_gonji(x):
s1 = x.head(4).iloc[:,4].sum()
s2 = x.tail(4).iloc[:,4].sum()
return round(s1,2),round(s2,2)
def calculate_wfp(x):
s1 = x.head(4).iloc[:,5].sum()
s2 = x.tail(4).iloc[:,5].sum()
return round(s1,2),round(s2,2)
result_gonji = filtered_df.groupby(by='Stkcd').apply(calculate_gonji,include_groups=False)
result_wfp = filtered_df.groupby(by='Stkcd').apply(calculate_wfp,include_groups=False)
# 取出2017年、2018男的每股资本公积年度总和,每股未分配利润年度总和
gonji_2017 = []
gonji_2018 = []
wfp_2017 = []
wfp_2018 = []
for i in range(result_gonji.shape[0]):
gonji_2017.append(result_gonji.values[i][0])
gonji_2018.append(result_gonji.values[i][1])
wfp_2017.append(result_wfp.values[i][0])
wfp_2018.append(result_wfp.values[i][1])
# 整理成数据框DataFrame
R = pd.DataFrame(
data={"gonji_2017":gonji_2017, "gonji_2018":gonji_2018, "wfp_2017":wfp_2017, "wfp_2018":wfp_2018},
index=result_gonji.index
)
# 排序,获取排名前十的股票
sort_gonji_17 = R['gonji_2017'].sort_values(ascending=False)
sort_gonji_18 = R['gonji_2018'].sort_values(ascending=False)
sort_wfp_17 = R['wfp_2017'].sort_values(ascending=False)
sort_wfp_18 = R['wfp_2018'].sort_values(ascending=False)
return sort_gonji_17.head(10),sort_gonji_18.head(10),sort_wfp_17.head(10),sort_wfp_18.head(10)
def question_three(df2):
"""
:param df2: 第二份原始数据data2,DataFrame类型
:return: 返回值依次为经过标准化处理的数据,主成分,特征向量
"""
df2.dropna(inplace=True)
df2['Accper'] = pd.to_datetime(df2['Accper'])
data_2018 = df2[df2['Accper'].dt.year == 2018]
x = data_2018.iloc[:,2:]
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
Y = pca.fit_transform(X)
tzxl = pca.components_
return X,Y,tzxl
def question_four(ndarray):
"""
:param ndarray: 数组类型的数据,question_three()函数返回的主成分
:return: 聚类中心,数组类型
"""
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
scores = []
for i in range(2,11):
kmeans_model = KMeans(n_clusters=i,random_state=0,max_iter=300)
kmeans_model.fit(ndarray)
labels = kmeans_model.labels_
scores.append(round(silhouette_score(ndarray,labels),2))
best_kind = max(enumerate(scores),key=lambda x: x[1])[0] + 2 # 获取轮廓系数最大的簇数
kmeans = KMeans(n_clusters=best_kind,random_state=0,max_iter=300)
kmeans.fit(ndarray)
centers = kmeans.cluster_centers_
return centers
# 主程序
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel('data.xlsx')
data2 = pd.read_excel('data2.xlsx')
# 问题一
r1 = question_one(data)
# 第一种处理方式
# a0 = pd.DataFrame(data=r1[0],columns=['四个季度同比增长率'])
# a0_index = pd.Series(a0.index).astype(str).apply(lambda x:x.zfill(6))
# a0.set_index(a0_index,inplace=True)
# a0[['第一季度同比增长率','第二季度同比增长率','第三季度同比增长率','第四季度同比增长率']] = a0['四个季度同比增长率'].apply(lambda x: pd.Series(x))
# a0.drop(columns='四个季度同比增长率',inplace=True)
# a1 = pd.DataFrame(data=r1[1],columns=['股票代码'])
# a1['股票代码'] = a1['股票代码'].astype(str).apply(lambda x: x.zfill(6))
# print(a0)
# 第二种处理方式
a0 = pd.DataFrame(data=r1[0].values,columns=['四个季度同比增长率'],index=r1[0].index)
# print(a0)
expand = pd.DataFrame(a0['四个季度同比增长率'].values.tolist(),columns=['第一个季度同比增长率','第二个季度同比增长率','第三个季度同比增长率','第四个季度同比增长率'],index=r1[0].index)
# print(expand)
a0 = pd.concat([a0,expand],axis=1)
a0.drop(columns='四个季度同比增长率',inplace=True)
a0.index = a0.index.map(lambda x: str(x).zfill(6))
print(a0)
# 第三种处理方式
# a0 = pd.DataFrame(data=r1[0].values,columns=['四个季度同比增长率'],index=r1[0].index)
# a0['第一个季度同比增长率'] = [x[0] for x in a0['四个季度同比增长率'].values]
# a0['第二个季度同比增长率'] = [x[1] for x in a0['四个季度同比增长率'].values]
# a0['第三个季度同比增长率'] = [x[2] for x in a0['四个季度同比增长率'].values]
# a0['第四个季度同比增长率'] = [x[3] for x in a0['四个季度同比增长率'].values]
# a0.drop('四个季度同比增长率',axis=1,inplace=True)
# a0.index = a0.index.map(lambda x: str(x).zfill(6))
# # print(a0['四个季度同比增长率'].values)
# print(a0)
# # print(type(a0.index))
# r1[0].to_excel("question_one.xlsx",sheet_name='每个季度每股同比增长率')
# a1.to_excel("question_one.xlsx",sheet_name='连续4个季度每股收益同比增长率大于20%的股票代码') # 第二次调用会覆盖掉第一次调用
# with pd.ExcelWriter('question_one.xlsx') as writer:
# a0.to_excel(writer, sheet_name='Sheet1')
# a1.to_excel(writer, sheet_name='Sheet2')
# optimize test1
# 先简单了解一下数据的大概情况
# import pandas as pd
# data = pd.read_excel('data.xlsx')
# print(data.shape) # (27722,8)
# print(data.isnull().any()) # 查看数据每列是否有缺失值
# print(data.isnull().sum().tolist()) # 查看数据每列一共有多少个缺失值 各列缺失值总和分别为[0, 0, 3, 16, 3, 111, 3, 3]
# stocks = pd.unique(data['Stkcd'])
# print(stocks.size) # 输出3620,一共有3620支股票
def question_one(df):
"""
:param df: 原始数据,DataFrame类型
:return: 一个Series类型数据,包含每支股票四个季度每股同比增长率;一个列表类型数据,为连续四个季度每股收益同比增长率大于等于20%的股票的代码
"""
# 定义计算每股收益同比增长率函数
# a季度每股收益同比增长率 = (今年a季度母公司的每股收益 - 去年a季度母公司的每股收益) ÷ 去年a季度母公司的每股收益
def calculate_rate(x):
"""
:param x: df分组后的数据
:return: 每支股票的四个季度每股同比增长率的列表
"""
r1 = (x.iloc[4, 2] - x.iloc[0, 2]) / x.iloc[0, 2]
r2 = (x.iloc[5, 2] - x.iloc[1, 2]) / x.iloc[1, 2]
r3 = (x.iloc[6, 2] - x.iloc[2, 2]) / x.iloc[2, 2]
r4 = (x.iloc[7, 2] - x.iloc[3, 2]) / x.iloc[3, 2]
return list(map(lambda t:round(t,2), [r1,r2,r3,r4]))
df.dropna(inplace=True)
# # 考虑到仅仅是删除缺失值,这种操作可能导致某一股票的四个季度中缺少几个季度的数据,因此将数据不足8份的股票都删除
filtered_df = df.groupby(by='Stkcd').filter(lambda t: len(t)==8)
cal_df = filtered_df.groupby(by='Stkcd').apply(calculate_rate,include_groups=False)
# # 筛选每股收益同比增长率连续四个季度大于20%的股票
all_great_then = cal_df.apply(lambda t: all(list(map(lambda y: y>=0.2,t))))
result_stock = all_great_then[all_great_then.values == True]
return cal_df,result_stock.index.tolist()
def question_two(df):
"""
:param df: 原始数据,DataFrame类型
:return: 2017年、2018年每股资本公积和每股未分配利润最大的10只股票,DataFrame类型
"""
# 同问题一一样处理数据
df.dropna(inplace=True)
filtered_df = df.groupby(by='Stkcd').filter(lambda t:len(t)==8)
def calculate_gonji(x):
s1 = x.head(4).iloc[:,4].sum()
s2 = x.tail(4).iloc[:,4].sum()
return round(s1,2),round(s2,2)
def calculate_wfp(x):
s1 = x.head(4).iloc[:,5].sum()
s2 = x.tail(4).iloc[:,5].sum()
return round(s1,2),round(s2,2)
result_gonji = filtered_df.groupby(by='Stkcd').apply(calculate_gonji,include_groups=False)
result_wfp = filtered_df.groupby(by='Stkcd').apply(calculate_wfp,include_groups=False)
# 取出2017年、2018男的每股资本公积年度总和,每股未分配利润年度总和
gonji_2017 = []
gonji_2018 = []
wfp_2017 = []
wfp_2018 = []
for i in range(result_gonji.shape[0]):
gonji_2017.append(result_gonji.values[i][0])
gonji_2018.append(result_gonji.values[i][1])
wfp_2017.append(result_wfp.values[i][0])
wfp_2018.append(result_wfp.values[i][1])
# 整理成数据框DataFrame
R = pd.DataFrame(
data={"gonji_2017":gonji_2017, "gonji_2018":gonji_2018, "wfp_2017":wfp_2017, "wfp_2018":wfp_2018},
index=result_gonji.index
)
# 排序,获取排名前十的股票
sort_gonji_17 = R['gonji_2017'].sort_values(ascending=False)
sort_gonji_18 = R['gonji_2018'].sort_values(ascending=False)
sort_wfp_17 = R['wfp_2017'].sort_values(ascending=False)
sort_wfp_18 = R['wfp_2018'].sort_values(ascending=False)
return sort_gonji_17.head(10),sort_gonji_18.head(10),sort_wfp_17.head(10),sort_wfp_18.head(10)
def question_three(df2):
"""
:param df2: 第二份原始数据data2,DataFrame类型
:return: 返回值依次为经过标准化处理的数据,主成分,特征向量
"""
df2.dropna(inplace=True)
df2['Accper'] = pd.to_datetime(df2['Accper'])
data_2018 = df2[df2['Accper'].dt.year == 2018]
x = data_2018.iloc[:,2:]
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(x)
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
Y = pca.fit_transform(X)
tzxl = pca.components_
return X,Y,tzxl
def question_four(ndarray):
"""
:param ndarray: 数组类型的数据,question_three()函数返回的主成分
:return: 聚类结果;聚类中心,数组类型
"""
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
scores = []
for i in range(2,11):
kmeans_model = KMeans(n_clusters=i,random_state=0,max_iter=300)
kmeans_model.fit(ndarray)
labels = kmeans_model.labels_
scores.append(round(silhouette_score(ndarray,labels),2))
best_kind = max(enumerate(scores),key=lambda x: x[1])[0] + 2 # 获取轮廓系数最大的簇数
kmeans = KMeans(n_clusters=best_kind,random_state=0,max_iter=300)
cluster_result = kmeans.fit_transform(ndarray)
centers = kmeans.cluster_centers_
return cluster_result,centers
# 主程序
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel('data.xlsx')
data2 = pd.read_excel('data2.xlsx')
# 问题一
# r1 = question_one(data)
# a0 = pd.DataFrame(data=r1[0].values,columns=['四季度每股收益同比增长率'])
# columns = ['第一季度每股收益同比增长率','第二季度每股收益同比增长率','第三季度每股收益同比增长率','第四季度每股收益同比增长率']
# a0[columns] = a0['四季度每股收益同比增长率'].apply(lambda x: pd.Series(x))
# a0.drop(columns='四季度每股收益同比增长率',inplace=True)
# a0.index = a0.index.map(lambda x: str(x).zfill(6))
# a1 = pd.DataFrame(
# data = map(lambda x: str(x).zfill(6),r1[1]),
# columns = ['股票代码']
# )
# a1['排名'] = a1.index.values + 1
# a1 = a1[['排名','股票代码']] # 交换列的位置
# print(a1)
# # r1[0].to_excel("question_one.xlsx",sheet_name='每个季度每股同比增长率')
# # a1.to_excel("question_one.xlsx",sheet_name='连续4个季度每股收益同比增长率大于20%的股票代码') # 第二次调用会覆盖掉第一次调用
# with pd.ExcelWriter('question_one.xlsx',engine='openpyxl') as writer:
# a0.to_excel(writer, sheet_name='每个季度每股同比增长率')
# a1.to_excel(writer, sheet_name='连续4个季度每股收益同比增长率大于20%的股票代码',index=False) # 设置index=False不把索引也导入进Excel表格中
# 问题二
# r = question_two(data)
# sub_plot = [221,222,223,224]
# years = [2017,2018,2017,2018]
# category = ['每股资本公积','每股未分配利润']
# import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif'] = "SimHei"
# plt.figure(figsize=[10, 8])
# for i in range(4):
# plt.subplot(sub_plot[i])
# # plt.bar(r1.index,r1.values)
# r[i].plot(kind='bar')
# plt.title(f"{years[i]}年{category[i//2]}Top10股票榜")
# plt.xlabel("股票代码")
# plt.ylabel(category[i//2])
# plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()
# 问题三
data2 = pd.read_excel('data2.xlsx')
X,Y,tzxl = question_three(data2)
tzxl = tzxl.T
for i in range(5):
print('第',i+1,'个主成分的表达式为: X * ',tzxl[:,i],'\n'
,'\t表示该主成分与原始数据X的6个特征的关系程度系数依次为:\n\t'
,tzxl[:,i])
print()
# 问题四
r4 = question_four(Y)
print('聚类中心为:\n',r4[1])