bigtwotwo实验六

# optimize test1
# 先简单了解一下数据的大概情况
# import pandas as pd
# data = pd.read_excel('data.xlsx')
# print(data.shape) # (27722,8)
# print(data.isnull().any()) # 查看数据每列是否有缺失值
# print(data.isnull().sum().tolist()) # 查看数据每列一共有多少个缺失值 各列缺失值总和分别为[0, 0, 3, 16, 3, 111, 3, 3]
# stocks = pd.unique(data['Stkcd'])
# print(stocks.size) # 输出3620,一共有3620支股票


def question_one(df):
    """
    :param df: 原始数据,DataFrame类型
    :return: 一个Series类型数据,包含每支股票四个季度每股同比增长率;一个列表类型数据,为连续四个季度每股收益同比增长率大于等于20%的股票的代码
    """
    # 定义计算每股收益同比增长率函数
    # a季度每股收益同比增长率 = (今年a季度母公司的每股收益 - 去年a季度母公司的每股收益) ÷ 去年a季度母公司的每股收益
    def calculate_rate(x):
        """
        :param x: df分组后的数据
        :return: 每支股票的四个季度每股同比增长率的列表
        """
        r1 = (x.iloc[4, 2] - x.iloc[0, 2]) / x.iloc[0, 2]
        r2 = (x.iloc[5, 2] - x.iloc[1, 2]) / x.iloc[1, 2]
        r3 = (x.iloc[6, 2] - x.iloc[2, 2]) / x.iloc[2, 2]
        r4 = (x.iloc[7, 2] - x.iloc[3, 2]) / x.iloc[3, 2]
        return list(map(lambda t:round(t,2), [r1,r2,r3,r4]))

    df.dropna(inplace=True)
    # 考虑到仅仅是删除缺失值,这种操作可能导致某一股票的四个季度中缺少几个季度的数据,因此将数据不足8份的股票都删除
    filtered_df = df.groupby(by='Stkcd').filter(lambda t: len(t)==8)
    cal_df = filtered_df.groupby(by='Stkcd').apply(calculate_rate,include_groups=False)
    # 筛选每股收益同比增长率连续四个季度大于20%的股票
    all_great_then = cal_df.apply(lambda t: all(list(map(lambda y: y>=0.2,t))))
    result_stock = all_great_then[all_great_then.values == True]

    return cal_df,result_stock.index.tolist()


def question_two(df):
    """
    :param df: 原始数据,DataFrame类型
    :return: 2017年、2018年每股资本公积和每股未分配利润最大的10只股票,DataFrame类型
    """
    # 同问题一一样处理数据
    df.dropna(inplace=True)
    filtered_df = df.groupby(by='Stkcd').filter(lambda t:len(t)==8)

    def calculate_gonji(x):
        s1 = x.head(4).iloc[:,4].sum()
        s2 = x.tail(4).iloc[:,4].sum()
        return round(s1,2),round(s2,2)

    def calculate_wfp(x):
        s1 = x.head(4).iloc[:,5].sum()
        s2 = x.tail(4).iloc[:,5].sum()
        return round(s1,2),round(s2,2)

    result_gonji = filtered_df.groupby(by='Stkcd').apply(calculate_gonji,include_groups=False)
    result_wfp = filtered_df.groupby(by='Stkcd').apply(calculate_wfp,include_groups=False)

    # 取出2017年、2018男的每股资本公积年度总和,每股未分配利润年度总和
    gonji_2017 = []
    gonji_2018 = []
    wfp_2017 = []
    wfp_2018 = []
    for i in range(result_gonji.shape[0]):
        gonji_2017.append(result_gonji.values[i][0])
        gonji_2018.append(result_gonji.values[i][1])
        wfp_2017.append(result_wfp.values[i][0])
        wfp_2018.append(result_wfp.values[i][1])
    # 整理成数据框DataFrame
    R = pd.DataFrame(
        data={"gonji_2017":gonji_2017, "gonji_2018":gonji_2018, "wfp_2017":wfp_2017, "wfp_2018":wfp_2018},
        index=result_gonji.index
    )
    # 排序,获取排名前十的股票
    sort_gonji_17 = R['gonji_2017'].sort_values(ascending=False)
    sort_gonji_18 = R['gonji_2018'].sort_values(ascending=False)
    sort_wfp_17 = R['wfp_2017'].sort_values(ascending=False)
    sort_wfp_18 = R['wfp_2018'].sort_values(ascending=False)

    return sort_gonji_17.head(10),sort_gonji_18.head(10),sort_wfp_17.head(10),sort_wfp_18.head(10)


def question_three(df2):
    """
    :param df2: 第二份原始数据data2,DataFrame类型
    :return: 返回值依次为经过标准化处理的数据,主成分,特征向量
    """
    df2.dropna(inplace=True)
    df2['Accper'] = pd.to_datetime(df2['Accper'])
    data_2018 = df2[df2['Accper'].dt.year == 2018]
    x = data_2018.iloc[:,2:]
    from sklearn.preprocessing import StandardScaler
    X = StandardScaler().fit_transform(x)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=0.95)
    Y = pca.fit_transform(X)
    tzxl = pca.components_

    return X,Y,tzxl


def question_four(ndarray):
    """
    :param ndarray: 数组类型的数据,question_three()函数返回的主成分
    :return: 聚类中心,数组类型
    """
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    scores = []
    for i in range(2,11):
        kmeans_model = KMeans(n_clusters=i,random_state=0,max_iter=300)
        kmeans_model.fit(ndarray)
        labels = kmeans_model.labels_
        scores.append(round(silhouette_score(ndarray,labels),2))

    best_kind = max(enumerate(scores),key=lambda x: x[1])[0] + 2 # 获取轮廓系数最大的簇数
    kmeans = KMeans(n_clusters=best_kind,random_state=0,max_iter=300)
    kmeans.fit(ndarray)
    centers = kmeans.cluster_centers_
    return centers


# 主程序
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel('data.xlsx')
data2 = pd.read_excel('data2.xlsx')
# 问题一
r1 = question_one(data)
# 第一种处理方式
# a0 = pd.DataFrame(data=r1[0],columns=['四个季度同比增长率'])
# a0_index = pd.Series(a0.index).astype(str).apply(lambda x:x.zfill(6))
# a0.set_index(a0_index,inplace=True)
# a0[['第一季度同比增长率','第二季度同比增长率','第三季度同比增长率','第四季度同比增长率']] = a0['四个季度同比增长率'].apply(lambda x: pd.Series(x))
# a0.drop(columns='四个季度同比增长率',inplace=True)
# a1 = pd.DataFrame(data=r1[1],columns=['股票代码'])
# a1['股票代码'] = a1['股票代码'].astype(str).apply(lambda x: x.zfill(6))
# print(a0)

# 第二种处理方式
a0 = pd.DataFrame(data=r1[0].values,columns=['四个季度同比增长率'],index=r1[0].index)
# print(a0)
expand = pd.DataFrame(a0['四个季度同比增长率'].values.tolist(),columns=['第一个季度同比增长率','第二个季度同比增长率','第三个季度同比增长率','第四个季度同比增长率'],index=r1[0].index)
# print(expand)
a0 = pd.concat([a0,expand],axis=1)
a0.drop(columns='四个季度同比增长率',inplace=True)
a0.index = a0.index.map(lambda x: str(x).zfill(6))

print(a0)


# 第三种处理方式
# a0 = pd.DataFrame(data=r1[0].values,columns=['四个季度同比增长率'],index=r1[0].index)
# a0['第一个季度同比增长率'] = [x[0] for x in a0['四个季度同比增长率'].values]
# a0['第二个季度同比增长率'] = [x[1] for x in a0['四个季度同比增长率'].values]
# a0['第三个季度同比增长率'] = [x[2] for x in a0['四个季度同比增长率'].values]
# a0['第四个季度同比增长率'] = [x[3] for x in a0['四个季度同比增长率'].values]
# a0.drop('四个季度同比增长率',axis=1,inplace=True)
# a0.index = a0.index.map(lambda x: str(x).zfill(6))
# # print(a0['四个季度同比增长率'].values)
# print(a0)
# # print(type(a0.index))



# r1[0].to_excel("question_one.xlsx",sheet_name='每个季度每股同比增长率')
# a1.to_excel("question_one.xlsx",sheet_name='连续4个季度每股收益同比增长率大于20%的股票代码') # 第二次调用会覆盖掉第一次调用
# with pd.ExcelWriter('question_one.xlsx') as writer:
#     a0.to_excel(writer, sheet_name='Sheet1')
#     a1.to_excel(writer, sheet_name='Sheet2')

# optimize test1
# 先简单了解一下数据的大概情况
# import pandas as pd
# data = pd.read_excel('data.xlsx')
# print(data.shape) # (27722,8)
# print(data.isnull().any()) # 查看数据每列是否有缺失值
# print(data.isnull().sum().tolist()) # 查看数据每列一共有多少个缺失值 各列缺失值总和分别为[0, 0, 3, 16, 3, 111, 3, 3]
# stocks = pd.unique(data['Stkcd'])
# print(stocks.size) # 输出3620,一共有3620支股票


def question_one(df):
    """
    :param df: 原始数据,DataFrame类型
    :return: 一个Series类型数据,包含每支股票四个季度每股同比增长率;一个列表类型数据,为连续四个季度每股收益同比增长率大于等于20%的股票的代码
    """
    # 定义计算每股收益同比增长率函数
    # a季度每股收益同比增长率 = (今年a季度母公司的每股收益 - 去年a季度母公司的每股收益) ÷ 去年a季度母公司的每股收益
    def calculate_rate(x):
        """
        :param x: df分组后的数据
        :return: 每支股票的四个季度每股同比增长率的列表
        """
        r1 = (x.iloc[4, 2] - x.iloc[0, 2]) / x.iloc[0, 2]
        r2 = (x.iloc[5, 2] - x.iloc[1, 2]) / x.iloc[1, 2]
        r3 = (x.iloc[6, 2] - x.iloc[2, 2]) / x.iloc[2, 2]
        r4 = (x.iloc[7, 2] - x.iloc[3, 2]) / x.iloc[3, 2]
        return list(map(lambda t:round(t,2), [r1,r2,r3,r4]))

    df.dropna(inplace=True)
    # # 考虑到仅仅是删除缺失值,这种操作可能导致某一股票的四个季度中缺少几个季度的数据,因此将数据不足8份的股票都删除
    filtered_df = df.groupby(by='Stkcd').filter(lambda t: len(t)==8)
    cal_df = filtered_df.groupby(by='Stkcd').apply(calculate_rate,include_groups=False)
    # # 筛选每股收益同比增长率连续四个季度大于20%的股票
    all_great_then = cal_df.apply(lambda t: all(list(map(lambda y: y>=0.2,t))))
    result_stock = all_great_then[all_great_then.values == True]

    return cal_df,result_stock.index.tolist()

def question_two(df):
    """
    :param df: 原始数据,DataFrame类型
    :return: 2017年、2018年每股资本公积和每股未分配利润最大的10只股票,DataFrame类型
    """
    # 同问题一一样处理数据
    df.dropna(inplace=True)
    filtered_df = df.groupby(by='Stkcd').filter(lambda t:len(t)==8)

    def calculate_gonji(x):
        s1 = x.head(4).iloc[:,4].sum()
        s2 = x.tail(4).iloc[:,4].sum()
        return round(s1,2),round(s2,2)

    def calculate_wfp(x):
        s1 = x.head(4).iloc[:,5].sum()
        s2 = x.tail(4).iloc[:,5].sum()
        return round(s1,2),round(s2,2)

    result_gonji = filtered_df.groupby(by='Stkcd').apply(calculate_gonji,include_groups=False)
    result_wfp = filtered_df.groupby(by='Stkcd').apply(calculate_wfp,include_groups=False)

    # 取出2017年、2018男的每股资本公积年度总和,每股未分配利润年度总和
    gonji_2017 = []
    gonji_2018 = []
    wfp_2017 = []
    wfp_2018 = []
    for i in range(result_gonji.shape[0]):
        gonji_2017.append(result_gonji.values[i][0])
        gonji_2018.append(result_gonji.values[i][1])
        wfp_2017.append(result_wfp.values[i][0])
        wfp_2018.append(result_wfp.values[i][1])
    # 整理成数据框DataFrame
    R = pd.DataFrame(
        data={"gonji_2017":gonji_2017, "gonji_2018":gonji_2018, "wfp_2017":wfp_2017, "wfp_2018":wfp_2018},
        index=result_gonji.index
    )
    # 排序,获取排名前十的股票
    sort_gonji_17 = R['gonji_2017'].sort_values(ascending=False)
    sort_gonji_18 = R['gonji_2018'].sort_values(ascending=False)
    sort_wfp_17 = R['wfp_2017'].sort_values(ascending=False)
    sort_wfp_18 = R['wfp_2018'].sort_values(ascending=False)

    return sort_gonji_17.head(10),sort_gonji_18.head(10),sort_wfp_17.head(10),sort_wfp_18.head(10)


def question_three(df2):
    """
    :param df2: 第二份原始数据data2,DataFrame类型
    :return: 返回值依次为经过标准化处理的数据,主成分,特征向量
    """
    df2.dropna(inplace=True)
    df2['Accper'] = pd.to_datetime(df2['Accper'])
    data_2018 = df2[df2['Accper'].dt.year == 2018]
    x = data_2018.iloc[:,2:]
    from sklearn.preprocessing import StandardScaler
    X = StandardScaler().fit_transform(x)
    from sklearn.decomposition import PCA
    pca = PCA(n_components=0.95)
    Y = pca.fit_transform(X)
    tzxl = pca.components_

    return X,Y,tzxl


def question_four(ndarray):
    """
    :param ndarray: 数组类型的数据,question_three()函数返回的主成分
    :return: 聚类结果;聚类中心,数组类型
    """
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_score
    scores = []
    for i in range(2,11):
        kmeans_model = KMeans(n_clusters=i,random_state=0,max_iter=300)
        kmeans_model.fit(ndarray)
        labels = kmeans_model.labels_
        scores.append(round(silhouette_score(ndarray,labels),2))

    best_kind = max(enumerate(scores),key=lambda x: x[1])[0] + 2 # 获取轮廓系数最大的簇数
    kmeans = KMeans(n_clusters=best_kind,random_state=0,max_iter=300)
    cluster_result = kmeans.fit_transform(ndarray)
    centers = kmeans.cluster_centers_
    return cluster_result,centers


# 主程序
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_excel('data.xlsx')
data2 = pd.read_excel('data2.xlsx')
# 问题一
# r1 = question_one(data)
# a0 = pd.DataFrame(data=r1[0].values,columns=['四季度每股收益同比增长率'])
# columns = ['第一季度每股收益同比增长率','第二季度每股收益同比增长率','第三季度每股收益同比增长率','第四季度每股收益同比增长率']
# a0[columns] = a0['四季度每股收益同比增长率'].apply(lambda x: pd.Series(x))
# a0.drop(columns='四季度每股收益同比增长率',inplace=True)
# a0.index = a0.index.map(lambda x: str(x).zfill(6))
# a1 = pd.DataFrame(
#     data = map(lambda x: str(x).zfill(6),r1[1]),
#     columns = ['股票代码']
# )
# a1['排名'] = a1.index.values + 1
# a1 = a1[['排名','股票代码']] # 交换列的位置
# print(a1)
# # r1[0].to_excel("question_one.xlsx",sheet_name='每个季度每股同比增长率')
# # a1.to_excel("question_one.xlsx",sheet_name='连续4个季度每股收益同比增长率大于20%的股票代码') # 第二次调用会覆盖掉第一次调用
# with pd.ExcelWriter('question_one.xlsx',engine='openpyxl') as writer:
#     a0.to_excel(writer, sheet_name='每个季度每股同比增长率')
#     a1.to_excel(writer, sheet_name='连续4个季度每股收益同比增长率大于20%的股票代码',index=False) # 设置index=False不把索引也导入进Excel表格中

# 问题二
# r = question_two(data)
# sub_plot = [221,222,223,224]
# years = [2017,2018,2017,2018]
# category = ['每股资本公积','每股未分配利润']
# import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif'] = "SimHei"
# plt.figure(figsize=[10, 8])

# for i in range(4):
#     plt.subplot(sub_plot[i])
#     # plt.bar(r1.index,r1.values)
#     r[i].plot(kind='bar')
#     plt.title(f"{years[i]}年{category[i//2]}Top10股票榜")
#     plt.xlabel("股票代码")
#     plt.ylabel(category[i//2])
#     plt.xticks(rotation=90)
# plt.tight_layout()
# plt.show()

# 问题三
data2 = pd.read_excel('data2.xlsx')
X,Y,tzxl = question_three(data2)
tzxl = tzxl.T
for i in range(5):
    print('第',i+1,'个主成分的表达式为: X * ',tzxl[:,i],'\n'
          ,'\t表示该主成分与原始数据X的6个特征的关系程度系数依次为:\n\t'
          ,tzxl[:,i])
    print()

# 问题四
r4 = question_four(Y)
print('聚类中心为:\n',r4[1])

你可能感兴趣的:(python,开发语言)