主成分分析-----学Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import LeaveOneOut
from mpl_toolkits import mplot3d
%matplotlib inline
plt.rcParams['font.sans-serif']=['SimHei']  #解决中文显示乱码问题
df = pd.read_excel('数字乡村指数数据.xlsx')
主成分分析
data1=df.loc[:,['数字金融基础设施','数字基础设施指数','乡村经济数字化','乡村治理数字化指数','乡村生活数字化指数','数字乡村指数']]
data1
最大值和最小值归一化

from sklearn import preprocessing #标准化处理

features_columns = [col for col in data1.columns if col not in ['数字乡村指数']]##选取V0~~V37

min_max_scaler = preprocessing.MinMaxScaler()
# 原型:sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
#元祖范围,默认为(0,1),copy为拷贝属性,默认为True,表示对原数据组拷贝操作,
#这样变换后元数组不变,False表 示变换操作后,原数组也跟随变化,相当于c++中的引用或指针。
min_max_scaler = min_max_scaler.fit(data1[features_columns])#求得train_data数据中的一些属性

data1_scaler = min_max_scaler.transform(data1[features_columns])#在fit的基础上,进行标准化,降维,归一化等操作
data1_scaler = min_max_scaler.transform(data1[features_columns])

data1_scaler = pd.DataFrame(data1_scaler)
data1_scaler.columns = features_columns

data1_scaler = pd.DataFrame(data1_scaler)
data1_scaler.columns = features_columns

data1_scaler['数字乡村指数'] = data1['数字乡村指数']
display(data1_scaler.describe())
Q型主成分分析
threshold = 0.5
corrmat = data1.corr()
top_corr_features=corrmat.index[abs(corrmat["数字乡村指数"])>threshold]
print(top_corr_features.sort_values(ascending=False))
Index(['数字金融基础设施', '数字基础设施指数', '数字乡村指数', '乡村经济数字化', '乡村生活数字化指数', '乡村治理数字化指数'], dtype='object')
column = data1_scaler.columns.tolist()
mcorr = data1_scaler[column].corr(method="spearman")
mcorr=mcorr.abs()
numerical_corr=mcorr[mcorr['数字乡村指数']>0.5]['数字乡村指数']
print(numerical_corr.sort_values(ascending=False))
index0 = numerical_corr.sort_values(ascending=False).index
print(data1_scaler[index0].corr('spearman'))
from sklearn.decomposition import PCA#主成分分析法
#保持90%的信息
pca = PCA(n_components=0.9)
new_data1_pca_90 = pca.fit_transform(data1_scaler.iloc[:,0:-1])
#new_test_pca_90 = pca.transform(test_data_scaler)
new_data1_pca_90 = pd.DataFrame(new_data1_pca_90)
#new_test_pca_90 = pd.DataFrame(new_test_pca_90)
new_data1_pca_90['数字乡村指数'] = data1_scaler['数字乡村指数']
new_data1_pca_90.describe()
#保留3个主成分
pca = PCA(n_components=3)
new_data1_pca_3 = pca.fit_transform(data1_scaler.iloc[:,0:-1])
new_data1_pca_3 = pd.DataFrame(new_data1_pca_3)
new_data1_pca_3['数字乡村指数'] = data1_scaler['数字乡村指数']
new_data1_pca_3.describe()

你可能感兴趣的:(python,sklearn)