# 使用花括号创建字典
dict={"name":"Alice","age":25,"city":"New York"}
dict
{'name': 'Alice', 'age': 25, 'city': 'New York'}
# 访问字典中的值
dict["name"]
'Alice'
离散特征:
不存在顺序,采用 get_dummies() 函数进行独热编码;
存在顺序和大小关系,采用 map() 函数进行标签编码。
# 读取数据
import pandas as pd
data=pd.read_csv(r"data.csv")
data.head()
Id | Home Ownership | Annual Income | Years in current job | Tax Liens | Number of Open Accounts | Years of Credit History | Maximum Open Credit | Number of Credit Problems | Months since last delinquent | Bankruptcies | Purpose | Term | Current Loan Amount | Current Credit Balance | Monthly Debt | Credit Score | Credit Default | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | Own Home | 482087.0 | NaN | 0.0 | 11.0 | 26.3 | 685960.0 | 1.0 | NaN | 1.0 | debt consolidation | Short Term | 99999999.0 | 47386.0 | 7914.0 | 749.0 | 0 |
1 | 1 | Own Home | 1025487.0 | 10+ years | 0.0 | 15.0 | 15.3 | 1181730.0 | 0.0 | NaN | 0.0 | debt consolidation | Long Term | 264968.0 | 394972.0 | 18373.0 | 737.0 | 1 |
2 | 2 | Home Mortgage | 751412.0 | 8 years | 0.0 | 11.0 | 35.0 | 1182434.0 | 0.0 | NaN | 0.0 | debt consolidation | Short Term | 99999999.0 | 308389.0 | 13651.0 | 742.0 | 0 |
3 | 3 | Own Home | 805068.0 | 6 years | 0.0 | 8.0 | 22.5 | 147400.0 | 1.0 | NaN | 1.0 | debt consolidation | Short Term | 121396.0 | 95855.0 | 11338.0 | 694.0 | 0 |
4 | 4 | Rent | 776264.0 | 8 years | 0.0 | 13.0 | 13.6 | 385836.0 | 1.0 | NaN | 0.0 | debt consolidation | Short Term | 125840.0 | 93309.0 | 7180.0 | 719.0 | 0 |
Home Ownership 特征
data["Home Ownership"].value_counts()
Home Ownership
Home Mortgage 3637
Rent 3204
Own Home 647
Have Mortgage 12
Name: count, dtype: int64
自定义抗风险能力:Own Home < Rent < Have Mortgage < Home Mortgage
# 定义映射字典
mapping={"Own Home":0,"Rent":1,"Have Mortgage":2,"Home Mortgage":3}
data["Home Ownership"].head()
0 Own Home
1 Own Home
2 Home Mortgage
3 Own Home
4 Rent
Name: Home Ownership, dtype: object
# 标签编码
data["Home Ownership"]=data["Home Ownership"].map(mapping)
data["Home Ownership"].head()
0 0
1 0
2 3
3 0
4 1
Name: Home Ownership, dtype: int64
Term 特征
data["Term"].value_counts()
Term
Short Term 5556
Long Term 1944
Name: count, dtype: int64
# 定义映射字典
mapping={"Short Term":1,"Long Term":0}
# 标签编码
data["Term"]=data["Term"].map(mapping)
data["Term"].head()
0 1
1 0
2 1
3 1
4 1
Name: Term, dtype: int64
一个映射函数实现两次编码
# 重新读取数据
import pandas as pd
data=pd.read_csv(r"data.csv")
mapping={
"Home Ownership":{"Own Home":0,"Rent":1,"Have Mortgage":2,"Home Mortgage":3},
"Term":{"Short Term":1,"Long Term":0}
}
# 对 Home Ownership 列进行映射
data["Home Ownership"]=data["Home Ownership"].map(mapping["Home Ownership"])
data["Home Ownership"].head()
0 0
1 0
2 3
3 0
4 1
Name: Home Ownership, dtype: int64
# 对 Term 列进行映射
data["Term"]=data["Term"].map(mapping["Term"])
data["Term"].head()
0 1
1 0
2 1
3 1
4 1
Name: Term, dtype: int64
Annual Income 特征
# 归一化处理函数 manual_normalize
def manual_normalize(data):
min_val=data.min()
max_val=data.max()
normalized_data=(data-min_val)/(max_val-min_val)
return normalized_data
data["Annual Income"]=manual_normalize(data["Annual Income"])
data["Annual Income"].head()
0 0.031798
1 0.086221
2 0.058771
3 0.064145
4 0.061260
Name: Annual Income, dtype: float64
# 使用 sklearn 库进行归一化处理
from sklearn.preprocessing import MinMaxScaler
# 重新读取数据
import pandas as pd
data=pd.read_csv(r"data.csv")
min_max_scaler=MinMaxScaler()
data["Annual Income"]=min_max_scaler.fit_transform(data[["Annual Income"]])
data["Annual Income"].head()
0 0.031798
1 0.086221
2 0.058771
3 0.064145
4 0.061260
Name: Annual Income, dtype: float64
# 标准化处理
from sklearn.preprocessing import StandardScaler
# 重新读取数据
import pandas as pd
data=pd.read_csv(r"data.csv")
scaler=StandardScaler()
data["Annual Income"]=scaler.fit_transform(data[["Annual Income"]])
data["Annual Income"].head()
0 -1.046183
1 -0.403310
2 -0.727556
3 -0.664078
4 -0.698155
Name: Annual Income, dtype: float64
# 读取数据
import pandas as pd
data=pd.read_csv(r"heart.csv")
# 离散特征
list_discrete=["sex","cp","fbs","restecg","exang","slope","thal"]
# 使用 astype 指定数据类型
for i in list_discrete:
data[i]=data[i].astype("object")
data.dtypes
age int64
sex object
cp object
trestbps int64
chol int64
fbs object
restecg object
thalach int64
exang object
oldpeak float64
slope object
ca int64
thal object
target int64
dtype: object
# 定义映射字典
mapping={"sex":{0:0,1:1},"cp":{0:1,1:2,2:3,3:4},"fbs":{0:0,1:1},"restecg":{0:0,1:1,2:2},"exang":{0:0,1:1},"slope":{0:1,1:2,2:3},"thal":{0:1,1:2,2:3,3:4}}
# 标签编码
for i in list_discrete:
data[i]=data[i].map(mapping[i])
data.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 4 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 1 | 0 | 2 | 1 |
1 | 37 | 1 | 3 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 1 | 0 | 3 | 1 |
2 | 41 | 0 | 2 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 3 | 0 | 3 | 1 |
3 | 56 | 1 | 2 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 3 | 0 | 3 | 1 |
4 | 57 | 0 | 1 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 3 | 0 | 3 | 1 |
# 读取数据
import pandas as pd
data=pd.read_csv(r"heart.csv")
# 连续特征
list_continuous=["age","trestbps","chol","thalach","oldpeak","ca"]
# 归一化处理函数 manual_normalize
def manual_normalize(data):
min_val=data.min()
max_val=data.max()
normalized_data=(data-min_val)/(max_val-min_val)
return normalized_data
# 归一化
for i in list_continuous:
data[i]=manual_normalize(data[i])
data.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.708333 | 1 | 3 | 0.481132 | 0.244292 | 1 | 0 | 0.603053 | 0 | 0.370968 | 0 | 0.0 | 1 | 1 |
1 | 0.166667 | 1 | 2 | 0.339623 | 0.283105 | 0 | 1 | 0.885496 | 0 | 0.564516 | 0 | 0.0 | 2 | 1 |
2 | 0.250000 | 0 | 1 | 0.339623 | 0.178082 | 0 | 0 | 0.770992 | 0 | 0.225806 | 2 | 0.0 | 2 | 1 |
3 | 0.562500 | 1 | 1 | 0.245283 | 0.251142 | 0 | 1 | 0.816794 | 0 | 0.129032 | 2 | 0.0 | 2 | 1 |
4 | 0.583333 | 0 | 0 | 0.245283 | 0.520548 | 0 | 1 | 0.702290 | 1 | 0.096774 | 2 | 0.0 | 2 | 1 |
# 使用 sklearn 库进行归一化处理
from sklearn.preprocessing import MinMaxScaler
# 重新读取数据
import pandas as pd
data=pd.read_csv(r"heart.csv")
# 连续特征
list_continuous=["age","trestbps","chol","thalach","oldpeak","ca"]
# 归一化
for i in list_continuous:
min_max_scaler=MinMaxScaler()
data[i]=min_max_scaler.fit_transform(data[[i]])
data.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.708333 | 1 | 3 | 0.481132 | 0.244292 | 1 | 0 | 0.603053 | 0 | 0.370968 | 0 | 0.0 | 1 | 1 |
1 | 0.166667 | 1 | 2 | 0.339623 | 0.283105 | 0 | 1 | 0.885496 | 0 | 0.564516 | 0 | 0.0 | 2 | 1 |
2 | 0.250000 | 0 | 1 | 0.339623 | 0.178082 | 0 | 0 | 0.770992 | 0 | 0.225806 | 2 | 0.0 | 2 | 1 |
3 | 0.562500 | 1 | 1 | 0.245283 | 0.251142 | 0 | 1 | 0.816794 | 0 | 0.129032 | 2 | 0.0 | 2 | 1 |
4 | 0.583333 | 0 | 0 | 0.245283 | 0.520548 | 0 | 1 | 0.702290 | 1 | 0.096774 | 2 | 0.0 | 2 | 1 |
# 标准化处理
from sklearn.preprocessing import StandardScaler
# 重新读取数据
import pandas as pd
data=pd.read_csv(r"heart.csv")
# 连续特征
list_continuous=["age","trestbps","chol","thalach","oldpeak","ca"]
# 标准化
for i in list_continuous:
scaler=StandardScaler()
data[i]=scaler.fit_transform(data[[i]])
data.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.952197 | 1 | 3 | 0.763956 | -0.256334 | 1 | 0 | 0.015443 | 0 | 1.087338 | 0 | -0.714429 | 1 | 1 |
1 | -1.915313 | 1 | 2 | -0.092738 | 0.072199 | 0 | 1 | 1.633471 | 0 | 2.122573 | 0 | -0.714429 | 2 | 1 |
2 | -1.474158 | 0 | 1 | -0.092738 | -0.816773 | 0 | 0 | 0.977514 | 0 | 0.310912 | 2 | -0.714429 | 2 | 1 |
3 | 0.180175 | 1 | 1 | -0.663867 | -0.198357 | 0 | 1 | 1.239897 | 0 | -0.206705 | 2 | -0.714429 | 2 | 1 |
4 | 0.290464 | 0 | 0 | -0.663867 | 2.082050 | 0 | 1 | 0.583939 | 1 | -0.379244 | 2 | -0.714429 | 2 | 1 |
@浙大疏锦行