Kaggle实战(三): 数据集的特征工程

本节主要介绍对于所给的数据,进行特征变换以及构造新的特征

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import  CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from nltk.metrics import distance as distance
from sklearn.model_selection import StratifiedKFold
from MeanEncoder import MeanEncoder

dpath = 'F:/Python_demo/XGBoost/data/'
train = pd.read_json(dpath +"RentListingInquries_train.json")
test = pd.read_json(dpath+"RentListingInquries_test.json")
train.head()

Kaggle实战(三): 数据集的特征工程_第1张图片

1.将分类标签编码为数字

y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])  # lambda相当于定义一个表达式,x为变量,y_map[x]为对应的数值
#y_train = train.interest_level.values
y_train = train.interest_level
train = train.drop(['listing_id', 'interest_level'], axis=1)  # 删除列

listing_id = test.listing_id.values
test = test.drop('listing_id', axis=1)  # 删除列

ntrain = train.shape[0]
train_test = pd.concat((train, test), axis=0).reset_index(drop=True)  # 将训练集、测试集拼接起来一同进行特征变换

2.去除噪点、异常值

# 去除杂点
#ulimit = np.percentile(train_test.price.values, 99)
train_test['price'].loc[train_test['price']>13000] = 13000
# 修改某些可能输入错误的值
train_test.loc[train_test["bathrooms"] == 112, "bathrooms"] = 1.5
train_test.loc[train_test["bathrooms"] == 10, "bathrooms"] = 1
train_test.loc[train_test["bathrooms"] == 20, "bathrooms"] = 2

3.计算新的特征

# 计算新的特征 单个屋子价格
train_test['price_bathrooms'] =  (train_test["price"])/ (train_test["bathrooms"] +1.0)
train_test['price_bedrooms'] =  (train_test["price"])/ (train_test["bedrooms"] +1.0)
train_test["room_diff"] = train_test["bathrooms"] - train_test["bedrooms"]
train_test["room_num"] = train_test["bedrooms"] + train_test["bathrooms"]

4.处理日期形式变量

# 处理日期变量
train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour

train_test = train_test.drop(['Date', 'created'], axis=1)
train_test['hour'].head()

5.对文本信息进行处理

# 描述中出现的单词个数
train_test["num_description_words"] = train_test["description"].apply(lambda x: len(x.split(" ")))
train_test = train_test.drop(['description'], axis=1)

# 将manager分为几个等级 top 1%, 2%, 5, 10, 15, 20, 25, 30, 50,
managers_count = train_test['manager_id'].value_counts()  # 有哪些不同的值,并计算每个值有多少个重复值
train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)

6.聚类编码、计算L1距离

 # Clustering
train_location = train_test.loc[:ntrain-1, ['latitude', 'longitude']]
test_location = train_test.loc[ntrain:, ['latitude', 'longitude']]

kmeans_cluster = KMeans(n_clusters=20)  # 聚20类
res = kmeans_cluster.fit(train_location)  # 聚类
res = kmeans_cluster.predict( pd.concat((train_location, test_location), axis=0).reset_index(drop=True))  # 预测

train_test['cenroid'] = res

# L1 范数  计算数据与聚类中心的距离
center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]
train_test['distance'] = abs(train_test['latitude'] - center[0]) + abs(train_test['longitude'] - center[1])

7.类别型特征编码

categoricals = ['building_id', 'manager_id', 'display_address', 'street_address']
#categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))

8.计算词频

train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))  # 以‘ ’ 连接成一个新字符

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))  # 每种词汇在该训练文本中出现的频率,内置英语停用词,只取前max_features个作为关键词集,
c_vect_sparse = c_vect.fit_transform(train_test['features2'])  # 拟合模型
c_vect_sparse_cols = c_vect.get_feature_names()  # 所有文本的词汇

train_test.drop(['features', 'features2'], axis=1, inplace=True)

#hstack作为特征处理的最后一部,先将其他所有特征都转换成数值型特征才能处理
train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()

9.特征处理结果存为文件

#存为csv格式方便用excel查看
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]

train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
train_new.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)
X_test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)

 

你可能感兴趣的:(Kaggle)