本节主要介绍对于所给的数据,进行特征变换以及构造新的特征
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from nltk.metrics import distance as distance
from sklearn.model_selection import StratifiedKFold
from MeanEncoder import MeanEncoder
dpath = 'F:/Python_demo/XGBoost/data/'
train = pd.read_json(dpath +"RentListingInquries_train.json")
test = pd.read_json(dpath+"RentListingInquries_test.json")
train.head()
1.将分类标签编码为数字
y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x]) # lambda相当于定义一个表达式,x为变量,y_map[x]为对应的数值
#y_train = train.interest_level.values
y_train = train.interest_level
train = train.drop(['listing_id', 'interest_level'], axis=1) # 删除列
listing_id = test.listing_id.values
test = test.drop('listing_id', axis=1) # 删除列
ntrain = train.shape[0]
train_test = pd.concat((train, test), axis=0).reset_index(drop=True) # 将训练集、测试集拼接起来一同进行特征变换
2.去除噪点、异常值
# 去除杂点
#ulimit = np.percentile(train_test.price.values, 99)
train_test['price'].loc[train_test['price']>13000] = 13000
# 修改某些可能输入错误的值
train_test.loc[train_test["bathrooms"] == 112, "bathrooms"] = 1.5
train_test.loc[train_test["bathrooms"] == 10, "bathrooms"] = 1
train_test.loc[train_test["bathrooms"] == 20, "bathrooms"] = 2
3.计算新的特征
# 计算新的特征 单个屋子价格
train_test['price_bathrooms'] = (train_test["price"])/ (train_test["bathrooms"] +1.0)
train_test['price_bedrooms'] = (train_test["price"])/ (train_test["bedrooms"] +1.0)
train_test["room_diff"] = train_test["bathrooms"] - train_test["bedrooms"]
train_test["room_num"] = train_test["bedrooms"] + train_test["bathrooms"]
4.处理日期形式变量
# 处理日期变量
train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)
train_test['hour'].head()
5.对文本信息进行处理
# 描述中出现的单词个数
train_test["num_description_words"] = train_test["description"].apply(lambda x: len(x.split(" ")))
train_test = train_test.drop(['description'], axis=1)
# 将manager分为几个等级 top 1%, 2%, 5, 10, 15, 20, 25, 30, 50,
managers_count = train_test['manager_id'].value_counts() # 有哪些不同的值,并计算每个值有多少个重复值
train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 70)] else 0)
6.聚类编码、计算L1距离
# Clustering
train_location = train_test.loc[:ntrain-1, ['latitude', 'longitude']]
test_location = train_test.loc[ntrain:, ['latitude', 'longitude']]
kmeans_cluster = KMeans(n_clusters=20) # 聚20类
res = kmeans_cluster.fit(train_location) # 聚类
res = kmeans_cluster.predict( pd.concat((train_location, test_location), axis=0).reset_index(drop=True)) # 预测
train_test['cenroid'] = res
# L1 范数 计算数据与聚类中心的距离
center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]
train_test['distance'] = abs(train_test['latitude'] - center[0]) + abs(train_test['longitude'] - center[1])
7.类别型特征编码
categoricals = ['building_id', 'manager_id', 'display_address', 'street_address']
#categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
lbl = LabelEncoder()
lbl.fit(list(train_test[feat].values))
train_test[feat] = lbl.transform(list(train_test[feat].values))
8.计算词频
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x)) # 以‘ ’ 连接成一个新字符
c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1)) # 每种词汇在该训练文本中出现的频率,内置英语停用词,只取前max_features个作为关键词集,
c_vect_sparse = c_vect.fit_transform(train_test['features2']) # 拟合模型
c_vect_sparse_cols = c_vect.get_feature_names() # 所有文本的词汇
train_test.drop(['features', 'features2'], axis=1, inplace=True)
#hstack作为特征处理的最后一部,先将其他所有特征都转换成数值型特征才能处理
train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()
9.特征处理结果存为文件
#存为csv格式方便用excel查看
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]
train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
train_new.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)
X_test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)