二手车价格预测-特征工程

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sample = pd.read_csv('used_car_sample_submit.csv',sep=' ')
test = pd.read_csv('used_car_testA_20200313.csv',sep=' ')
train= pd.read_csv('used_car_train_20200313.csv',sep=' ')

包装一个异常值处理的代码:

def outliers_proc(data,col_name,scale=3):
    """
    默认用box_plot(scale=3)进行清洗
    """
    def box_plot_outliers(data_ser,box_scale):
        """
        利用箱线图去除异常值
        param data_ser: 接收pandas.Series数据格式
        param box_scale:箱线图尺度
        """
        iqr=box_scale*(data_ser.quantile(0.75)-data_ser.quantile(0.25))
        val_low=data_ser.quantile(0.25)-iqr
        val_up=data_ser.quantile(0.75)+iqr
        rule_low=(data_ser<val_low)
        rule_up=(data_ser>val_up)
        return (rule_low,rule_up),(val_low,val_up)
    
    data_n=data.copy()
    data_series=data_n[col_name]
    rule,value=box_plot_outliers(data_series,box_scale=scale)
    index=np.arange(data_series.shape[0])[rule[0]|rule[1]]
    print('Delete number is:{}'.format(len(index)))
    data_n=data_n.drop(index)
    data_n.reset_index(drop=True,inplace=True)
    print('now column number is:{}'.format(data_n.shape[0]))
    index_low=np.arange(data_series.shape[0])[rule[0]]
    outliers=data_series.iloc[index_low]
    print('Description of data less than the lower bound is:')
    print(pd.Series(outliers).describe())
          
    fig,ax=plt.subplots(1,2,figsize=(10,7))
    sns.boxplot(y=data[col_name],data=data,palette='Set1',ax=ax[0])
    sns.boxplot(y=data_n[col_name],data=data_n,palette='Set2',ax=ax[1])
    return data_n

删除某列异常值:

outliers_proc(train,'power',scale=3)

二手车价格预测-特征工程_第1张图片

你可能感兴趣的:(二手车价格预测)