异常值检测

#设定异常值比例
outliers_fraction = 0.01

# 初始化 LSCP 探测集
detector_list = [LOF(n_neighbors=5), LOF(n_neighbors=10), LOF(n_neighbors=15),
                 LOF(n_neighbors=20), LOF(n_neighbors=25), LOF(n_neighbors=30),
                 LOF(n_neighbors=35), LOF(n_neighbors=40), LOF(n_neighbors=45),
                 LOF(n_neighbors=50)]
classifiers = {
    'Angle-based Outlier Detector (ABOD)':
        ABOD(contamination=outliers_fraction),
    'Cluster-based Local Outlier Factor (CBLOF)':
        CBLOF(contamination=outliers_fraction,
              check_estimator=False, random_state=0),
    'Feature Bagging':
        FeatureBagging(LOF(n_neighbors=35),
                       contamination=outliers_fraction,
                       random_state=0),
    'Histogram-base Outlier Detection (HBOS)': HBOS(
        contamination=outliers_fraction),
    'Isolation Forest': IForest(contamination=outliers_fraction,
                                random_state=0),
    'K Nearest Neighbors (KNN)': KNN(
        contamination=outliers_fraction),
    'Average KNN': KNN(method='mean',
                       contamination=outliers_fraction),
    'Local Outlier Factor (LOF)':
        LOF(n_neighbors=35, contamination=outliers_fraction),
    'Minimum Covariance Determinant (MCD)': MCD(
        contamination=outliers_fraction, random_state=0),
    'One-class SVM (OCSVM)': OCSVM(contamination=outliers_fraction),
    'Principal Component Analysis (PCA)': PCA(
        contamination=outliers_fraction, random_state=0),
    'Locally Selective Combination (LSCP)': LSCP(
        detector_list, contamination=outliers_fraction,
        random_state=0)
}

for i, clf in enumerate(classifiers.keys()):
    print('Model', i + 1, clf)
#将num_people和num_order合并成一个两列的numpy数组
X1= df['num_people'].values.reshape(-1,1)
X2 = df['num_order'].values.reshape(-1,1)
X = np.concatenate((X1,X2),axis=1)

xx , yy = np.meshgrid(np.linspace(0, 1, 100), np.linspace(0, 1, 100))
plt.figure(figsize=(20, 15))
for i, (clf_name, clf) in enumerate(classifiers.items()):
    #训练数据
    clf.fit(X)
    # 预测异常值分数
    scores_pred = clf.decision_function(X) * -1
    # 预测异常值和正常值的数据
    y_pred = clf.predict(X)
    n_inliers = len(y_pred) - np.count_nonzero(y_pred)
    n_outliers = np.count_nonzero(y_pred == 1)

    df1 = df
    df1['outlier'] = y_pred.tolist()
    #过滤出num_people和num_order的正常值
    inliers_people = np.array(df1['num_people'][df1['outlier'] == 0]).reshape(-1,1)
    inliers_order = np.array(df1['num_order'][df1['outlier'] == 0]).reshape(-1,1)

    #过滤出num_people和num_order的异常值
    outliers_people = df1['num_people'][df1['outlier'] == 1].values.reshape(-1,1)
    outliers_order = df1['num_order'][df1['outlier'] == 1].values.reshape(-1,1)
        
    # 设定一个阈值用以识别正常值和异常值的标准
    threshold = np.percentile(scores_pred, 100 * outliers_fraction)

    #决策函数为每一个数据点计算异常值分数
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) * -1
    Z = Z.reshape(xx.shape)
    
    plt.subplot(3,4,i+1)
    #在图上对从最小的异常值分数到阈值的范围内进行分层着色
    plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),cmap=plt.cm.Blues_r)
    #在异常值分数等于阈值处画红色线条
    a = plt.contour(xx, yy, Z, levels=[threshold],linewidths=2, colors='red')
    
    #填充橙色轮廓线,其中异常分数的范围是从阈值到最大异常分数
    plt.contourf(xx, yy, Z, levels=[threshold, Z.max()],colors='orange')
    b = plt.scatter(x=inliers_people, y=inliers_order, c='white',s=20, edgecolor='k')    
    c = plt.scatter(x=outliers_people, y=outliers_order, c='black',s=20, edgecolor='k')
       
    plt.axis('tight')   
    plt.legend([a.collections[0], b,c], ['决策函数', '正常值','异常值'],
                prop=matplotlib.font_manager.FontProperties(size=12),loc='upper right')

    plt.xlim((0, 1))
    plt.ylim((0, 1))
    ss = '异常值数量: '+str(n_outliers)+' 正常值数量: '+str(n_inliers)
    plt.title(clf_name)
    plt.xlabel(ss)
plt.show();


你可能感兴趣的:(python,python,开发语言)