SMOTE上采样生成平衡数据

由于经常会用到不平衡的数据,因此经常用到smote,
其中的生成倍数=正常倍数-1,比如要生成10,则参数写为9

import random
from sklearn.neighbors import NearestNeighbors
import numpy as np


class Smote:
    """
    SMOTE过采样算法.


    Parameters:
    -----------
    k: int
        选取的近邻数目.
    sampling_rate: int
        采样倍数, attention sampling_rate < k.
    newindex: int
        生成的新样本(合成样本)的索引号.
    """

    def __init__(self, sampling_rate=5, k=5):
        self.sampling_rate = sampling_rate
        self.k = k
        self.newindex = 0

    def fit(self, X, y=None):
        if y is not None:
            negative_X = X[y == 0]
            X = X[y == 1]

        n_samples, n_features = X.shape
        # 初始化一个矩阵, 用来存储合成样本
        self.synthetic = np.zeros((n_samples * self.sampling_rate, n_features))

        # 找出正样本集(数据集X)中的每一个样本在数据集X中的k个近邻
        knn = NearestNeighbors(n_neighbors=self.k).fit(X)
        for i in range(len(X)):
            k_neighbors = knn.kneighbors(X[i].reshape(1, -1),
                                         return_distance=False)[0]
            # 对正样本集(minority class samples)中每个样本, 分别根据其k个近邻生成
            # sampling_rate个新的样本
            self.synthetic_samples(X, i, k_neighbors)

        if y is not None:
            return (np.concatenate((self.synthetic, X, negative_X), axis=0),
                    np.concatenate(([1] * (len(self.synthetic) + len(X)), y[y == 0]), axis=0))

        return np.concatenate((self.synthetic, X), axis=0)

    # 对正样本集(minority class samples)中每个样本, 分别根据其k个近邻生成sampling_rate个新的样本
    def synthetic_samples(self, X, i, k_neighbors):
        for j in range(self.sampling_rate):
            # 从k个近邻里面随机选择一个近邻
            neighbor = np.random.choice(k_neighbors)
            # 计算样本X[i]与刚刚选择的近邻的差
            diff = X[neighbor] - X[i]
            # 生成新的数据
            self.synthetic[self.newindex] = X[i] + random.random() * diff
            self.newindex += 1

def mat(inputfile):
    count = len(open(inputfile, 'rU').readlines())
    mat=np.zeros((count,39),np.float32)
    i=0
    with open(inputfile,"r") as fr:
        line=fr.readlines()
        for line in line[1:]:
            line=line.split(",")
            line=list(map(float,line))
            line_array=np.array(line[1:])
            # print(line_array)
            mat[i]=line_array
            i+=1
    return mat
import pandas as pd
path="F:/M6A/featureFile/huang/ANF/"
inputfilename="FullTrainPosCD8T.csv"
# Mat_Label = mat(path+inputfilename)  # 有标签的数据坐标
Mat_Label=(pd.read_csv(open(path+inputfilename,"r"),delimiter=",",skiprows=0))
Mat_Label=Mat_Label.fillna(0)
# # x=[0,1]
# Mat_Label.drop(["0"],axis=1,inplace=True)
# Mat_Label.drop(["0"],inplace=True)
# print((Mat_Label))
# Mat_Label=mat(path+inputfilename)
Mat_array=np.array(Mat_Label)
Mat_array=np.delete(Mat_array,0,axis=1)
# print(Mat_array)
labels1=list(np.ones(75252,int))
# labels0=list(np.zeros(1112,int))
labels=labels1
Label=np.array(labels)
X=Mat_array
y=Label
# X = np.array([[1, 2, 3], [3, 4, 6], [2, 2, 1], [3, 5, 2], [5, 3, 4], [3, 2, 4]])
# y = np.array([1, 1, 1, 0, 0, 0])
smote = Smote(sampling_rate=9, k=5)
# with open("188D.txt","w") as fw:
#     fw.write(smote.fit(X))
a=smote.fit(X)
np.savetxt(path+"smote/balan_"+inputfilename,a,delimiter=",")
# np.savetxt("C:\\Users\\lijing\\Desktop\\train3\\pos\\balan_CTDT.txt",a)

你可能感兴趣的:(python代码)