【数据挖掘】分类算法学习—ID3

分类算法学习—ID3

        ID3(Iterative Dichotomiser 3)是一种经典的决策树学习算法,由Ross Quinlan于1986年提出,主要用于处理离散特征的分类问题。其核心思想是通过信息增益选择最优特征进行节点分裂,递归构建决策树。

要求:

        理解并掌握ID3算法,理解算法的原理,能够实现算法,并对给定的数据集进行分类,分析个人参股的情况

【数据挖掘】分类算法学习—ID3_第1张图片
代码实现:

import pandas as pd
import numpy as np
from math import log2

# 数据预处理
# 读取数据(跳过ARFF元数据行)
data_path = r"D:\课程\数据挖掘\实验三\实验3-bank-data.csv"
df = pd.read_csv(data_path, skiprows=12, header=None, on_bad_lines='skip')

# 定义列名(根据@attribute部分)
columns = [
    "id", "age", "sex", "region", "income",
    "married", "children", "car", "save_act",
    "current_act", "mortgage", "pep"
]
df.columns = columns

# 清理多余的空列(根据实际数据调整)
df = df.iloc[:, :12]

# 离散化数值属性
def discretize_age(age):
    if age < 30: return "Young"
    elif 30 <= age <= 50: return "Middle"
    else: return "Senior"

def discretize_income(income):
    q1 = df["income"].quantile(0.33)
    q2 = df["income"].quantile(0.66)
    if income <= q1: return "Low"
    elif q1 < income <= q2: return "Medium"
    else: return "High"

df["age_group"] = df["age"].apply(discretize_age)
df["income_level"] = df["income"].apply(discretize_income)

# 定义特征和目标变量
features = ["age_group", "sex", "region", "income_level", "married", "children"]
target_col = "pep"

#核心算法实现
class TreeNode:
    def __init__(self, feature=None, value=None, results=None, branches=None):
        self.feature = feature   # 分裂特征
        self.value = value       # 分支取值
        self.results = results   # 叶节点类别分布
        self.branches = branches # 子节点字典

def entropy(labels):
    counts = np.unique(labels, return_counts=True)
    probs = counts[1] / len(labels)
    return -np.sum(probs * np.log2(probs + 1e-10))

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)
    weighted_entropy = 0
    for val, cnt in zip(values, counts):
        subset = data[data[feature] == val]
        weighted_entropy += (cnt/len(data)) * entropy(subset[target])
    return total_entropy - weighted_entropy

def gain_ratio(data, feature, target):
    info_gain = information_gain(data, feature, target)
    split_info = entropy(data[feature])
    return info_gain / (split_info + 1e-10) if split_info != 0 else 0

def id3(data, features, target):
    # 终止条件1:所有样本属于同一类别
    class_counts = data[target].value_counts().to_dict()
    if len(class_counts) == 1:
        return TreeNode(results=class_counts)
    
    # 终止条件2:无剩余特征可用
    if len(features) == 0:
        majority_class = max(class_counts, key=class_counts.get)
        return TreeNode(results={majority_class: class_counts[majority_class]})
    
    # 选择最佳特征
    gains = {f: information_gain(data, f, target) for f in features}
    best_feature = max(gains, key=gains.get)
    
    # 构建节点
    node = TreeNode(feature=best_feature)
    remaining_features = [f for f in features if f != best_feature]
    node.branches = {}
    
    # 递归构建子树
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        if len(subset) == 0:
            # 子集为空时,使用父节点的多数类
            majority_class = max(class_counts, key=class_counts.get)
            node.branches[value] = TreeNode(results={majority_class: class_counts[majority_class]})
        else:
            node.branches[value] = id3(subset, remaining_features, target)
    return node

def c45(data, features, target):
    # 终止条件与ID3相同
    if len(np.unique(data[target])) == 1:
        return TreeNode(results=data[target].value_counts().to_dict())
    if len(features) == 0:
        return TreeNode(results=data[target].value_counts().to_dict())
    
    # 选择最佳特征(使用增益率)
    ratios = {f: gain_ratio(data, f, target) for f in features}
    best_feature = max(ratios, key=ratios.get)
    
    # 构建节点
    node = TreeNode(feature=best_feature)
    remaining_features = [f for f in features if f != best_feature]
    node.branches = {}
    
    # 递归构建子树
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        if len(subset) == 0:
            node.branches[value] = TreeNode(results=data[target].value_counts().to_dict())
        else:
            node.branches[value] = c45(subset, remaining_features, target)
    return node

#  模型训练与评估 
# 划分训练集和测试集
train_data = df.sample(frac=0.7, random_state=42)
test_data = df.drop(train_data.index)

# 训练模型
id3_tree = id3(train_data, features, target_col)
c45_tree = c45(train_data, features, target_col)

# 分类函数
def classify(tree, sample):
    if tree.results is not None:
        return max(tree.results, key=tree.results.get)
    else:
        value = sample[tree.feature]
        if value not in tree.branches:
            # 处理未知值,返回全局多数类
            return max(tree.results, key=tree.results.get) if tree.results else "UNKNOWN"
        return classify(tree.branches[value], sample)

# 评估函数
def evaluate(tree, test_data):
    correct = 0
    for _, row in test_data.iterrows():
        pred = classify(tree, row)
        if pred == row[target_col]:
            correct += 1
    return correct / len(test_data)

# 输出结果
print(f"ID3 算法结果: {evaluate(id3_tree, test_data):.2f}")
print(f"C4.5 算法结果: {evaluate(c45_tree, test_data):.2f}")

执行结果:

你可能感兴趣的:(数据挖掘,数据挖掘,分类,学习,经验分享,ID3)