ID3(Iterative Dichotomiser 3)是一种经典的决策树学习算法,由Ross Quinlan于1986年提出,主要用于处理离散特征的分类问题。其核心思想是通过信息增益选择最优特征进行节点分裂,递归构建决策树。
理解并掌握ID3算法,理解算法的原理,能够实现算法,并对给定的数据集进行分类,分析个人参股的情况
import pandas as pd
import numpy as np
from math import log2
# 数据预处理
# 读取数据(跳过ARFF元数据行)
data_path = r"D:\课程\数据挖掘\实验三\实验3-bank-data.csv"
df = pd.read_csv(data_path, skiprows=12, header=None, on_bad_lines='skip')
# 定义列名(根据@attribute部分)
columns = [
"id", "age", "sex", "region", "income",
"married", "children", "car", "save_act",
"current_act", "mortgage", "pep"
]
df.columns = columns
# 清理多余的空列(根据实际数据调整)
df = df.iloc[:, :12]
# 离散化数值属性
def discretize_age(age):
if age < 30: return "Young"
elif 30 <= age <= 50: return "Middle"
else: return "Senior"
def discretize_income(income):
q1 = df["income"].quantile(0.33)
q2 = df["income"].quantile(0.66)
if income <= q1: return "Low"
elif q1 < income <= q2: return "Medium"
else: return "High"
df["age_group"] = df["age"].apply(discretize_age)
df["income_level"] = df["income"].apply(discretize_income)
# 定义特征和目标变量
features = ["age_group", "sex", "region", "income_level", "married", "children"]
target_col = "pep"
#核心算法实现
class TreeNode:
def __init__(self, feature=None, value=None, results=None, branches=None):
self.feature = feature # 分裂特征
self.value = value # 分支取值
self.results = results # 叶节点类别分布
self.branches = branches # 子节点字典
def entropy(labels):
counts = np.unique(labels, return_counts=True)
probs = counts[1] / len(labels)
return -np.sum(probs * np.log2(probs + 1e-10))
def information_gain(data, feature, target):
total_entropy = entropy(data[target])
values, counts = np.unique(data[feature], return_counts=True)
weighted_entropy = 0
for val, cnt in zip(values, counts):
subset = data[data[feature] == val]
weighted_entropy += (cnt/len(data)) * entropy(subset[target])
return total_entropy - weighted_entropy
def gain_ratio(data, feature, target):
info_gain = information_gain(data, feature, target)
split_info = entropy(data[feature])
return info_gain / (split_info + 1e-10) if split_info != 0 else 0
def id3(data, features, target):
# 终止条件1:所有样本属于同一类别
class_counts = data[target].value_counts().to_dict()
if len(class_counts) == 1:
return TreeNode(results=class_counts)
# 终止条件2:无剩余特征可用
if len(features) == 0:
majority_class = max(class_counts, key=class_counts.get)
return TreeNode(results={majority_class: class_counts[majority_class]})
# 选择最佳特征
gains = {f: information_gain(data, f, target) for f in features}
best_feature = max(gains, key=gains.get)
# 构建节点
node = TreeNode(feature=best_feature)
remaining_features = [f for f in features if f != best_feature]
node.branches = {}
# 递归构建子树
for value in data[best_feature].unique():
subset = data[data[best_feature] == value]
if len(subset) == 0:
# 子集为空时,使用父节点的多数类
majority_class = max(class_counts, key=class_counts.get)
node.branches[value] = TreeNode(results={majority_class: class_counts[majority_class]})
else:
node.branches[value] = id3(subset, remaining_features, target)
return node
def c45(data, features, target):
# 终止条件与ID3相同
if len(np.unique(data[target])) == 1:
return TreeNode(results=data[target].value_counts().to_dict())
if len(features) == 0:
return TreeNode(results=data[target].value_counts().to_dict())
# 选择最佳特征(使用增益率)
ratios = {f: gain_ratio(data, f, target) for f in features}
best_feature = max(ratios, key=ratios.get)
# 构建节点
node = TreeNode(feature=best_feature)
remaining_features = [f for f in features if f != best_feature]
node.branches = {}
# 递归构建子树
for value in data[best_feature].unique():
subset = data[data[best_feature] == value]
if len(subset) == 0:
node.branches[value] = TreeNode(results=data[target].value_counts().to_dict())
else:
node.branches[value] = c45(subset, remaining_features, target)
return node
# 模型训练与评估
# 划分训练集和测试集
train_data = df.sample(frac=0.7, random_state=42)
test_data = df.drop(train_data.index)
# 训练模型
id3_tree = id3(train_data, features, target_col)
c45_tree = c45(train_data, features, target_col)
# 分类函数
def classify(tree, sample):
if tree.results is not None:
return max(tree.results, key=tree.results.get)
else:
value = sample[tree.feature]
if value not in tree.branches:
# 处理未知值,返回全局多数类
return max(tree.results, key=tree.results.get) if tree.results else "UNKNOWN"
return classify(tree.branches[value], sample)
# 评估函数
def evaluate(tree, test_data):
correct = 0
for _, row in test_data.iterrows():
pred = classify(tree, row)
if pred == row[target_col]:
correct += 1
return correct / len(test_data)
# 输出结果
print(f"ID3 算法结果: {evaluate(id3_tree, test_data):.2f}")
print(f"C4.5 算法结果: {evaluate(c45_tree, test_data):.2f}")