K-Means文档聚类 - 关键代码详解

1. 数据加载与预处理
import net.sf.javaml.core.Dataset;
import net.sf.javaml.core.DefaultDataset;
import net.sf.javaml.core.DenseInstance;

/**
 * 从实验二的TF-IDF结果加载数据
 * @param tfidfVectors 实验二生成的TF-IDF矩阵(double[][]类型)
 * @return Java-ML兼容的Dataset对象
 */
public static Dataset loadData(double[][] tfidfVectors) {
   
    Dataset dataset = new DefaultDataset();
    for (int docId = 0; docId < tfidfVectors.length; docId++) {
   
        // 将每个文档向量转换为DenseInstance对象
        Instance instance = new DenseInstance(tfidfVectors[docId]);
        // 设置文档ID便于追踪(可选)
        instance.setID("Document_" + (docId + 1)); 
        dataset.add(instance);
    }
    return dataset;
}
2. 自定义余弦相似度实现
import net.sf.javaml.distance.DistanceMeasure;

/**
 * 自定义余弦相似度计算(替代默认欧氏距离)
 * 注:Java-ML中距离越小表示越相似,故用1 - cosine值
 */
public class CosineSimilarity implements DistanceMeasure {
   
    @Override
    public double measure(Instance a, Instance b) {
   
        double dotProduct = 0.0;
        double normA = 0.0;
        double normB = 0.0;
        
        // 计算点积和模长
        for (int i = 0; i < a.noAttributes(); i++) {
   
            dotProduct += a.get(i) * b.get(i);
            normA += Math.pow(a.get(

你可能感兴趣的:(kmeans,聚类,算法)