计算文本信息熵

package Nlp;

import org.apache.commons.lang.StringUtils;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * @author ruyi.yury
 * @date 2018/6/11 0011 20:29
 */
public class CalculateEntropy {
    /**
     * 文件路径
     */
    private final static String path = "H:\\weibo.txt";
    /**
     * 信息熵
     */
    private double entropy = 0.0;


    public static void main(String[] args) throws IOException {
        calculate(path);
    }

    /**
     * 计算信息熵
     *
     * @param path
     * @return
     */
    public static void calculate(String path) {
        long startTime = System.currentTimeMillis();
        try {
            //读取文件,统计同类字出现次数,以(k,v)存储,k为字,v为该字的数量
            Map map = Files.lines(Paths.get(path), Charset.defaultCharset())
                    .flatMap(line -> Arrays.stream(line.split("\n")))
                    .flatMap(line -> {
                        char[] aWord = line.toCharArray();
                        Character[] cwords = new Character[aWord.length];
                        for (int i = 0; i < aWord.length; i++) {
                            cwords[i] = Character.valueOf(aWord[i]);
                        }
                        String str = StringUtils.join(cwords, "\n");
                        return Stream.of(str);
                    })
                    .collect(Collectors.toCollection(ArrayList::new))
                    .parallelStream().flatMap(obj -> Stream.of(obj.toString().split("\n")))
                    .collect(Collectors.groupingBy(p -> p, Collectors.counting()));
            //统计总字数
            final int[] countArr = new int[1];
            map.forEach((k, v) -> {
                countArr[0] += v;
            });
            final int count = countArr[0];
            double[] entropys = new double[1];
            //计算信息熵
            map.forEach((k, v) -> {
                System.out.println(k + ":" + v);
                entropys[0] -= (Double.valueOf(v) / count) * (Math.log((Double.valueOf(v) / count) / Math.log(2)));
            });
            System.out.println("文本的信息熵为:" + entropys[0]);
        } catch (IOException e) {
            e.printStackTrace();
        }
        long endTime = System.currentTimeMillis();
        long useTime = endTime - startTime;
        System.out.println("耗时:" + useTime / 1000 + "秒");
    }
}


你可能感兴趣的:(自然语言处理,自然语言处理,文本信息熵)