统计一篇文章中出现次数前k多的单词集

import com.google.common.collect.Lists;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import com.google.common.util.concurrent.AtomicLongMap;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;

import java.io.File;
import java.io.IOException;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.concurrent.LinkedBlockingQueue;

import static com.google.common.base.Charsets.UTF_8;

public class TopKWord {
    public static void main(String[] args) throws IOException {
        // 1.把文件转换为单词集合
        File file = new File("D:\\CodeBetter\\src\\main\\resources\\io\\sourceA");
        List<String> wordList = getWordListBySplitPaper(file);

        // 2.获取每个单词出现的次数
        Map<String, Long> wordAndCountMap = getWordAndCountMap(wordList);

        // 3.获取出现频率最高的k个单词
        int k = 10;
        Queue<String> wordQ = topKWord(k, wordAndCountMap);
        System.out.println(wordQ);
    }

    private static List<String> getWordListBySplitPaper(File file) throws IOException {
        if (file == null) {
            return Lists.newArrayList();
        }

        // guava工具
        CharSource charSource = Files.asCharSource(file, UTF_8);
        LineProcessor<List<String>> listLineProcessor = new LineProcessor<List<String>>() {
            List<String> result = Lists.newArrayList();
            @Override
            public boolean processLine(String words) throws IOException {
                // 1.这里的words是每一行全部字符串(eg: "hello word.")  ,空行则下一行
                if (StringUtils.isBlank(words)) {
                    return true;
                }

                // 2.先将每行的单词总字符串,按照空格划分下
                String[] wordList = words.split(" ");
                // 遍历每一行中的每个单词
                for (String word : wordList) {
                    // 3.每个单词的最后一个字符如果是特殊字符(标点符号、特殊字符等),则需要处理下(过滤掉此单词的最后一个字符)
                    String wordLastChar = word.substring(word.length() - 1);
                    if (!StringUtils.isAlphanumeric(wordLastChar)) {
                        result.add(word.substring(0, word.length() - 1));
                    } else {
                        result.add(word);
                    }
                }
                return true;
            }

            @Override
            public List<String> getResult() {
                return result;
            }
        };
        return charSource.readLines(listLineProcessor);
    }

    private static Map<String, Long> getWordAndCountMap(List<String> wordList) {
        // 这种方式并发安全
        AtomicLongMap<String> map = AtomicLongMap.create();
        wordList.forEach(map::incrementAndGet);
        return map.asMap();
    }

    private static Queue<String> topKWord(int k, Map<String, Long> wordAndCountMap) {
        if (k <= 0 || MapUtils.isEmpty(wordAndCountMap)) {
            return new LinkedBlockingQueue<>();
        }

        // 定义字符串Q 和 字符串对应出现次数小顶堆(因为要找出现次数最大的k个单词)
        Queue<String> wordQ = new LinkedBlockingQueue<>();
        Queue<Long> countQ = new PriorityQueue<>(k, Comparator.comparingLong(i -> i));

        wordAndCountMap.forEach((word, count) -> {
            if (wordQ.size() < k) {
                wordQ.add(word);
                countQ.add(count);
            } else {
                Long peek = countQ.peek();
                // 小顶堆中peek看下最小值,如果peek小于待加入count值,则换成待加入值(因为我们需要值更大的count),否则不做处理
                if (peek != null && peek < count) {
                    // 弹出count最小值,并加入新count
                    countQ.poll();
                    countQ.add(count);

                    // 弹出count最小值对应的字符串(countQ.poll的count值,一定是wordQ对应poll出来的字符串对应的count值,因为word和count是同一个Entry且同时add的)
                    wordQ.poll();
                    wordQ.add(word);
                }
            }
        });
        return wordQ;
    }
}

你可能感兴趣的:(guava,java)