Collections.sort()
或Arrays.sort()
)排序。List<File> splitAndSort(File input) throws IOException {
List<File> chunks = new ArrayList<>();
try (BufferedReader reader = new BufferedReader(new FileReader(input))) {
List<String> buffer = new ArrayList<>();
String line;
while ((line = reader.readLine()) != null) {
buffer.add(line);
if (buffer.size() >= 10_000_000) { // 控制块大小
chunks.add(sortAndSave(buffer));
buffer.clear();
}
}
if (!buffer.isEmpty()) chunks.add(sortAndSave(buffer));
}
return chunks;
}
File sortAndSave(List<String> data) throws IOException {
Collections.sort(data); // 内部排序
File tempFile = File.createTempFile("chunk", ".txt");
tempFile.deleteOnExit();
try (BufferedWriter writer = new BufferedWriter(new FileWriter(tempFile))) {
for (String str : data) {
writer.write(str);
writer.newLine();
}
}
return tempFile;
}
void mergeFiles(List<File> chunks, File output) throws IOException {
PriorityQueue<BufferedLine> minHeap = new PriorityQueue<>();
List<BufferedReader> readers = new ArrayList<>();
// 初始化堆和读取器
for (File file : chunks) {
BufferedReader reader = new BufferedReader(new FileReader(file));
readers.add(reader);
String line = reader.readLine();
if (line != null) minHeap.add(new BufferedLine(line, reader));
}
try (BufferedWriter writer = new BufferedWriter(new FileWriter(output))) {
while (!minHeap.isEmpty()) {
BufferedLine min = minHeap.poll();
writer.write(min.line.replaceAll("\\d+$", "")); // 删除行尾序号
writer.newLine();
String nextLine = min.reader.readLine();
if (nextLine != null) minHeap.add(new BufferedLine(nextLine, min.reader));
}
}
// 关闭所有读取器
for (BufferedReader reader : readers) reader.close();
}
class BufferedLine implements Comparable<BufferedLine> {
String line;
BufferedReader reader;
public BufferedLine(String line, BufferedReader reader) {
this.line = line; this.reader = reader;
}
@Override
public int compareTo(BufferedLine o) {
return this.line.compareTo(o.line); // 按字符串排序
}
}
减少IO开销:
BufferedReader
和BufferedWriter
加速读写]。动态分块:
归并路数控制:
k
需满足:k * 缓冲区大小 ≤ 1GB
。例如11个文件需每个缓冲区≤90MB。资源清理:
File.deleteOnExit()
自动删除。此方案兼顾功能性(排序+数据清洗)与效率(O(n log n)时间复杂度),是Java面试中考察分布式处理能力的典型解法。