刚开始的学java的时候 那个时候由于对数据结构和算法
不了解, 总是在一些博客上看到"学好xxx仅需这一步, 手撕…"诸如此类的描述, 就感觉很牛逼
如今我也走上了手撕的道路—哈夫曼编码和解码
就发现其实都是纸老虎
具体流程请直接查看代码, 有疑惑的地方我都标明了详细的注释
注: 如果看完还有疑惑请在下面评论 ,看到会及时回复, 另外代码直接复制可以使用
/**
* 哈夫曼编码
* 1.统计不同字符出现次数 构造哈夫曼树节点
* 2.构造哈夫曼树
* 3.构造哈夫曼编码表
* 4.根据哈夫曼编码进行压缩
* @param srcBytes
* @return
*/
public static byte[] haFuManEncode(byte[] srcBytes) {
List<TreeNode> treeNodes = countBytes(srcBytes);
TreeNode tree = createHaFuManTree(treeNodes);
Map<Byte, String> encodeMap = createEncodeMap(tree);
byte[] zipBytes = zip(srcBytes, encodeMap);
return zipBytes;
}
private static byte[] zip(byte[] srcBytes, Map<Byte, String> encodeMap2) {
StringBuilder sb = new StringBuilder();
for (byte b : srcBytes) {
sb.append(encodeMap2.get(b));
}
//System.out.println(sb.toString());
//取八位一组 构成新的压缩数组
int len = sb.length() / 8 == 0 ? sb.length() / 8: sb.length() / 8 + 1;
byte[] zipBytes = new byte[len];
int t = 0;
for (int i = 0; i < sb.length(); i += 8) {
String binaryString;
if (i + 8 > sb.length()) {
binaryString = sb.substring(i);
} else {
binaryString = sb.substring(i, i+8);
}
int b = Integer.valueOf(binaryString, 2);
zipBytes[t++] = (byte) b;
}
return zipBytes;
}
//存储编码表
static Map<Byte, String> encodeMap = new HashMap<Byte, String>();
static StringBuilder lastBuilder = new StringBuilder();
private static Map<Byte, String> createEncodeMap(TreeNode tree) {
if (tree == null) return null;
//左子树路径
getRoute(tree.leftNode, "0", lastBuilder);
//右字树路径
getRoute(tree.rigNode, "1", lastBuilder);
return encodeMap;
}
private static void getRoute(TreeNode node, String code, StringBuilder last) {
//记录上一级路径
StringBuilder sb = new StringBuilder(last);
sb.append(code);
if(node.data == null) {
getRoute(node.leftNode, "0", sb);
getRoute(node.rigNode, "1", sb);
} else {
encodeMap.put(node.data, sb.toString());
}
}
private static TreeNode createHaFuManTree(List<TreeNode> treeNodes) {
if(treeNodes == null) return null;
// treeNodes.forEach(System.out::println);
while(treeNodes.size() > 1) {
Collections.sort(treeNodes);
TreeNode left = treeNodes.get(0);
TreeNode right = treeNodes.get(1);
TreeNode newNode = new TreeNode(null, left.value + right.value);
newNode.leftNode = left;
newNode.rigNode = right;
treeNodes.remove(left);
treeNodes.remove(right);
treeNodes.add(newNode);
}
return treeNodes.get(0);
}
private static List<TreeNode> countBytes(byte[] srcBytes) {
Map<Byte, Integer> countMap = new HashMap<Byte, Integer>();
for (byte b : srcBytes) {
Integer value = countMap.get(b);
countMap.put(b, value == null ? 1 : value + 1);
}
List<TreeNode> list = new ArrayList<TreeNode>();
for (Map.Entry<Byte, Integer> entry : countMap.entrySet()) {
list.add(new TreeNode(entry.getKey(), entry.getValue()));
}
return list;
}
/**
* 哈夫曼解码
* 1.将压缩编码表key-value调换构造解压表
* 2.根据压缩编码表 解析压缩后的字节数组并拼接成连续的二进制串
* 3.解析二进制串取后八位 注意不够的情况 (设置是否解析的标志)
* 4.根据解压表解析二进制串到解压的数组中 (设置步长)
* @param zipBytes
* @param encodeMap
* @return
*/
public static byte[] haFuManDeCode(byte[] zipBytes, Map<Byte, String> encodeMap) {
Map<String, Byte> decodeMap = new HashMap<String, Byte>();
for (Map.Entry<Byte, String> entry : encodeMap.entrySet()) {
decodeMap.put(entry.getValue(), entry.getKey());
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < zipBytes.length; i++) {
byte b = zipBytes[i];
boolean flag = (i == zipBytes.length -1);
sb.append(convertToBianryBit(!flag, b));
}
//System.out.println(sb.toString());
//由于不知道解压的字节数 采用list
List<Byte> list = new ArrayList<Byte>();
for (int i = 0; i < sb.length();) {
//解析步长
int step = 1;
Byte value = null;
while(true) {
String binaryString;
if (step + i > sb.length())
binaryString = sb.substring(i);
else
binaryString = sb.substring(i, i + step);
//根据路径解析字节
value = decodeMap.get(binaryString);
if (step + i < sb.length() && value == null)
step ++;
else
break;
}
if (i + step < sb.length()) {
list.add(value);
}
//i 不受变化
i += step;
}
byte[] unzipBytes= new byte[list.size()];
for (int i = 0; i < list.size(); i++) {
unzipBytes[i] = list.get(i);
}
return unzipBytes;
}
/**
* 将传入的字节转为二进制串
* @param b
* @param b2
*/
private static String convertToBianryBit(boolean flag, byte b2) {
int temp = b2;
if(flag)
temp |= 256;
String binaryString = Integer.toBinaryString(temp);
if(flag)
return binaryString.substring(binaryString.length() - 8);
return binaryString;
}
package org.demo;
public class TreeNode implements Comparable<TreeNode>{
Byte data; //字符
int value; //权值
TreeNode leftNode;
TreeNode rigNode;
public TreeNode(Byte data, int value) {
this.data = data;
this.value = value;
}
@Override
public int compareTo(TreeNode o) {
return this.value - o.value;
}
@Override
public String toString() {
return data + "-" + value;
}
}
public static void main(String[] args) {
String msg = "hello, hi hello";
byte[] bytes = msg.getBytes();
byte[] zip = haFuManEncode(bytes);
byte[] unzip = haFuManDeCode(zip, encodeMap);
System.out.println("压缩前数据长度:" + bytes.length);
System.out.println("压缩后数据长度:" + zip.length);
System.out.println("解压后的数据: " + new String(unzip));
}
测试结果
压缩前数据长度:15
压缩后数据长度:6
解压后的数据: hello, hi hello
哈夫曼树是什么 想必大家都明白, 但是对于数据压缩这一块的它的使用场景其实很局限, 从构造哈夫曼树和编码表
的时候大家就能感受到一个特点: 使用哈夫曼编码压缩最好是数据重复率高的, 如果数据重复次数很低或者各不相同, 其实它的的效率就会降低很多