????????赫夫曼压缩算法属于可变字长编码(VLC)的一种,该算法的基本原理是将字符按权重生成赫夫曼树,特点是WPL(加权路径长度)最小,且保证权重大(出现次数多)的字符离根节点近。解压过程则是将压缩编码通过匹配赫夫曼编码表,得到原始数据。
package com.example.datastructures.huffmancode;
import java.io.*;
import java.util.*;
/**
* @author maoyouhua
* @version jdk21
*
* 赫夫曼压缩算法属于可变字长编码(VLC)的一种,
* 该算法的基本原理是将字符按权重生成赫夫曼树,特点是WPL(加权路径长度)最小,且保证权重大(出现次数多)的字符离根节点近。
* 解压过程则是将压缩编码通过匹配赫夫曼编码表,得到原始数据
*
*/
public class HuffmanCode {
/**
* 接收字节数组转化为Node
* @param bytes
* @return
*/
public static List<Node> getNodes(byte[] bytes){
List<Node> nodes = new ArrayList<>();
Map<Byte,Integer> map = new HashMap<>();
for (byte item : bytes) {
if (map.containsKey(item)) {
map.put(item,map.get(item) + 1);
} else {
map.put(item,1);
}
}
for (Map.Entry<Byte, Integer> entry : map.entrySet()) {
nodes.add(new Node(entry.getKey(), entry.getValue()));
}
return nodes;
}
/**
* 生成赫夫曼树
* @param nodes
* @return
*/
public static Node createHuffmanTree(List<Node> nodes){
while (nodes.size() > 1) {
Collections.sort(nodes);
Node leftNode = nodes.get(0);
Node rightNode = nodes.get(1);
Node parent = new Node(null,leftNode.getWeight() + rightNode.getWeight(), leftNode, rightNode);
nodes.remove(leftNode);
nodes.remove(rightNode);
nodes.add(parent);
}
return nodes.get(0);
}
static Map<Byte, String> huffmanCodes = new HashMap<>();
/**
* 将传入的node节点的所有叶子节点的赫夫曼编码得到,并放入huffmanCodes集合
* @param node 传入节点
* @param code 路径 左边是0,右边是1 ,最后得到的code是二进制表示的路径(因为权值大离根结点近,所以路径短,故重复率高压缩效率明显)
* @param stringBuilder 用于拼接路径
*/
public static void getCodes(Node node, String code, StringBuilder stringBuilder){
StringBuilder stringBuilder2 = new StringBuilder(stringBuilder);
stringBuilder2.append(code);
if (node != null) {
//说明是非叶子节点
if (node.getData() == null) {
getCodes(node.getLeft(),"0",stringBuilder2);
getCodes(node.getRight(),"1",stringBuilder2);
} else {
//说明是叶子节点
huffmanCodes.put(node.getData(),stringBuilder2.toString());
}
}
}
public static Map<Byte, String> getCodes(Node root){
if (root != null) {
getCodes(root,"",new StringBuilder());
} else {
return null;
}
return huffmanCodes;
}
/**
*
* @param bytes 原始字符串对应的字节数组
* @param huffmanCodes 赫夫曼编码
* @return 经过赫夫曼编码压缩后的数组
*/
public static byte[] zip(byte[] bytes, Map<Byte, String> huffmanCodes){
StringBuilder stringBuilder = new StringBuilder();
for (byte item :bytes){
stringBuilder.append(huffmanCodes.get(item));
}
//字节字符串转字节数组
int len;
if (stringBuilder.length() % 8 == 0) {
len = stringBuilder.length() / 8;
} else {
len = stringBuilder.length() / 8 + 1;
}
byte[] huffmanCodesBytes = new byte[len];
int index = 0;
for (int i = 0; i < stringBuilder.length(); i += 8) {
String strByte;
if (i + 8 > stringBuilder.length()) {
strByte = stringBuilder.substring(i);
} else {
strByte = stringBuilder.substring(i, i + 8);
}
huffmanCodesBytes[index] = (byte)Integer.parseInt(strByte,Character.MIN_RADIX);
index++;
}
return huffmanCodesBytes;
}
/**
* 赫夫曼压缩
* @param bytes 字符串对应的字节数组
* @return 赫夫曼压缩数组
*/
public static byte[] huffmanZip(byte[] bytes){
Node huffmanTree = createHuffmanTree(getNodes(bytes));
Map<Byte, String> huffmanCodes = getCodes(huffmanTree);
byte[] zipBytes = zip(bytes, huffmanCodes);
return zipBytes;
}
/**
* 将byte转成二进制的字符串
* @param b
* @return
*/
public static String byteToBitString(boolean flag, byte b){
int temp = b;
if (flag) {
temp |= 256;
}
String str = Integer.toBinaryString(temp);
if (flag) {
return str.substring(str.length() - 8);
} else {
return str;
}
}
/**
*
* @param huffmanCodes 赫夫曼编码表
* @param huffmanBytes 经赫夫曼压缩的字节数组
* @return
*/
public static byte[] decode(Map<Byte, String> huffmanCodes, byte[] huffmanBytes){
StringBuilder stringBuilder = new StringBuilder();
for (int i = 0; i < huffmanBytes.length; i++) {
boolean flag = (i == huffmanBytes.length - 1);
stringBuilder.append(byteToBitString(!flag,huffmanBytes[i]));
}
Map<String, Byte> map = new HashMap<>();
for (Map.Entry<Byte, String> entry : huffmanCodes.entrySet()) {
map.put(entry.getValue(), entry.getKey());
}
List<Byte> list = new ArrayList<>();
for (int i = 0; i < stringBuilder.length(); ) {
int count = 1;
boolean flag = true;
Byte b = null;
while (flag) {
String key = stringBuilder.substring(i, i + count);
b = map.get(key);
if (b == null) {
count++;
} else {
flag = false;
}
}
list.add(b);
i += count;
}
byte[] bytes = new byte[list.size()];
for (int i = 0; i < bytes.length; i++) {
bytes[i] = list.get(i);
}
return bytes;
}
/**
* 压缩文件
* @param srcFile 源文件
* @param destFile 目标文件
*/
public static void zipFile(String srcFile, String destFile){
try (FileInputStream is = new FileInputStream(srcFile); FileOutputStream os = new FileOutputStream(destFile);
ObjectOutputStream oos = new ObjectOutputStream(os)
) {
byte[] bytes = new byte[is.available()];
is.read(bytes);
byte[] huffmanBytes = huffmanZip(bytes);
oos.writeObject(huffmanBytes);
oos.writeObject(huffmanCodes);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* 解压文件
* @param zipFile 待解压文件
* @param destFile 目标文件
*/
public static void unZipFile(String zipFile, String destFile){
try (FileInputStream is = new FileInputStream(zipFile); ObjectInputStream ois = new ObjectInputStream(is);
FileOutputStream os = new FileOutputStream(destFile)
) {
byte[] huffmanBytes = (byte[]) ois.readObject();
Map<Byte, String> huffmanCodes = (Map<Byte, String>) ois.readObject();
byte[] decodeBytes = decode(huffmanCodes, huffmanBytes);
os.write(decodeBytes);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* 标准ASCII码也叫基础ASCII码,使用7位二进制数(剩下的1位二进制为0)来表示所有的大写和小写字母,数字0到9、标点符号,以及在美式英语中使用的特殊控制字符。
* 其中,数字0~9、标点符号等是基础打印字符,共102个,可在键盘上直接输入;
* 大写字母A~Z和小写字母a~z各有32个,代表其在电脑上的码值为65~90和97~122;还包括一些常用符号,如空格、回车、换行等。
*
* String 底层为字节数组
*
* {32=01, 97=100, 100=11000, 117=11001, 101=1110, 118=11011, 105=101, 121=11010, 106=0010, 107=1111, 108=000, 111=0011}
*
* [-88, -65, -56, -65, -56, -65, -55, 77, -57, 6, -24, -14, -117, -4, -60, -90, 28] 17个字节
*
* @param args
*/
public static void main(String[] args) {
// String content = "i like like like java do you like a java"; // 40个字节
// byte[] bytes = huffmanZip(content.getBytes());
// System.out.println(Arrays.toString(bytes));
// byte[] decodeBytes = decode(huffmanCodes, bytes);
// System.out.println(new String(decodeBytes));
zipFile("D:\\test\\aaa.txt","D:\\test\\aaa.zip");
unZipFile("D:\\test\\aaa.zip","D:\\test\\bbb.txt");
}
}
class Node implements Comparable<Node> {
private Byte data;
private Integer weight;
private Node left;
private Node right;
public Node(byte data, int weight) {
this.data = data;
this.weight = weight;
}
public Node(Byte data, int weight, Node left, Node right) {
this.data = data;
this.weight = weight;
this.left = left;
this.right = right;
}
public void preOrder(){
System.out.println(this);
if (this.left != null) {
this.left.preOrder();
}
if (this.right != null) {
this.right.preOrder();
}
}
public Node getLeft() {
return left;
}
public Node getRight() {
return right;
}
public Byte getData() {
return data;
}
public int getWeight() {
return weight;
}
@Override
public int compareTo(Node o) {
return this.weight - o.weight;
}
@Override
public String toString() {
return "Node{" +
"data=" + data +
", weight=" + weight +
'}';
}
}