java基础算法之赫夫曼压缩算法

发布时间:2024年01月21日

????????赫夫曼压缩算法属于可变字长编码(VLC)的一种,该算法的基本原理是将字符按权重生成赫夫曼树,特点是WPL(加权路径长度)最小,且保证权重大(出现次数多)的字符离根节点近。解压过程则是将压缩编码通过匹配赫夫曼编码表,得到原始数据。

package com.example.datastructures.huffmancode;

import java.io.*;
import java.util.*;

/**
 * @author maoyouhua
 * @version jdk21
 *
 *          赫夫曼压缩算法属于可变字长编码(VLC)的一种,
 *          该算法的基本原理是将字符按权重生成赫夫曼树,特点是WPL(加权路径长度)最小,且保证权重大(出现次数多)的字符离根节点近。
 *          解压过程则是将压缩编码通过匹配赫夫曼编码表,得到原始数据
 *
 */
public class HuffmanCode {

    /**
     *              接收字节数组转化为Node
     * @param bytes
     * @return
     */
    public static List<Node> getNodes(byte[] bytes){

        List<Node> nodes = new ArrayList<>();
        Map<Byte,Integer> map = new HashMap<>();
        for (byte item : bytes) {
            if (map.containsKey(item)) {
               map.put(item,map.get(item) + 1);
            } else {
                map.put(item,1);
            }
        }
        for (Map.Entry<Byte, Integer> entry : map.entrySet()) {
            nodes.add(new Node(entry.getKey(), entry.getValue()));
        }
        return nodes;
    }

    /**
     *          生成赫夫曼树
     * @param nodes
     * @return
     */
    public static Node createHuffmanTree(List<Node> nodes){
        while (nodes.size() > 1) {
            Collections.sort(nodes);
            Node leftNode = nodes.get(0);
            Node rightNode = nodes.get(1);
            Node parent = new Node(null,leftNode.getWeight() + rightNode.getWeight(), leftNode, rightNode);
            nodes.remove(leftNode);
            nodes.remove(rightNode);
            nodes.add(parent);
        }
        return nodes.get(0);
    }
    static Map<Byte, String> huffmanCodes = new HashMap<>();

    /**
     *              将传入的node节点的所有叶子节点的赫夫曼编码得到,并放入huffmanCodes集合
     * @param node      传入节点
     * @param code      路径 左边是0,右边是1 ,最后得到的code是二进制表示的路径(因为权值大离根结点近,所以路径短,故重复率高压缩效率明显)
     * @param stringBuilder     用于拼接路径
     */
    public static void getCodes(Node node, String code, StringBuilder stringBuilder){
        StringBuilder stringBuilder2 = new StringBuilder(stringBuilder);
        stringBuilder2.append(code);
        if (node != null) {
            //说明是非叶子节点
            if (node.getData() == null) {
                getCodes(node.getLeft(),"0",stringBuilder2);
                getCodes(node.getRight(),"1",stringBuilder2);
            } else {
               //说明是叶子节点
                huffmanCodes.put(node.getData(),stringBuilder2.toString());
            }
        }
    }
    public static Map<Byte, String> getCodes(Node root){
        if (root != null) {
            getCodes(root,"",new StringBuilder());
        } else {
            return null;
        }
        return huffmanCodes;
    }

    /**
     *
     * @param bytes         原始字符串对应的字节数组
     * @param huffmanCodes      赫夫曼编码
     * @return                  经过赫夫曼编码压缩后的数组
     */
    public static byte[] zip(byte[] bytes, Map<Byte, String> huffmanCodes){
        StringBuilder stringBuilder = new StringBuilder();
        for (byte item :bytes){
            stringBuilder.append(huffmanCodes.get(item));
        }
        //字节字符串转字节数组
        int len;
        if (stringBuilder.length() % 8 == 0) {
            len = stringBuilder.length() / 8;
        } else {
            len = stringBuilder.length() / 8 + 1;
        }

        byte[] huffmanCodesBytes = new byte[len];
        int index = 0;
        for (int i = 0; i < stringBuilder.length(); i += 8) {
            String strByte;
            if (i + 8 > stringBuilder.length()) {
                strByte = stringBuilder.substring(i);
            } else {
                strByte = stringBuilder.substring(i, i + 8);
            }
            huffmanCodesBytes[index] = (byte)Integer.parseInt(strByte,Character.MIN_RADIX);
            index++;
        }
        return huffmanCodesBytes;
    }

    /**
     *          赫夫曼压缩
     * @param bytes     字符串对应的字节数组
     * @return      赫夫曼压缩数组
     */
    public static byte[] huffmanZip(byte[] bytes){
        Node huffmanTree = createHuffmanTree(getNodes(bytes));
        Map<Byte, String> huffmanCodes = getCodes(huffmanTree);
        byte[] zipBytes = zip(bytes, huffmanCodes);
        return zipBytes;
    }

    /**
     *          将byte转成二进制的字符串
     * @param b
     * @return
     */
    public static String byteToBitString(boolean flag, byte b){
        int temp = b;
        if (flag) {
            temp |= 256;
        }
        String str = Integer.toBinaryString(temp);
        if (flag) {
            return str.substring(str.length() - 8);
        } else {
            return str;
        }
    }

    /**
     *
     * @param huffmanCodes      赫夫曼编码表
     * @param huffmanBytes      经赫夫曼压缩的字节数组
     * @return
     */
    public static byte[] decode(Map<Byte, String> huffmanCodes, byte[] huffmanBytes){
        StringBuilder stringBuilder = new StringBuilder();
        for (int i = 0; i < huffmanBytes.length; i++) {
            boolean flag = (i  == huffmanBytes.length - 1);
            stringBuilder.append(byteToBitString(!flag,huffmanBytes[i]));
        }
        Map<String, Byte> map = new HashMap<>();
        for (Map.Entry<Byte, String> entry : huffmanCodes.entrySet()) {
            map.put(entry.getValue(), entry.getKey());
        }

        List<Byte> list = new ArrayList<>();
        for (int i = 0; i < stringBuilder.length(); ) {
            int count = 1;
            boolean flag = true;
            Byte b = null;
            while (flag) {
                String key = stringBuilder.substring(i, i + count);
                b = map.get(key);
                if (b == null) {
                    count++;
                } else {
                  flag = false;
                }
            }
            list.add(b);
            i += count;
        }
        byte[] bytes = new byte[list.size()];
        for (int i = 0; i < bytes.length; i++) {
            bytes[i] =  list.get(i);
        }
        return bytes;
    }

    /**
     *              压缩文件
     * @param srcFile       源文件
     * @param destFile      目标文件
     */
    public static void zipFile(String srcFile, String destFile){

        try (FileInputStream is = new FileInputStream(srcFile); FileOutputStream os = new FileOutputStream(destFile);
             ObjectOutputStream oos = new ObjectOutputStream(os)
        ) {

            byte[] bytes = new byte[is.available()];
            is.read(bytes);
            byte[] huffmanBytes = huffmanZip(bytes);
            oos.writeObject(huffmanBytes);
            oos.writeObject(huffmanCodes);

        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     *                  解压文件
     * @param zipFile       待解压文件
     * @param destFile      目标文件
     */
    public static void unZipFile(String zipFile, String destFile){
        try (FileInputStream is = new FileInputStream(zipFile); ObjectInputStream ois = new ObjectInputStream(is);
             FileOutputStream os = new FileOutputStream(destFile)
        ) {
            byte[] huffmanBytes = (byte[]) ois.readObject();
            Map<Byte, String> huffmanCodes = (Map<Byte, String>) ois.readObject();
            byte[] decodeBytes = decode(huffmanCodes, huffmanBytes);
            os.write(decodeBytes);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }


    /**
     * 标准ASCII码也叫基础ASCII码,使用7位二进制数(剩下的1位二进制为0)来表示所有的大写和小写字母,数字0到9、标点符号,以及在美式英语中使用的特殊控制字符。
     * 其中,数字0~9、标点符号等是基础打印字符,共102个,可在键盘上直接输入;
     * 大写字母A~Z和小写字母a~z各有32个,代表其在电脑上的码值为65~90和97~122;还包括一些常用符号,如空格、回车、换行等。
     *
     *  String 底层为字节数组
     *
     * {32=01, 97=100, 100=11000, 117=11001, 101=1110, 118=11011, 105=101, 121=11010, 106=0010, 107=1111, 108=000, 111=0011}
     *
     * [-88, -65, -56, -65, -56, -65, -55, 77, -57, 6, -24, -14, -117, -4, -60, -90, 28]    17个字节
     *
     * @param args
     */
    public static void main(String[] args) {
//        String content = "i like like like java do you like a java";  // 40个字节
//        byte[] bytes = huffmanZip(content.getBytes());
//        System.out.println(Arrays.toString(bytes));
//        byte[] decodeBytes = decode(huffmanCodes, bytes);
//        System.out.println(new String(decodeBytes));
        zipFile("D:\\test\\aaa.txt","D:\\test\\aaa.zip");
        unZipFile("D:\\test\\aaa.zip","D:\\test\\bbb.txt");
    }
}
class Node implements Comparable<Node> {
    private Byte data;
    private Integer weight;
    private Node left;
    private Node right;

    public Node(byte data, int weight) {
        this.data = data;
        this.weight = weight;
    }
    public Node(Byte data, int weight, Node left, Node right) {
        this.data = data;
        this.weight = weight;
        this.left = left;
        this.right = right;
    }
    public void preOrder(){
        System.out.println(this);
        if (this.left != null) {
            this.left.preOrder();
        }
        if (this.right != null) {
            this.right.preOrder();
        }
    }

    public Node getLeft() {
        return left;
    }

    public Node getRight() {
        return right;
    }

    public Byte getData() {
        return data;
    }

    public int getWeight() {
        return weight;
    }

    @Override
    public int compareTo(Node o) {
        return this.weight - o.weight;
    }
    @Override
    public String toString() {
        return "Node{" +
                "data=" + data +
                ", weight=" + weight +
                '}';
    }
}
文章来源:https://blog.csdn.net/weixin_46375204/article/details/135728312
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。