日撸java 三百行（day 17 ）哈夫曼编码（01定义+文件预处理）

哈夫曼编码是哈夫曼树的应用之一

编码是计算机很重要的步骤，编码分为等长编码与不等长编码。

等长编码顾名思义，对于不同的编码对象所对应的长度相同，优点就是简单暴力，众生平等，缺点也很明显我们所有的编码长度都是以最长的编码为基准，比较浪费

不等长编码就得需要精心设计，哈夫曼编码就属于不等长编码。

哈夫曼编码是一种可以被唯一解读的二进制编码
频率低的采用短编码，频率高的采用长编码

今天主要是节点的定义、文件读取与节点数组的确定（预处理）

package com.day17;

import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.stream.Collectors;


public class MyHuffman {

    
    class HuffmanNode {
        
        char character;

        
        int weight;

        
        HuffmanNode leftChild;

        
        HuffmanNode rightChild;

        
        HuffmanNode parent;

        
        public HuffmanNode(char paraCharacter, int paraWeight, HuffmanNode paraLeftChild,
                           HuffmanNode paraRightChild, HuffmanNode paraParent) {
            character = paraCharacter;
            weight = paraWeight;
            leftChild = paraLeftChild;
            rightChild = paraRightChild;
            parent = paraParent;
        }// Of HuffmanNode

        
        public String toString() {
            String resultString = "(" + character + ", " + weight + ")";

            return resultString;
        }// Of toString

    }// Of class HuffmanNode

    
    public static final int NUM_CHARS = 256;

    
    String inputText;

    
    int alphabetLength;

    
    char[] alphabet;

    
    int[] charCounts;

    
    int[] charMapping;

    
    String[] huffmanCodes;

    
    HuffmanNode[] nodes;

    
    public MyHuffman(String paraFilename) {
        charMapping = new int[NUM_CHARS];

        readText(paraFilename);
    }// Of the first constructor

    
    public void readText(String paraFilename) {
        try {
            inputText = Files.newBufferedReader(Paths.get(paraFilename), StandardCharsets.UTF_8)
                    .lines().collect(Collectors.joining("n"));
        } catch (Exception ee) {
            System.out.println(ee);
            System.exit(0);
        } // Of try

        System.out.println("The text is:rn" + inputText);
    }// Of readText

    
    public void constructAlphabet() {
        // Initialize.
        Arrays.fill(charMapping, -1);

        // The count for each char. At most NUM_CHARS chars.
        int[] tempCharCounts = new int[NUM_CHARS];

        // The index of the char in the ASCII charset.
        int tempCharIndex;

        // Step 1. Scan the string to obtain the counts.
        char tempChar;
        for (int i = 0; i < inputText.length(); i++) {
            tempChar = inputText.charAt(i);
            tempCharIndex = (int) tempChar;

            System.out.print("" + tempCharIndex + " ");

            tempCharCounts[tempCharIndex]++;
        } // Of for i

        // Step 2. Scan to determine the size of the alphabet.
        alphabetLength = 0;
        for (int i = 0; i < 255; i++) {
            if (tempCharCounts[i] > 0) {
                alphabetLength++;
            } // Of if
        } // Of for i

        // Step 3. Compress to the alphabet
        alphabet = new char[alphabetLength];
        charCounts = new int[2 * alphabetLength - 1];

        int tempCounter = 0;
        for (int i = 0; i < NUM_CHARS; i++) {
            if (tempCharCounts[i] > 0) {
                alphabet[tempCounter] = (char) i;
                charCounts[tempCounter] = tempCharCounts[i];
                charMapping[i] = tempCounter;
                tempCounter++;
            } // Of if
        } // Of for i

        System.out.println("The alphabet is: " + Arrays.toString(alphabet));
        System.out.println("Their counts are: " + Arrays.toString(charCounts));
        System.out.println("The char mappings are: " + Arrays.toString(charMapping));
    }// Of constructAlphabet



    public static void main(String[] args) {
        MyHuffman tempHuffman = new MyHuffman("C:/Users/胡来的魔术师/Desktop/test.txt");
        tempHuffman.constructAlphabet();
    }

}

这里属性有点多，慢慢理：

首先是内部类代表节点，节点有5个属性分别是：节点值、对应权重（该节点值出现的频率）、左右孩子指针、父指针（构建哈夫曼树用）
需要读取本地的文本，将每个字符的ASCLL码做哈希映射，这样相同的字符就会被映射到同一个位置，对该位置进行计数即可获取每个字符出现的频率，同时也能得到节点数组的长度（不重复的字符的个数）

哈夫曼编码这个地方以前一直停留在手算阶段，今天也是第一次上代码，说实话不是想象中那么简单，今天主要是处理读取的文本，中间转换过度的数组比较多，也是费了一定时间才能理清每个数组是干嘛的以及为什么需要这个数组，这就导致我默写的时候总是漏东西，属性、参数过多也是一种折磨，这就必须花时间理解之后才能写出来。不过今天把这些折磨人的准备工作干了，明天就可以直接上算法。

今天的运行截图：

日撸java 三百行 （day 17 ）哈夫曼编码 （01定义+文件预处理）

Java相关栏目本月热门文章

日撸java 三百行（day 17 ）哈夫曼编码（01定义+文件预处理）