哈夫曼编码(文件编码与解码)

前言
读取text文件
计算各字符的权重
编码
- 建立编码表
- 全文编码
译码
主函数例程

前言

本文主要介绍对text文件中的大量英文字母进行哈夫曼编码，并根据编码表对编码进行译码。
编码原理参考哈夫曼编码

读取text文件

思路

（1）先读取一次，获得文件中数据的长度
（2）根据获得的长度申请用于存储数据的字符数组
（3）再次读取文件，将数据存入数组中

char *ReadFile(int &num)
{
    int n = 0;
    char x;
    ifstream inFile;
    inFile.open("test.txt"); //打开文件

    if(!inFile) //不能打开文件（不存在或被占用）
    {
        cout << "Unable to open file !" << endl;
    }
    while (inFile >> x) //计算字符串长度
    {
        num ++;
    }
    inFile.close(); //关闭文件

    cout << "字符串长度：" << num << endl;
    char *str = new char[num]; //为str申请长度为num的内存
    inFile.open("test.txt");
    while (inFile >> str[n++]); //将文件中的数据存入str中
    inFile.close();

    return str;
}

计算各字符的权重

这里只统计英文字母，如果想要将标点符号等也一并编码，可以自行添加。
思路：
（1）遍历所有字符，判断是否是A-Z和a-z的字符
（2）权重数组weight，其0-25位对应大写字母A-Z，26-51位对应小写字母a-z
（3）每遍历到一个字母，则weight对应的位置+1

void CalWeight(char *Str, int Weight[])
{
    int Pos = 0;
    while (*Str != '') //遍历结束则退出
    {
        if (*Str >= 'A' && *Str <= 'Z')
        {
            Pos = *Str - 'A';
            Weight[Pos] ++; //对应的位置权重+1
        }

        if (*Str >= 'a' && *Str <= 'z')
        {
            Pos = *Str - 'a' + 26; // 小写字母对应的位置
            Weight[Pos] ++;
        }
        cout << *Str; // 输出字符串
        Str ++;

    }
}

编码建立编码表

思路：
（1）首先应该建立哈夫曼树，将52个字符作为叶结点，进行合并。
（2）再根据所得的哈夫曼树进行哈夫曼编码，获得每个字符对应的编码，即编码表。
详细请参考哈夫曼编码
这里将哈夫曼编码程序单独打包在一个.cpp和.h文件中，一个文件中代码尽量不要太多。

头文件：

#ifndef __CRHUFFMAN_H
#define __CRHUFFMAN_H

#include 
#include 
using namespace std;

class Node //定义哈夫曼树的结点
{
  public:
    int lchild = -1, rchild = -1;
    int parent = -1;
    int weight = 0;
};

class huffman
{
public: //公开函数声明
    void CreateHuffman(Node *p, int *weight, int numLeafs); //创建哈夫曼树
    void HuffmanCode(Node *p, int numLeafs, string *codes); //哈夫曼编码


private: //私有函数和变量声明
    void InitWeight(Node *p, int *weight, int numLeafs); //初始化各结点的权重
    void SearchMin(const Node *p, int num, int &min1, int &min2); //搜索权重最小的根结点
    Node *p; //结点（叶结点+合并结点）
    int *weight; //各结点权重
    int num; //结点数
    int min1; //权重最小结点
    int min2; //权重第二小结点
    int numLeafs;//叶结点数
    string *codes; //编码表
};

#endif // __CRHUFFMAN_H

.cpp文件：

#include "crhuffman.h"


void huffman::InitWeight(Node *p, int *weight, int numLeafs)
{
    for (int i = 0; i < numLeafs; i++)
    {
        p[i].weight = weight[i];
    }
}


void huffman::SearchMin(const Node *p, int num, int &min1, int &min2)
{
    for (int i = 0; i < num; i++) //找到第一个根结点
    {
        if (p[i].parent == -1)
        {
            min1 = i;
            break;
        }
    }

    for (int i = 0; i < num; i++) //遍历全部根结点，找到权重最小的根结点
    {
        if ((p[i].parent == -1) && (p[i].weight < p[min1].weight))
        {
            min1 = i;
        }
    }

    for (int i = 0; i < num; i++) //找到第二个根结点
    {
        if ((p[i].parent == -1) && (i != min1))
        {
            min2 = i;
            break;
        }
    }

    for (int i = 0; i < num; i++) //遍历全部根结点，找到权重第二小的根结点
    {
        if ((p[i].parent == -1) && (p[i].weight < p[min2].weight) && (i != min1))
        {
            min2 = i;
        }
    }
}


void huffman::CreateHuffman(Node *p, int *weight, int numLeafs)
{
    int PosMin1, PosMin2;
    InitWeight(p, weight, numLeafs); //初始化叶结点权重

    for (int i = 0; i < numLeafs - 1; i++)
    {
        SearchMin(p, numLeafs + i, PosMin1, PosMin2); //搜索最小的两个根结点

        p[numLeafs + i].weight = p[PosMin1].weight + p[PosMin2].weight;
        p[numLeafs + i].lchild = PosMin1;
        p[numLeafs + i].rchild = PosMin2;
        p[PosMin1].parent = numLeafs + i;
        p[PosMin2].parent = numLeafs + i;
    }
}


void huffman::HuffmanCode(Node *p, int numLeafs, string *codes)
{
    int parent;

    for (int i = 0; i < numLeafs; i++)
    {
        int j = i;
        while (p[j].parent != -1)
        {
            parent = p[j].parent;

            if (p[parent].lchild == j)
            {
                codes[i].insert(codes[i].begin(),'0');
            }
            else
            {
                codes[i].insert(codes[i].begin(),'1');
            }

            j = parent;

        }
    }
}

全文编码

思路：
（1）就是根据编码表对整个text文章进行编码
（2）因为需要创建一个字符数组用于存储编码，因此先计算所需的长度
（3）遍历所有字符，将每个字符对应的编码拼接到strcode中。

int GetCodeLen(int *weight, string *codes)
{
    int Len = 0;
    for (int i = 0; i < 52; i++) //总共52个英文字符
    {
        //sum(每个字符的数量*对应的编码长度)
        Len += weight[i] * codes[i].length();
    }
    return Len;
}


char *Encode(char *str, int *weight, string *codes, int num)
{
    int Len = GetCodeLen(weight, codes); //计算编码后所占用的长度
    cout << "编码长度：" << Len << endl; //打印编码长度

    char *strcode = new char[Len]; //为strcode申请长度为Len的字符数组
    strcode[0] = '';//清空字符串

    for (int i = 0; i < num; i++) //遍历所有字符
    {
        if (str[i] >= 'A' && str[i] <= 'Z')
        {
           char *m = (char *)codes[(int)(str[i] - 'A')].data(); //当前字符对应的编码（string转换为char[])

           strcat(strcode, m); //将当前对应的编码拼接在strcode尾部
        }
        else if (str[i] >= 'a' && str[i] <= 'z')
        {
            char *m = (char *)codes[(int)(str[i]-'a'+26)].data();
 
            strcat(strcode, m);
        }
        else
        {
        }
    }
    
    return strcode;
}

写到这里的时候，发现可以使用string字符串来存储编码更好，因此C++中string在插入字符串货字符时，会自动为其分配合适的内存。
利用库中的append函数或者operator +=来将字符串拼接在strcode尾部。

string *EncodeRepl(char *str, int *weight, string *codes, int num)
{
    string *strcode = new string();
    for (int i = 0; i < num; i++)
    {
        if (str[i] >= 'A' && str[i] <= 'Z')
        {
            //(*strcode).append(codes[(int)(str[i] - 'A')]);
            (*strcode) += codes[(int)(str[i] - 'A')];

        }
        else if (str[i] >= 'a' && str[i] <= 'z')
        {
            //(*strcode).append(codes[(int)(str[i]-'a'+26)]);
            (*strcode) += codes[(int)(str[i]-'a'+26)];
        }
        else
        {

        }

    }
    return strcode;
}

注：如果使用这个函数，其返回的是string类型，因此对于译码函数的输入参数，需要将char *str改为string str。

译码

思路：
（1）遍历全文编码段，将编码表依次与编码段进行匹配，如果匹配成功，则在继续匹配下一位；
（2）如果匹配不成功，则回溯到当前字符编码段的开头位置，继续与下一个编码表匹配。
（3）这里使用的是链表结构存储译码的字符串，链表代码参考线性表总结。简单点可以使用string字符串存储，利用push_back或者operator +=将单个字符拼接在字符串的结尾即可。

.h文件如下：

#ifndef __DECODE_H
#define __DECODE_H
#include 
#include "LinkList.h"
using namespace std;


class decode
{
public:
    LinkList *Readhuffman(char *str, string *codes); //译码函数

private:
    char *str; //编码字符串
    string *codes; //编码表
};

#endif // __DECODE_H

.cpp文件如下：

#include "decode.h"


LinkList *decode::Readhuffman(char *strcode, string *codes)
{
    int i = 0; //编码当前匹配的位置
    int k = 0; //编码表的某一字母编码的匹配位置
    int j = 0; //当前匹配的编码表的某一字母
    int CurPos = 0; //当前正在匹配中的编码段的开头位置
    char Alp; //当前匹配成功的字符
    Link DeLink; //Link类对象创建

    LinkList *stList = new LinkList(); //创建链表

    while (strcode[i] != '') //遍历编码字符串
    {
        if (strcode[i] == codes[j][k]) //匹配成功
        {
            i++;
            k++;
            if (k >= codes[j].length())
            {
                if (j < 26)
                {
                    Alp = (char)(j + 'A');
                }
                else
                {
                    Alp = (char)(j - 26 + 'a');
                }
                
                DeLink.CreatList(Alp, stList); //将译码成功的字符添加到链表尾部

                k = 0; //从编码表的头开始
                CurPos = i; //下一个编码匹配的第一位
                j = 0;
            }

        }
        else //匹配不成功
        {
            i = CurPos;//返回当前编码段的开头
            j++;  //编码表的下一位
            if (j >= 52)
            {
                cout << "n decode error !" << endl;
                break;
            }

            k = 0;
        }

    }

    return stList;
}

主函数例程

#include 
#include 
#include 
#include 
#include "calweight.h"
#include "crhuffman.h"
#include "decode.h"
using namespace std;

int main()
{
    int num = 0;
    char *str = NULL;
    str = ReadFile(num); //读取文件数据

    int weight[52] = {0};
    CalW Cal;
    Cal.CalWeight(str, weight); //计算权重

    cout << "n------------------------------------------------------------------------------n" <<
    "权重：" << endl;
    cout << setw(8);
    for (int i = 0; i < 52; i++) //打印各个字符及其对应的权重
    {
        if (i < 26)
        {
          cout << (char)(i+'A') << weight[i] << setw(8);
        }
        else
        {
            cout << (char)(i-26 +'a') << weight[i] << setw(8);
        }
        if (!((i+1)%10) && i)
        {
            cout << endl;
        }
    }
    
    
    Node N[2*52 - 1]; //创建哈夫曼树的结点（52个叶结点+51个合并结点）
    string codes[52]; //52个编码表
    
    huffman hu;
    hu.CreateHuffman(N, weight, 52); //创建哈夫曼树
    hu.HuffmanCode(N, 52, codes); //哈夫曼编码

    cout << "n哈夫曼编码表：" << endl;
    for (int i = 0; i < 52; i++) //打印编码表
    {
        if (i < 26)
        {
            cout << (char)(i+'A') << setw(20) << codes[i] << endl;
        }
        else
        {
            cout << (char)(i-26 +'a') << setw(20) << codes[i] << endl;
        }
    }

    char *strcode = Encode(str, weight, codes, num); //全文编码
    cout << "编码: n" << strcode << endl; //打印编码
    cout << "n----------------------------------------------------------------------" << endl;
    
    
    decode Dec;
    LinkList *DecList = NULL; //定义链表
    Link L;
    cout << "译码：" << endl;
    DecList = Dec.Readhuffman(strcode, codes); //译码并将结构保存在链表中
    L.ScanList(DecList);  //打印链表数据
    
    
    delete [] strcode;
    delete [] str;
    delete DecList;

    return 0;
}

哈夫曼编码(文件编码与解码)

C/C++/C#相关栏目本月热门文章