Huffman编码实现文件的压缩和解压缩

news2025/4/26 8:50:28

一个项目，不过处理起来也比较麻烦，配套可以和文件传输放一起

前提知识：

哈夫曼树和哈夫曼编码的概念和构建

1：n个数构成的哈夫曼树一共有2*n-1个结点=>8 -> 15

2：数字越大的数离根节点越近，越小的数离根节点越近。因为每次是挑权值最小的两个结点当叶子结点从下往上一直到根去构建哈夫曼树

“abcdefgh”这一串字符统计各个字符出现的次数，根据次数当做权重去构建哈夫曼树，每个字符都占8个比特，把a弄成哈夫曼编码例如“11111”,不过这样也是个字符串，每个'1'字符都是8bit，等于没压缩反而大了，哈夫曼编码代替源文件字符内容，再把他转成二进制，关键点就是把这个哈夫曼编码的"1"字符8bit转成二进制里面的1，这样8bit->成了1bit，这样a->8bit就转换为5bit，压缩成功

这种对于文本文件效率还行13%~17%，图片视频音频二进制的文件效率不太高，基本可以说没有

构建哈夫曼树，利用哈夫曼编码去替换

技术点：

哈夫曼树构建，哈夫曼编码构建
哈希映射
内存对齐的问题，压缩解压缩转二进制体现
位运算，是压缩解压缩弄二进制关键性逻辑
文件操作

大体思路：

压缩：
1、统计字符种类及频度：ASCII码值当下标，字符频度当对应下标的值。

2、构建哈夫曼树：在没有访问过的节点中，找最小字符频度下标来构建哈夫曼树。

3、构建哈夫曼编码

4、生成配置文件，记录源文件字符种类，对应频度，用来后面解压缩用

5、生成压缩文件：把字符的哈夫曼编码以二进制形式写入目标文件中。给压缩文件前面写入源文件数据大小，解压缩时需使用根据这个去判断什么时候停止解压(循环次数)

注意内存对齐的问题

解压缩
1、根据配置文件，得到字符种类和频度。

2、构建哈夫曼树：在没有访问过的节点中，找最小字符频度下标来构建哈夫曼树。

3、生成解压缩的文件，这个时候需要用到压缩文件头部那几个字节的信息了

关键结构：

static const int BUFFLEN = 256;//写文件 读文件，缓冲区大小
static const int KIND = 256; // 字符种类个数
typedef int FREP[KIND];      // 频度数组


typedef unsigned char uchar;
typedef struct
{
	uchar ch;   // 字符类型
	int   frep; // 频度
}CharFrep;      //字符种类

typedef struct
{
	int total;  // 文件总的字节个数；
	int chkind; // 文件字符种类数
	CharFrep* frepdata;//指向字符种类的指针
}FileInfo;  //得到当前文件的信息

                 
                 //哈夫曼树结构
typedef struct
{
	uchar ch;
	int   cfreq; // 频度
	int   parent;
	int   leftchild;
	int   rightchild;
}hfmnode;
                     
typedef struct
{
	int chkind;  //文件字符种类数
	int sum;     //总共节点个数
	hfmnode* node;
}HuffmanTree;

struct Index_Fref//形成下标和权值的对应放进优先级队列里面
{                 //可以直接拿pair对组弄
	int index;
	int freq;
	operator int()const { return freq; }
};


                //哈夫曼编码 处理
typedef struct   
{
	uchar ch;
	char* code;
}hfmcode;               
typedef struct
{
	int chkind; //文件字符种类数
	hfmcode* coding;   //直接开辟256而不是按照chkind去开辟，为了提高效率
	                   //读到文件一个字符直接能映射到256里的下标，而不用一个个查找
}HuffMan

我就不对哈夫曼树和哈夫曼编码的结构做解释了，对于最上面俩关于文件操作的结构说明一下

static const int BUFFLEN = 256;//写文件 读文件，缓冲区大小
static const int KIND = 256; // 字符种类个数
typedef int FREP[KIND];      // 频度数组

为什么KING=256? ASCII种类 2的8次方，直接用频度数组，数组下标对应的就是字符的ASCII，比如FREP frep[97]+=1 说明遍历到a了，出现的次数+1

我们用以二进制编辑器打开一个文件

typedef unsigned char uchar;
typedef struct
{
	uchar ch;   // 字符类型
	int   frep; // 频度
}CharFrep;      //字符种类

typedef struct
{
	int total;  // 文件总的字节个数；
	int chkind; // 文件字符种类数
	CharFrep* frepdata;//指向字符种类的指针
}FileInfo;  //得到当前文件的信息

压缩解压缩：

写文件的时候给压缩文件前四个字节写的是源文件的大小，方便后面解压根据大小去判断解压循环多少次就结束了，要不然解压到后面二进制都是0，就会多解压出来一串cccc

压缩的关键性的代码：重点还是while循环里面字符转二进制

void CompressFile(HuffManCode* phfc, FileInfo* pfileinfo, const char* inputfile, const char* outputfile)
{
	assert(phfc != nullptr);
	assert(pfileinfo != nullptr);
	assert(inputfile != nullptr);
	assert(outputfile);
	FILE* srcfile = nullptr, * destfile = nullptr;
	errno_t tag = fopen_s(&srcfile, inputfile, "rb");//打开源文件
	if (tag)
	{
		LOG("open file %s error \n", inputfile);
		exit(EXIT_FAILURE);
	}
	tag = fopen_s(&destfile, outputfile, "wb"); //打开目标文件
	if (tag)
	{
		LOG("open file %s error \n", outputfile);
		exit(EXIT_FAILURE);
	}
	uchar buff[BUFFLEN] = { 0 };
	uchar writebuff[BUFFLEN] = { 0 };
	int wpos = 0;
	int ret = 0;
	uchar tmp = 0;
	uchar bitpos = 0x80;  //1000 0000
	int total = 0;
	rewind(srcfile);

	fwrite(&pfileinfo->total, sizeof(int), 1, destfile);//头部把文件大小填充上
	while ((ret = fread(buff, sizeof(uchar), BUFFLEN, srcfile)) > 0)
	{
		for (int i = 0; i < ret; ++i)
		{
			if (phfc->coding[buff[i]].code == nullptr)
			{
				printf("error %d \n", buff[i]);
				exit(1);
			}
			for (int j = 0; phfc->coding[buff[i]].code[j] != '\0'; ++j)
			{
				tmp |= (phfc->coding[buff[i]].code[j] == '1') ? bitpos : 0;
				bitpos = bitpos >> 1;
				if (bitpos == 0) // 一个字节数据完成
				{
					bitpos = 0x80;
					writebuff[wpos++] = tmp;
					tmp = 0;
					if (wpos == BUFFLEN)
					{
						fwrite(writebuff, sizeof(uchar), wpos, destfile);
						total += wpos;
						wpos = 0;
					}
				}
			}
		}
	}
	if (bitpos != 0)
	{
		writebuff[wpos++] = tmp;
	}
	if (wpos > 0)
	{
		total += wpos;
		fwrite(writebuff, sizeof(uchar), wpos, destfile);
	}
	LOG("压缩文件成功\n");
	fclose(destfile);
	fclose(srcfile);
	destfile = nullptr;
	srcfile = nullptr;

	free(pfileinfo->frepdata);
	pfileinfo->frepdata = nullptr;
	for (int i = 0; i < phfc->chkind; ++i)
	{
		free(phfc->coding[i].code);
		phfc->coding[i].code = nullptr;
	}
	free(phfc->coding);
	phfc->coding = nullptr;
}

解压缩代码：拿到配置文件信息，和压缩文件的头部信息，就开始解压了，构建哈夫曼树，不用构建编码了，遍历压缩文件，根据哈夫曼树结点权值找到对应字符读到缓冲区，写到新文件

void DeCompress(const char* newfilename, const char* compfilename, HuffmanTree* phft)
{
	uchar buff[BUFFLEN] = { 0 };
	uchar writebuff[BUFFLEN] = { 0 };
	int wpos = 0;
	int ret = 0;
	uchar tmp = 0;
	uchar bitpos = 0x80;  //1000 0000
	int total = 0;
	FILE* newfile = nullptr;
	FILE* compfile = nullptr;
	errno_t tag = fopen_s(&newfile, newfilename, "wb");
	if (tag)
	{
		LOG("open file %s fail \n", newfilename);
		exit(EXIT_FAILURE);
	}
	tag = fopen_s(&compfile, compfilename, "rb");
	if (tag)
	{
		LOG("open file %s fail \n", compfilename);
		exit(EXIT_FAILURE);
	}
	fread(&total, sizeof(int), 1, compfile);
	bitpos = 0x80;
	wpos = 0;
	int npos = phft->sum - 2;
	while ((ret = fread(buff, sizeof(uchar), BUFFLEN, compfile)) > 0)
	{
		int i = 0;
		while (i < ret)
		{
			do
			{
				if (phft->node[npos].leftchild == 0 && phft->node[npos].rightchild == 0)
				{
					break;
				}
				npos = (buff[i] & bitpos) ? phft->node[npos].rightchild : phft->node[npos].leftchild;
				bitpos = bitpos >> 1;
			} while (bitpos != 0);
			if (phft->node[npos].leftchild == 0 && phft->node[npos].rightchild == 0)
			{
				writebuff[wpos++] = phft->node[npos].ch;
				npos = phft->sum - 2;
				if (wpos == BUFFLEN)
				{
					fwrite(writebuff, sizeof(uchar), wpos, newfile);
					wpos = 0;
				}
				if (--total == 0) break;
			}
			if (bitpos == 0)
			{
				bitpos = 0x80;
				++i;
			}
		}
	}
	if (wpos > 0)
	{
		fwrite(writebuff, sizeof(uchar), wpos, newfile);
	}
	fclose(newfile);
	fclose(compfile);
	newfile = nullptr;
	compfile = nullptr;
}