C#，数值计算——哈夫曼编码与数据压缩技术（Huffman Coding and Compression of Data）源代码

news2025/7/7 21:05:57

1 霍夫曼编码导论

霍夫曼编码是一种基于数据集中符号频率的无损数据压缩形式。
它是一种前缀编码方案，这意味着编码的数据不包含任何冗余比特。
霍夫曼编码广泛应用于各种应用，如图像和视频压缩、数据传输和数据存储。

2 霍夫曼编码的优点

以下是霍夫曼编码的5个优点：

霍夫曼编码是一种有效的数据压缩方法，因为它将较短的代码分配给数据集中出现频率较高的符号。这导致了更高的压缩比。
霍夫曼编码是一种前缀编码方案，这意味着它不需要任何特殊的标记来分隔不同的代码。这使得它易于实现和解码。
霍夫曼编码是一种广泛使用的数据压缩方法，并得到许多软件库和工具的支持，使其易于集成到现有系统中。
霍夫曼编码是一种无损压缩方法，这意味着可以从压缩数据中准确地重建原始数据。
霍夫曼编码是一种简单有效的算法，可以很容易地在软件和硬件上实现。

3 霍夫曼编码的缺点

以下是霍夫曼编码的5个缺点：

霍夫曼编码要求预先知道每个符号的频率，这使得它不太适合于符号分布未知或动态变化的情况。
霍夫曼树可能很复杂，很难理解，这使得调试和维护代码变得更加困难。
编码过程可能耗时且计算成本高昂，尤其是对于大型数据集。
霍夫曼编码并不总是最有效的压缩方法，可能还有其他方法可以为给定的数据集提供更好的压缩比。
在只有很少的唯一符号或符号已经被高度压缩的情况下，霍夫曼编码对数据的效果可能较差。

using System;

namespace Legalsoft.Truffer
{
/// <summary>
/// Huffman Coding and Compression of Data
/// </summary>
public class Huffcode
{
private int nch { get; set; }
private int nodemax { get; set; }
private int mq { get; set; }
private int ilong { get; set; }
private int nlong { get; set; }
private int[] ncod { get; set; }
private int[] left { get; set; }
private int[] right { get; set; }
private uint[] icod { get; set; }
private uint[] setbit { get; set; } = new uint[32];

/// <summary>
/// Given the frequency of occurrence table nfreq[0..nnch - 1] for nnch charac-
/// ters, constructs the Huffman code.Also sets ilong and nlong as the
/// character number that produced the longest code symbol, and the length of
/// that symbol.
/// </summary>
/// <param name="nnch"></param>
/// <param name="nfreq"></param>
/// <exception cref="Exception"></exception>
public Huffcode(int nnch, int[] nfreq)
{
this.nch = nnch;
this.mq = 2 * nch - 1;
this.icod = new uint[mq];
this.ncod = new int[mq];
this.left = new int[mq];
this.right = new int[mq];

int[] index = new int[mq];
int[] nprob = new int[mq];
int[] up = new int[mq];
for (int j = 0; j < 32; j++)
{
setbit[j] = (uint)(1 << j);
}
int nused = 0;
for (int j = 0; j < nch; j++)
{
nprob[j] = nfreq[j];
icod[j] = 0;
ncod[j] = 0;
if (nfreq[j] != 0)
{
index[nused++] = j;
}
}
for (int j = nused - 1; j >= 0; j--)
{
heep( index, nprob, nused, j);
}
int k = nch;
while (nused > 1)
{
int node = index[0];
index[0] = index[(nused--) - 1];
heep( index, nprob, nused, 0);
nprob[k] = nprob[index[0]] + nprob[node];
left[k] = node;
right[k++] = index[0];
up[index[0]] = -(int)k;
index[0] = k - 1;
up[node] = k;
heep( index, nprob, nused, 0);
}
up[(nodemax = k) - 1] = 0;
for (int j = 0; j < nch; j++)
{
if (nprob[j] != 0)
{
int n = 0;
int ibit = 0;
for (int node = up[j]; node > 0; node = up[node - 1], ibit++)
{
if (node < 0)
{
n |= (int)setbit[ibit];
node = -node;
}
}
icod[j] = (uint)n;
ncod[j] = ibit;
}
}
nlong = 0;
for (int j = 0; j < nch; j++)
{
if (ncod[j] > nlong)
{
nlong = ncod[j];
ilong = j;
}
}
if (nlong > 16)// numeric_limits<uint>.digits)
{
throw new Exception("Code too long in Huffcode. See text.");
}
}

/// <summary>
/// Huffman encode the single character ich(in the range 0..nch-1), write the
/// result to the byte array code starting at bit nb(whose smallest valid
/// value is zero), and increment nb to the first unused bit.This routine is
/// called repeatedly to encode consecutive characters in a message. The user
/// is responsible for monitoring that the value of nb does not overrun the
/// length of code.
/// </summary>
/// <param name="ich"></param>
/// <param name="code"></param>
/// <param name="nb"></param>
/// <exception cref="Exception"></exception>
public void codeone(int ich, ref string code, ref int nb)
{
char[] cc = code.ToCharArray();

if (ich >= nch)
{
throw new Exception("bad ich (out of range) in Huffcode");
}
if (ncod[ich] == 0)
{
throw new Exception("bad ich (zero prob) in Huffcode");
}
for (int n = ncod[ich] - 1; n >= 0; n--, ++nb)
{
int nc = nb >> 3;
int m = nb & 7;
if (m == 0)
{
code = code.Substring(0, nc);
}
if ((icod[ich] & setbit[n]) != 0)
{
//code[nc] |= setbit[m];
cc[nc] |= (char)setbit[m];
}
}

code = new string(cc);
}

/// <summary>
/// Starting at bit number nb in the byte array code, decode a single character
/// (returned as ich in the range 0..nch-1) and increment nb appropriately.
/// Repeated calls, starting with nb D 0, will return successive characters in
/// a compressed message. The user is responsible for detecting EOM from the
/// message content.
/// </summary>
/// <param name="code"></param>
/// <param name="nb"></param>
/// <returns></returns>
public int decodeone(ref string code, ref int nb)
{
int node = nodemax - 1;
for (; ; )
{
int nc = nb >> 3;
node = ((code[nc] & setbit[7 & nb++]) != 0 ? right[node] : left[node]);
if (node < nch)
{
return node;
}
}
}

/// <summary>
/// Used by the constructor to maintain a heap structure in the array
/// index[0..m - 1].
/// </summary>
/// <param name="index"></param>
/// <param name="nprob"></param>
/// <param name="n"></param>
/// <param name="m"></param>
public void heep(int[] index, int[] nprob, int n, int m)
{
int i = m;
int k = index[i];
int j;
while (i < (n >> 1))
{
if ((j = 2 * i + 1) < n - 1 && nprob[index[j]] > nprob[index[j + 1]])
{
j++;
}
if (nprob[k] <= nprob[index[j]])
{
break;
}
index[i] = index[j];
i = j;
}
index[i] = k;
}
}
}