huffman encoding

🏠

For the impatient, the full implementation is at the bottom of this page.

Huffman Encoding is an easy to understand, and efficient data compression algorithm (in this example, ~75% compression ratio is attained). On this page we go through a working implementation. I wrote it in a couple hours whilst watching this great lecture on Huffman Encoding; needless to say this is for educational purposes only.

Please note, it uses the third-party bitarray python module.

Generate The Huffman Tree

graph TD 5 --> E 5 --> A 9 --> D 9 --> B 11 --> 5 11 --> C 20 --> 9 20 --> 11

The first step with Huffman encoding is to generate the encoding tree. This is done by taking the counts of each letter, and creating a hierarchy of counts.

 1 from collections import Counter
 2 from collections import namedtuple
 3 from bitarray import bitarray
 4 
 5 node = namedtuple('node', 'char count left right')
 6 hcode = namedtuple('hcode', 'char count code')
 7 
 8 def gen_huffman_tree(message):
 9     counts = Counter(message)
10     htree = {node(x[0], x[1], 0, 0) for x in sorted(counts.items(), key=lambda x: x[1])}
11     huff_min = lambda: min(htree, key=lambda x: x.count)
12     while len(htree) > 1:
13         a = huff_min()
14         htree.remove(a)
15         b = huff_min()
16         htree.remove(b)
17         htree.add(node('', a.count + b.count, a, b))
18     return next(iter(htree)), len(counts)
19 
20 def main():
21     # this is the message we want to encode
22     message  = 'BCCABBDDAECCBBAEDDCC'
23     
24     # compute the huffman tree
25     htree, count = gen_huffman_tree(message)
26 
27     print(htree)
28 
29 if __name__ == '__main__':
30     main()

Generating the tree is quite straightforward, you can find out more on generating callgraphs in python with mermaid.

Generate the Huffman Codes

Next you'll want to generate the table of codes. i.e the letter frequencies, and the encoded path each letter takes down the Huffman tree.

 1 def gen_huffman_codes(htree, count):
 2     code = [0]*count
 3     hcodes = {}
 4     def gen_codes(htree, depth=0):
 5         if htree.left:
 6             code[depth] = 0
 7             gen_codes(htree.left, depth+1)
 8         if htree.right:
 9             code[depth] = 1
10             gen_codes(htree.right, depth+1)
11         is_leaf = htree.left == htree.right == 0
12         if is_leaf:
13             hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
14     gen_codes(htree)
15     return hcodes
16 
17 def print_huffman_codec(hcodes):
18     for c, v in hcodes.items():
19         print(v)
20 
21 def main():
22     # this is the message we want to encode
23     message  = 'BCCABBDDAECCBBAEDDCC'
24     
25     # compute the huffman tree
26     htree, count = gen_huffman_tree(message)
27 
28     # compute our huffman encoder
29     hcodes = gen_huffman_codes(htree, count)
30  
31     print_huffman_codec(hcodes)

Output:

1 hcode(char='D', count=4, code=bitarray('00'))
2 hcode(char='E', count=2, code=bitarray('010'))
3 hcode(char='A', count=3, code=bitarray('011'))
4 hcode(char='B', count=5, code=bitarray('10'))
5 hcode(char='C', count=6, code=bitarray('11'))

Compress the Message

Next we'll want to encode the actual message.

 1 def encode_message(message, hcodes):
 2     huff_encoding = bitarray()
 3     for c in message:
 4         huff_encoding.extend(hcodes[c].code)
 5     return huff_encoding
 6 
 7 
 8 def main():
 9     # this is the message we want to encode
10     message  = 'BCCABBDDAECCBBAEDDCC'
11     
12     # compute the huffman tree
13     htree, count = gen_huffman_tree(message)
14 
15     # compute our huffman encoder
16     hcodes = gen_huffman_codes(htree, count)
17 
18     # print our huffman codec
19     print_huffman_codec(hcodes)
20 
21     # compress the message into our huffman encoding
22     huff_encoding = encode_message(message, hcodes)

Output:

1 bitarray('101111011101000000110101111101001101000001111')

Decompress the Message

Next we'll want to decode our Huffman encoded message. For that we simply perform a traveral, while following the directional rules of our encoded message. When we hit a leaf, that's the character we are after.

 1 def decode_huffman_encoding(htree, huff_encoding):
 2     def decode(depth, htree):
 3         is_leaf = htree.left == htree.right == 0
 4         if is_leaf:
 5             offset[0] += depth
 6             decompressed_message.append(htree.char)
 7         else:
 8             bit = huff_encoding[depth+offset[0]]
 9             if htree.left and not bit:
10                 decode(depth+1, htree.left)
11             elif htree.right and bit:
12                 decode(depth+1, htree.right)
13 
14     decompressed_message = []
15     offset = [0]
16 
17     while offset[0] < len(huff_encoding):
18         decode(0, htree)
19 
20     decompressed_message = ''.join(decompressed_message)
21     return decompressed_message
22 
23 def main():
24     # this is the message we want to encode
25     message  = 'BCCABBDDAECCBBAEDDCC'
26     
27     # compute the huffman tree
28     htree, count = gen_huffman_tree(message)
29 
30     # compute our huffman encoder
31     hcodes = gen_huffman_codes(htree, count)
32 
33     # print our huffman codec
34     print_huffman_codec(hcodes)
35 
36     # compress the message into our huffman encoding
37     huff_encoding = encode_message(message, hcodes)
38 
39     # decompress our huffman encoding back into a string
40     decompressed_message = decode_huffman_encoding(htree, huff_encoding)
41  
42     print(decompressed_message)

Output:

1 BCCABBDDAECCBBAEDDCC

Printing some stats

The final step is to print some stats. Please note, i did not include the step of including the hcodes and htree into a final output, which would have indeed lead to a slight drop in the compression ratio.

 1 def main():
 2     # this is the message we want to encode
 3     message  = 'BCCABBDDAECCBBAEDDCC'
 4     
 5     # compute the huffman tree
 6     htree, count = gen_huffman_tree(message)
 7 
 8     # compute our huffman encoder
 9     hcodes = gen_huffman_codes(htree, count)
10 
11     # print our huffman codec
12     print_huffman_codec(hcodes)
13 
14     # compress the message into our huffman encoding
15     huff_encoding = encode_message(message, hcodes)
16 
17     # decompress our huffman encoding back into a string
18     decompressed_message = decode_huffman_encoding(htree, huff_encoding)
19 
20     # print stats and sanity check
21     print(f'Original message: {message}')
22     print(f'Huffman Encoded Message: {huff_encoding}')
23     print(f'Original Size: {len(message)*8} bits.')
24     print(f'New Size: {len(huff_encoding)} bits.')
25     print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
26     print(f'decompressed message: {decompressed_message}')
27     print(f'decompression success: {message == decompressed_message}')

Output:

1 Original message: BCCABBDDAECCBBAEDDCC
2 Huffman Encoded Message: bitarray('101111011101000000110101111101001101000001111')
3 Original Size: 160 bits.
4 New Size: 45 bits.
5 Compression: 71.875%.
6 decompressed message: BCCABBDDAECCBBAEDDCC
7 decompression success: True

Calculating Encoding Size

Using the codes

The size of the final encoding can be calculated from the hcode var:

It can also be calculated with with htree var:

Where di is the depth of the leaf, and fi is the count for the leaf.

Putting it all together

Here's the full implementation for your fun and enjoyment.

 1 from collections import Counter
 2 from collections import namedtuple
 3 from bitarray import bitarray
 4 
 5 node = namedtuple('node', 'char count left right')
 6 hcode = namedtuple('hcode', 'char count code')
 7 
 8 def gen_huffman_tree(message):
 9     counts = Counter(message)
10     htree = {node(x[0], x[1], 0, 0) for x in sorted([*counts.items()], key=lambda x: x[1])}
11     huff_min = lambda: min(htree, key=lambda x: x.count)
12     while len(htree) > 1:
13         a = huff_min()
14         htree.remove(a)
15         b = huff_min()
16         htree.remove(b)
17         htree.add(node('', a.count + b.count, a, b))
18     return next(iter(htree)), len(counts)
19 
20 def gen_huffman_codes(htree, count):
21     code = [0]*count
22     hcodes = {}
23     def gen_codes(htree, depth=0):
24         if htree.left:
25             code[depth] = 0
26             gen_codes(htree.left, depth+1)
27         if htree.right:
28             code[depth] = 1
29             gen_codes(htree.right, depth+1)
30         is_leaf = htree.left == htree.right == 0
31         if is_leaf:
32             hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
33     gen_codes(htree)
34     return hcodes
35 
36 def decode_huffman_encoding(htree, huff_encoding):
37     def decode(depth, htree):
38         is_leaf = htree.left == htree.right == 0
39         if is_leaf:
40             offset[0] += depth
41             decompressed_message.append(htree.char)
42         else:
43             bit = huff_encoding[depth+offset[0]]
44             if htree.left and not bit:
45                 decode(depth+1, htree.left)
46             elif htree.right and bit:
47                 decode(depth+1, htree.right)
48 
49     decompressed_message = []
50     offset = [0]
51 
52     while offset[0] < len(huff_encoding):
53         decode(0, htree)
54 
55     decompressed_message = ''.join(decompressed_message)
56     return decompressed_message
57 
58 def print_huffman_codec(hcodes):
59     for c, v in hcodes.items():
60         print(v)
61 
62 def encode_message(message, hcodes):
63     huff_encoding = bitarray()
64     for c in message:
65         huff_encoding.extend(hcodes[c].code)
66     return huff_encoding
67 
68 def main():
69     # this is the message we want to encode
70     message  = 'BCCABBDDAECCBBAEDDCC'
71     
72     # compute the huffman tree
73     htree, count = gen_huffman_tree(message)
74 
75     # compute our huffman encoder
76     hcodes = gen_huffman_codes(htree, count)
77 
78     # print our huffman codec
79     print_huffman_codec(hcodes)
80 
81     # compress the message into our huffman encoding
82     huff_encoding = encode_message(message, hcodes)
83 
84     # decompress our huffman encoding back into a string
85     decompressed_message = decode_huffman_encoding(htree, huff_encoding)
86 
87     # print stats and sanity check
88     print(f'Original message: {message}')
89     print(f'Huffman Encoded Message: {huff_encoding}')
90     print(f'Original Size: {len(message)*8} bits.')
91     print(f'New Size: {len(huff_encoding)} bits.')
92     print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
93     print(f'decompressed message: {decompressed_message}')
94     print(f'decompression success: {message == decompressed_message}')
95 
96 if __name__ == '__main__':
97     main()