# huffman encoding

🏠

For the impatient, the full implementation is at the bottom of this page.

Huffman Encoding is an easy to understand, and efficient data compression algorithm (in this example, ~75% compression ratio is attained). On this page we go through a working implementation. I wrote it in a couple hours whilst watching this great lecture on Huffman Encoding; needless to say this is for educational purposes only.

Please note, it uses the third-party bitarray python module.

# Generate The Huffman Tree

graph TD 5 --> E 5 --> A 9 --> D 9 --> B 11 --> 5 11 --> C 20 --> 9 20 --> 11

The first step with Huffman encoding is to generate the encoding tree. This is done by taking the counts of each letter, and creating a hierarchy of counts.

 1 from collections import Counter
2 from collections import namedtuple
3 from bitarray import bitarray
4
5 node = namedtuple('node', 'char count left right')
6 hcode = namedtuple('hcode', 'char count code')
7
8 def gen_huffman_tree(message):
9     counts = Counter(message)
10     htree = {node(x, x, 0, 0) for x in sorted(counts.items(), key=lambda x: x)}
11     huff_min = lambda: min(htree, key=lambda x: x.count)
12     while len(htree) > 1:
13         a = huff_min()
14         htree.remove(a)
15         b = huff_min()
16         htree.remove(b)
17         htree.add(node('', a.count + b.count, a, b))
18     return next(iter(htree)), len(counts)
19
20 def main():
21     # this is the message we want to encode
22     message  = 'BCCABBDDAECCBBAEDDCC'
23
24     # compute the huffman tree
25     htree, count = gen_huffman_tree(message)
26
27     print(htree)
28
29 if __name__ == '__main__':
30     main()


Generating the tree is quite straightforward, you can find out more on generating callgraphs in python with mermaid.

# Generate the Huffman Codes

Next you'll want to generate the table of codes. i.e the letter frequencies, and the encoded path each letter takes down the Huffman tree.

 1 def gen_huffman_codes(htree, count):
2     code = *count
3     hcodes = {}
4     def gen_codes(htree, depth=0):
5         if htree.left:
6             code[depth] = 0
7             gen_codes(htree.left, depth+1)
8         if htree.right:
9             code[depth] = 1
10             gen_codes(htree.right, depth+1)
11         is_leaf = htree.left == htree.right == 0
12         if is_leaf:
13             hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
14     gen_codes(htree)
15     return hcodes
16
17 def print_huffman_codec(hcodes):
18     for c, v in hcodes.items():
19         print(v)
20
21 def main():
22     # this is the message we want to encode
23     message  = 'BCCABBDDAECCBBAEDDCC'
24
25     # compute the huffman tree
26     htree, count = gen_huffman_tree(message)
27
28     # compute our huffman encoder
29     hcodes = gen_huffman_codes(htree, count)
30
31     print_huffman_codec(hcodes)


Output:

1 hcode(char='D', count=4, code=bitarray('00'))
2 hcode(char='E', count=2, code=bitarray('010'))
3 hcode(char='A', count=3, code=bitarray('011'))
4 hcode(char='B', count=5, code=bitarray('10'))
5 hcode(char='C', count=6, code=bitarray('11'))


# Compress the Message

Next we'll want to encode the actual message.

 1 def encode_message(message, hcodes):
2     huff_encoding = bitarray()
3     for c in message:
4         huff_encoding.extend(hcodes[c].code)
5     return huff_encoding
6
7
8 def main():
9     # this is the message we want to encode
10     message  = 'BCCABBDDAECCBBAEDDCC'
11
12     # compute the huffman tree
13     htree, count = gen_huffman_tree(message)
14
15     # compute our huffman encoder
16     hcodes = gen_huffman_codes(htree, count)
17
18     # print our huffman codec
19     print_huffman_codec(hcodes)
20
21     # compress the message into our huffman encoding
22     huff_encoding = encode_message(message, hcodes)


Output:

1 bitarray('101111011101000000110101111101001101000001111')


# Decompress the Message

Next we'll want to decode our Huffman encoded message. For that we simply perform a traveral, while following the directional rules of our encoded message. When we hit a leaf, that's the character we are after.

 1 def decode_huffman_encoding(htree, huff_encoding):
2     def decode(depth, htree):
3         is_leaf = htree.left == htree.right == 0
4         if is_leaf:
5             offset += depth
6             decompressed_message.append(htree.char)
7         else:
8             bit = huff_encoding[depth+offset]
9             if htree.left and not bit:
10                 decode(depth+1, htree.left)
11             elif htree.right and bit:
12                 decode(depth+1, htree.right)
13
14     decompressed_message = []
15     offset = 
16
17     while offset < len(huff_encoding):
18         decode(0, htree)
19
20     decompressed_message = ''.join(decompressed_message)
21     return decompressed_message
22
23 def main():
24     # this is the message we want to encode
25     message  = 'BCCABBDDAECCBBAEDDCC'
26
27     # compute the huffman tree
28     htree, count = gen_huffman_tree(message)
29
30     # compute our huffman encoder
31     hcodes = gen_huffman_codes(htree, count)
32
33     # print our huffman codec
34     print_huffman_codec(hcodes)
35
36     # compress the message into our huffman encoding
37     huff_encoding = encode_message(message, hcodes)
38
39     # decompress our huffman encoding back into a string
40     decompressed_message = decode_huffman_encoding(htree, huff_encoding)
41
42     print(decompressed_message)


Output:

1 BCCABBDDAECCBBAEDDCC


# Printing some stats

The final step is to print some stats. Please note, i did not include the step of including the hcodes and htree into a final output, which would have indeed lead to a slight drop in the compression ratio.

 1 def main():
2     # this is the message we want to encode
3     message  = 'BCCABBDDAECCBBAEDDCC'
4
5     # compute the huffman tree
6     htree, count = gen_huffman_tree(message)
7
8     # compute our huffman encoder
9     hcodes = gen_huffman_codes(htree, count)
10
11     # print our huffman codec
12     print_huffman_codec(hcodes)
13
14     # compress the message into our huffman encoding
15     huff_encoding = encode_message(message, hcodes)
16
17     # decompress our huffman encoding back into a string
18     decompressed_message = decode_huffman_encoding(htree, huff_encoding)
19
20     # print stats and sanity check
21     print(f'Original message: {message}')
22     print(f'Huffman Encoded Message: {huff_encoding}')
23     print(f'Original Size: {len(message)*8} bits.')
24     print(f'New Size: {len(huff_encoding)} bits.')
25     print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
26     print(f'decompressed message: {decompressed_message}')
27     print(f'decompression success: {message == decompressed_message}')


Output:

1 Original message: BCCABBDDAECCBBAEDDCC
2 Huffman Encoded Message: bitarray('101111011101000000110101111101001101000001111')
3 Original Size: 160 bits.
4 New Size: 45 bits.
5 Compression: 71.875%.
6 decompressed message: BCCABBDDAECCBBAEDDCC
7 decompression success: True


# Calculating Encoding Size

## Using the codes

The size of the final encoding can be calculated from the hcode var:

It can also be calculated with with htree var:

Where di is the depth of the leaf, and fi is the count for the leaf.

# Putting it all together

Here's the full implementation for your fun and enjoyment.

 1 from collections import Counter
2 from collections import namedtuple
3 from bitarray import bitarray
4
5 node = namedtuple('node', 'char count left right')
6 hcode = namedtuple('hcode', 'char count code')
7
8 def gen_huffman_tree(message):
9     counts = Counter(message)
10     htree = {node(x, x, 0, 0) for x in sorted([*counts.items()], key=lambda x: x)}
11     huff_min = lambda: min(htree, key=lambda x: x.count)
12     while len(htree) > 1:
13         a = huff_min()
14         htree.remove(a)
15         b = huff_min()
16         htree.remove(b)
17         htree.add(node('', a.count + b.count, a, b))
18     return next(iter(htree)), len(counts)
19
20 def gen_huffman_codes(htree, count):
21     code = *count
22     hcodes = {}
23     def gen_codes(htree, depth=0):
24         if htree.left:
25             code[depth] = 0
26             gen_codes(htree.left, depth+1)
27         if htree.right:
28             code[depth] = 1
29             gen_codes(htree.right, depth+1)
30         is_leaf = htree.left == htree.right == 0
31         if is_leaf:
32             hcodes[htree.char] = hcode(htree.char, htree.count, bitarray(code[:depth]))
33     gen_codes(htree)
34     return hcodes
35
36 def decode_huffman_encoding(htree, huff_encoding):
37     def decode(depth, htree):
38         is_leaf = htree.left == htree.right == 0
39         if is_leaf:
40             offset += depth
41             decompressed_message.append(htree.char)
42         else:
43             bit = huff_encoding[depth+offset]
44             if htree.left and not bit:
45                 decode(depth+1, htree.left)
46             elif htree.right and bit:
47                 decode(depth+1, htree.right)
48
49     decompressed_message = []
50     offset = 
51
52     while offset < len(huff_encoding):
53         decode(0, htree)
54
55     decompressed_message = ''.join(decompressed_message)
56     return decompressed_message
57
58 def print_huffman_codec(hcodes):
59     for c, v in hcodes.items():
60         print(v)
61
62 def encode_message(message, hcodes):
63     huff_encoding = bitarray()
64     for c in message:
65         huff_encoding.extend(hcodes[c].code)
66     return huff_encoding
67
68 def main():
69     # this is the message we want to encode
70     message  = 'BCCABBDDAECCBBAEDDCC'
71
72     # compute the huffman tree
73     htree, count = gen_huffman_tree(message)
74
75     # compute our huffman encoder
76     hcodes = gen_huffman_codes(htree, count)
77
78     # print our huffman codec
79     print_huffman_codec(hcodes)
80
81     # compress the message into our huffman encoding
82     huff_encoding = encode_message(message, hcodes)
83
84     # decompress our huffman encoding back into a string
85     decompressed_message = decode_huffman_encoding(htree, huff_encoding)
86
87     # print stats and sanity check
88     print(f'Original message: {message}')
89     print(f'Huffman Encoded Message: {huff_encoding}')
90     print(f'Original Size: {len(message)*8} bits.')
91     print(f'New Size: {len(huff_encoding)} bits.')
92     print(f'Compression: {(1-len(huff_encoding)/(len(message)*8))*100}%.')
93     print(f'decompressed message: {decompressed_message}')
94     print(f'decompression success: {message == decompressed_message}')
95
96 if __name__ == '__main__':
97     main()