1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
#!/usr/bin/env perl
#
# Write a script to implement Lempel–Ziv–Welch (LZW) compression algorithm. The
# script should have method to encode/decode algorithm. The wiki page explains
# the compression algorithm very nicely.
#
# (https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch).
################################################################################
use strict;
use warnings;
use POSIX qw<ceil>;
use Data::Dumper;
################################################################################
# Naïve implementation of a bidirectional map
sub bimap_new {
return {sym2bin=>{}, bin2sym=>{}};
}
sub bimap_insert {
my ($dict, $sym) = @_;
my ($sym2bin, $bin2sym) = @$dict{'sym2bin', 'bin2sym'};
die "Symbol '$sym' already in dict" if exists $dict->{sym2bin}{$sym};
my $ord = keys %{$dict->{sym2bin}}; # Ordinal of symbol to insert in decimal
my $bin = sprintf "%b", $ord; # ... and in binary (string of 1/0s)
$sym2bin->{$sym} = $bin; # Update symbol -> binary mapping
$bin2sym->{$bin} = $sym; # Update binary -> symbol mapping
# Extend with left zeroes the previously inserted binaries
my $binlen = length $bin;
foreach my $sym (keys %$sym2bin) {
my $bin = $sym2bin->{$sym};
delete $bin2sym->{$bin};
$bin = "0" x ($binlen - length $bin) . $bin;
$sym2bin->{$sym} = $bin;
$bin2sym->{$bin} = $sym;
}
}
################################################################################
# Default dictionary
my $DEFAULT_DICT = bimap_new();
bimap_insert($DEFAULT_DICT, $_) foreach ('', 'A' .. 'Z');
sub dict_print {
my $bin2sym = shift()->{bin2sym};
foreach my $bin (sort keys %$bin2sym) {
printf "%s -> %s\n", $bin2sym->{$bin}, $bin;
}
}
sub lzw_encode {
my $text = shift;
my %dict = %$DEFAULT_DICT;
my ($sym2bin, $bin2sym) = @dict{'sym2bin', 'bin2sym'};
my $out = '';
while ($text) {
# Find largest match in dictionary
foreach my $sym (sort { length $b cmp length $a } keys %$sym2bin) {
if ($text =~ /^$sym(.*)$/) {
# Attach the binary corresponding to the matching symbol
$out .= $sym2bin->{$sym};
# ... and cut the symbol from the input text
$text = substr($text, length $sym);
if ($1) {
# New dict entry ($sym + next character)
bimap_insert(\%dict, $sym . substr($1, 0, 1));
} else {
# No next character: insert EOM marker and we're finished
$out .= $sym2bin->{''};
}
last;
}
}
print "[debug] compressing $text\n";
}
return $out;
}
sub usage {
die "Usage: $0 -e | --encode | -d | --decode\n";
}
my $mode = shift || usage;
if ($mode eq '-e' || $mode eq '--encode') {
my $text = <>;
my $bin = lzw_encode($text);
my $comprate = 100 * length($bin) / (8 * length($text));
printf STDERR "Compressed to %.2f%% of original size\n", $comprate;
print $bin;
} else {
usage;
}
|