summaryrefslogtreecommitdiff
path: root/022/ch2.pl
blob: 012407134f7b216bfb971121e495291691d63015 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env perl
#
# Write a script to implement Lempel–Ziv–Welch (LZW) compression algorithm. The
# script should have method to encode/decode algorithm. The wiki page explains
# the compression algorithm very nicely.
#
# (https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Welch).
################################################################################

use strict;
use warnings;

use POSIX qw<ceil>;
use Data::Dumper;

################################################################################
# Naïve implementation of a bidirectional map
sub bimap_new {
    return {sym2bin=>{}, bin2sym=>{}};
}
sub bimap_insert {
    my ($dict, $sym) = @_;
    my ($sym2bin, $bin2sym) = @$dict{'sym2bin', 'bin2sym'};
    die "Symbol '$sym' already in dict" if exists $dict->{sym2bin}{$sym};
    my $ord = keys %{$dict->{sym2bin}}; # Ordinal of symbol to insert in decimal
    my $bin = sprintf "%b", $ord; # ... and in binary (string of 1/0s)
    $sym2bin->{$sym} = $bin; # Update symbol -> binary mapping
    $bin2sym->{$bin} = $sym; # Update binary -> symbol mapping

    # Extend with left zeroes the previously inserted binaries
    my $binlen = length $bin;
    foreach my $sym (keys %$sym2bin) {
        my $bin = $sym2bin->{$sym};
        delete $bin2sym->{$bin};
        $bin = "0" x ($binlen - length $bin) . $bin;
        $sym2bin->{$sym} = $bin;
        $bin2sym->{$bin} = $sym;
    }
}


################################################################################
# Default dictionary
my $DEFAULT_DICT = bimap_new();
bimap_insert($DEFAULT_DICT, $_) foreach ('', 'A' .. 'Z');
sub dict_print {
    my $bin2sym = shift()->{bin2sym};
    foreach my $bin (sort keys %$bin2sym) {
        printf "%s -> %s\n", $bin2sym->{$bin}, $bin;
    }
}

sub lzw_encode {
    my $text = shift;
    my %dict = %$DEFAULT_DICT;
    my ($sym2bin, $bin2sym) = @dict{'sym2bin', 'bin2sym'};

    my $out = '';
    while ($text) {
        # Find largest match in dictionary
        foreach my $sym (sort { length $b cmp length $a } keys %$sym2bin) {
            if ($text =~ /^$sym(.*)$/) {
                # Attach the binary corresponding to the matching symbol
                $out .= $sym2bin->{$sym};
                # ... and cut the symbol from the input text
                $text = substr($text, length $sym);
                if ($1) {
                    # New dict entry ($sym + next character)
                    bimap_insert(\%dict, $sym . substr($1, 0, 1));
                } else {
                    # No next character: insert EOM marker and we're finished
                    $out .= $sym2bin->{''};
                }
                last;
            }
        }
        print "[debug] compressing $text\n";
    }
    return $out;
}

sub usage {
    die "Usage: $0 -e | --encode | -d | --decode\n";
}
my $mode = shift || usage;
if ($mode eq '-e' || $mode eq '--encode') {
    my $text = <>;
    my $bin = lzw_encode($text);
    my $comprate = 100 * length($bin) / (8 * length($text));
    printf STDERR "Compressed to %.2f%% of original size\n", $comprate;
    print $bin;
} else {
    usage;
}