summaryrefslogtreecommitdiff
path: root/024/ch2.pl
blob: 0ba06b256d24628983f82a3c05d4d0340917d425 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env perl
#
# Create a script to implement full text search functionality using Inverted
# Index. According to wikipedia:
#
# In computer science, an inverted index (also referred to as a postings file or
# inverted file) is a database index storing a mapping from content, such as
# words or numbers, to its locations in a table, or in a document or a set of
# documents (named in contrast to a forward index, which maps from documents to
# content). The purpose of an inverted index is to allow fast full-text
# searches, at a cost of increased processing when a document is added to the
# database.
#
# Here is a nice example of Inverted Index.
#
# (https://en.wikipedia.org/wiki/Search_engine_indexing#Inverted_indices).
################################################################################

use strict;
use warnings;

use autodie;

use DBI;

my $DBFILE = 'ii.db';
my $USAGE = "Usage: $0 [--index-doc <doc_path> | --search <word>]\n";

my $opt = shift or die $USAGE;

# Connect to database
my $dbh = DBI->connect("dbi:SQLite:dbname=$DBFILE","","", { AutoCommit => 0 });

# Initialize database
$dbh->prepare('CREATE TABLE IF NOT EXISTS ii (word TEXT, docpath TEXT, PRIMARY KEY (word, docpath));')
    ->execute();
$dbh->prepare('CREATE INDEX IF NOT EXISTS word_idx ON ii (word);')
    ->execute();
$dbh->commit();

if ($opt eq '--index-doc') {
    # Index the given document
    my $doc_path = shift;
    open(my $dh, '<:encoding(UTF-8)', $doc_path);
    $/ = undef;
    my @words = map { lc($_) } <$dh> =~ /(\w+)/g;
    my $sth = $dbh->prepare("INSERT OR IGNORE INTO ii VALUES (?, \"$doc_path\");");
    $sth->execute($_) foreach @words;
    $dbh->commit();
} elsif ($opt eq '--search') {
    # Search the given word
    my $word = shift;
    my $sth = $dbh->prepare('SELECT DISTINCT docpath FROM ii WHERE word = ?');
    $sth->execute($word);
    my @files = map { $_->[0] } @{$sth->fetchall_arrayref()};
    if (@files) {
        printf "Word '$word' appears in documents: %s\n", join(', ', @files);
    } else {
        print "Word '$word' does not appear in any document\n";
    }
} else {
    die $USAGE;
}

$dbh->disconnect();