1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
#!/usr/bin/env perl
#
# Create a script to implement full text search functionality using Inverted
# Index. According to wikipedia:
#
# In computer science, an inverted index (also referred to as a postings file or
# inverted file) is a database index storing a mapping from content, such as
# words or numbers, to its locations in a table, or in a document or a set of
# documents (named in contrast to a forward index, which maps from documents to
# content). The purpose of an inverted index is to allow fast full-text
# searches, at a cost of increased processing when a document is added to the
# database.
#
# Here is a nice example of Inverted Index.
#
# (https://en.wikipedia.org/wiki/Search_engine_indexing#Inverted_indices).
################################################################################
use strict;
use warnings;
use autodie;
use DBI;
my $DBFILE = 'ii.db';
my $USAGE = "Usage: $0 [--index-doc <doc_path> | --search <word>]\n";
my $opt = shift or die $USAGE;
# Connect to database
my $dbh = DBI->connect("dbi:SQLite:dbname=$DBFILE","","", { AutoCommit => 0 });
# Initialize database
$dbh->prepare('CREATE TABLE IF NOT EXISTS ii (word TEXT, docpath TEXT, PRIMARY KEY (word, docpath));')
->execute();
$dbh->prepare('CREATE INDEX IF NOT EXISTS word_idx ON ii (word);')
->execute();
$dbh->commit();
if ($opt eq '--index-doc') {
# Index the given document
my $doc_path = shift;
open(my $dh, '<:encoding(UTF-8)', $doc_path);
$/ = undef;
my @words = map { lc($_) } <$dh> =~ /(\w+)/g;
my $sth = $dbh->prepare("INSERT OR IGNORE INTO ii VALUES (?, \"$doc_path\");");
$sth->execute($_) foreach @words;
$dbh->commit();
} elsif ($opt eq '--search') {
# Search the given word
my $word = shift;
my $sth = $dbh->prepare('SELECT DISTINCT docpath FROM ii WHERE word = ?');
$sth->execute($word);
my @files = map { $_->[0] } @{$sth->fetchall_arrayref()};
if (@files) {
printf "Word '$word' appears in documents: %s\n", join(', ', @files);
} else {
print "Word '$word' does not appear in any document\n";
}
} else {
die $USAGE;
}
$dbh->disconnect();
|