From d3e5b64eaa7fb3e05e776c180bc3f5ccc1573382 Mon Sep 17 00:00:00 2001 From: Guillermo Ramos Date: Sun, 8 Sep 2019 17:19:51 +0200 Subject: [024#2] --- 024/ch2.pl | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100755 024/ch2.pl diff --git a/024/ch2.pl b/024/ch2.pl new file mode 100755 index 0000000..0ba06b2 --- /dev/null +++ b/024/ch2.pl @@ -0,0 +1,65 @@ +#!/usr/bin/env perl +# +# Create a script to implement full text search functionality using Inverted +# Index. According to wikipedia: +# +# In computer science, an inverted index (also referred to as a postings file or +# inverted file) is a database index storing a mapping from content, such as +# words or numbers, to its locations in a table, or in a document or a set of +# documents (named in contrast to a forward index, which maps from documents to +# content). The purpose of an inverted index is to allow fast full-text +# searches, at a cost of increased processing when a document is added to the +# database. +# +# Here is a nice example of Inverted Index. +# +# (https://en.wikipedia.org/wiki/Search_engine_indexing#Inverted_indices). +################################################################################ + +use strict; +use warnings; + +use autodie; + +use DBI; + +my $DBFILE = 'ii.db'; +my $USAGE = "Usage: $0 [--index-doc | --search ]\n"; + +my $opt = shift or die $USAGE; + +# Connect to database +my $dbh = DBI->connect("dbi:SQLite:dbname=$DBFILE","","", { AutoCommit => 0 }); + +# Initialize database +$dbh->prepare('CREATE TABLE IF NOT EXISTS ii (word TEXT, docpath TEXT, PRIMARY KEY (word, docpath));') + ->execute(); +$dbh->prepare('CREATE INDEX IF NOT EXISTS word_idx ON ii (word);') + ->execute(); +$dbh->commit(); + +if ($opt eq '--index-doc') { + # Index the given document + my $doc_path = shift; + open(my $dh, '<:encoding(UTF-8)', $doc_path); + $/ = undef; + my @words = map { lc($_) } <$dh> =~ /(\w+)/g; + my $sth = $dbh->prepare("INSERT OR IGNORE INTO ii VALUES (?, \"$doc_path\");"); + $sth->execute($_) foreach @words; + $dbh->commit(); +} elsif ($opt eq '--search') { + # Search the given word + my $word = shift; + my $sth = $dbh->prepare('SELECT DISTINCT docpath FROM ii WHERE word = ?'); + $sth->execute($word); + my @files = map { $_->[0] } @{$sth->fetchall_arrayref()}; + if (@files) { + printf "Word '$word' appears in documents: %s\n", join(', ', @files); + } else { + print "Word '$word' does not appear in any document\n"; + } +} else { + die $USAGE; +} + +$dbh->disconnect(); -- cgit v1.2.3