1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
|
#!/usr/bin/env perl
#
# Write a script for URL normalization based on rfc3986
#
# According to Wikipedia, URL normalization is the process by which URLs are
# modified and standardized in a consistent manner. The goal of the
# normalization process is to transform a URL into a normalized URL so it is
# possible to determine if two syntactically different URLs may be equivalent.
#
# (https://en.wikipedia.org/wiki/URL_normalization).
################################################################################
use strict;
use warnings;
my $url = shift or die "Usage: $0 <url>\n";
my $scheme_re = qr<([hH][tT][tT][pP][sS]?)>;
my $ipv4_re = qr<(?:[0-9]{1,3}\.){3}[0-9]{1,3}>;
my $hostname_re = qr<[a-zA-Z0-9.\-]+>;
my $host_re = qr<($ipv4_re|$hostname_re)>;
my $port_re = qr<(?::([0-9]+))?>;
my $rest_re = qr<([a-zA-Z0-9/\-._~\?=&%#]*)>;
# Capitalizing letters in escape sequences
$url =~ s<%[a-zA-Z0-9]{2}>< uc($&) >ge;
sub decode_octet {
my $octet = shift;
my $hex = hex(substr($octet, 1, length($octet)));
if (grep /^$hex$/, map ord, 'a'..'z', 'A'..'Z', '0'..'9', qw<- . _ ~>) {
return chr($hex);
} else {
return $octet;
}
}
# Decoding percent-encoded octets of unreserved characters
$url =~ s<%[a-zA-Z0-9]{2}>< decode_octet($&) >ge;
my ($scheme, $host, $port, $rest) = $url =~
m<^$scheme_re://$host_re$port_re$rest_re$>;
die "Unable to decode URI" unless $scheme and $host;
# Converting the scheme and host to lower case
$scheme = lc($scheme);
$host = lc($host);
# Removing the default port
if (!$port || $scheme eq 'http' && $port == 80 || $scheme eq 'https' && $port == 443) {
$port = '';
} else {
$port = ":$port";
}
# Removing dot-segments
$rest =~ s</\./|/\.$></>g; # Remove all '.'
$rest =~ s</[^/]+/\.\.><>g; # Remove instances of the pattern: '/x/..'
$rest =~ s<(\.\./)+><>g; # Remove all '..'s left at the beginning
print "$scheme://$host$port$rest\n";
|