diff options
author | Guillermo Ramos | 2019-08-16 21:53:57 +0200 |
---|---|---|
committer | Guillermo Ramos | 2019-08-16 21:56:27 +0200 |
commit | 5ebb802fbeaf78da370e1026cffc3d129d0be8e2 (patch) | |
tree | 71d88de10508e320187046526dde267522d1f031 | |
parent | 6d0ba18b52f0c430617f9640da11810da8075c98 (diff) | |
download | perlweekly-5ebb802fbeaf78da370e1026cffc3d129d0be8e2.tar.gz |
[021#2]
-rwxr-xr-x | 021/ch2.pl | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/021/ch2.pl b/021/ch2.pl new file mode 100755 index 0000000..a3970a2 --- /dev/null +++ b/021/ch2.pl @@ -0,0 +1,62 @@ +#!/usr/bin/env perl +# +# Write a script for URL normalization based on rfc3986 +# +# According to Wikipedia, URL normalization is the process by which URLs are +# modified and standardized in a consistent manner. The goal of the +# normalization process is to transform a URL into a normalized URL so it is +# possible to determine if two syntactically different URLs may be equivalent. +# +# (https://en.wikipedia.org/wiki/URL_normalization). +################################################################################ + +use strict; +use warnings; + +my $url = shift or die "Usage: $0 <url>\n"; + +my $scheme_re = qr<([hH][tT][tT][pP][sS]?)>; +my $ipv4_re = qr<(?:[0-9]{1,3}\.){3}[0-9]{1,3}>; +my $hostname_re = qr<[a-zA-Z0-9.\-]+>; +my $host_re = qr<($ipv4_re|$hostname_re)>; +my $port_re = qr<(?::([0-9]+))?>; +my $rest_re = qr<([a-zA-Z0-9/\-._~\?=&%#]*)>; + +# Capitalizing letters in escape sequences +$url =~ s<%[a-zA-Z0-9]{2}>< uc($&) >ge; + +sub decode_octet { + my $octet = shift; + my $hex = hex(substr($octet, 1, length($octet))); + if (grep /^$hex$/, map ord, 'a'..'z', 'A'..'Z', '0'..'9', qw<- . _ ~>) { + return chr($hex); + } else { + return $octet; + } +} + +# Decoding percent-encoded octets of unreserved characters +$url =~ s<%[a-zA-Z0-9]{2}>< decode_octet($&) >ge; + +my ($scheme, $host, $port, $path) = $url =~ + m<^$scheme_re://$host_re$port_re$rest_re>; + +die "Unable to decode URI" unless $scheme and $host; + +# Converting the scheme and host to lower case +$scheme = lc($scheme); +$host = lc($host); + +# Removing the default port +if (!$port || $scheme eq 'http' && $port == 80 || $scheme eq 'https' && $port == 443) { + $port = ''; +} else { + $port = ":$port"; +} + +# Removing dot-segments +$path =~ s</\./|/\.$></>g; # Remove all '.' +$path =~ s</[^/]+/\.\.><>g; # Remove instances of the pattern: '/x/..' +$path =~ s<(\.\./)+><>g; # Remove all '..'s left at the beginning + +print "$scheme://$host$port$path\n"; |