/usr/share/perl5/URI/Find/Schemeless.pm is in liburi-find-perl 20100505-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | # Copyright (c) 2000, 2009 Michael G. Schwern. All rights reserved.
# This program is free software; you can redistribute it and/or modify
# it under the same terms as Perl itself.
package URI::Find::Schemeless;
use strict;
use base qw(URI::Find);
# base.pm error in 5.005_03 prevents it from loading URI::Find if I'm
# required first.
use URI::Find ();
use vars qw($VERSION);
$VERSION = 20100505;
my($dnsSet) = 'A-Za-z0-9-';
my($cruftSet) = __PACKAGE__->cruft_set . '<>?}';
my($tldRe) = __PACKAGE__->top_level_domain_re;
my($uricSet) = __PACKAGE__->uric_set;
=head1 NAME
URI::Find::Schemeless - Find schemeless URIs in arbitrary text.
=head1 SYNOPSIS
require URI::Find::Schemeless;
my $finder = URI::Find::Schemeless->new(\&callback);
The rest is the same as URI::Find.
=head1 DESCRIPTION
URI::Find finds absolute URIs in plain text with some weak heuristics
for finding schemeless URIs. This subclass is for finding things
which might be URIs in free text. Things like "www.foo.com" and
"lifes.a.bitch.if.you.aint.got.net".
The heuristics are such that it hopefully finds a minimum of false
positives, but there's no easy way for it know if "COMMAND.COM" refers
to a web site or a file.
=cut
sub schemeless_uri_re {
@_ == 1 || __PACKAGE__->badinvo;
return qr{
# Originally I constrained what couldn't be before the match
# like this: don't match email addresses, and don't start
# anywhere but at the beginning of a host name
# (?<![\@.$dnsSet])
# but I switched to saying what can be there after seeing a
# false match of "Lite.pm" via "MIME/Lite.pm".
(?: ^ | (?<=[\s<>()\{\}\[\]]) )
# hostname
(?: [$dnsSet]+(?:\.[$dnsSet]+)*\.$tldRe
| (?:\d{1,3}\.){3}\d{1,3} ) # not inet_aton() complete
(?:
(?=[\s\Q$cruftSet\E]) # followed by unrelated thing
(?!\.\w) # but don't stop mid foo.xx.bar
(?<!\.p[ml]) # but exclude Foo.pm and Foo.pl
|$ # or end of line
(?<!\.p[ml]) # but exclude Foo.pm and Foo.pl
|/[$uricSet#]* # or slash and URI chars
)
}x;
}
=head3 top_level_domain_re
my $tld_re = $self->top_level_domain_re;
Returns the regex for matching top level DNS domains. The regex shouldn't
be anchored, it shouldn't do any capturing matches, and it should make
itself ignore case.
=cut
sub top_level_domain_re {
@_ == 1 || __PACKAGE__->badinvo;
my($self) = shift;
# Updated from http://www.iana.org/domains/root/db/
my $plain = join '|', qw(
AERO
ARPA
ASIA
BIZ
CAT
COM
COOP
EDU
GOV
INFO
INT
JOBS
MIL
MOBI
MUSEUM
NAME
NET
ORG
PRO
TEL
TRAVEL
);
return qr/(?:[a-z]{2}|$plain)/i;
}
=head1 AUTHOR
Original code by Roderick Schertler <roderick@argon.org>, adapted by
Michael G Schwern <schwern@pobox.com>.
Currently maintained by Roderick Schertler <roderick@argon.org>.
=head1 SEE ALSO
L<URI::Find>
=cut
1;
|