/usr/share/perl5/URI/Title/HTML.pm is in liburi-title-perl 1.86-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | =head NAME
URI::Title::HTML - get titles of html files
=cut
package URI::Title::HTML;
use warnings;
use strict;
use HTML::Entities;
use utf8;
our $CAN_USE_ENCODE;
BEGIN {
eval { require Encode; Encode->import('decode') };
$CAN_USE_ENCODE = !$@;
}
sub types {(
'text/html',
'default',
)}
sub title {
my ($class, $url, $data, $type, $cset) = @_;
my $title;
my $special_case;
my $default_match = '<title.*?>(.+?)</title';
# special case for the iTMS.
if ( $INC{'URI/Title/iTMS.pm'} and $url =~ m!phobos.apple.com! and $data =~ m!(itms://[^']*)! ) {
return URI::Title::iTMS->title($1);
}
# TODO - work this out from the headers of the HTML
if ($data =~ /charset=\"?([\w-]+)/i) {
$cset = lc($1);
}
if ( $CAN_USE_ENCODE ) {
$data = eval { decode('utf-8', $data, 1) } || eval { decode($cset, $data, 1) } || $data;
}
my $found_title;
if ($url =~ /use\.perl\.org\/~([^\/]+).*journal\/\d/i) {
$special_case = '<FONT FACE="geneva,verdana,sans-serif" SIZE="1"><B>(.+?)<';
$title = "use.perl journal of $1 - ";
} elsif ($url =~ /(pants\.heddley\.com|dailychump\.org).*#(.*)$/i) {
my $id = $2;
$special_case = 'id="a'.$id.'.*?></a>(.+?)<';
$title = "pants daily chump - ";
} elsif ($url =~ /paste\.husk\.org/i) {
$special_case = 'Summary: (.+?)<';
$title = "paste - ";
} elsif ($url =~ /twitter.com\/(.*?)\/status(es)?\/\d+/i) {
$special_case = '<p class="js-tweet-text tweet-text ">([^\<]+)';
$title = "twitter - ";
} elsif ($url =~ /independent\.co\.uk/i) {
$special_case = '<h1 class=head1>(.+?)<';
} elsif ($url =~ /www\.hs\.fi\/english\/article/i) {
$special_case = '<h1>(.+?)</h1>';
} elsif ($url =~ /google.com/i and $data =~ /calc_img/) {
# google can be used as a calculator. Try to find the result.
$special_case = 'calc_img.*<td nowrap>(.+?)</td';
} elsif ($url =~ /spotify\.url\.fi/) {
$special_case = '<title>\s*(.+?)\s+—\s+Decode\s+Spotify\s+URIs\s*</title>';
}
if (!$found_title and $special_case) {
($found_title) = $data =~ /$special_case/ims;
}
if (!$found_title) {
($found_title) = $data =~ /$default_match/ims;
}
return unless $found_title;
$found_title =~ s/<sup>(.+?)<\/sup>/^$1/g; # for the google math output
$found_title =~ s/<.*?>//g;
$title .= $found_title;
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$title =~ s/\n+//g;
$title =~ s/\s+/ /g;
#use Devel::Peek;
#Dump( $title );
$title = decode_entities($title);
#Dump( $title );
# decode nasty number-encoded entities. Mostly works
$title =~ s/(&\#(\d+);?)/chr($2)/eg;
return $title;
}
1;
|