/usr/share/perl5/XMLTV/Grab_XML.pm is in libxmltv-perl 0.5.70-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | # $Id: Grab_XML.pm,v 1.20 2015/07/05 04:32:22 knowledgejunkie Exp $
package XMLTV::Grab_XML;
use strict;
use Getopt::Long;
use Date::Manip;
use XMLTV;
use XMLTV::Usage;
use XMLTV::Memoize;
use XMLTV::ProgressBar;
use XMLTV::Ask;
use XMLTV::TZ qw(parse_local_date);
use XMLTV::Get_nice qw();
use XMLTV::Date;
# Use Log::TraceMessages if installed.
BEGIN {
eval { require Log::TraceMessages };
if ($@) {
*t = sub {};
*d = sub { '' };
}
else {
*t = \&Log::TraceMessages::t;
*d = \&Log::TraceMessages::d;
Log::TraceMessages::check_argv();
}
}
=pod
=head1 NAME
XMLTV::Grab_XML - Perl extension to fetch raw XMLTV data from a site
=head1 SYNOPSIS
package Grab_XML_rur;
use base 'XMLTV::Grab_XML';
sub urls_by_date( $ ) { my $pkg = shift; ... }
sub country( $ ) { my $pkg = shift; return 'Ruritania' }
# Maybe override a couple of other methods as described below...
Grab_XML_rur->go();
=head1 DESCRIPTION
This module helps to write grabbers which fetch pages in XMLTV format
from some website and output the data. It is not used for grabbers
which scrape human-readable sites.
It consists of several class methods (package methods). The way to
use it is to subclass it and override some of these.
=head1 METHODS
=over
=item XMLTV::Grab_XML->date_init()
Called at the start of the program to set up Date::Manip. You might
want to override this with a method that sets the timezone.
=cut
sub date_init( $ ) {
my $pkg = shift;
Date_Init();
}
=pod
=item XMLTV::Grab_XML->urls_by_date()
Returns a hash mapping YYYYMMDD dates to a URL where listings for that
date can be downloaded. This method is abstract, you must override
it.
Arguments: the command line options for --config-file and --quiet.
=cut
sub urls_by_date( $$$ ) {
my $pkg = shift;
die 'abstract class method: override in subclass';
}
=pod
=item XMLTV::Grab_XML->xml_from_data(data)
Given page data for a particular day, turn it into XML. The default
implementation just returns the data unchanged, but you might override
it if you need to decompress the data or patch it up.
=cut
sub xml_from_data( $$ ) {
my $pkg = shift;
t 'Grab_XML::xml_from_data()';
return shift; # leave unchanged
}
=pod
=item XMLTV::Grab_XML->configure()
Configure the grabber if needed. Arguments are --config-file option
(or undef) and --quiet flag (or undef).
This method is not provided in the base class; if you don't provide it
then attempts to --configure will give a message that configuration is
not necessary.
=item XMLTV::Grab_XML->nextday(day)
Bump a YYYYMMDD date by one. You probably shouldnE<39>t override this.
=cut
sub nextday( $$ ) {
my $pkg = shift;
my $d = shift; $d =~ /^\d{8}$/ or die;
my $p = parse_date($d);
my $n = DateCalc($p, '+ 1 day'); die if not defined $n;
return UnixDate($n, '%Q');
}
=item XMLTV::Grab_XML->country()
Return the name of the country youE<39>re grabbing for, used in usage
messages. Abstract.
=cut
sub country( $ ) {
my $pkg = shift;
die 'abstract class method: override in subclass';
}
=item XMLTV::Grab_XML->usage_msg()
Return a command-line usage message. This calls C<country()>, so you
probably need to override only that method.
=cut
sub usage_msg( $ ) {
my $pkg = shift;
my $country = $pkg->country();
if ($pkg->can('configure')) {
return <<END
$0: get $country television listings in XMLTV format
usage: $0 --configure [--config-file FILE]
$0 [--output FILE] [--days N] [--offset N] [--quiet] [--config-file FILE]
$0 --help
END
;
}
else {
return <<END
$0: get $country television listings in XMLTV format
usage: $0 [--output FILE] [--days N] [--offset N] [--quiet]
$0 --help
END
;
}
}
=item XMLTV::Grab_XML->get()
Given a URL, fetch the content at that URL. The default
implementation calls XMLTV::Get_nice::get_nice() but you might want to
override it if you need to do wacky things with http requests, like
cookies.
Note that while this method fetches a page, C<xml_from_data()> does
any further processing of the result to turn it into XML.
=cut
sub get( $$ ) {
my $pkg = shift;
my $url = shift;
return XMLTV::Get_nice::get_nice($url);
}
=item XMLTV::Grab_XML->go()
The main program. Parse command line options, fetch and write data.
Most of the options are fairly self-explanatory but this routine also
calls the XMLTV::Memoize module to look for a B<--cache> argument.
The functions memoized are those given by the C<cachables()> method.
=cut
sub go( $ ) {
my $pkg = shift;
XMLTV::Memoize::check_argv($pkg->cachables());
my ($opt_days,
$opt_help,
$opt_output,
$opt_share,
$opt_gui,
$opt_offset,
$opt_quiet,
$opt_configure,
$opt_config_file,
$opt_list_channels,
);
$opt_offset = 0; # default
$opt_quiet = 0; # default
GetOptions('days=i' => \$opt_days,
'help' => \$opt_help,
'output=s' => \$opt_output,
'share=s' => \$opt_share, # undocumented
'gui:s' => \$opt_gui,
'offset=i' => \$opt_offset,
'quiet' => \$opt_quiet,
'configure' => \$opt_configure,
'config-file=s' => \$opt_config_file,
'list-channels' => \$opt_list_channels,
)
or usage(0, $pkg->usage_msg());
die 'number of days must not be negative'
if (defined $opt_days && $opt_days < 0);
usage(1, $pkg->usage_msg()) if $opt_help;
usage(0, $pkg->usage_msg()) if @ARGV;
XMLTV::Ask::init($opt_gui);
if ($opt_share) {
if ($pkg->can('set_share_dir')) {
$pkg->set_share_dir($opt_share);
}
else {
print STDERR "share directory not in use\n";
}
}
my $has_config = $pkg->can('configure');
if ($opt_configure) {
if ($has_config) {
$pkg->configure($opt_config_file, $opt_quiet);
}
else {
print STDERR "no configuration necessary\n";
}
exit;
}
for ($opt_config_file) {
warn("this grabber has no configuration, so ignoring --config-file\n"), undef $_
if defined and not $has_config;
}
# Need to call parse_local_date() before any resetting of
# Date::Manip's timezone.
#
my $now = DateCalc(parse_local_date('now'), "$opt_offset days");
die if not defined $now;
$pkg->date_init();
my $today = UnixDate($now, '%Q');
my %urls = $pkg->urls_by_date($opt_config_file, $opt_quiet);
t 'URLs by date: ' . d \%urls;
my @to_get;
if ($opt_list_channels) {
# We won't bother to do an exhaustive check for every option
# that is ignored with --list-channels.
#
die "useless to give --days or --offset with --list-channels\n"
if defined $opt_days or $opt_offset != 0;
# For now, assume that the upstream site doesn't provide any
# way to get just the channels, so we'll have to pick a
# listings file and then discard most of it.
#
my @dates = sort keys %urls;
die 'no dates found on site' if not @dates;
my $latest = $dates[-1];
@to_get = $urls{$latest};
}
else {
# Getting programme listings.
my $days_left = $opt_days;
t '$days_left starts at ' . d $days_left;
t '$today=' . d $today;
for (my $day = $today; defined $urls{$day}; $day = $pkg->nextday($day)) {
t "\$urls{$day}=" . d $urls{$day};
if (defined $days_left and $days_left-- == 0) {
t 'got to last day';
last;
}
push @to_get, $urls{$day};
}
if (defined $days_left and $days_left > 0) {
warn "couldn't get all of $opt_days days, only "
. ($opt_days - $days_left) . "\n";
}
elsif (not @to_get) {
warn "couldn't get any listings from the site for today or later\n";
}
}
my $bar = new XMLTV::ProgressBar('downloading listings', scalar @to_get)
if not $opt_quiet;
my @listingses;
foreach my $url (@to_get) {
my $xml;
# Set error handlers. Strange bugs if you call warn() or
# die() inside these, at least I have seen such bugs in
# XMLTV.pm, so I'm avoiding it here.
#
local $SIG{__WARN__} = sub {
my $msg = shift;
$msg = "warning: something's wrong" if not defined $msg;
print STDERR "$url: $msg\n";
};
local $SIG{__DIE__} = sub {
my $msg = shift;
$msg = 'died' if not defined $msg;
print STDERR "$url: $msg, exiting\n";
exit(1);
};
my $got = $pkg->get($url);
if (not defined $got) {
warn 'failed to download, skipping';
next;
}
$xml = $pkg->xml_from_data($got);
t 'got XML: ' . d $xml;
if (not defined $xml) {
warn 'could not get XML from page, skipping';
next;
}
push @listingses, XMLTV::parse($xml);
update $bar if not $opt_quiet;
}
$bar->finish() if not $opt_quiet;
my %w_args = ();
if (defined $opt_output) {
my $fh = new IO::File ">$opt_output";
die "cannot write to $opt_output\n" if not $fh;
%w_args = (OUTPUT => $fh);
}
if ($opt_list_channels) {
die if @listingses != 1;
my $l = $listingses[0];
undef $l->[3]; # blank out programme data
XMLTV::write_data($l, %w_args);
}
else {
XMLTV::write_data(XMLTV::cat(@listingses), %w_args);
}
}
=item XMLTV::Grab_XML->cachables()
Returns a list of names of functions which could reasonably be
memoized between runs. This will normally be whatever function
fetches the web pages - you memoize that to save on repeated
downloads. A subclass might want to add things to this list
if it has its own way of fetching web pages.
=cut
sub cachables( $ ) {
my $pkg = shift;
return ('XMLTV::Get_nice::get_nice_aux');
}
=pod
=item XMLTV::Grab_XML->remove_early_stop_times()
Checks each stop time and removes it if it's before the start time.
Argument: the XML to correct
Returns: the corrected XML
=cut
my $warned_bad_stop_time = 0;
sub remove_early_stop_times( $$ ) {
my $pkg = shift;
my @lines = split /\n/, shift;
foreach (@lines) {
if (/<programme/) {
# First change to numeric timezones.
s{(start|stop)="(\d+) ([A-Z]+)"}
{qq'$1="$2 ' . tz_to_num($3) . '"'}eg;
# Now remove stop times before start. Only worry about
# cases where the timezone is the same - we hope the
# upstream data will be fixed by the next TZ changeover.
#
/start="(\d+) (\S+)"/ or next;
my ($start, $tz) = ($1, $2);
/stop="(\d+) \Q$tz\E"/ or next;
my $stop = $1;
if ($stop lt $start) {
warn "removing stop time before start time: $_"
unless $warned_bad_stop_time++;
s/stop="[^""]+"\s*// or die;
}
}
}
return join("\n", @lines);
}
=pod
=back
=head1 AUTHOR
Ed Avis, ed@membled.com
=head1 SEE ALSO
L<perl(1)>, L<XMLTV(3)>.
=cut
1;
|