/usr/bin/getpdftext is in libcam-pdf-perl 1.60-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 | #!/usr/bin/perl -w
eval 'exec /usr/bin/perl -w -S $0 ${1+"$@"}'
if 0; # not running under some shell
package main;
use warnings;
use strict;
use CAM::PDF;
use Getopt::Long;
use Pod::Usage;
our $VERSION = '1.60';
my %opts = (
check => 0,
geom => 0,
verbose => 0,
help => 0,
version => 0,
);
Getopt::Long::Configure('bundling');
GetOptions('g|geometry' => \$opts{geom},
'c|check' => \$opts{check},
'v|verbose' => \$opts{verbose},
'h|help' => \$opts{help},
'V|version' => \$opts{version},
) or pod2usage(1);
if ($opts{help})
{
pod2usage(-exitstatus => 0, -verbose => 2);
}
if ($opts{version})
{
print "CAM::PDF v$CAM::PDF::VERSION\n";
exit 0;
}
if (@ARGV < 1)
{
pod2usage(1);
}
my $file = shift;
my $pagelist = shift;
my $doc = CAM::PDF->new($file) || die "$CAM::PDF::errstr\n";
foreach my $p ($doc->rangeToArray(1,$doc->numPages(),$pagelist))
{
if ($opts{check})
{
print "Checking page $p\n";
my $tree = $doc->getPageContentTree($p, $opts{verbose});
if (!$tree || !$tree->validate())
{
print " Failed\n";
}
if ($opts{geom})
{
$tree->computeGS();
}
}
else
{
my $str = $doc->getPageText($p, $opts{verbose});
if (defined $str)
{
CAM::PDF->asciify(\$str);
print $str;
}
}
}
__END__
=for stopwords getpdftext.pl
=head1 NAME
getpdftext.pl - Extracts and print the text from one or more PDF pages
=head1 SYNOPSIS
getpdftext.pl [options] infile.pdf [<pagenums>]
Options:
-c --check just validates the page instead of printing it
-g --geometry just computes geometry, prints nothing
-v --verbose print diagnostic messages
-h --help verbose help message
-V --version print CAM::PDF version
<pagenums> is a comma-separated list of page numbers.
Ranges like '2-6' allowed in the list
Example: 4-6,2,12,8-9
=head1 DESCRIPTION
Extracts all of the text from the specified PDF page(s) and prints
them to STDOUT. If no pages are specified, all pages are processed.
The C<--check> and C<--geometry> modes are distinctly different. They are
used primarily for debugging.
=head1 SEE ALSO
CAM::PDF
F<renderpdf.pl>
=head1 AUTHOR
See L<CAM::PDF>
=cut
|