/usr/lib/python2.7/dist-packages/linkcheck/plugins/parsepdf.py is in linkchecker 9.3-5.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | # -*- coding: iso-8859-1 -*-
# Copyright (C) 2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parse links in PDF files with pdfminer.
"""
from cStringIO import StringIO
from . import _ParserPlugin
try:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdftypes import PDFStream, PDFObjRef
from pdfminer.pdfpage import PDFPage
from pdfminer.psparser import PSException
except ImportError:
has_pdflib = False
else:
has_pdflib = True
from .. import log, LOG_PLUGIN, strformat
def search_url(obj, url_data, pageno, seen_objs):
"""Recurse through a PDF object, searching for URLs."""
if isinstance(obj, PDFObjRef):
if obj.objid in seen_objs:
# prevent recursive loops
return
seen_objs.add(obj.objid)
obj = obj.resolve()
if isinstance(obj, dict):
for key, value in obj.items():
if key == 'URI' and isinstance(value, basestring):
# URIs should be 7bit ASCII encoded, but be safe and encode
# to unicode
# XXX this does not use an optional specified base URL
url = strformat.unicode_safe(value)
url_data.add_url(url, page=pageno)
else:
search_url(value, url_data, pageno, seen_objs)
elif isinstance(obj, list):
for elem in obj:
search_url(elem, url_data, pageno, seen_objs)
elif isinstance(obj, PDFStream):
search_url(obj.attrs, url_data, pageno, seen_objs)
class PdfParser(_ParserPlugin):
"""PDF parsing plugin."""
def __init__(self, config):
"""Check for pdfminer."""
if not has_pdflib:
log.warn(LOG_PLUGIN, "pdfminer not found for PdfParser plugin")
super(PdfParser, self).__init__(config)
def applies_to(self, url_data, pagetype=None):
"""Check for PDF pagetype."""
return has_pdflib and pagetype == 'pdf'
def check(self, url_data):
"""Parse PDF data."""
# XXX user authentication from url_data
password = ''
data = url_data.get_content()
# PDFParser needs a seekable file object
fp = StringIO(data)
try:
parser = PDFParser(fp)
doc = PDFDocument(parser, password=password)
for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1):
if "Contents" in page.attrs:
search_url(page.attrs["Contents"], url_data, pageno, set())
if "Annots" in page.attrs:
search_url(page.attrs["Annots"], url_data, pageno, set())
except PSException as msg:
if not msg.args:
# at least show the class name
msg = repr(msg)
log.warn(LOG_PLUGIN, "Error parsing PDF file: %s", msg)
|