/usr/lib/python2.7/dist-packages/linkcheck/plugins/parseword.py is in linkchecker 9.3-5.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | # -*- coding: iso-8859-1 -*-
# Copyright (C) 2010-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parse hyperlinks in Word files.
"""
from . import _ParserPlugin
try:
import win32com
import pythoncom
has_win32com = True
Error = pythoncom.com_error
except ImportError:
has_win32com = False
Error = StandardError
from .. import fileutil, log, LOG_PLUGIN
_initialized = False
def init_win32com ():
"""Initialize the win32com.client cache."""
global _initialized
if _initialized:
return
import win32com.client
if win32com.client.gencache.is_readonly:
#allow gencache to create the cached wrapper objects
win32com.client.gencache.is_readonly = False
# under py2exe the call in gencache to __init__() does not happen
# so we use Rebuild() to force the creation of the gen_py folder
# Note that the python...\win32com.client.gen_py dir must not exist
# to allow creation of the cache in %temp% for py2exe.
# This is ensured by excluding win32com.gen_py in setup.py
win32com.client.gencache.Rebuild()
_initialized = True
def has_word ():
"""Determine if Word is available on the current system."""
if not has_win32com:
return False
try:
import _winreg as winreg
except ImportError:
import winreg
try:
key = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, "Word.Application")
winreg.CloseKey(key)
return True
except (EnvironmentError, ImportError):
pass
return False
def constants (name):
"""Helper to return constants. Avoids importing win32com.client in
other modules."""
return getattr(win32com.client.constants, name)
def get_word_app ():
"""Return open Word.Application handle, or None if Word is not available
on this system."""
if not has_word():
return None
# Since this function is called from different threads, initialize
# the COM layer.
pythoncom.CoInitialize()
import win32com.client
app = win32com.client.gencache.EnsureDispatch("Word.Application")
app.Visible = False
return app
def close_word_app (app):
"""Close Word application object."""
app.Quit()
def open_wordfile (app, filename):
"""Open given Word file with application object."""
return app.Documents.Open(filename, ReadOnly=True,
AddToRecentFiles=False, Visible=False, NoEncodingDialog=True)
def close_wordfile (doc):
"""Close word file."""
doc.Close()
class WordParser(_ParserPlugin):
"""Word parsing plugin."""
def __init__(self, config):
"""Check for pdfminer."""
init_win32com()
if not has_word():
log.warn(LOG_PLUGIN, "Microsoft Word not found for WordParser plugin")
super(WordParser, self).__init__(config)
def applies_to(self, url_data, pagetype=None):
"""Check for Word pagetype."""
return has_word() and pagetype == 'word'
def check(self, url_data):
"""Parse Word data."""
content = url_data.get_content()
filename = get_temp_filename(content)
# open word file and parse hyperlinks
try:
app = get_word_app()
try:
doc = open_wordfile(app, filename)
if doc is None:
raise Error("could not open word file %r" % filename)
try:
for link in doc.Hyperlinks:
line = get_line_number(link.Range)
name=link.TextToDisplay
url_data.add_url(link.Address, name=name, line=line)
finally:
close_wordfile(doc)
finally:
close_word_app(app)
except Error as msg:
log.warn(LOG_PLUGIN, "Error parsing word file: %s", msg)
def get_line_number(doc, wrange):
"""Get line number for given range object."""
lineno = 1
wrange.Select()
wdFirstCharacterLineNumber = constants("wdFirstCharacterLineNumber")
wdGoToLine = constants("wdGoToLine")
wdGoToPrevious = constants("wdGoToPrevious")
while True:
curline = doc.Selection.Information(wdFirstCharacterLineNumber)
doc.Selection.GoTo(wdGoToLine, wdGoToPrevious, Count=1, Name="")
lineno += 1
prevline = doc.Selection.Information(wdFirstCharacterLineNumber)
if prevline == curline:
break
return lineno
def get_temp_filename (content):
"""Get temporary filename for content to parse."""
# store content in temporary file
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc',
prefix='lc_')
try:
fd.write(content)
finally:
fd.close()
return filename
|