/usr/share/pyshared/PyritePublisher/plugin_WebInput.py is in pyrite-publisher 2.1.1-8.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | import sys, os, string, re, htmllib, urllib, urlparse, mimetypes
from pprint import pprint
from dtkplugins import InputPlugin
from dtkmain import ConversionError
class NullFormatter:
def nothing(self, *a, **kw):
pass
def __getattr__(self, k):
return self.nothing
class HTMLLinkGatherer(htmllib.HTMLParser):
def __init__(self):
htmllib.HTMLParser.__init__(self,NullFormatter())
self.links = []
self.images = []
def anchor_bgn(self, href, name, type):
self.links.append((href, name, type))
def handle_image(self, src, alt, *a):
self.images.append((src, alt))
def __str__(self):
l = ['Links:'] + map(str, self.links.keys())
l = l + ['Images:'] + map(str, self.images.keys())
return string.join(l,'\n')
def gather_links(fn):
p = HTMLLinkGatherer()
p.feed(open(fn).read())
return p
def type_and_host(url):
typ, rest = urllib.splittype(url)
hostport, rest = urllib.splithost(rest)
if hostport is None:
host = None
else:
upwd, hostport = urllib.splituser(hostport)
host, port = urllib.splitport(hostport)
return typ, host
def guess_mimetype(fn, headers=None):
try:
mtype = headers['Content-Type']
except:
mtype = 'application/octet-stream'
if mtype in ['application/octet-stream', 'text/plain']:
gtype, genc = mimetypes.guess_type(fn)
return gtype
else:
return mtype
class WebSpider:
def __init__(self, base):
self.base = base
self.info = {}
self.valid_url_types = ['http','ftp','gopher'] #whatever urllib handles
self.maximum_depth = 3
self.follow_offsite_links = 0
self.verbose = 1
typ, host = type_and_host(base)
self.base_host = host
self.pages = []
def go(self):
print "Retrieving", self.base
fn, headers = urllib.urlretrieve(self.base)
self.info[self.base] = (fn, headers)
mtype = guess_mimetype(fn, headers)
self.pages.append((self.base, mtype, fn, headers))
self.process_links(fn, self.base, 2)
def process_links(self, fn, url, level=0):
# If we have reached maximum link depth, quit.
if self.maximum_depth and level > self.maximum_depth:
return
lks = gather_links(fn)
for link, lname, ltype in lks.links:
nurl = urlparse.urljoin(url, link)
# decide whether we want to follow this link
typ, host = type_and_host(nurl)
if typ is not None and typ not in self.valid_url_types:
if self.verbose: print "Skipping", nurl, "due to ignored type", typ
continue
if host != self.base_host and not self.follow_offsite_links:
if self.verbose: print "Skipping", nurl, "because it is an offsite link to", host
continue
sys.stdout.flush()
if not self.info.has_key(nurl):
if self.verbose: print "Retrieving", nurl, "(level %s)" % level
try:
fn, headers = urllib.urlretrieve(nurl)
except IOError: # broken link
if self.verbose: print "Broken link", nurl
continue
self.info[nurl] = (fn, headers)
# if it's HTML, recurse
mtype = guess_mimetype(fn, headers)
self.pages.append((nurl, mtype, fn, headers))
if mtype == 'text/html':
self.process_links(fn, nurl, level+1)
else:
if self.verbose: print "Not processing", nurl, "because it isn't html"
else:
if self.verbose: print "Not retrieving", nurl, "because it is in cache"
class Plugin(InputPlugin):
name = 'WebInput'
description = 'Retrieves a web page or site.'
def __init__(self, *a, **kw):
InputPlugin.__init__(self, *a, **kw)
self._add_property('spider','Use web spider')
self._add_cli_option('spider','','spider',
'Use web spider', boolean=1)
self.spider = 0
self._add_property('maximum_depth', 'Maximum depth to retrieve')
self._add_cli_option('maximum_depth', '', 'maxdepth',
'Maximum depth to retrieve', vtype="NUM", func=int)
self.maximum_depth = 1
self._add_property('follow_offsite_links', 'Follow links off site of main page')
self._add_cli_option('follow_offsite_links', '', 'offsite-links',
'Follow links off site of main page', boolean=1)
self.follow_offsite_links = 0
self._tempdir = ''
self._spiderobj = None
self._base_url = None
def handles_filename(self, fn):
# XXX later, retrieve the file and spider if it is HTML
# but behave like URLStream if it is not.
if self.spider and (fn[:5] == 'http:' or
fn[-5:].lower() == '.html' or
fn[-4].lower() == '.htm'):
return 1
def open_input(self, fn):
self._base_url = fn
self._spiderobj = WebSpider(fn)
self.copyProperties(self._spiderobj)
self._spiderobj.go()
#pprint(self._spiderobj.info)
return ['MULTIPART:web'], os.path.basename(fn)
def close_input(self):
urllib.urlcleanup()
def go(self, mimetype):
self.next.process_multipart_web(self._spiderobj.pages[:])
|