/usr/share/webcheck/serialize.py is in webcheck 1.10.4.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | # serialize.py - module for (de)serializing site data
#
# Copyright (C) 2006 Arthur de Jong
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
# The files produced as output from the software do not automatically fall
# under the copyright of the software, unless explicitly stated otherwise.
"""This module is used for (de)serializing site data.
Serialization takes place to a file and can be done incremental.
The format of the serialized data is subject to change as this
format is currently experimental. The current format
is as follows:
internal_url*=URL
internal_re*=REGEXP
external_re*=REGEXP
yanked_re*=REGEXP
[URL]
child*=URL
embed*=URL
anchor*=ANCHOR
reqanchor*=PARENTURL|ANCHOR
isfetched=BOOL
ispage=BOOL
mtime=TIME
size=SIZE
mimetype=MIMETYPE
encoding=ENCODING
title=TITLE
author=AUTHOR
status=STATUS
linkproblem*=LEV;LINKPROBLEM
pageproblem*=LEV;PROBLEM
redirectdepth=NUM
When there are section name clashes earlier sections should be
ignored. Keys with * can be specified multiple times. This denotes
a list.
"""
import re
import time
import debugio
import parsers.html
# TODO: maybe save some part of configuration
# TODO: maybe also serialize robotparsers
# TODO: maybe serialize isyanked
# pattern for matching sections
_sectionpattern = re.compile('^\[(.+)\] *$')
# pattern for matching key-value pairs
_keyvaluepattern = re.compile('^([a-z0-9_-]+) *= *(.*)$')
# pattern for matching comments
_commentpattern = re.compile('^[;#]')
# pattern for splitting comma separated list
_commapattern = re.compile(',? *("[^"]*")')
# exception class
class DeSerializeException(Exception):
"""An exception class signalling a problem in parsing some
value."""
pass
# functions for writing data to the serialized file
def _escape(txt):
"""Escape the string to make it fit for writing to the serialized
data file. The string is html escaped and surrounded by quotes."""
return '"'+parsers.html.htmlescape(txt, True)+'"'
def _writebool(fp, key, value):
"""Write a key/value pair displaying a boolean."""
if value is None:
fp.write('%(key)s = None\n' % locals());
elif value:
fp.write('%(key)s = True\n' % locals());
else:
fp.write('%(key)s = False\n' % locals());
def _writeint(fp, key, value):
"""Write a key/value pair displaying an integer."""
value = str(value)
fp.write('%(key)s = %(value)s\n' % locals())
def _writestring(fp, key, value):
"""Write a key/value pair displaying a string or None."""
if value is None:
value = 'None'
else:
value = _escape(value)
fp.write('%(key)s = %(value)s\n' % locals())
def _writedate(fp, key, value):
"""Write a key/value pair displaying a date value or None."""
if value:
date = time.strftime('%c %Z', time.localtime(value))
fp.write('%(key)s = %(date)s\n' % locals())
else:
fp.write('%(key)s = None\n' % locals())
def _writelist(fp, key, values):
"""Write a comma separated list of string using proper
quoting and html escaping."""
value = ', '.join([ _escape(x) for x in values ])
fp.write('%(key)s = %(value)s\n' % locals())
# functions for reading data from the serialized file
def _unescape(txt):
"""This function unescapes a quoted escaped string.
The function removed quotes and replaces html entities
with their proper values."""
# strip quotes
if txt[0] != '"' or txt[-1] != '"':
raise DeSerializeException('parse error: quotes do not match')
txt = txt[1:-1]
# unescape
return parsers.html.htmlunescape(txt)
def _readbool(txt):
"""Interpret the string as a boolean value."""
txt = txt.lower().strip()
if txt in ('true', '1', '-1', 'yes', 'on'):
return True
elif txt in ('false', '0', 'no', 'off'):
return False
elif txt == 'none':
return None
else:
raise DeSerializeException('parse error: boolean value expected')
def _readint(txt):
"""Interpret the string as an integer value."""
if txt == 'None':
return None
return int(txt)
def _readstring(txt):
"""Transform the string read from a key/value pair
to a string that can be used."""
if txt == 'None':
return None
return _unescape(txt)
def _readdate(txt):
"""Interpret the string as a date value."""
import rfc822
date = rfc822.parsedate_tz(txt.strip())
if date is not None:
return rfc822.mktime_tz(date)
return None
def _readlist(txt):
"""Interpret the string as a list of strings."""
return [ _readstring(x.strip())
for x in _commapattern.findall(txt) ]
# general serialize and deraserialize functions
def serialize_site(fp, site):
"""Store the information of the site in the specified file."""
for url in site._internal_urls:
_writestring(fp, 'internal_url', url)
for res in site._internal_res.keys():
_writestring(fp, 'internal_re', res)
for res in site._external_res.keys():
_writestring(fp, 'external_re', res)
for res in site._yanked_res.keys():
_writestring(fp, 'yanked_re', res)
fp.write('\n')
def serialize_links(fp, site):
"""Store all the links of the site in the specified file."""
for link in site.linkMap.values():
serialize_link(fp, link)
def serialize_link(fp, link):
"""Store the information on the url in the specified file."""
fp.write('[%s]\n' % link.url)
if link.isfetched:
_writebool(fp, 'isfetched', link.isfetched)
if link.isfetched:
_writebool(fp, 'ispage', link.ispage)
if link.mtime:
_writedate(fp, 'mtime', link.mtime)
if link.size:
_writeint(fp, 'size', link.size)
if link.mimetype:
_writestring(fp, 'mimetype', link.mimetype)
if link.encoding:
_writestring(fp, 'encoding', link.encoding)
if link.title:
_writestring(fp, 'title', link.title)
if link.author:
_writestring(fp, 'author', link.author)
if link.status:
_writestring(fp, 'status', link.status)
if link.redirectdepth > 0:
_writeint(fp, 'redirectdepth', link.redirectdepth)
for child in link.children:
_writestring(fp, 'child', child.url)
for embed in link.embedded:
_writestring(fp, 'embed', embed.url)
for anchor in link.anchors:
_writestring(fp, 'anchor', anchor)
for reqanchor in link.reqanchors:
for parent in link.reqanchors[reqanchor]:
_writelist(fp, 'reqanchor', (parent.url, reqanchor))
for problem in link.linkproblems:
_writestring(fp, 'linkproblem', problem)
for problem in link.pageproblems:
_writestring(fp, 'pageproblem', problem)
fp.write('\n')
def _deserialize_site(site, key, value):
"""The data in the key value pair is fed into the site."""
debugio.debug("%s=%s" % (key, value))
if key == 'internal_url':
site.add_internal(_readstring(value))
elif key == 'internal_re':
site.add_internal_re(_readstring(value))
elif key == 'external_re':
site.add_external_re(_readstring(value))
elif key == 'yanked_re':
site.add_yanked_re(_readstring(value))
else:
raise DeSerializeException('parse error: unrecognized key for site')
def _deserialize_link(link, key, value):
"""The data in the kay value pair is fed into the link."""
link._ischanged = True
if key == 'child':
link.add_child(_readstring(value))
elif key == 'embed':
link.add_embed(_readstring(value))
elif key == 'anchor':
link.add_anchor(_readstring(value))
elif key == 'reqanchor':
(url, anchor) = _readlist(value)
link.add_reqanchor(url, anchor)
elif key == 'isfetched':
link.isfetched = _readbool(value)
elif key == 'ispage':
link.ispage = _readbool(value)
elif key == 'mtime':
link.mtime = _readdate(value)
elif key == 'size':
link.size = _readint(value)
elif key == 'mimetype':
link.mimetype = str(_readstring(value))
elif key == 'encoding':
link.encoding = str(_readstring(value))
elif key == 'title':
link.title = _readstring(value)
elif key == 'author':
link.author = _readstring(value)
elif key == 'status':
link.status = _readstring(value)
elif key =='linkproblem':
link.add_linkproblem(_readstring(value))
elif key =='pageproblem':
link.add_pageproblem(_readstring(value))
elif key == 'redirectdepth':
link.redirectdepth = _readint(value)
else:
raise DeSerializeException('parse error: unrecognized key for link %s' % link.url)
def deserialize(fp):
"""Read data from the file and construct objects from it.
A new site instance is returned.
After the site has been deserialized the crawl() and postprocess()
functions should be called to regenerate the other link attributes."""
import crawler
site = crawler.Site()
link = None
while True:
line = fp.readline()
# check for end-of-file
if not line:
break
# skip comments
if _commentpattern.search(line):
continue
# skip empty lines
if line.rstrip() == '':
continue
# find section header
match = _sectionpattern.search(line)
if match:
url = match.group(1)
link = site.get_link(url)
debugio.info(' %s' % link.url)
# clear some data that is annoying if we have duplicates
link.anchors = set()
link.linkproblems = []
link.pageproblems = []
continue
# check for key-value pair
match = _keyvaluepattern.search(line)
if match:
key = match.group(1)
value = match.group(2)
if link is None:
_deserialize_site(site, key, value)
else:
_deserialize_link(link, key, value)
continue
# fallthrough
raise DeSerializeException('parse error: unrecorgnized line')
return site
|