/usr/share/pyshared/twill/extensions/check_links.py is in python-twill 0.9-3.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | """
Extension functions to check all of the links on a page.
Usage:
check_links [ <pattern> ]
Make sure that all of the HTTP links on the current page can be visited
successfully. If 'pattern' is given, check only URLs that match that
regular expression.
If option 'check_links.only_collect_bad_links' is on, then all bad
links are silently collected across all calls to check_links. The
function 'report_bad_links' can then be used to report all of the links,
together with their referring pages.
"""
__all__ = ['check_links', 'report_bad_links']
DEBUG=True
import re
from twill import commands
from twill.errors import TwillAssertionError
### first, set up config options & persistent 'bad links' memory...
if commands._options.get('check_links.only_collection_bad_links') is None:
commands._options['check_links.only_collect_bad_links'] = False
bad_links_dict = {}
#
# main function: 'check_links'
#
def check_links(pattern = '', visited={}):
"""
>> check_links [ <pattern> ]
Make sure that all of the HTTP links on the current page can be visited
with an HTTP response 200 (success). If 'pattern' is given, interpret
it as a regular expression that link URLs must contain in order to be
tested, e.g.
check_links http://.*\.google\.com
would check only links to google URLs. Note that because 'follow'
is used to visit the pages, the referrer URL is properly set on the
visit.
"""
from twill import commands
if DEBUG:
print 'in check_links'
OUT = commands.OUT
browser = commands.browser
#
# compile the regexp
#
regexp = None
if pattern:
regexp = re.compile(pattern)
#
# iterate over all links, collecting those that match.
#
# note that in the case of duplicate URLs, only one of the
# links is actually followed!
#
collected_urls = {}
links = list(browser._browser.links())
if not links:
if DEBUG:
print>>OUT, "no links to check!?"
return
for link in links:
url = link.absolute_url
url = url.split('#', 1)[0] # get rid of subpage pointers
if not (url.startswith('http://') or url.startswith('https://')):
if DEBUG:
print>>OUT, "url '%s' is not an HTTP link; ignoring" % (url,)
continue
if regexp:
if regexp.search(url):
collected_urls[url] = link
if DEBUG:
print>>OUT, "Gathered URL %s -- matched regexp" % (url,)
elif DEBUG:
print>>OUT, "URL %s doesn't match regexp" % (url,)
else:
collected_urls[url] = link
if DEBUG:
print>>OUT, "Gathered URL %s." % (url,)
#
# now, for each unique URL, follow the link. Trap ALL exceptions
# as failures.
#
failed = []
for link in collected_urls.values():
went = False
try:
if DEBUG:
print>>OUT, "Trying %s" % (link.absolute_url,),
if not visited.has_key(link.absolute_url):
went = True
browser.follow_link(link)
code = browser.get_code()
assert code == 200
visited[link.absolute_url] = 1
if DEBUG:
print>>OUT, '...success!'
else:
if DEBUG:
print>>OUT, ' (already visited successfully)'
except:
failed.append(link.absolute_url)
if DEBUG:
print>>OUT, '...failure ;('
if went:
browser.back()
if failed:
if commands._options['check_links.only_collect_bad_links']:
for l in failed:
refering_pages = bad_links_dict.get(l, [])
print '***', browser.get_url()
refering_pages.append(browser.get_url())
bad_links_dict[l] = refering_pages
else:
print>>OUT, '\nCould not follow %d links' % (len(failed),)
print>>OUT, '\t%s\n' % '\n\t'.join(failed)
raise TwillAssertionError("broken links on page")
def report_bad_links(fail_if_exist='+', flush_bad_links='+'):
"""
>> report_bad_links [<fail-if-exist> [<flush-bad-links>]]
Report all of the links collected across check_links runs (collected
if and only if the config option check_links.only_collect_bad_links
is set).
If <fail-if-exist> is false (true by default) then the command will
fail after reporting any bad links.
If <flush-bad-links> is false (true by default) then the list of
bad links will be retained across the function call.
"""
global bad_links_dict
from twill import utils
fail_if_exist = utils.make_boolean(fail_if_exist)
flush_bad_links = utils.make_boolean(flush_bad_links)
from twill import commands
OUT = commands.OUT
if not bad_links_dict:
print>>OUT, '\nNo bad links to report.\n'
else:
print>>OUT, '\nCould not follow %d links' % (len(bad_links_dict),)
for page, referers in bad_links_dict.items():
err_msg = "\t link '%s' (occurs on: " % (page,)\
+ ",".join(referers) + ')'
print>>OUT, err_msg
if flush_bad_links:
bad_links_dict = {}
if fail_if_exist:
raise TwillAssertionError("broken links encountered")
|