/usr/share/sumo/tools/build/buildHTMLDocs.py is in sumo-tools 0.15.0~dfsg-2.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | #!/usr/bin/env python
"""
@file getWikiPages.py
@author Daniel Krajzewicz
@date 2011-10-20
@version $Id: buildHTMLDocs.py 11671 2012-01-07 20:14:30Z behrisch $
Converts wiki-documentation into HTML pages.
Determines what to convert, first: if a command line argument is given,
it is interpreted as the page to convert. Otherwise, "Special:AllPages" is
downloaded and parsed for obtaining the list of all pages which will be
converted in subsequent steps.
For each of the pages to convert, the HTML-representation of the
page is downloaded and stripped from wiki-header/footer, first.
Then, the image-links are extracted from the HTML page and stored
temporarily, the links themselves are patched to point to local pages/images
(if the page behind the link exists).
The page is saved into MIRROR_FOLDER/<PAGE_PATH>.
After parsing all pages, the images are downloaded and stored into
MIRROR_FOLDER/images.
After downloading all data, the title page is extracted and the content
included in this page is extracted. This content is embedded into "index.html"
between the <!-- nav begins --> / <!-- nav ends --> markers.
All pages downloaded earlier are loaded, and embedded into the index.html
between the <!-- content begins --> / <!-- content ends --> markers. Then,
the page is saved into HTML_FOLDER/<PAGE_PATH>. All images are copied
from MIRROR_FOLDER/images to HTML_FOLDER/images.
Copyright (C) 2011 DLR (http://www.dlr.de/) and contributors
All rights reserved
"""
import urllib, os, sys, shutil
def getIndex():
f = urllib.urlopen("http://sourceforge.net/apps/mediawiki/sumo/index.php?title=SUMO_User_Documentation&action=edit")
c = f.read()
b = c.find('name="wpTextbox1"')
b = c.find(">", b)+1
e = c.find("</textarea>", b)
return c[b:e]
def readParsePage(page):
f = urllib.urlopen("http://sourceforge.net/apps/mediawiki/sumo/index.php?title=%s" % page)
c = f.read()
b = c.find("This page was last modified on");
e = c.find("<", b)
lastMod = c[b:e]
b = c.find("globalWrapper")
b = c.find('<a name="top"', b)
e = c.find("<div class=\"printfooter\">")
c = c[b:e]
c = c.replace("<h3 id=\"siteSub\">From sumo</h3>", "")
b = c.find("<div id=\"jump-to-nav\">")
e = c.find("</div>", b)+6
c = c[:b] + c[e:]
c = c + '</div><hr/><div id="lastmod">' + lastMod + '</div>'
return c
def patchLinks(page, name):
images = set()
level = len(name.split("/"))-1
level = "../" * level
b = page.find("<a href")
while b>=0:
# images
if page[b+9:].startswith("File:") or page[b+9:].startswith("Image:"):
images.add(page[b+9:page.find("\"",b+9)])
e = page.find(":", b+9)+1
page = page[:b] + level + "images/" + page[e:]
# images/files
elif page[b+9:].startswith("/apps/mediawiki/sumo/index.php?title=File:") or page[b+9:].startswith("/apps/mediawiki/sumo/index.php?title=Image:"):
b2 = b
b = page.find("title=", b)+6
images.add(page[b:page.find("\"",b)])
e = page.find(":", b)+1
page = page[:b2+9] + level + "images/" + page[e:]
# pages (HTML)
elif page[b+9:].startswith("/apps/mediawiki/sumo/index.php"):
e = page.find("?", b+9)+7
e2 = page.find("\"", b+9)
link = page[e:e2]
if link.find("action=edit")<0:
if link.find("#")>0:
link = level + link.replace("#", ".html#")
elif link.find("#")<0 and not (link.endswith(".png") or link.endswith(".jpg") or link.endswith(".svg")):
link = level + link + ".html"
page = page[:b+9] + link + page[e2:]
else:
page = page[:b+9] + "http://sourceforge.net/" + page[b+10:]
b = page.find("<a href", b+1)
return page, images, level
def patchImages(page, name):
images = set()
level = len(name.split("/"))-1
level = "../" * level
b = page.find("<img ")
b = page.find("src", b)
while b>=0:
b= b + 5
e = page.find("\"", b+2)
add = page[b:e]
l = add[add.rfind("/"):]
if add.find("thumb")>=0:
l = l[l.find("-")+1:]
images.add(add)
page = page[:b] + level + "images/" + l + page[e:]
b = page.find("<img", b+1)
b = page.find("src", b)
page = page.replace(".svg.png", ".svg")
return page, images
def parseWikiLink(l):
if l.find("[[")>=0:
# internal link
b = l.find("[")+2
e = l.find("]", b)
t = l[b:e]
if t.find("|")<0:
link = t
text = t
else:
link, text = t.split("|")
link = link.replace(" ", "_")
if link.find("#")>=0:
link = link.replace("#", ".html#")
else:
link = link + ".html"
# external link
elif l.find("[")>=0:
b = l.find("[")+1
e = l.find("]", b)
t = l[b:e]
link = t[:t.find(" ")]
text = t[t.find(" ")+1:]
else:
# text
text = l[l.find(" ")+1:]
link = ""
return text, link
MIRROR_FOLDER = "mirror"
HTML_FOLDER = "docs"
try: os.mkdir(MIRROR_FOLDER)
except: pass
try: os.mkdir(MIRROR_FOLDER + "/images")
except: pass
images = set()
if len(sys.argv)<2:
p = readParsePage("Special:AllPages")
p = p[p.find("<input type=\"submit\" value=\"Go\" />"):]
p = p[p.find("<table "):]
pages = p.split("<a ")
else:
pages = ["href=?title=" + sys.argv[1] + "\""]
for p in pages:
if(not p.startswith("href")):
continue
b = p.find("?title=")
e = p.find("\"", b)
name = p[b+7:e]
if name.endswith(".css"):
print "Skipping css-file %s" % name
print "Fetching %s" % name
c = readParsePage(name)
if name.find("/")>0:
try:
os.makedirs(os.path.join(MIRROR_FOLDER, name[:name.rfind("/")]))
except: pass
if True:#name.find(".")<0:
c, pi, level = patchLinks(c, name)
for i in pi:
images.add(i)
c, pi = patchImages(c, name)
for i in pi:
images.add(i)
name = name + ".html"
fd = open(os.path.join(MIRROR_FOLDER, name), "w")
fd.write(c)
fd.close()
imageFiles = []
for i in images:
print "Fetching image %s" % i
if i.find(":")>=0:
f = urllib.urlopen("http://sourceforge.net/apps/mediawiki/sumo/index.php?title=%s" % i)
c = f.read()
b = c.find("<div class=\"fullImageLink\" id=\"file\">")
b = c.find("href=", b)+6
e = c.find("\"", b+1)
f = urllib.urlopen("http://sourceforge.net/%s" % c[b:e])
i = i[i.find(":")+1:]
else:
f = urllib.urlopen("http://sourceforge.net/%s" % i)
i = i[i.rfind("/")+1:]
if i.find("px-")>=0:
i = i[:i.find('-')+1]
fd = open(os.path.join(MIRROR_FOLDER, "images", i), "wb")
fd.write(f.read())
fd.close()
imageFiles.append(os.path.join("images", i))
# build navigation
nav = getIndex()
lines = nav[nav.find("="):].split("\n")
level = 0
c = "<ul>\n";
hadHeader = False
for l in lines:
if len(l)==0:
continue
if l[0]=='=':
text, link = parseWikiLink(" " + l.replace("=", ""))
if hadHeader:
c = c + "</ul>\n";
spc = ' ' * (level+1)
c = c + spc + "<li>";
if link!="":
c = c + "<a href=\"" + link + "\">";
c = c + text;
if link!="":
c = c + "</a>";
c = c + "</li>\n";
hadHeader = True
level = 0
continue
if l[0].find('*')<0:
continue
text, link = parseWikiLink(l)
nLevel = l.count('*')
if level>nLevel:
c = c + ("</ul>\n" * (level-nLevel))
if level<nLevel:
c = c + ("<ul>\n" * (nLevel-level))
spc = ' ' * (nLevel+1)
#+ str(level) + "-" + str(nLevel)
c = c + spc + "<li>";
if link !="":
c = c + '<a href="' + link + '">' + text + '</a>';
else:
c = c + text;
c = c + "</li>\n";
level = nLevel
# get template and embed navigation
fd = open("index.html")
tpl = fd.read()
fd.close()
b = tpl.find("<!-- nav begins -->")
b = tpl.find(">", b)+1
e = tpl.find("<!-- nav ends -->")
tpl = tpl[:b] + c + tpl[e:]
# build HTML pages
try: os.mkdir(HTML_FOLDER)
except: pass
try: os.mkdir(HTML_FOLDER + "/images")
except: pass
for p in pages:
if(not p.startswith("href")):
continue
b = p.find("?title=")
e = p.find("\"", b)
name = p[b+7:e]
if name.endswith(".css"):
print "Skipping css-file %s" % name
name = name + ".html"
t = os.path.join(HTML_FOLDER, name)
fd = open(os.path.join(MIRROR_FOLDER, name))
c = fd.read()
fd.close()
#
if name.find('/')>=0:
level = name.count("/")
else:
level = name.count("\\")
level = "../" * level
# title
cc = tpl.replace("<title>", "<title>" + name[:name.rfind(".html")] + " - ")
# css inclusion
cc = cc.replace("sumo.css", level + "sumo.css")
cc = cc.replace("logo.png", level + "logo.png")
# links
b = cc.find("<a href=")
while b>=0:
b = cc.find('"', b)
if not cc[b+1:].startswith("http"):
cc = cc[:b+1] + level + cc[b+1:]
b = cc.find("<a href=", b)
# content
b = cc.find("<!-- content begins -->")
e = cc.find("<!-- content ends -->", b)
e = cc.find("<", e+1) - 1
cc = cc[:b] + c + cc[e:]
try: os.makedirs(os.path.split(t)[0])
except: pass
fd = open(t, "w")
fd.write(cc)
fd.close()
for i in imageFiles:
shutil.copy(os.path.join(MIRROR_FOLDER, i), os.path.join(HTML_FOLDER, i))
|