/usr/bin/html2stx

#!/usr/bin/env python
# This file is copyright (c) 2004 Aaron Swartz, copyright (c) 2004, 2005, 2006 Panu Kalliokoski
# This file is released under the GNU General Public License (GPL), version 2.

# Derived from html2text, version 2.11, by Aaron Swartz.

"""html2stx: Turn HTML into neat Stx source, stripping everything that
cannot be expressed in Stx."""

__author__ = "Panu A. Kalliokoski"
__copyright__ = "(C) 2004 Aaron Swartz; 2004, 2005 Panu Kalliokoski. GNU GPL 2."

import re, sys, urllib, htmlentitydefs, codecs, StringIO
import sgmllib
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')

# Use Unicode characters instead of their ascii psuedo-replacements
UNICODE_SNOB = 0

### Entity Nonsense ###

def name2cp(k):
	if k == 'apos': return ord("'")
	if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
		return htmlentitydefs.name2codepoint[k]
	else:
		k = htmlentitydefs.entitydefs[k]
		if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
		return ord(codecs.latin_1_decode(k)[0])

unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
 'mdash':' -- ', 'ndash':'--'}

unifiable_n = {}

for k in unifiable.keys():
	unifiable_n[name2cp(k)] = unifiable[k]

def charref(name):
	if name[0] in ['x','X']:
		c = int(name[1:], 16)
	else:
		c = int(name)
	
	if not UNICODE_SNOB and c in unifiable_n.keys():
		return unifiable_n[c]
	else:
		return unichr(c)

def entityref(c):
	if not UNICODE_SNOB and c in unifiable.keys():
		return unifiable[c]
	else:
		try: name2cp(c)
		except KeyError: return "&%s;" % c
		else: return unichr(name2cp(c))

def replaceEntities(s):
	s = s.group(1)
	if s[0] == "#": 
		return charref(s[1:])
	else: return entityref(s)

r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
def unescape(s):
	return r_unescape.sub(replaceEntities, s)
	
def fixattrs(attrs):
	# Fix bug in sgmllib.py
	if not attrs: return []
	newattrs = []
	for attr in attrs:
		newattrs.append((attr[0], unescape(attr[1])))
	return newattrs

### End Entity Nonsense ###

def hn(tag):
	if not (tag[0] == 'h' and len(tag) == 2): return False
	try: return max(int(tag[1])-1, 1)
	except ValueError: return False

class _html2text(sgmllib.SGMLParser):

	line_length = 72
	begspace_re = re.compile('^( *)(.*)$')
	endspace_re = re.compile('^(.*)( +)$')

	def __init__(self, out=sys.stdout.write):
		sgmllib.SGMLParser.__init__(self)
		
		if out is None: self.out = self.outtextf
		else: self.out = out
		self.outtext = u''
		self.quiet = []
		self.p_p = 0
		self.outcount = 0
		self.list = []
		self.space = ''
		self.start = 1
		self.blockquote = 0
		self.pre = 0
		self.lastWasNL = 1
		self.column = 0
		self.charset = 'latin1'
	
	def outtextf(self, s): 
		self.outtext += s
	
	def close(self):
		sgmllib.SGMLParser.close(self)
		
		self.pbr()
		self.o('', 0, 'end')
		
		return self.outtext
		
	def handle_charref(self, c):
		self.o(charref(c))

	def handle_entityref(self, c):
		self.o(entityref(c))
			
	def unknown_starttag(self, tag, attrs):
		self.handle_tag(tag, attrs, 1)
	
	def unknown_endtag(self, tag):
		self.handle_tag(tag, None, 0)

	def handle_tag(self, tag, attrs, start):
		attrs = dict(fixattrs(attrs))
		if not start: self.space = ''
	
		if hn(tag):
			self.p()
			if start:
				self.pre = 1
				self.o(hn(tag)*"!" + ' ')
			else: self.pre = 0

		if tag in ['p', 'div']: self.p()
		
		if tag == "br" and start:
			self.o("//")
			self.pbr()
		if tag == 'hr' and start:
			self.p()
			self.o('----')
			self.p()

		if tag in ["head", "style", "script"]: 
			if start: self.quiet.append(1)
			else: self.quiet.pop()

		if tag == 'title':
			if start:
				self.quiet.append(0)
				self.o("w_title(")
			else:
				self.o(")dnl")
				self.begin_line()
				self.quiet.pop()

		if tag == 'meta' and start:
			name = attrs.get('name') or \
					attrs.get('http-equiv') or ''
			content = attrs.get('content')
			if name.lower() == 'author':
				self.o("w_author(%s)dnl" % content, 0, 1)
				self.begin_line()
			elif name.lower() in ['date', 'last-modified']:
				self.o("w_date(%s)dnl" % content, 0, 1)
				self.begin_line()
			elif name.lower() == 'content-type':
				match = re.search('[Cc]harset=(.*)', content)
				if match:
					try: charset = {
						'ISO-8859-1':'latin1',
						'ISO-8859-15':'latin9',
						'US-ASCII':'ascii',
						'UTF-8':'utf8' }[match.group(1).upper()]
					except KeyError: charset = 'latin1'
					self.charset = charset
					self.o("w_char_coding(%s)dnl" %
						charset, 0, 1 )
					self.begin_line()
		
		if tag == "dl": self.p()
		if tag == "dt":
			if start:
				self.pre = 1
				self.pbr()
			else:
				self.o("::")
				self.pre = 0
				self.pbr()

		if tag in ["blockquote", "dd"]:
			if start: 
				if tag != "dd": self.p()
				self.blockquote += 1
			else:
				self.blockquote -= 1
				if tag == "dd": self.pbr()
				else: self.p()
		
		if tag in ['em', 'i', 'u']: self.o("_")
		if tag in ['var', 'cite', 'dfn']: self.o("/")
		if tag in ['kbd', 'samp', 'code', 'tt']: self.o("''")
		if tag == "q": self.o('"')
		if tag in ['strong', 'b']: self.o("*")
		
		if tag == "a":
			if start:
				tgt = attrs.get('href', '') 
				lbl = attrs.get('name', '')
				if lbl: self.o("w_label(%s, " % lbl)
				elif tgt and tgt[0]=='#':
					self.o("w_refer(%s, " % tgt[1:])
				else: self.o("w_link(%s, " % tgt)
			else:
				self.o(")")
		
		if tag == "img" and start:
			tgt = re.sub('\.(jpe?g|gif|png)$', '',
					attrs.get('src', ''))
			alt = attrs.get('alt', '')
			self.o("w_img(%s, %s)" % (tgt, alt))
		
		if tag in ["ol", "ul"]:
			if start:
				self.list.append(tag)
			else:
				self.list.pop()
			
			self.p()
		
		if tag == 'li':
			if start:
				self.pbr()
				if self.list: li = self.list.pop()
				else: li = "ul"
				if li == "ul" and len(self.list)<=1:
					self.o("- ")
				elif li == "ul": self.o("* ")
				elif li == "ol": self.o("# ")
				self.list.append(li)
			else:
				self.pbr()
		
		if tag == 'table':
			if start:
				self.p()
				self.o('w_beg(table)')
				self.pbr()
			else:
				self.pbr()
				self.o('w_end(table)')
				self.p()

		if tag == 'tr' and not start:
			self.o("//")
			self.begin_line()
		if tag in ['td', 'th'] and not start: self.o("||")
		
		if tag == "pre":
			if start:
				self.p()
				self.pre = 1
				self.o("{{{")
			else:
				if not self.lastWasNL: self.pbr()
				self.o("}}}")
				self.pre = 0
				self.p()

		self.start = start
			
	def pbr(self):
		if self.p_p == 0: self.p_p = 1

	def p(self): self.p_p = 2
	
	def begin_line(self):
		self.out('\n')
		self.out('\t' * self.blockquote)
		self.out('  ' * len(self.list))
		self.column = self.blockquote * 8 + len(self.list) * 2
		self.lastWasNL = 1
	
	def o(self, data, puredata=0, force=0):
		if self.quiet and self.quiet[-1] and not force: return

		if puredata and not self.pre:
			data = re.sub('\n[ \t]*', '\n', data)
			data = re.sub('\\s', ' ', data)
			sp, data = self.begspace_re.match(data).groups()
			self.space += sp
		if not data and not force: return

		space = self.space

		if force == 'end':
			# It's the end.
			self.p_p = 0
			self.out("\n")
			space = ''

		if self.p_p:
			if self.p_p > 1 and not self.lastWasNL: self.out('\n')
			self.begin_line()
			space = ''

		if space and not self.start and not self.lastWasNL:
			if self.column > self.line_length: self.begin_line()
			else: self.out(space)

		if not self.pre:
			while len(data) + self.column > self.line_length:
				spl = self.split_line( data )
				if not spl: break
				line, data = spl
				self.out(self.decode(line))
				self.begin_line()

		self.p_p = 0
		if puredata and not self.pre:
			match = self.endspace_re.match(data)
			if match: data, self.space = match.groups()
			else: self.space = ''
		else: self.space = ''
		self.out(self.decode(data))
		self.column += len(data)
		if data: self.lastWasNL = data[-1] == '\n'
		self.outcount += 1

	def split_line( self, line ):
		match = re.match('^(.{0,%d}) (.*)$' % max(self.line_length -
				self.column, 15), line)
		if not match: return None
		return match.groups()

	def decode(self, s):
		if type(s) is unicode:
			if self.charset == 'latin9':
				return s.encode('latin1')
			return s.encode(self.charset)
		return s

	def handle_data(self, data):
		self.o(data, 1)
		self.start = 0
	
	def unknown_decl(self, data): pass
		
def html2text_file(html, out=sys.stdout.write):
	h = _html2text(out)
	h.feed(html)
	h.feed("")
	return h.close()

if __name__ == "__main__":
	try: data = open(sys.argv[1], 'r').read()
	except IndexError: data = sys.stdin.read()
	html2text_file(data)
stx2any 1.56-2 / usr / bin / html2stx