#!/usr/bin/env python
# coding=iso-8859-1
import sys
import types
import os
import re
from HTMLParser import HTMLParser
from htmlentitydefs import entitydefs
from StringIO import StringIO
try:
version = os.environ['VERSION_NUMBER']
except:
version = ""
entities = {
'lt': "<",
'gt': ">",
'amp': "&",
'nbsp': " ",
'copy': "©",
'quot': "\"",
'bull': "*"
}
single = ["area", "base", "basefont", "br", "col", "frame",
"hr", "img", "input", "isindex", "link", "meta", "param"]
single = frozenset(single)
heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
phrase = [ "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr",
"acronym"]
special = [ "a", "img", "applet", "object", "font", "basefont", "br", "script",
"map", "q", "sub", "sup", "span", "bdo", "iframe"]
formctrl = [ "input", "select", "textarea", "label", "button"]
list = [ "ul", "ol", " dir", "menu"]
head_misc = [ "script", "style", "meta", "link", "object"]
pre_exclusion = [ "img", "object", "applet", "big", "small", "sub", "sup",
"font", "basefont"]
block = [ "p", "pre", "dl", "div", "center", "noscript", "noframes",
"blockquote", "form", "isindex", "hr", "table", "fieldset",
"address"] + heading + list
inline = fontstyle + phrase + special + formctrl
flow = block + inline
html_content = ["head", "body"]
head_content = ["title", "isindex", "base"]
def setify(d):
return dict([(key, frozenset(val)) for key, val in d.iteritems()])
allowed = {
"a": inline,
"abbr": inline,
"acronym": inline,
"address": inline + ["p"],
"applet": flow + ["param"],
"b": inline,
"bdo": inline,
"big": inline,
"blockquote": flow,
"body": flow + ["ins", "del"],
"button": flow,
"caption": inline,
"center": flow,
"cite": inline,
"code": inline,
"colgroup": ["col"],
"dd": flow,
"del": flow,
"dfn": inline,
"dir": ["li"],
"div": flow,
"dl": ["dt", "dd"],
"dt": inline,
"em": inline,
"fieldset": flow + ["legend"],
"font": inline,
"form": flow,
"frameset": ["frameset", "frame", "noframes"],
"h1": inline,
"h2": inline,
"h3": inline,
"h4": inline,
"h5": inline,
"h6": inline,
"head": head_content + head_misc,
"html": html_content,
"i": inline,
"iframe": flow,
"ins": flow,
"kbd": inline,
"label": inline,
"legend": inline,
"li": flow,
"map": block + ["area"],
"menu": ["li"],
"noframes": flow,
"noscript": flow,
"object": flow + ["param"],
"ol": ["li"],
"optgroup": ["option"],
"option": [],
"p": inline,
"pre": inline,
"q": inline,
"s": inline,
"samp": inline,
"script": [],
"select": ["optgroup", "option"],
"small": inline,
"span": inline,
"strike": inline,
"strong": inline,
"style": [],
"sub": inline,
"sup": inline,
"table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody",
"tr"], # to allow for
[implied ]
"tbody": ["tr"],
"td": flow,
"textarea": [],
"tfoot": ["tr"],
"th": flow,
"thead": ["tr"],
"title": [],
"tr": ["th", "td"],
"tt": inline,
"u": inline,
"ul": ["li"],
"var": inline
}
allowed = setify(allowed)
excluded = {
"a": ["a"],
"button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
"dir": block,
"form": ["form"],
"label": ["label"],
"menu": block,
"pre": pre_exclusion
}
excluded = setify(excluded)
styles = {
'b': "\\fB@\\fR",
'i': "\\fI@\\fR",
'em': "\\fI@\\fR",
'code': "\\fC@\\fR",
'span': "\\fC@\\fR",
'sup': "\\u@\\d",
'hr': ""
}
formats = {
'br': "\n.br\n",
'h2': "\n.SH @",
'h3': "\n.SS @",
'h4': "\n.SS @",
'dt': ("\n.IP \"@\" 4m", 'no_nl'),
'dd': "\n.br\n@",
'ul': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
'menu': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
'dir': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
'ol': ("\n.IP\n@\n.PP\n", 'index'),
'p': "\n.PP\n@",
'pre': ("\n.br\n.nf\n\\fC\n@\n\\fR\n.fi\n", 'preformat')
}
formats.update(styles)
def is_string(x):
return isinstance(x, types.StringType)
def is_tuple(x):
return isinstance(x, types.TupleType)
def is_list(x):
return isinstance(x, types.ListType)
def is_blank(s):
return is_string(s) and s.strip() == ""
def clean(content):
return [item for item in content if not is_blank(item)]
class Formatter:
def __init__(self, stream = sys.stdout):
self.stream = stream
self.style = dict(preformat = False,
in_ul = False,
no_nl = False,
in_table = False,
in_tr = False,
index = [])
self.stack = []
self.strip_re = re.compile("\n[ \t]+")
def warning(self, msg):
sys.stderr.write(msg + '\n')
def set(self, var, val):
self.style[var] = val
def get(self, var):
return self.style[var]
def push(self, **kwargs):
self.stack.append(self.style.copy())
self.style.update(**kwargs)
def pop(self):
self.style = self.stack.pop()
def show(self, s):
self.stream.write(s)
def pp_with(self, content, var, val):
self.push()
self.set(var, val)
self.pp(content)
self.pop()
def fmt(self, format, content, var = None):
(pre,sep,post) = format.partition("@")
if pre != "":
self.show(pre)
if sep != "":
if var:
if var == 'index':
val = self.get('index') + [0]
else:
val = True
self.pp_with(content, var, val)
else:
self.pp(content)
if post != "":
self.show(post)
def pp_li(self, content):
if self.get('in_ul'):
self.fmt("\n.IP \(bu 4n\n@", content)
else:
idx = self.get('index')
idx[-1] += 1
sec = ".".join(map(str,idx))
self.show("\n.IP \\fB%s\\fR\n" % sec)
self.set('index', idx)
self.pp(content)
def pp_title(self):
self.show("\n.TH " +
os.path.basename(sys.argv[1]).replace(".html","") +
" 1 \"\" \"GRASS " +
version +
"\" \"Grass User's Manual\"")
def pp_tr(self, content):
content = clean(content)
self.push(in_tr = True)
col = 0
for item in content:
if not is_tuple(item):
self.warning("invalid item in table row: %s" % str(item))
continue
(tag, tail) = item
if tag not in ['td', 'th']:
self.warning("invalid tag in table row: %s" % tag)
continue
if col > 0:
self.show("\t \t")
self.show("T{\n")
self.pp(tail)
self.show("\nT}")
col += 1
self.show("\n")
self.pop()
def pp_tbody(self, content):
for item in content:
if is_tuple(item):
(tag, tail) = item
if tag in ['thead', 'tbody', 'tfoot']:
self.pp_tbody(tail)
elif tag == 'tr':
self.pp_tr(tail)
self.show(".sp 1\n")
def count_cols(self, content):
cols = 0
for item in content:
n = 0
if is_blank(item):
pass
elif is_tuple(item):
(tag, tail) = item
if tag in ['thead', 'tbody', 'tfoot']:
n = self.count_cols(tail)
elif tag == 'tr':
n = len(clean(tail))
cols = max(cols, n)
else:
self.warning("invalid item in table: %s" % str(item))
return cols
def pp_table(self, content):
cols = self.count_cols(content)
if cols == 0:
return
self.show("\n.TS\nexpand;\n")
self.show(" lw1 ".join(["lw60" for i in range(cols)]) + ".\n")
self.pp_tbody(content)
self.show("\n.TE\n")
def pp_tag(self, tag, content):
if self.get('in_tr') and tag not in styles:
self.pp(content)
elif tag in formats:
spec = formats[tag]
if is_string(spec):
self.fmt(spec, content)
else:
(fmt, var) = spec
self.fmt(fmt, content, var)
elif tag == 'table':
if self.get('in_table'):
self.warning("cannot handle nested tables")
return
self.push(in_table = True)
self.pp_table(content)
self.pop()
elif tag == 'li':
self.pp_li(content)
elif tag == 'title':
self.pp_title()
else:
self.pp(content)
def pp_string(self, content):
s = content
if self.get('no_nl'):
s = s.replace("\n"," ")
s = s.replace("\\", "\\(rs")
s = s.replace("'", "\\(cq")
s = s.replace("\"", "\\(dq")
s = s.replace("`", "\\(ga")
self.show(s)
def pp_text(self, content):
if content != "":
if self.get('preformat'):
self.pp_string(content)
else:
s = self.strip_re.sub('\n', content)
self.pp_string(s)
def pp_list(self, content):
for item in content:
self.pp(item)
def pp(self, content):
if is_list(content):
self.pp_list(content)
elif is_tuple(content):
(head, tail) = content
self.pp_tag(head, tail)
elif is_string(content):
self.pp_text(content)
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.tag_stack = []
self.excluded = frozenset()
self.excluded_stack = []
self.data = []
self.data_stack = []
def top(self):
if self.tag_stack == []:
return None
else:
return self.tag_stack[-1]
def pop(self):
self.excluded = self.excluded_stack.pop()
data = self.data
self.data = self.data_stack.pop()
tag = self.tag_stack.pop()
self.append((tag, data))
return tag
def push(self, tag):
self.tag_stack.append(tag)
self.excluded_stack.append(self.excluded)
if tag in excluded:
self.excluded = self.excluded.union(excluded[tag])
self.data_stack.append(self.data)
self.data = []
def append(self, item):
self.data.append(item)
def is_allowed(self, tag):
return tag not in self.excluded and tag in allowed[self.top()]
def handle_starttag(self, tag, attrs):
if self.tag_stack != []:
while not self.is_allowed(tag):
self.pop()
if tag not in single:
self.push(tag)
else:
self.append((tag,None))
def handle_entityref(self, name):
if name in entities:
self.handle_data(entities[name])
elif name in entitydefs:
self.handle_data(entitydefs[name])
else:
sys.stderr.write("unrecognized entity: %s\n" % name)
def handle_data(self, data):
self.append(data)
def handle_endtag(self, tag):
while True:
if self.pop() == tag:
break
if __name__ == "__main__":
# parse HTML
inf = file(sys.argv[1])
p = MyHTMLParser()
p.feed(inf.read())
p.close()
inf.close()
# generate groff
sf = StringIO()
f = Formatter(sf)
f.pp(p.data)
s = sf.getvalue()
sf.close()
# strip excess whitespace
blank_re = re.compile("[ \t\n]*\n[ \t\n]*")
s = blank_re.sub('\n', s)
s = s.lstrip()
# write groff
outf = file(sys.argv[2], 'w')
outf.write(s)
outf.close()