|
@@ -1,484 +1,20 @@
|
|
|
#!/usr/bin/env python
|
|
|
-# coding=iso-8859-1
|
|
|
import sys
|
|
|
-import types
|
|
|
-import os
|
|
|
import re
|
|
|
-from HTMLParser import HTMLParser, HTMLParseError
|
|
|
-from htmlentitydefs import entitydefs
|
|
|
+from html import HTMLParser, HTMLParseError
|
|
|
+from groff import Formatter
|
|
|
from StringIO import StringIO
|
|
|
|
|
|
-try:
|
|
|
- version = os.environ['VERSION_NUMBER']
|
|
|
-except:
|
|
|
- version = ""
|
|
|
-
|
|
|
entities = {
|
|
|
- 'lt': "<",
|
|
|
- 'gt': ">",
|
|
|
- 'amp': "&",
|
|
|
'nbsp': " ",
|
|
|
- 'copy': "©",
|
|
|
- 'quot': "\"",
|
|
|
'bull': "*"
|
|
|
}
|
|
|
|
|
|
-omit_start = ["body", "tbody", "head", "html"]
|
|
|
-
|
|
|
-single = ["area", "base", "basefont", "br", "col", "frame",
|
|
|
- "hr", "img", "input", "isindex", "link", "meta", "param"]
|
|
|
-single = frozenset(single)
|
|
|
-
|
|
|
-heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
|
|
|
-fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
|
|
|
-phrase = [ "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr",
|
|
|
- "acronym"]
|
|
|
-special = [ "a", "img", "applet", "object", "font", "basefont", "br", "script",
|
|
|
- "map", "q", "sub", "sup", "span", "bdo", "iframe"]
|
|
|
-formctrl = [ "input", "select", "textarea", "label", "button"]
|
|
|
-list = [ "ul", "ol", " dir", "menu"]
|
|
|
-head_misc = [ "script", "style", "meta", "link", "object"]
|
|
|
-pre_exclusion = [ "img", "object", "applet", "big", "small", "sub", "sup",
|
|
|
- "font", "basefont"]
|
|
|
-block = [ "p", "pre", "dl", "div", "center", "noscript", "noframes",
|
|
|
- "blockquote", "form", "isindex", "hr", "table", "fieldset",
|
|
|
- "address"] + heading + list
|
|
|
-inline = fontstyle + phrase + special + formctrl
|
|
|
-flow = block + inline
|
|
|
-html_content = ["head", "body"]
|
|
|
-head_content = ["title", "isindex", "base"]
|
|
|
-
|
|
|
-def setify(d):
|
|
|
- return dict([(key, frozenset(val)) for key, val in d.iteritems()])
|
|
|
-
|
|
|
-def omit(allowed, tags):
|
|
|
- result = {}
|
|
|
- for k, v in allowed.iteritems():
|
|
|
- for t in tags:
|
|
|
- if t in v:
|
|
|
- v = v.union(allowed[t])
|
|
|
- result[k] = v
|
|
|
- return result
|
|
|
-
|
|
|
-allowed = {
|
|
|
- "a": inline,
|
|
|
- "abbr": inline,
|
|
|
- "acronym": inline,
|
|
|
- "address": inline + ["p"],
|
|
|
- "applet": flow + ["param"],
|
|
|
- "b": inline,
|
|
|
- "bdo": inline,
|
|
|
- "big": inline,
|
|
|
- "blockquote": flow,
|
|
|
- "body": flow + ["ins", "del"],
|
|
|
- "button": flow,
|
|
|
- "caption": inline,
|
|
|
- "center": flow,
|
|
|
- "cite": inline,
|
|
|
- "code": inline,
|
|
|
- "colgroup": ["col"],
|
|
|
- "dd": flow,
|
|
|
- "del": flow,
|
|
|
- "dfn": inline,
|
|
|
- "dir": ["li"],
|
|
|
- "div": flow,
|
|
|
- "dl": ["dt", "dd"],
|
|
|
- "dt": inline,
|
|
|
- "em": inline,
|
|
|
- "fieldset": flow + ["legend"],
|
|
|
- "font": inline,
|
|
|
- "form": flow,
|
|
|
- "frameset": ["frameset", "frame", "noframes"],
|
|
|
- "h1": inline,
|
|
|
- "h2": inline,
|
|
|
- "h3": inline,
|
|
|
- "h4": inline,
|
|
|
- "h5": inline,
|
|
|
- "h6": inline,
|
|
|
- "head": head_content + head_misc,
|
|
|
- "html": html_content,
|
|
|
- "i": inline,
|
|
|
- "iframe": flow,
|
|
|
- "ins": flow,
|
|
|
- "kbd": inline,
|
|
|
- "label": inline,
|
|
|
- "legend": inline,
|
|
|
- "li": flow,
|
|
|
- "map": block + ["area"],
|
|
|
- "menu": ["li"],
|
|
|
- "noframes": flow,
|
|
|
- "noscript": flow,
|
|
|
- "object": flow + ["param"],
|
|
|
- "ol": ["li"],
|
|
|
- "optgroup": ["option"],
|
|
|
- "option": [],
|
|
|
- "p": inline,
|
|
|
- "pre": inline,
|
|
|
- "q": inline,
|
|
|
- "s": inline,
|
|
|
- "samp": inline,
|
|
|
- "script": [],
|
|
|
- "select": ["optgroup", "option"],
|
|
|
- "small": inline,
|
|
|
- "span": inline,
|
|
|
- "strike": inline,
|
|
|
- "strong": inline,
|
|
|
- "style": [],
|
|
|
- "sub": inline,
|
|
|
- "sup": inline,
|
|
|
- "table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody"],
|
|
|
- "tbody": ["tr"],
|
|
|
- "td": flow,
|
|
|
- "textarea": [],
|
|
|
- "tfoot": ["tr"],
|
|
|
- "th": flow,
|
|
|
- "thead": ["tr"],
|
|
|
- "title": [],
|
|
|
- "tr": ["th", "td"],
|
|
|
- "tt": inline,
|
|
|
- "u": inline,
|
|
|
- "ul": ["li"],
|
|
|
- "var": inline
|
|
|
- }
|
|
|
-
|
|
|
-allowed = setify(allowed)
|
|
|
-allowed = omit(allowed, omit_start)
|
|
|
-
|
|
|
-excluded = {
|
|
|
- "a": ["a"],
|
|
|
- "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
|
|
|
- "dir": block,
|
|
|
- "form": ["form"],
|
|
|
- "label": ["label"],
|
|
|
- "menu": block,
|
|
|
- "pre": pre_exclusion
|
|
|
- }
|
|
|
-
|
|
|
-excluded = setify(excluded)
|
|
|
-
|
|
|
-styles = {
|
|
|
- 'b': "\\fB@\\fR",
|
|
|
- 'i': "\\fI@\\fR",
|
|
|
- 'em': "\\fI@\\fR",
|
|
|
- 'code': "\\fC@\\fR",
|
|
|
- 'span': "\\fC@\\fR",
|
|
|
- 'sup': "\\u@\\d",
|
|
|
- 'hr': ""
|
|
|
- }
|
|
|
-
|
|
|
-formats = {
|
|
|
- 'br': "\n.br\n",
|
|
|
- 'h2': "\n.SH @",
|
|
|
- 'h3': "\n.SS @",
|
|
|
- 'h4': "\n.SS @",
|
|
|
- 'dt': ("\n.IP \"@\" 4m", 'no_nl'),
|
|
|
- 'dd': "\n.br\n@",
|
|
|
- 'ul': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
|
|
|
- 'menu': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
|
|
|
- 'dir': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
|
|
|
- 'ol': ("\n.IP\n@\n.PP\n", 'index'),
|
|
|
- 'p': "\n.PP\n@",
|
|
|
- 'pre': ("\n.br\n.nf\n\\fC\n@\n\\fR\n.fi\n", 'preformat')
|
|
|
- }
|
|
|
-
|
|
|
-formats.update(styles)
|
|
|
-
|
|
|
-def is_string(x):
|
|
|
- return isinstance(x, types.StringType)
|
|
|
-
|
|
|
-def is_tuple(x):
|
|
|
- return isinstance(x, types.TupleType)
|
|
|
-
|
|
|
-def is_list(x):
|
|
|
- return isinstance(x, types.ListType)
|
|
|
-
|
|
|
-def is_blank(s):
|
|
|
- return is_string(s) and s.strip() == ""
|
|
|
-
|
|
|
-def clean(content):
|
|
|
- return [item for item in content if not is_blank(item)]
|
|
|
-
|
|
|
-class Formatter:
|
|
|
- def __init__(self, filename, stream = sys.stdout):
|
|
|
- self.stream = stream
|
|
|
- self.style = dict(preformat = False,
|
|
|
- in_ul = False,
|
|
|
- no_nl = False,
|
|
|
- in_table = False,
|
|
|
- in_tr = False,
|
|
|
- index = [])
|
|
|
- self.stack = []
|
|
|
- self.strip_re = re.compile("^[ \t]+")
|
|
|
- self.filename = filename
|
|
|
- self.at_bol = True
|
|
|
-
|
|
|
- def warning(self, msg):
|
|
|
- sys.stderr.write(msg + '\n')
|
|
|
-
|
|
|
- def set(self, var, val):
|
|
|
- self.style[var] = val
|
|
|
-
|
|
|
- def get(self, var):
|
|
|
- return self.style[var]
|
|
|
-
|
|
|
- def push(self, **kwargs):
|
|
|
- self.stack.append(self.style.copy())
|
|
|
- self.style.update(**kwargs)
|
|
|
-
|
|
|
- def pop(self):
|
|
|
- self.style = self.stack.pop()
|
|
|
-
|
|
|
- def show(self, s):
|
|
|
- self.stream.write(s)
|
|
|
- if s != '':
|
|
|
- self.at_bol = s.endswith('\n')
|
|
|
-
|
|
|
- def pp_with(self, content, var, val):
|
|
|
- self.push()
|
|
|
- self.set(var, val)
|
|
|
- self.pp(content)
|
|
|
- self.pop()
|
|
|
-
|
|
|
- def fmt(self, format, content, var = None):
|
|
|
- # String.partition is only in 2.5+
|
|
|
- # (pre,sep,post) = format.partition("@")
|
|
|
- if self.get('no_nl') and '\n' in format:
|
|
|
- self.warning("can't handle line breaks in <dt>...</dt>")
|
|
|
- format = "@"
|
|
|
- f = format.split('@', 1)
|
|
|
- pre = f[0]
|
|
|
- if len(f) > 1:
|
|
|
- sep = '@'
|
|
|
- post = f[1]
|
|
|
- else:
|
|
|
- sep = ''
|
|
|
- post = ''
|
|
|
-
|
|
|
- if pre != "":
|
|
|
- self.show(pre)
|
|
|
- if sep != "":
|
|
|
- if var:
|
|
|
- if var == 'index':
|
|
|
- val = self.get('index') + [0]
|
|
|
- else:
|
|
|
- val = True
|
|
|
- self.pp_with(content, var, val)
|
|
|
- else:
|
|
|
- self.pp(content)
|
|
|
- if post != "":
|
|
|
- self.show(post)
|
|
|
-
|
|
|
- def pp_li(self, content):
|
|
|
- if self.get('in_ul'):
|
|
|
- self.fmt("\n.IP \(bu 4n\n@", content)
|
|
|
- else:
|
|
|
- idx = self.get('index')
|
|
|
- idx[-1] += 1
|
|
|
- sec = ".".join(map(str,idx))
|
|
|
- self.show("\n.IP \\fB%s\\fR\n" % sec)
|
|
|
- self.set('index', idx)
|
|
|
- self.pp(content)
|
|
|
-
|
|
|
- def pp_title(self):
|
|
|
- self.show("\n.TH " +
|
|
|
- os.path.basename(self.filename).replace(".html","") +
|
|
|
- " 1 \"\" \"GRASS " +
|
|
|
- version +
|
|
|
- "\" \"Grass User's Manual\"")
|
|
|
-
|
|
|
- def pp_tr(self, content):
|
|
|
- content = clean(content)
|
|
|
- self.push(in_tr = True)
|
|
|
- col = 0
|
|
|
- for item in content:
|
|
|
- if not is_tuple(item):
|
|
|
- self.warning("invalid item in table row: %s" % str(item))
|
|
|
- continue
|
|
|
- (tag, tail) = item
|
|
|
- if tag not in ['td', 'th']:
|
|
|
- self.warning("invalid tag in table row: %s" % tag)
|
|
|
- continue
|
|
|
- if col > 0:
|
|
|
- self.show("\t \t")
|
|
|
- self.show("T{\n")
|
|
|
- self.pp(tail)
|
|
|
- self.show("\nT}")
|
|
|
- col += 1
|
|
|
- self.show("\n")
|
|
|
- self.pop()
|
|
|
-
|
|
|
- def pp_tbody(self, content):
|
|
|
- for item in content:
|
|
|
- if is_tuple(item):
|
|
|
- (tag, tail) = item
|
|
|
- if tag in ['thead', 'tbody', 'tfoot']:
|
|
|
- self.pp_tbody(tail)
|
|
|
- elif tag == 'tr':
|
|
|
- self.pp_tr(tail)
|
|
|
- self.show(".sp 1\n")
|
|
|
-
|
|
|
- def count_cols(self, content):
|
|
|
- cols = 0
|
|
|
- for item in content:
|
|
|
- n = 0
|
|
|
- if is_blank(item):
|
|
|
- pass
|
|
|
- elif is_tuple(item):
|
|
|
- (tag, tail) = item
|
|
|
- if tag in ['thead', 'tbody', 'tfoot']:
|
|
|
- n = self.count_cols(tail)
|
|
|
- elif tag == 'tr':
|
|
|
- n = len(clean(tail))
|
|
|
- cols = max(cols, n)
|
|
|
- else:
|
|
|
- self.warning("invalid item in table: %s" % str(item))
|
|
|
- return cols
|
|
|
-
|
|
|
- def pp_table(self, content):
|
|
|
- cols = self.count_cols(content)
|
|
|
- if cols == 0:
|
|
|
- return
|
|
|
- self.show("\n.TS\nexpand;\n")
|
|
|
- self.show(" lw1 ".join(["lw60" for i in range(cols)]) + ".\n")
|
|
|
- self.pp_tbody(content)
|
|
|
- self.show("\n.TE\n")
|
|
|
-
|
|
|
- def pp_tag(self, tag, content):
|
|
|
- if self.get('in_tr') and tag not in styles:
|
|
|
- self.pp(content)
|
|
|
- elif tag in formats:
|
|
|
- spec = formats[tag]
|
|
|
- if is_string(spec):
|
|
|
- self.fmt(spec, content)
|
|
|
- else:
|
|
|
- (fmt, var) = spec
|
|
|
- self.fmt(fmt, content, var)
|
|
|
- elif tag == 'table':
|
|
|
- if self.get('in_table'):
|
|
|
- self.warning("cannot handle nested tables")
|
|
|
- return
|
|
|
- self.push(in_table = True)
|
|
|
- self.pp_table(content)
|
|
|
- self.pop()
|
|
|
- elif tag == 'li':
|
|
|
- self.pp_li(content)
|
|
|
- elif tag == 'title':
|
|
|
- self.pp_title()
|
|
|
- else:
|
|
|
- self.pp(content)
|
|
|
-
|
|
|
- def pp_string(self, content):
|
|
|
- if content == "":
|
|
|
- return
|
|
|
- s = content
|
|
|
- if self.get('no_nl'):
|
|
|
- s = s.replace("\n"," ")
|
|
|
- s = s.replace("\\", "\\(rs")
|
|
|
- s = s.replace("'", "\\(cq")
|
|
|
- s = s.replace("\"", "\\(dq")
|
|
|
- s = s.replace("`", "\\(ga")
|
|
|
- s = s.replace("-", "\\-")
|
|
|
- if self.at_bol and s[0] in [".","'"]:
|
|
|
- s = "\\&" + s
|
|
|
- self.show(s)
|
|
|
-
|
|
|
- def pp_text(self, content):
|
|
|
- if content == "":
|
|
|
- return
|
|
|
- lines = content.splitlines(True)
|
|
|
- if len(lines) != 1:
|
|
|
- for line in lines:
|
|
|
- self.pp_text(line)
|
|
|
- return
|
|
|
- else:
|
|
|
- content = lines[0]
|
|
|
- if self.at_bol and not self.get('preformat'):
|
|
|
- content = self.strip_re.sub('', content)
|
|
|
- self.pp_string(content)
|
|
|
-
|
|
|
- def pp_list(self, content):
|
|
|
- for item in content:
|
|
|
- self.pp(item)
|
|
|
-
|
|
|
- def pp(self, content):
|
|
|
- if is_list(content):
|
|
|
- self.pp_list(content)
|
|
|
- elif is_tuple(content):
|
|
|
- (head, tail) = content
|
|
|
- self.pp_tag(head, tail)
|
|
|
- elif is_string(content):
|
|
|
- self.pp_text(content)
|
|
|
-
|
|
|
-class MyHTMLParser(HTMLParser):
|
|
|
- def __init__(self):
|
|
|
- HTMLParser.__init__(self)
|
|
|
- self.tag_stack = []
|
|
|
- self.excluded = frozenset()
|
|
|
- self.excluded_stack = []
|
|
|
- self.data = []
|
|
|
- self.data_stack = []
|
|
|
-
|
|
|
- def top(self):
|
|
|
- if self.tag_stack == []:
|
|
|
- return None
|
|
|
- else:
|
|
|
- return self.tag_stack[-1]
|
|
|
-
|
|
|
- def pop(self):
|
|
|
- self.excluded = self.excluded_stack.pop()
|
|
|
- data = self.data
|
|
|
- self.data = self.data_stack.pop()
|
|
|
- tag = self.tag_stack.pop()
|
|
|
- self.append((tag, data))
|
|
|
- return tag
|
|
|
-
|
|
|
- def push(self, tag):
|
|
|
- self.tag_stack.append(tag)
|
|
|
- self.excluded_stack.append(self.excluded)
|
|
|
- if tag in excluded:
|
|
|
- self.excluded = self.excluded.union(excluded[tag])
|
|
|
- self.data_stack.append(self.data)
|
|
|
- self.data = []
|
|
|
-
|
|
|
- def append(self, item):
|
|
|
- self.data.append(item)
|
|
|
-
|
|
|
- def is_allowed(self, tag):
|
|
|
- return tag not in self.excluded and tag in allowed[self.top()]
|
|
|
-
|
|
|
- def handle_starttag(self, tag, attrs):
|
|
|
- if self.tag_stack != []:
|
|
|
- while not self.is_allowed(tag):
|
|
|
- self.pop()
|
|
|
- if tag not in single:
|
|
|
- self.push(tag)
|
|
|
- else:
|
|
|
- self.append((tag,None))
|
|
|
-
|
|
|
- def handle_entityref(self, name):
|
|
|
- if name in entities:
|
|
|
- self.handle_data(entities[name])
|
|
|
- elif name in entitydefs:
|
|
|
- self.handle_data(entitydefs[name])
|
|
|
- else:
|
|
|
- sys.stderr.write("unrecognized entity: %s\n" % name)
|
|
|
-
|
|
|
- def handle_charref(self, name):
|
|
|
- sys.stderr.write('unsupported character reference <%s>' % name);
|
|
|
-
|
|
|
- def handle_data(self, data):
|
|
|
- self.append(data)
|
|
|
-
|
|
|
- def handle_endtag(self, tag):
|
|
|
- while True:
|
|
|
- if self.pop() == tag:
|
|
|
- break
|
|
|
-
|
|
|
def main():
|
|
|
# parse HTML
|
|
|
infile = sys.argv[1]
|
|
|
inf = file(infile)
|
|
|
- p = MyHTMLParser()
|
|
|
+ p = HTMLParser(entities)
|
|
|
for n, line in enumerate(inf):
|
|
|
try:
|
|
|
p.feed(line)
|
|
@@ -486,7 +22,7 @@ def main():
|
|
|
sys.stderr.write('%s:%d:%d: Parse error: %s\n' % (infile, err.lineno, err.offset, err.msg))
|
|
|
sys.exit(1)
|
|
|
except Exception, err:
|
|
|
- sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, err.__dict__, line))
|
|
|
+ sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, repr(err), line))
|
|
|
sys.exit(1)
|
|
|
p.close()
|
|
|
inf.close()
|