g.html2man.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. #!/usr/bin/env python
  2. # coding=iso-8859-1
  3. import sys
  4. import types
  5. import os
  6. import re
  7. from HTMLParser import HTMLParser, HTMLParseError
  8. from htmlentitydefs import entitydefs
  9. from StringIO import StringIO
  10. try:
  11. version = os.environ['VERSION_NUMBER']
  12. except:
  13. version = ""
  14. entities = {
  15. 'lt': "<",
  16. 'gt': ">",
  17. 'amp': "&",
  18. 'nbsp': " ",
  19. 'copy': "©",
  20. 'quot': "\"",
  21. 'bull': "*"
  22. }
  23. single = ["area", "base", "basefont", "br", "col", "frame",
  24. "hr", "img", "input", "isindex", "link", "meta", "param"]
  25. single = frozenset(single)
  26. heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
  27. fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
  28. phrase = [ "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr",
  29. "acronym"]
  30. special = [ "a", "img", "applet", "object", "font", "basefont", "br", "script",
  31. "map", "q", "sub", "sup", "span", "bdo", "iframe"]
  32. formctrl = [ "input", "select", "textarea", "label", "button"]
  33. list = [ "ul", "ol", " dir", "menu"]
  34. head_misc = [ "script", "style", "meta", "link", "object"]
  35. pre_exclusion = [ "img", "object", "applet", "big", "small", "sub", "sup",
  36. "font", "basefont"]
  37. block = [ "p", "pre", "dl", "div", "center", "noscript", "noframes",
  38. "blockquote", "form", "isindex", "hr", "table", "fieldset",
  39. "address"] + heading + list
  40. inline = fontstyle + phrase + special + formctrl
  41. flow = block + inline
  42. html_content = ["head", "body"]
  43. head_content = ["title", "isindex", "base"]
  44. def setify(d):
  45. return dict([(key, frozenset(val)) for key, val in d.iteritems()])
  46. allowed = {
  47. "a": inline,
  48. "abbr": inline,
  49. "acronym": inline,
  50. "address": inline + ["p"],
  51. "applet": flow + ["param"],
  52. "b": inline,
  53. "bdo": inline,
  54. "big": inline,
  55. "blockquote": flow,
  56. "body": flow + ["ins", "del"],
  57. "button": flow,
  58. "caption": inline,
  59. "center": flow,
  60. "cite": inline,
  61. "code": inline,
  62. "colgroup": ["col"],
  63. "dd": flow,
  64. "del": flow,
  65. "dfn": inline,
  66. "dir": ["li"],
  67. "div": flow,
  68. "dl": ["dt", "dd"],
  69. "dt": inline,
  70. "em": inline,
  71. "fieldset": flow + ["legend"],
  72. "font": inline,
  73. "form": flow,
  74. "frameset": ["frameset", "frame", "noframes"],
  75. "h1": inline,
  76. "h2": inline,
  77. "h3": inline,
  78. "h4": inline,
  79. "h5": inline,
  80. "h6": inline,
  81. "head": head_content + head_misc,
  82. "html": html_content,
  83. "i": inline,
  84. "iframe": flow,
  85. "ins": flow,
  86. "kbd": inline,
  87. "label": inline,
  88. "legend": inline,
  89. "li": flow,
  90. "map": block + ["area"],
  91. "menu": ["li"],
  92. "noframes": flow,
  93. "noscript": flow,
  94. "object": flow + ["param"],
  95. "ol": ["li"],
  96. "optgroup": ["option"],
  97. "option": [],
  98. "p": inline,
  99. "pre": inline,
  100. "q": inline,
  101. "s": inline,
  102. "samp": inline,
  103. "script": [],
  104. "select": ["optgroup", "option"],
  105. "small": inline,
  106. "span": inline,
  107. "strike": inline,
  108. "strong": inline,
  109. "style": [],
  110. "sub": inline,
  111. "sup": inline,
  112. "table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody",
  113. "tr"], # to allow for <table>[implied <tbody>]<tr>
  114. "tbody": ["tr"],
  115. "td": flow,
  116. "textarea": [],
  117. "tfoot": ["tr"],
  118. "th": flow,
  119. "thead": ["tr"],
  120. "title": [],
  121. "tr": ["th", "td"],
  122. "tt": inline,
  123. "u": inline,
  124. "ul": ["li"],
  125. "var": inline
  126. }
  127. allowed = setify(allowed)
  128. excluded = {
  129. "a": ["a"],
  130. "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
  131. "dir": block,
  132. "form": ["form"],
  133. "label": ["label"],
  134. "menu": block,
  135. "pre": pre_exclusion
  136. }
  137. excluded = setify(excluded)
  138. styles = {
  139. 'b': "\\fB@\\fR",
  140. 'i': "\\fI@\\fR",
  141. 'em': "\\fI@\\fR",
  142. 'code': "\\fC@\\fR",
  143. 'span': "\\fC@\\fR",
  144. 'sup': "\\u@\\d",
  145. 'hr': ""
  146. }
  147. formats = {
  148. 'br': "\n.br\n",
  149. 'h2': "\n.SH @",
  150. 'h3': "\n.SS @",
  151. 'h4': "\n.SS @",
  152. 'dt': ("\n.IP \"@\" 4m", 'no_nl'),
  153. 'dd': "\n.br\n@",
  154. 'ul': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
  155. 'menu': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
  156. 'dir': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
  157. 'ol': ("\n.IP\n@\n.PP\n", 'index'),
  158. 'p': "\n.PP\n@",
  159. 'pre': ("\n.br\n.nf\n\\fC\n@\n\\fR\n.fi\n", 'preformat')
  160. }
  161. formats.update(styles)
  162. def is_string(x):
  163. return isinstance(x, types.StringType)
  164. def is_tuple(x):
  165. return isinstance(x, types.TupleType)
  166. def is_list(x):
  167. return isinstance(x, types.ListType)
  168. def is_blank(s):
  169. return is_string(s) and s.strip() == ""
  170. def clean(content):
  171. return [item for item in content if not is_blank(item)]
  172. class Formatter:
  173. def __init__(self, filename, stream = sys.stdout):
  174. self.stream = stream
  175. self.style = dict(preformat = False,
  176. in_ul = False,
  177. no_nl = False,
  178. in_table = False,
  179. in_tr = False,
  180. index = [])
  181. self.stack = []
  182. self.strip_re = re.compile("^[ \t]+")
  183. self.filename = filename
  184. self.at_bol = True
  185. def warning(self, msg):
  186. sys.stderr.write(msg + '\n')
  187. def set(self, var, val):
  188. self.style[var] = val
  189. def get(self, var):
  190. return self.style[var]
  191. def push(self, **kwargs):
  192. self.stack.append(self.style.copy())
  193. self.style.update(**kwargs)
  194. def pop(self):
  195. self.style = self.stack.pop()
  196. def show(self, s):
  197. self.stream.write(s)
  198. if s != '':
  199. self.at_bol = s.endswith('\n')
  200. def pp_with(self, content, var, val):
  201. self.push()
  202. self.set(var, val)
  203. self.pp(content)
  204. self.pop()
  205. def fmt(self, format, content, var = None):
  206. # String.partition is only in 2.5+
  207. # (pre,sep,post) = format.partition("@")
  208. if self.get('no_nl') and '\n' in format:
  209. self.warning("can't handle line breaks in <dt>...</dt>")
  210. format = "@"
  211. f = format.split('@', 1)
  212. pre = f[0]
  213. if len(f) > 1:
  214. sep = '@'
  215. post = f[1]
  216. else:
  217. sep = ''
  218. post = ''
  219. if pre != "":
  220. self.show(pre)
  221. if sep != "":
  222. if var:
  223. if var == 'index':
  224. val = self.get('index') + [0]
  225. else:
  226. val = True
  227. self.pp_with(content, var, val)
  228. else:
  229. self.pp(content)
  230. if post != "":
  231. self.show(post)
  232. def pp_li(self, content):
  233. if self.get('in_ul'):
  234. self.fmt("\n.IP \(bu 4n\n@", content)
  235. else:
  236. idx = self.get('index')
  237. idx[-1] += 1
  238. sec = ".".join(map(str,idx))
  239. self.show("\n.IP \\fB%s\\fR\n" % sec)
  240. self.set('index', idx)
  241. self.pp(content)
  242. def pp_title(self):
  243. self.show("\n.TH " +
  244. os.path.basename(self.filename).replace(".html","") +
  245. " 1 \"\" \"GRASS " +
  246. version +
  247. "\" \"Grass User's Manual\"")
  248. def pp_tr(self, content):
  249. content = clean(content)
  250. self.push(in_tr = True)
  251. col = 0
  252. for item in content:
  253. if not is_tuple(item):
  254. self.warning("invalid item in table row: %s" % str(item))
  255. continue
  256. (tag, tail) = item
  257. if tag not in ['td', 'th']:
  258. self.warning("invalid tag in table row: %s" % tag)
  259. continue
  260. if col > 0:
  261. self.show("\t \t")
  262. self.show("T{\n")
  263. self.pp(tail)
  264. self.show("\nT}")
  265. col += 1
  266. self.show("\n")
  267. self.pop()
  268. def pp_tbody(self, content):
  269. for item in content:
  270. if is_tuple(item):
  271. (tag, tail) = item
  272. if tag in ['thead', 'tbody', 'tfoot']:
  273. self.pp_tbody(tail)
  274. elif tag == 'tr':
  275. self.pp_tr(tail)
  276. self.show(".sp 1\n")
  277. def count_cols(self, content):
  278. cols = 0
  279. for item in content:
  280. n = 0
  281. if is_blank(item):
  282. pass
  283. elif is_tuple(item):
  284. (tag, tail) = item
  285. if tag in ['thead', 'tbody', 'tfoot']:
  286. n = self.count_cols(tail)
  287. elif tag == 'tr':
  288. n = len(clean(tail))
  289. cols = max(cols, n)
  290. else:
  291. self.warning("invalid item in table: %s" % str(item))
  292. return cols
  293. def pp_table(self, content):
  294. cols = self.count_cols(content)
  295. if cols == 0:
  296. return
  297. self.show("\n.TS\nexpand;\n")
  298. self.show(" lw1 ".join(["lw60" for i in range(cols)]) + ".\n")
  299. self.pp_tbody(content)
  300. self.show("\n.TE\n")
  301. def pp_tag(self, tag, content):
  302. if self.get('in_tr') and tag not in styles:
  303. self.pp(content)
  304. elif tag in formats:
  305. spec = formats[tag]
  306. if is_string(spec):
  307. self.fmt(spec, content)
  308. else:
  309. (fmt, var) = spec
  310. self.fmt(fmt, content, var)
  311. elif tag == 'table':
  312. if self.get('in_table'):
  313. self.warning("cannot handle nested tables")
  314. return
  315. self.push(in_table = True)
  316. self.pp_table(content)
  317. self.pop()
  318. elif tag == 'li':
  319. self.pp_li(content)
  320. elif tag == 'title':
  321. self.pp_title()
  322. else:
  323. self.pp(content)
  324. def pp_string(self, content):
  325. if content == "":
  326. return
  327. s = content
  328. if self.get('no_nl'):
  329. s = s.replace("\n"," ")
  330. s = s.replace("\\", "\\(rs")
  331. s = s.replace("'", "\\(cq")
  332. s = s.replace("\"", "\\(dq")
  333. s = s.replace("`", "\\(ga")
  334. if self.at_bol and s[0] in [".","'"]:
  335. s = "\\&" + s
  336. self.show(s)
  337. def pp_text(self, content):
  338. if content == "":
  339. return
  340. lines = content.splitlines(True)
  341. if len(lines) != 1:
  342. for line in lines:
  343. self.pp_text(line)
  344. return
  345. else:
  346. content = lines[0]
  347. if self.at_bol and not self.get('preformat'):
  348. content = self.strip_re.sub('', content)
  349. self.pp_string(content)
  350. def pp_list(self, content):
  351. for item in content:
  352. self.pp(item)
  353. def pp(self, content):
  354. if is_list(content):
  355. self.pp_list(content)
  356. elif is_tuple(content):
  357. (head, tail) = content
  358. self.pp_tag(head, tail)
  359. elif is_string(content):
  360. self.pp_text(content)
  361. class MyHTMLParser(HTMLParser):
  362. def __init__(self):
  363. HTMLParser.__init__(self)
  364. self.tag_stack = []
  365. self.excluded = frozenset()
  366. self.excluded_stack = []
  367. self.data = []
  368. self.data_stack = []
  369. def top(self):
  370. if self.tag_stack == []:
  371. return None
  372. else:
  373. return self.tag_stack[-1]
  374. def pop(self):
  375. self.excluded = self.excluded_stack.pop()
  376. data = self.data
  377. self.data = self.data_stack.pop()
  378. tag = self.tag_stack.pop()
  379. self.append((tag, data))
  380. return tag
  381. def push(self, tag):
  382. self.tag_stack.append(tag)
  383. self.excluded_stack.append(self.excluded)
  384. if tag in excluded:
  385. self.excluded = self.excluded.union(excluded[tag])
  386. self.data_stack.append(self.data)
  387. self.data = []
  388. def append(self, item):
  389. self.data.append(item)
  390. def is_allowed(self, tag):
  391. return tag not in self.excluded and tag in allowed[self.top()]
  392. def handle_starttag(self, tag, attrs):
  393. if self.tag_stack != []:
  394. while not self.is_allowed(tag):
  395. self.pop()
  396. if tag not in single:
  397. self.push(tag)
  398. else:
  399. self.append((tag,None))
  400. def handle_entityref(self, name):
  401. if name in entities:
  402. self.handle_data(entities[name])
  403. elif name in entitydefs:
  404. self.handle_data(entitydefs[name])
  405. else:
  406. sys.stderr.write("unrecognized entity: %s\n" % name)
  407. def handle_data(self, data):
  408. self.append(data)
  409. def handle_endtag(self, tag):
  410. while True:
  411. if self.pop() == tag:
  412. break
  413. def main():
  414. # parse HTML
  415. infile = sys.argv[1]
  416. inf = file(infile)
  417. p = MyHTMLParser()
  418. for n, line in enumerate(inf):
  419. try:
  420. p.feed(line)
  421. except HTMLParseError, err:
  422. sys.stderr.write('%s:%d:%d: Parse error: %s\n' % (infile, err.lineno, err.offset, err.msg))
  423. sys.exit(1)
  424. except Exception, err:
  425. sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, err.__dict__, line))
  426. sys.exit(1)
  427. p.close()
  428. inf.close()
  429. # generate groff
  430. sf = StringIO()
  431. f = Formatter(infile, sf)
  432. f.pp(p.data)
  433. s = sf.getvalue()
  434. sf.close()
  435. # strip excess whitespace
  436. blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*")
  437. s = blank_re.sub('\n', s)
  438. s = s.lstrip()
  439. # write groff
  440. outf = file(sys.argv[2], 'w')
  441. outf.write(s)
  442. outf.close()
  443. if __name__ == "__main__":
  444. main()