g.html2man.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459
  1. #!/usr/bin/env python
  2. # coding=iso-8859-1
  3. import sys
  4. import types
  5. import os
  6. import re
  7. from HTMLParser import HTMLParser
  8. from htmlentitydefs import entitydefs
  9. from StringIO import StringIO
  10. try:
  11. version = os.environ['VERSION_NUMBER']
  12. except:
  13. version = ""
  14. entities = {
  15. 'lt': "<",
  16. 'gt': ">",
  17. 'amp': "&",
  18. 'nbsp': " ",
  19. 'copy': "©",
  20. 'quot': "\"",
  21. 'bull': "*"
  22. }
  23. single = ["area", "base", "basefont", "br", "col", "frame",
  24. "hr", "img", "input", "isindex", "link", "meta", "param"]
  25. single = frozenset(single)
  26. heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
  27. fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
  28. phrase = [ "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr",
  29. "acronym"]
  30. special = [ "a", "img", "applet", "object", "font", "basefont", "br", "script",
  31. "map", "q", "sub", "sup", "span", "bdo", "iframe"]
  32. formctrl = [ "input", "select", "textarea", "label", "button"]
  33. list = [ "ul", "ol", " dir", "menu"]
  34. head_misc = [ "script", "style", "meta", "link", "object"]
  35. pre_exclusion = [ "img", "object", "applet", "big", "small", "sub", "sup",
  36. "font", "basefont"]
  37. block = [ "p", "pre", "dl", "div", "center", "noscript", "noframes",
  38. "blockquote", "form", "isindex", "hr", "table", "fieldset",
  39. "address"] + heading + list
  40. inline = fontstyle + phrase + special + formctrl
  41. flow = block + inline
  42. html_content = ["head", "body"]
  43. head_content = ["title", "isindex", "base"]
  44. def setify(d):
  45. return dict([(key, frozenset(val)) for key, val in d.iteritems()])
  46. allowed = {
  47. "a": inline,
  48. "abbr": inline,
  49. "acronym": inline,
  50. "address": inline + ["p"],
  51. "applet": flow + ["param"],
  52. "b": inline,
  53. "bdo": inline,
  54. "big": inline,
  55. "blockquote": flow,
  56. "body": flow + ["ins", "del"],
  57. "button": flow,
  58. "caption": inline,
  59. "center": flow,
  60. "cite": inline,
  61. "code": inline,
  62. "colgroup": ["col"],
  63. "dd": flow,
  64. "del": flow,
  65. "dfn": inline,
  66. "dir": ["li"],
  67. "div": flow,
  68. "dl": ["dt", "dd"],
  69. "dt": inline,
  70. "em": inline,
  71. "fieldset": flow + ["legend"],
  72. "font": inline,
  73. "form": flow,
  74. "frameset": ["frameset", "frame", "noframes"],
  75. "h1": inline,
  76. "h2": inline,
  77. "h3": inline,
  78. "h4": inline,
  79. "h5": inline,
  80. "h6": inline,
  81. "head": head_content + head_misc,
  82. "html": html_content,
  83. "i": inline,
  84. "iframe": flow,
  85. "ins": flow,
  86. "kbd": inline,
  87. "label": inline,
  88. "legend": inline,
  89. "li": flow,
  90. "map": block + ["area"],
  91. "menu": ["li"],
  92. "noframes": flow,
  93. "noscript": flow,
  94. "object": flow + ["param"],
  95. "ol": ["li"],
  96. "optgroup": ["option"],
  97. "option": [],
  98. "p": inline,
  99. "pre": inline,
  100. "q": inline,
  101. "s": inline,
  102. "samp": inline,
  103. "script": [],
  104. "select": ["optgroup", "option"],
  105. "small": inline,
  106. "span": inline,
  107. "strike": inline,
  108. "strong": inline,
  109. "style": [],
  110. "sub": inline,
  111. "sup": inline,
  112. "table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody",
  113. "tr"], # to allow for <table>[implied <tbody>]<tr>
  114. "tbody": ["tr"],
  115. "td": flow,
  116. "textarea": [],
  117. "tfoot": ["tr"],
  118. "th": flow,
  119. "thead": ["tr"],
  120. "title": [],
  121. "tr": ["th", "td"],
  122. "tt": inline,
  123. "u": inline,
  124. "ul": ["li"],
  125. "var": inline
  126. }
  127. allowed = setify(allowed)
  128. excluded = {
  129. "a": ["a"],
  130. "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
  131. "dir": block,
  132. "form": ["form"],
  133. "label": ["label"],
  134. "menu": block,
  135. "pre": pre_exclusion
  136. }
  137. excluded = setify(excluded)
  138. styles = {
  139. 'b': "\\fB@\\fR",
  140. 'i': "\\fI@\\fR",
  141. 'em': "\\fI@\\fR",
  142. 'code': "\\fC@\\fR",
  143. 'span': "\\fC@\\fR",
  144. 'sup': "\\u@\\d",
  145. 'hr': ""
  146. }
  147. formats = {
  148. 'br': "\n.br\n",
  149. 'h2': "\n.SH @",
  150. 'h3': "\n.SS @",
  151. 'h4': "\n.SS @",
  152. 'dt': ("\n.IP \"@\" 4m", 'no_nl'),
  153. 'dd': "\n.br\n@",
  154. 'ul': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
  155. 'menu': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
  156. 'dir': ("\n.RS 4n\n@\n.RE\n", 'in_ul'),
  157. 'ol': ("\n.IP\n@\n.PP\n", 'index'),
  158. 'p': "\n.PP\n@",
  159. 'pre': ("\n.br\n.nf\n\\fC\n@\n\\fR\n.fi\n", 'preformat')
  160. }
  161. formats.update(styles)
  162. def is_string(x):
  163. return isinstance(x, types.StringType)
  164. def is_tuple(x):
  165. return isinstance(x, types.TupleType)
  166. def is_list(x):
  167. return isinstance(x, types.ListType)
  168. def is_blank(s):
  169. return is_string(s) and s.strip() == ""
  170. def clean(content):
  171. return [item for item in content if not is_blank(item)]
  172. class Formatter:
  173. def __init__(self, stream = sys.stdout):
  174. self.stream = stream
  175. self.style = dict(preformat = False,
  176. in_ul = False,
  177. no_nl = False,
  178. in_table = False,
  179. in_tr = False,
  180. index = [])
  181. self.stack = []
  182. self.strip_re = re.compile("\n[ \t]+")
  183. def warning(self, msg):
  184. sys.stderr.write(msg + '\n')
  185. def set(self, var, val):
  186. self.style[var] = val
  187. def get(self, var):
  188. return self.style[var]
  189. def push(self, **kwargs):
  190. self.stack.append(self.style.copy())
  191. self.style.update(**kwargs)
  192. def pop(self):
  193. self.style = self.stack.pop()
  194. def show(self, s):
  195. self.stream.write(s)
  196. def pp_with(self, content, var, val):
  197. self.push()
  198. self.set(var, val)
  199. self.pp(content)
  200. self.pop()
  201. def fmt(self, format, content, var = None):
  202. (pre,sep,post) = format.partition("@")
  203. if pre != "":
  204. self.show(pre)
  205. if sep != "":
  206. if var:
  207. if var == 'index':
  208. val = self.get('index') + [0]
  209. else:
  210. val = True
  211. self.pp_with(content, var, val)
  212. else:
  213. self.pp(content)
  214. if post != "":
  215. self.show(post)
  216. def pp_li(self, content):
  217. if self.get('in_ul'):
  218. self.fmt("\n.IP \(bu 4n\n@", content)
  219. else:
  220. idx = self.get('index')
  221. idx[-1] += 1
  222. sec = ".".join(map(str,idx))
  223. self.show("\n.IP \\fB%s\\fR\n" % sec)
  224. self.set('index', idx)
  225. self.pp(content)
  226. def pp_title(self):
  227. self.show("\n.TH " +
  228. os.path.basename(sys.argv[1]).replace(".html","") +
  229. " 1 \"\" \"GRASS " +
  230. version +
  231. "\" \"Grass User's Manual\"")
  232. def pp_tr(self, content):
  233. content = clean(content)
  234. self.push(in_tr = True)
  235. col = 0
  236. for item in content:
  237. if not is_tuple(item):
  238. self.warning("invalid item in table row: %s" % str(item))
  239. continue
  240. (tag, tail) = item
  241. if tag not in ['td', 'th']:
  242. self.warning("invalid tag in table row: %s" % tag)
  243. continue
  244. if col > 0:
  245. self.show("\t \t")
  246. self.show("T{\n")
  247. self.pp(tail)
  248. self.show("\nT}")
  249. col += 1
  250. self.show("\n")
  251. self.pop()
  252. def pp_tbody(self, content):
  253. for item in content:
  254. if is_tuple(item):
  255. (tag, tail) = item
  256. if tag in ['thead', 'tbody', 'tfoot']:
  257. self.pp_tbody(tail)
  258. elif tag == 'tr':
  259. self.pp_tr(tail)
  260. self.show(".sp 1\n")
  261. def count_cols(self, content):
  262. cols = 0
  263. for item in content:
  264. n = 0
  265. if is_blank(item):
  266. pass
  267. elif is_tuple(item):
  268. (tag, tail) = item
  269. if tag in ['thead', 'tbody', 'tfoot']:
  270. n = self.count_cols(tail)
  271. elif tag == 'tr':
  272. n = len(clean(tail))
  273. cols = max(cols, n)
  274. else:
  275. self.warning("invalid item in table: %s" % str(item))
  276. return cols
  277. def pp_table(self, content):
  278. cols = self.count_cols(content)
  279. if cols == 0:
  280. return
  281. self.show("\n.TS\nexpand;\n")
  282. self.show(" lw1 ".join(["lw60" for i in range(cols)]) + ".\n")
  283. self.pp_tbody(content)
  284. self.show("\n.TE\n")
  285. def pp_tag(self, tag, content):
  286. if self.get('in_tr') and tag not in styles:
  287. self.pp(content)
  288. elif tag in formats:
  289. spec = formats[tag]
  290. if is_string(spec):
  291. self.fmt(spec, content)
  292. else:
  293. (fmt, var) = spec
  294. self.fmt(fmt, content, var)
  295. elif tag == 'table':
  296. if self.get('in_table'):
  297. self.warning("cannot handle nested tables")
  298. return
  299. self.push(in_table = True)
  300. self.pp_table(content)
  301. self.pop()
  302. elif tag == 'li':
  303. self.pp_li(content)
  304. elif tag == 'title':
  305. self.pp_title()
  306. else:
  307. self.pp(content)
  308. def pp_string(self, content):
  309. s = content
  310. if self.get('no_nl'):
  311. s = s.replace("\n"," ")
  312. s = s.replace("\\", "\\(rs")
  313. s = s.replace("'", "\\(cq")
  314. s = s.replace("\"", "\\(dq")
  315. s = s.replace("`", "\\(ga")
  316. self.show(s)
  317. def pp_text(self, content):
  318. if content != "":
  319. if self.get('preformat'):
  320. self.pp_string(content)
  321. else:
  322. s = self.strip_re.sub('\n', content)
  323. self.pp_string(s)
  324. def pp_list(self, content):
  325. for item in content:
  326. self.pp(item)
  327. def pp(self, content):
  328. if is_list(content):
  329. self.pp_list(content)
  330. elif is_tuple(content):
  331. (head, tail) = content
  332. self.pp_tag(head, tail)
  333. elif is_string(content):
  334. self.pp_text(content)
  335. class MyHTMLParser(HTMLParser):
  336. def __init__(self):
  337. HTMLParser.__init__(self)
  338. self.tag_stack = []
  339. self.excluded = frozenset()
  340. self.excluded_stack = []
  341. self.data = []
  342. self.data_stack = []
  343. def top(self):
  344. if self.tag_stack == []:
  345. return None
  346. else:
  347. return self.tag_stack[-1]
  348. def pop(self):
  349. self.excluded = self.excluded_stack.pop()
  350. data = self.data
  351. self.data = self.data_stack.pop()
  352. tag = self.tag_stack.pop()
  353. self.append((tag, data))
  354. return tag
  355. def push(self, tag):
  356. self.tag_stack.append(tag)
  357. self.excluded_stack.append(self.excluded)
  358. if tag in excluded:
  359. self.excluded = self.excluded.union(excluded[tag])
  360. self.data_stack.append(self.data)
  361. self.data = []
  362. def append(self, item):
  363. self.data.append(item)
  364. def is_allowed(self, tag):
  365. return tag not in self.excluded and tag in allowed[self.top()]
  366. def handle_starttag(self, tag, attrs):
  367. if self.tag_stack != []:
  368. while not self.is_allowed(tag):
  369. self.pop()
  370. if tag not in single:
  371. self.push(tag)
  372. else:
  373. self.append((tag,None))
  374. def handle_entityref(self, name):
  375. if name in entities:
  376. self.handle_data(entities[name])
  377. elif name in entitydefs:
  378. self.handle_data(entitydefs[name])
  379. else:
  380. sys.stderr.write("unrecognized entity: %s\n" % name)
  381. def handle_data(self, data):
  382. self.append(data)
  383. def handle_endtag(self, tag):
  384. while True:
  385. if self.pop() == tag:
  386. break
  387. if __name__ == "__main__":
  388. # parse HTML
  389. inf = file(sys.argv[1])
  390. p = MyHTMLParser()
  391. p.feed(inf.read())
  392. p.close()
  393. inf.close()
  394. # generate groff
  395. sf = StringIO()
  396. f = Formatter(sf)
  397. f.pp(p.data)
  398. s = sf.getvalue()
  399. sf.close()
  400. # strip excess whitespace
  401. blank_re = re.compile("[ \t\n]*\n[ \t\n]*")
  402. s = blank_re.sub('\n', s)
  403. s = s.lstrip()
  404. # write groff
  405. outf = file(sys.argv[2], 'w')
  406. outf.write(s)
  407. outf.close()