g.html2man.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #!/usr/bin/env python
  2. import sys
  3. import re
  4. from html import HTMLParser, HTMLParseError
  5. from groff import Formatter
  6. from StringIO import StringIO
  7. entities = {
  8. 'nbsp': " ",
  9. 'bull': "*"
  10. }
  11. # Remove ToC
  12. def fix(content):
  13. if isinstance(content, tuple):
  14. tag, attrs, body = content
  15. if tag == 'div' and ('class', 'toc') in attrs:
  16. return None
  17. else:
  18. return (tag, attrs, fix(body))
  19. elif isinstance(content, list):
  20. return [fixed
  21. for item in content
  22. for fixed in [fix(item)]
  23. if fixed is not None]
  24. else:
  25. return content
  26. def main():
  27. # parse HTML
  28. infile = sys.argv[1]
  29. inf = file(infile)
  30. p = HTMLParser(entities)
  31. for n, line in enumerate(inf):
  32. try:
  33. p.feed(line)
  34. except HTMLParseError as err:
  35. sys.stderr.write('%s:%d:%d: Parse error: %s\n' % (infile, err.lineno, err.offset, err.msg))
  36. sys.exit(1)
  37. except Exception as err:
  38. sys.stderr.write('%s:%d:0: Error (%s): %s\n' % (infile, n + 1, repr(err), line))
  39. sys.exit(1)
  40. p.close()
  41. inf.close()
  42. # generate groff
  43. sf = StringIO()
  44. f = Formatter(infile, sf)
  45. f.pp(fix(p.data))
  46. s = sf.getvalue()
  47. sf.close()
  48. # strip excess whitespace
  49. blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*")
  50. s = blank_re.sub('\n', s)
  51. s = s.lstrip()
  52. # write groff
  53. outf = file(sys.argv[2], 'w')
  54. outf.write(s)
  55. outf.close()
  56. if __name__ == "__main__":
  57. main()