g.html2man.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. #!/usr/bin/env python3
  2. import sys
  3. import re
  4. from ghtml import HTMLParser, HTMLParseError
  5. from ggroff import Formatter
  6. try:
  7. # Python 2 str - bytes version
  8. from StringIO import StringIO
  9. except ImportError:
  10. # Python 3 str - unicode version
  11. from io import StringIO
  12. entities = {"nbsp": " ", "bull": "*"}
  13. # Remove ToC
  14. def fix(content):
  15. if isinstance(content, tuple):
  16. tag, attrs, body = content
  17. if tag == "div" and ("class", "toc") in attrs:
  18. return None
  19. else:
  20. return (tag, attrs, fix(body))
  21. elif isinstance(content, list):
  22. return [fixed for item in content for fixed in [fix(item)] if fixed is not None]
  23. else:
  24. return content
  25. def main():
  26. # parse HTML
  27. infile = sys.argv[1]
  28. inf = open(infile)
  29. p = HTMLParser(entities)
  30. for n, line in enumerate(inf):
  31. try:
  32. p.feed(line)
  33. except HTMLParseError as err:
  34. sys.stderr.write(
  35. "%s:%d:%d: Parse error: %s\n"
  36. % (infile, err.lineno, err.offset, err.msg)
  37. )
  38. sys.exit(1)
  39. except Exception as err:
  40. sys.stderr.write(
  41. "%s:%d:0: Error (%s): %s\n" % (infile, n + 1, repr(err), line)
  42. )
  43. sys.exit(1)
  44. p.close()
  45. inf.close()
  46. # generate groff
  47. sf = StringIO()
  48. f = Formatter(infile, sf)
  49. f.pp(fix(p.data))
  50. s = sf.getvalue()
  51. sf.close()
  52. # strip excess whitespace
  53. blank_re = re.compile("[ \t\n]*\n([ \t]*\n)*")
  54. s = blank_re.sub("\n", s)
  55. s = s.lstrip()
  56. # write groff
  57. with open(sys.argv[2], "wb") as outf:
  58. if sys.version_info.major >= 3:
  59. s = s.encode("UTF-8")
  60. outf.write(s)
  61. if __name__ == "__main__":
  62. main()