ghtml.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. from __future__ import (absolute_import, division, generators, nested_scopes,
  2. print_function, unicode_literals, with_statement)
  3. import sys
  4. try:
  5. # Python 2 import
  6. import HTMLParser as base
  7. HTMLParseError = base.HTMLParseError
  8. except:
  9. # Python 3 import
  10. import html.parser as base
  11. # TODO: this needs a better fix since HTMLParseError is actually
  12. # used including its attributes, so that actually fails
  13. # HTMLParseError is depreciated, parsing is not strict
  14. HTMLParseError = Exception
  15. try:
  16. # Python 3
  17. from html.entities import entitydefs
  18. except ImportError:
  19. # Python 2
  20. from htmlentitydefs import entitydefs
  21. __all__ = ["HTMLParser", "HTMLParseError"]
  22. omit_start = ["body", "tbody", "head", "html"]
  23. single = ["area", "base", "basefont", "br", "col", "frame",
  24. "hr", "img", "input", "isindex", "link", "meta", "param"]
  25. single = frozenset(single)
  26. heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
  27. fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
  28. phrase = ["em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr",
  29. "acronym"]
  30. special = ["a", "img", "applet", "object", "font", "basefont", "br", "script",
  31. "map", "q", "sub", "sup", "span", "bdo", "iframe"]
  32. formctrl = ["input", "select", "textarea", "label", "button"]
  33. lists = ["ul", "ol", " dir", "menu"]
  34. head_misc = ["script", "style", "meta", "link", "object"]
  35. pre_exclusion = ["img", "object", "applet", "big", "small", "sub", "sup",
  36. "font", "basefont"]
  37. block = ["p", "pre", "dl", "div", "center", "noscript", "noframes",
  38. "blockquote", "form", "isindex", "hr", "table", "fieldset",
  39. "address"] + heading + lists
  40. inline = fontstyle + phrase + special + formctrl
  41. flow = block + inline
  42. html_content = ["head", "body"]
  43. head_content = ["title", "isindex", "base"]
  44. def setify(d):
  45. return dict([(key, frozenset(val)) for key, val in d.items()])
  46. def omit(allowed, tags):
  47. result = {}
  48. for k, v in allowed.items():
  49. for t in tags:
  50. if t in v:
  51. v = v.union(allowed[t])
  52. result[k] = v
  53. return result
  54. allowed = {
  55. "a": inline,
  56. "abbr": inline,
  57. "acronym": inline,
  58. "address": inline + ["p"],
  59. "applet": flow + ["param"],
  60. "b": inline,
  61. "bdo": inline,
  62. "big": inline,
  63. "blockquote": flow,
  64. "body": flow + ["ins", "del"],
  65. "button": flow,
  66. "caption": inline,
  67. "center": flow,
  68. "cite": inline,
  69. "code": inline,
  70. "colgroup": ["col"],
  71. "dd": flow,
  72. "del": flow,
  73. "dfn": inline,
  74. "dir": ["li"],
  75. "div": flow,
  76. "dl": ["dt", "dd"],
  77. "dt": inline,
  78. "em": inline,
  79. "fieldset": flow + ["legend"],
  80. "font": inline,
  81. "form": flow,
  82. "frameset": ["frameset", "frame", "noframes"],
  83. "h1": inline,
  84. "h2": inline,
  85. "h3": inline,
  86. "h4": inline,
  87. "h5": inline,
  88. "h6": inline,
  89. "head": head_content + head_misc,
  90. "html": html_content,
  91. "i": inline,
  92. "iframe": flow,
  93. "ins": flow,
  94. "kbd": inline,
  95. "label": inline,
  96. "legend": inline,
  97. "li": flow,
  98. "map": block + ["area"],
  99. "menu": ["li"],
  100. "noframes": flow,
  101. "noscript": flow,
  102. "object": flow + ["param"],
  103. "ol": ["li"],
  104. "optgroup": ["option"],
  105. "option": [],
  106. "p": inline,
  107. "pre": inline,
  108. "q": inline,
  109. "s": inline,
  110. "samp": inline,
  111. "script": [],
  112. "select": ["optgroup", "option"],
  113. "small": inline,
  114. "span": inline,
  115. "strike": inline,
  116. "strong": inline,
  117. "style": [],
  118. "sub": inline,
  119. "sup": inline,
  120. "table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody"],
  121. "tbody": ["tr"],
  122. "td": flow,
  123. "textarea": [],
  124. "tfoot": ["tr"],
  125. "th": flow,
  126. "thead": ["tr"],
  127. "title": [],
  128. "tr": ["th", "td"],
  129. "tt": inline,
  130. "u": inline,
  131. "ul": ["li"],
  132. "var": inline
  133. }
  134. allowed = setify(allowed)
  135. allowed = omit(allowed, omit_start)
  136. excluded = {
  137. "a": ["a"],
  138. "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
  139. "dir": block,
  140. "form": ["form"],
  141. "label": ["label"],
  142. "menu": block,
  143. "pre": pre_exclusion
  144. }
  145. excluded = setify(excluded)
  146. class HTMLParser(base.HTMLParser):
  147. def __init__(self, entities=None):
  148. base.HTMLParser.__init__(self)
  149. self.tag_stack = []
  150. self.excluded = frozenset()
  151. self.excluded_stack = []
  152. self.data = []
  153. self.data_stack = []
  154. self.decls = []
  155. if entities:
  156. self.entities = entities
  157. else:
  158. self.entities = {}
  159. def top(self):
  160. if self.tag_stack == []:
  161. return None
  162. else:
  163. return self.tag_stack[-1][0]
  164. def pop(self):
  165. self.excluded = self.excluded_stack.pop()
  166. data = self.data
  167. self.data = self.data_stack.pop()
  168. (tag, attrs) = self.tag_stack.pop()
  169. self.append((tag, attrs, data))
  170. return tag
  171. def push(self, tag, attrs):
  172. self.tag_stack.append((tag, attrs))
  173. self.excluded_stack.append(self.excluded)
  174. if tag in excluded:
  175. self.excluded = self.excluded.union(excluded[tag])
  176. self.data_stack.append(self.data)
  177. self.data = []
  178. def append(self, item):
  179. self.data.append(item)
  180. def is_allowed(self, tag):
  181. return tag not in self.excluded and tag in allowed[self.top()]
  182. def handle_starttag(self, tag, attrs):
  183. if self.tag_stack != []:
  184. while not self.is_allowed(tag):
  185. self.pop()
  186. if tag not in single:
  187. self.push(tag, attrs)
  188. else:
  189. self.append((tag, attrs, None))
  190. def handle_entityref(self, name):
  191. if name in self.entities:
  192. self.handle_data(self.entities[name])
  193. elif name in entitydefs:
  194. self.handle_data(entitydefs[name])
  195. else:
  196. sys.stderr.write("unrecognized entity: %s\n" % name)
  197. def handle_charref(self, name):
  198. sys.stderr.write('unsupported character reference <%s>' % name)
  199. def handle_data(self, data):
  200. self.append(data)
  201. def handle_endtag(self, tag):
  202. while True:
  203. if self.pop() == tag:
  204. break
  205. def handle_decl(self, decl):
  206. self.decls.append(decl)