ghtml.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. from __future__ import (
  2. absolute_import,
  3. division,
  4. generators,
  5. nested_scopes,
  6. print_function,
  7. unicode_literals,
  8. with_statement,
  9. )
  10. import sys
  11. try:
  12. # Python 2 import
  13. import HTMLParser as base
  14. HTMLParseError = base.HTMLParseError
  15. except ImportError:
  16. # Python 3 import
  17. import html.parser as base
  18. # TODO: this needs a better fix since HTMLParseError is actually
  19. # used including its attributes, so that actually fails
  20. # HTMLParseError is depreciated, parsing is not strict
  21. HTMLParseError = Exception
  22. try:
  23. # Python 3
  24. from html.entities import entitydefs
  25. except ImportError:
  26. # Python 2
  27. from htmlentitydefs import entitydefs
  28. __all__ = ["HTMLParser", "HTMLParseError"]
  29. omit_start = ["body", "tbody", "head", "html"]
  30. single = [
  31. "area",
  32. "base",
  33. "basefont",
  34. "br",
  35. "col",
  36. "frame",
  37. "hr",
  38. "img",
  39. "input",
  40. "isindex",
  41. "link",
  42. "meta",
  43. "param",
  44. ]
  45. single = frozenset(single)
  46. heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
  47. fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
  48. phrase = [
  49. "em",
  50. "strong",
  51. "dfn",
  52. "code",
  53. "samp",
  54. "kbd",
  55. "var",
  56. "cite",
  57. "abbr",
  58. "acronym",
  59. ]
  60. special = [
  61. "a",
  62. "img",
  63. "applet",
  64. "object",
  65. "font",
  66. "basefont",
  67. "br",
  68. "script",
  69. "map",
  70. "q",
  71. "sub",
  72. "sup",
  73. "span",
  74. "bdo",
  75. "iframe",
  76. ]
  77. formctrl = ["input", "select", "textarea", "label", "button"]
  78. lists = ["ul", "ol", " dir", "menu"]
  79. head_misc = ["script", "style", "meta", "link", "object"]
  80. pre_exclusion = [
  81. "img",
  82. "object",
  83. "applet",
  84. "big",
  85. "small",
  86. "sub",
  87. "sup",
  88. "font",
  89. "basefont",
  90. ]
  91. block = (
  92. [
  93. "p",
  94. "pre",
  95. "dl",
  96. "div",
  97. "center",
  98. "noscript",
  99. "noframes",
  100. "blockquote",
  101. "form",
  102. "isindex",
  103. "hr",
  104. "table",
  105. "fieldset",
  106. "address",
  107. ]
  108. + heading
  109. + lists
  110. )
  111. inline = fontstyle + phrase + special + formctrl
  112. flow = block + inline
  113. html_content = ["head", "body"]
  114. head_content = ["title", "isindex", "base"]
  115. def setify(d):
  116. return dict([(key, frozenset(val)) for key, val in d.items()])
  117. def omit(allowed, tags):
  118. result = {}
  119. for k, v in allowed.items():
  120. for t in tags:
  121. if t in v:
  122. v = v.union(allowed[t])
  123. result[k] = v
  124. return result
  125. allowed = {
  126. "a": inline,
  127. "abbr": inline,
  128. "acronym": inline,
  129. "address": inline + ["p"],
  130. "applet": flow + ["param"],
  131. "b": inline,
  132. "bdo": inline,
  133. "big": inline,
  134. "blockquote": flow,
  135. "body": flow + ["ins", "del"],
  136. "button": flow,
  137. "caption": inline,
  138. "center": flow,
  139. "cite": inline,
  140. "code": inline,
  141. "colgroup": ["col"],
  142. "dd": flow,
  143. "del": flow,
  144. "dfn": inline,
  145. "dir": ["li"],
  146. "div": flow,
  147. "dl": ["dt", "dd"],
  148. "dt": inline,
  149. "em": inline,
  150. "fieldset": flow + ["legend"],
  151. "font": inline,
  152. "form": flow,
  153. "frameset": ["frameset", "frame", "noframes"],
  154. "h1": inline,
  155. "h2": inline,
  156. "h3": inline,
  157. "h4": inline,
  158. "h5": inline,
  159. "h6": inline,
  160. "head": head_content + head_misc,
  161. "html": html_content,
  162. "i": inline,
  163. "iframe": flow,
  164. "ins": flow,
  165. "kbd": inline,
  166. "label": inline,
  167. "legend": inline,
  168. "li": flow,
  169. "map": block + ["area"],
  170. "menu": ["li"],
  171. "noframes": flow,
  172. "noscript": flow,
  173. "object": flow + ["param"],
  174. "ol": ["li"],
  175. "optgroup": ["option"],
  176. "option": [],
  177. "p": inline,
  178. "pre": inline,
  179. "q": inline,
  180. "s": inline,
  181. "samp": inline,
  182. "script": [],
  183. "select": ["optgroup", "option"],
  184. "small": inline,
  185. "span": inline,
  186. "strike": inline,
  187. "strong": inline,
  188. "style": [],
  189. "sub": inline,
  190. "sup": inline,
  191. "table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody"],
  192. "tbody": ["tr"],
  193. "td": flow,
  194. "textarea": [],
  195. "tfoot": ["tr"],
  196. "th": flow,
  197. "thead": ["tr"],
  198. "title": [],
  199. "tr": ["th", "td"],
  200. "tt": inline,
  201. "u": inline,
  202. "ul": ["li"],
  203. "var": inline,
  204. }
  205. allowed = setify(allowed)
  206. allowed = omit(allowed, omit_start)
  207. excluded = {
  208. "a": ["a"],
  209. "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
  210. "dir": block,
  211. "form": ["form"],
  212. "label": ["label"],
  213. "menu": block,
  214. "pre": pre_exclusion,
  215. }
  216. excluded = setify(excluded)
  217. class HTMLParser(base.HTMLParser):
  218. def __init__(self, entities=None):
  219. base.HTMLParser.__init__(self)
  220. self.tag_stack = []
  221. self.excluded = frozenset()
  222. self.excluded_stack = []
  223. self.data = []
  224. self.data_stack = []
  225. self.decls = []
  226. if entities:
  227. self.entities = entities
  228. else:
  229. self.entities = {}
  230. def top(self):
  231. if self.tag_stack == []:
  232. return None
  233. else:
  234. return self.tag_stack[-1][0]
  235. def pop(self):
  236. self.excluded = self.excluded_stack.pop()
  237. data = self.data
  238. self.data = self.data_stack.pop()
  239. (tag, attrs) = self.tag_stack.pop()
  240. self.append((tag, attrs, data))
  241. return tag
  242. def push(self, tag, attrs):
  243. self.tag_stack.append((tag, attrs))
  244. self.excluded_stack.append(self.excluded)
  245. if tag in excluded:
  246. self.excluded = self.excluded.union(excluded[tag])
  247. self.data_stack.append(self.data)
  248. self.data = []
  249. def append(self, item):
  250. self.data.append(item)
  251. def is_allowed(self, tag):
  252. return tag not in self.excluded and tag in allowed[self.top()]
  253. def handle_starttag(self, tag, attrs):
  254. if self.tag_stack != []:
  255. while not self.is_allowed(tag):
  256. self.pop()
  257. if tag not in single:
  258. self.push(tag, attrs)
  259. else:
  260. self.append((tag, attrs, None))
  261. def handle_entityref(self, name):
  262. if name in self.entities:
  263. self.handle_data(self.entities[name])
  264. elif name in entitydefs:
  265. self.handle_data(entitydefs[name])
  266. else:
  267. sys.stderr.write("unrecognized entity: %s\n" % name)
  268. def handle_charref(self, name):
  269. sys.stderr.write("unsupported character reference <%s>" % name)
  270. def handle_data(self, data):
  271. self.append(data)
  272. def handle_endtag(self, tag):
  273. while True:
  274. if self.pop() == tag:
  275. break
  276. def handle_decl(self, decl):
  277. self.decls.append(decl)