html.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. import sys
  2. import HTMLParser as base
  3. import htmlentitydefs
  4. HTMLParseError = base.HTMLParseError
  5. __all__ = ["HTMLParser", "HTMLParseError"]
  6. omit_start = ["body", "tbody", "head", "html"]
  7. single = ["area", "base", "basefont", "br", "col", "frame",
  8. "hr", "img", "input", "isindex", "link", "meta", "param"]
  9. single = frozenset(single)
  10. heading = ["h1", "h2", "h3", "h4", "h5", "h6"]
  11. fontstyle = ["tt", "i", "b", "u", "s", "strike", "big", "small"]
  12. phrase = [ "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr",
  13. "acronym"]
  14. special = [ "a", "img", "applet", "object", "font", "basefont", "br", "script",
  15. "map", "q", "sub", "sup", "span", "bdo", "iframe"]
  16. formctrl = [ "input", "select", "textarea", "label", "button"]
  17. lists = [ "ul", "ol", " dir", "menu"]
  18. head_misc = [ "script", "style", "meta", "link", "object"]
  19. pre_exclusion = [ "img", "object", "applet", "big", "small", "sub", "sup",
  20. "font", "basefont"]
  21. block = [ "p", "pre", "dl", "div", "center", "noscript", "noframes",
  22. "blockquote", "form", "isindex", "hr", "table", "fieldset",
  23. "address"] + heading + lists
  24. inline = fontstyle + phrase + special + formctrl
  25. flow = block + inline
  26. html_content = ["head", "body"]
  27. head_content = ["title", "isindex", "base"]
  28. def setify(d):
  29. return dict([(key, frozenset(val)) for key, val in d.iteritems()])
  30. def omit(allowed, tags):
  31. result = {}
  32. for k, v in allowed.iteritems():
  33. for t in tags:
  34. if t in v:
  35. v = v.union(allowed[t])
  36. result[k] = v
  37. return result
  38. allowed = {
  39. "a": inline,
  40. "abbr": inline,
  41. "acronym": inline,
  42. "address": inline + ["p"],
  43. "applet": flow + ["param"],
  44. "b": inline,
  45. "bdo": inline,
  46. "big": inline,
  47. "blockquote": flow,
  48. "body": flow + ["ins", "del"],
  49. "button": flow,
  50. "caption": inline,
  51. "center": flow,
  52. "cite": inline,
  53. "code": inline,
  54. "colgroup": ["col"],
  55. "dd": flow,
  56. "del": flow,
  57. "dfn": inline,
  58. "dir": ["li"],
  59. "div": flow,
  60. "dl": ["dt", "dd"],
  61. "dt": inline,
  62. "em": inline,
  63. "fieldset": flow + ["legend"],
  64. "font": inline,
  65. "form": flow,
  66. "frameset": ["frameset", "frame", "noframes"],
  67. "h1": inline,
  68. "h2": inline,
  69. "h3": inline,
  70. "h4": inline,
  71. "h5": inline,
  72. "h6": inline,
  73. "head": head_content + head_misc,
  74. "html": html_content,
  75. "i": inline,
  76. "iframe": flow,
  77. "ins": flow,
  78. "kbd": inline,
  79. "label": inline,
  80. "legend": inline,
  81. "li": flow,
  82. "map": block + ["area"],
  83. "menu": ["li"],
  84. "noframes": flow,
  85. "noscript": flow,
  86. "object": flow + ["param"],
  87. "ol": ["li"],
  88. "optgroup": ["option"],
  89. "option": [],
  90. "p": inline,
  91. "pre": inline,
  92. "q": inline,
  93. "s": inline,
  94. "samp": inline,
  95. "script": [],
  96. "select": ["optgroup", "option"],
  97. "small": inline,
  98. "span": inline,
  99. "strike": inline,
  100. "strong": inline,
  101. "style": [],
  102. "sub": inline,
  103. "sup": inline,
  104. "table": ["caption", "col", "colgroup", "thead", "tfoot", "tbody"],
  105. "tbody": ["tr"],
  106. "td": flow,
  107. "textarea": [],
  108. "tfoot": ["tr"],
  109. "th": flow,
  110. "thead": ["tr"],
  111. "title": [],
  112. "tr": ["th", "td"],
  113. "tt": inline,
  114. "u": inline,
  115. "ul": ["li"],
  116. "var": inline
  117. }
  118. allowed = setify(allowed)
  119. allowed = omit(allowed, omit_start)
  120. excluded = {
  121. "a": ["a"],
  122. "button": formctrl + ["a", "form", "isindex", "fieldset", "iframe"],
  123. "dir": block,
  124. "form": ["form"],
  125. "label": ["label"],
  126. "menu": block,
  127. "pre": pre_exclusion
  128. }
  129. excluded = setify(excluded)
  130. class HTMLParser(base.HTMLParser):
  131. def __init__(self, entities = None):
  132. base.HTMLParser.__init__(self)
  133. self.tag_stack = []
  134. self.excluded = frozenset()
  135. self.excluded_stack = []
  136. self.data = []
  137. self.data_stack = []
  138. self.decls = []
  139. if entities:
  140. self.entities = entities
  141. else:
  142. self.entities = {}
  143. def top(self):
  144. if self.tag_stack == []:
  145. return None
  146. else:
  147. return self.tag_stack[-1][0]
  148. def pop(self):
  149. self.excluded = self.excluded_stack.pop()
  150. data = self.data
  151. self.data = self.data_stack.pop()
  152. (tag, attrs) = self.tag_stack.pop()
  153. self.append((tag, attrs, data))
  154. return tag
  155. def push(self, tag, attrs):
  156. self.tag_stack.append((tag, attrs))
  157. self.excluded_stack.append(self.excluded)
  158. if tag in excluded:
  159. self.excluded = self.excluded.union(excluded[tag])
  160. self.data_stack.append(self.data)
  161. self.data = []
  162. def append(self, item):
  163. self.data.append(item)
  164. def is_allowed(self, tag):
  165. return tag not in self.excluded and tag in allowed[self.top()]
  166. def handle_starttag(self, tag, attrs):
  167. if self.tag_stack != []:
  168. while not self.is_allowed(tag):
  169. self.pop()
  170. if tag not in single:
  171. self.push(tag, attrs)
  172. else:
  173. self.append((tag, attrs, None))
  174. def handle_entityref(self, name):
  175. if name in self.entities:
  176. self.handle_data(self.entities[name])
  177. elif name in htmlentitydefs.entitydefs:
  178. self.handle_data(htmlentitydefs.entitydefs[name])
  179. else:
  180. sys.stderr.write("unrecognized entity: %s\n" % name)
  181. def handle_charref(self, name):
  182. sys.stderr.write('unsupported character reference <%s>' % name);
  183. def handle_data(self, data):
  184. self.append(data)
  185. def handle_endtag(self, tag):
  186. while True:
  187. if self.pop() == tag:
  188. break
  189. def handle_decl(self, decl):
  190. self.decls.append(decl)