1 from __future__ import absolute_import, division, unicode_literals
2 from pip._vendor.six import text_type
8 from functools import reduce
12 from ..constants import voidElements, booleanAttributes, spaceCharacters
13 from ..constants import rcdataElements, entities, xmlEntities
15 from xml.sax.saxutils import escape
17 spaceCharacters = "".join(spaceCharacters)
20 from codecs import register_error, xmlcharrefreplace_errors
22 unicode_encode_errors = "strict"
24 unicode_encode_errors = "htmlentityreplace"
26 encode_entity_map = {}
27 is_ucs4 = len("\U0010FFFF") == 1
28 for k, v in list(entities.items()):
29 # skip multi-character entities
30 if ((is_ucs4 and len(v) > 1) or
31 (not is_ucs4 and len(v) > 2)):
35 v = utils.surrogatePairToCodepoint(v)
38 if not v in encode_entity_map or k.islower():
39 # prefer < over < and similarly for &, >, etc.
40 encode_entity_map[v] = k
42 def htmlentityreplace_errors(exc):
43 if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
47 for i, c in enumerate(exc.object[exc.start:exc.end]):
52 if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
53 codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
57 codepoints.append(codepoint)
59 e = encode_entity_map.get(cp)
63 if not e.endswith(";"):
66 res.append("&#x%s;" % (hex(cp)[2:]))
67 return ("".join(res), exc.end)
69 return xmlcharrefreplace_errors(exc)
71 register_error(unicode_encode_errors, htmlentityreplace_errors)
76 class HTMLSerializer(object):
78 # attribute quoting options
79 quote_attr_values = False
81 use_best_quote_char = True
84 omit_optional_tags = True
85 minimize_boolean_attributes = True
86 use_trailing_solidus = False
87 space_before_trailing_solidus = True
90 escape_lt_in_attrs = False
92 resolve_entities = True
94 # miscellaneous options
95 inject_meta_charset = True
96 strip_whitespace = False
99 options = ("quote_attr_values", "quote_char", "use_best_quote_char",
100 "minimize_boolean_attributes", "use_trailing_solidus",
101 "space_before_trailing_solidus", "omit_optional_tags",
102 "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
103 "escape_rcdata", "resolve_entities", "sanitize")
105 def __init__(self, **kwargs):
106 """Initialize HTMLSerializer.
108 Keyword options (default given first unless specified) include:
110 inject_meta_charset=True|False
111 Whether it insert a meta element to define the character set of the
113 quote_attr_values=True|False
114 Whether to quote attribute values that don't require quoting
115 per HTML5 parsing rules.
117 Use given quote character for attribute quoting. Default is to
118 use double quote unless attribute value contains a double quote,
119 in which case single quotes are used instead.
120 escape_lt_in_attrs=False|True
121 Whether to escape < in attribute values.
122 escape_rcdata=False|True
123 Whether to escape characters that need to be escaped within normal
124 elements within rcdata elements such as style.
125 resolve_entities=True|False
126 Whether to resolve named character entities that appear in the
127 source tree. The XML predefined entities < > & " '
128 are unaffected by this setting.
129 strip_whitespace=False|True
130 Whether to remove semantically meaningless whitespace. (This
131 compresses all whitespace to a single space except within pre.)
132 minimize_boolean_attributes=True|False
133 Shortens boolean attributes to give just the attribute value,
134 for example <input disabled="disabled"> becomes <input disabled>.
135 use_trailing_solidus=False|True
136 Includes a close-tag slash at the end of the start tag of void
137 elements (empty elements whose end tag is forbidden). E.g. <hr/>.
138 space_before_trailing_solidus=True|False
139 Places a space immediately before the closing slash in a tag
140 using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
142 Strip all unsafe or unknown constructs from output.
143 See `html5lib user documentation`_
144 omit_optional_tags=True|False
145 Omit start/end tags that are optional.
147 .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
149 if 'quote_char' in kwargs:
150 self.use_best_quote_char = False
151 for attr in self.options:
152 setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
156 def encode(self, string):
157 assert(isinstance(string, text_type))
159 return string.encode(self.encoding, unicode_encode_errors)
163 def encodeStrict(self, string):
164 assert(isinstance(string, text_type))
166 return string.encode(self.encoding, "strict")
170 def serialize(self, treewalker, encoding=None):
171 self.encoding = encoding
174 if encoding and self.inject_meta_charset:
175 from ..filters.inject_meta_charset import Filter
176 treewalker = Filter(treewalker, encoding)
177 # XXX: WhitespaceFilter should be used before OptionalTagFilter
178 # for maximum efficiently of this latter filter
179 if self.strip_whitespace:
180 from ..filters.whitespace import Filter
181 treewalker = Filter(treewalker)
183 from ..filters.sanitizer import Filter
184 treewalker = Filter(treewalker)
185 if self.omit_optional_tags:
186 from ..filters.optionaltags import Filter
187 treewalker = Filter(treewalker)
188 for token in treewalker:
190 if type == "Doctype":
191 doctype = "<!DOCTYPE %s" % token["name"]
193 if token["publicId"]:
194 doctype += ' PUBLIC "%s"' % token["publicId"]
195 elif token["systemId"]:
197 if token["systemId"]:
198 if token["systemId"].find('"') >= 0:
199 if token["systemId"].find("'") >= 0:
200 self.serializeError(_("System identifer contains both single and double quote characters"))
204 doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
207 yield self.encodeStrict(doctype)
209 elif type in ("Characters", "SpaceCharacters"):
210 if type == "SpaceCharacters" or in_cdata:
211 if in_cdata and token["data"].find("</") >= 0:
212 self.serializeError(_("Unexpected </ in CDATA"))
213 yield self.encode(token["data"])
215 yield self.encode(escape(token["data"]))
217 elif type in ("StartTag", "EmptyTag"):
219 yield self.encodeStrict("<%s" % name)
220 if name in rcdataElements and not self.escape_rcdata:
223 self.serializeError(_("Unexpected child element of a CDATA element"))
224 for (attr_namespace, attr_name), attr_value in token["data"].items():
225 # TODO: Add namespace support here
228 yield self.encodeStrict(' ')
230 yield self.encodeStrict(k)
231 if not self.minimize_boolean_attributes or \
232 (k not in booleanAttributes.get(name, tuple())
233 and k not in booleanAttributes.get("", tuple())):
234 yield self.encodeStrict("=")
235 if self.quote_attr_values or not v:
238 quote_attr = reduce(lambda x, y: x or (y in v),
239 spaceCharacters + ">\"'=", False)
240 v = v.replace("&", "&")
241 if self.escape_lt_in_attrs:
242 v = v.replace("<", "<")
244 quote_char = self.quote_char
245 if self.use_best_quote_char:
246 if "'" in v and '"' not in v:
248 elif '"' in v and "'" not in v:
250 if quote_char == "'":
251 v = v.replace("'", "'")
253 v = v.replace('"', """)
254 yield self.encodeStrict(quote_char)
256 yield self.encodeStrict(quote_char)
259 if name in voidElements and self.use_trailing_solidus:
260 if self.space_before_trailing_solidus:
261 yield self.encodeStrict(" /")
263 yield self.encodeStrict("/")
264 yield self.encode(">")
266 elif type == "EndTag":
268 if name in rcdataElements:
271 self.serializeError(_("Unexpected child element of a CDATA element"))
272 yield self.encodeStrict("</%s>" % name)
274 elif type == "Comment":
276 if data.find("--") >= 0:
277 self.serializeError(_("Comment contains --"))
278 yield self.encodeStrict("<!--%s-->" % token["data"])
280 elif type == "Entity":
283 if not key in entities:
284 self.serializeError(_("Entity %s not recognized" % name))
285 if self.resolve_entities and key not in xmlEntities:
289 yield self.encodeStrict(data)
292 self.serializeError(token["data"])
294 def render(self, treewalker, encoding=None):
296 return b"".join(list(self.serialize(treewalker, encoding)))
298 return "".join(list(self.serialize(treewalker)))
300 def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
301 # XXX The idea is to make data mandatory.
302 self.errors.append(data)
307 def SerializeError(Exception):
308 """Error in serialized tree"""