08b60dfcc451e3499f5cea3d2e6369f47c1da68f
[sdc/sdc-distribution-client.git] /
1 from __future__ import absolute_import, division, unicode_literals
2 from pip._vendor.six import text_type
3
4 import gettext
5 _ = gettext.gettext
6
7 try:
8     from functools import reduce
9 except ImportError:
10     pass
11
12 from ..constants import voidElements, booleanAttributes, spaceCharacters
13 from ..constants import rcdataElements, entities, xmlEntities
14 from .. import utils
15 from xml.sax.saxutils import escape
16
17 spaceCharacters = "".join(spaceCharacters)
18
19 try:
20     from codecs import register_error, xmlcharrefreplace_errors
21 except ImportError:
22     unicode_encode_errors = "strict"
23 else:
24     unicode_encode_errors = "htmlentityreplace"
25
26     encode_entity_map = {}
27     is_ucs4 = len("\U0010FFFF") == 1
28     for k, v in list(entities.items()):
29         # skip multi-character entities
30         if ((is_ucs4 and len(v) > 1) or
31                 (not is_ucs4 and len(v) > 2)):
32             continue
33         if v != "&":
34             if len(v) == 2:
35                 v = utils.surrogatePairToCodepoint(v)
36             else:
37                 v = ord(v)
38             if not v in encode_entity_map or k.islower():
39                 # prefer < over < and similarly for &, >, etc.
40                 encode_entity_map[v] = k
41
42     def htmlentityreplace_errors(exc):
43         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
44             res = []
45             codepoints = []
46             skip = False
47             for i, c in enumerate(exc.object[exc.start:exc.end]):
48                 if skip:
49                     skip = False
50                     continue
51                 index = i + exc.start
52                 if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
53                     codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
54                     skip = True
55                 else:
56                     codepoint = ord(c)
57                 codepoints.append(codepoint)
58             for cp in codepoints:
59                 e = encode_entity_map.get(cp)
60                 if e:
61                     res.append("&")
62                     res.append(e)
63                     if not e.endswith(";"):
64                         res.append(";")
65                 else:
66                     res.append("&#x%s;" % (hex(cp)[2:]))
67             return ("".join(res), exc.end)
68         else:
69             return xmlcharrefreplace_errors(exc)
70
71     register_error(unicode_encode_errors, htmlentityreplace_errors)
72
73     del register_error
74
75
76 class HTMLSerializer(object):
77
78     # attribute quoting options
79     quote_attr_values = False
80     quote_char = '"'
81     use_best_quote_char = True
82
83     # tag syntax options
84     omit_optional_tags = True
85     minimize_boolean_attributes = True
86     use_trailing_solidus = False
87     space_before_trailing_solidus = True
88
89     # escaping options
90     escape_lt_in_attrs = False
91     escape_rcdata = False
92     resolve_entities = True
93
94     # miscellaneous options
95     inject_meta_charset = True
96     strip_whitespace = False
97     sanitize = False
98
99     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
100                "minimize_boolean_attributes", "use_trailing_solidus",
101                "space_before_trailing_solidus", "omit_optional_tags",
102                "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
103                "escape_rcdata", "resolve_entities", "sanitize")
104
105     def __init__(self, **kwargs):
106         """Initialize HTMLSerializer.
107
108         Keyword options (default given first unless specified) include:
109
110         inject_meta_charset=True|False
111           Whether it insert a meta element to define the character set of the
112           document.
113         quote_attr_values=True|False
114           Whether to quote attribute values that don't require quoting
115           per HTML5 parsing rules.
116         quote_char=u'"'|u"'"
117           Use given quote character for attribute quoting. Default is to
118           use double quote unless attribute value contains a double quote,
119           in which case single quotes are used instead.
120         escape_lt_in_attrs=False|True
121           Whether to escape < in attribute values.
122         escape_rcdata=False|True
123           Whether to escape characters that need to be escaped within normal
124           elements within rcdata elements such as style.
125         resolve_entities=True|False
126           Whether to resolve named character entities that appear in the
127           source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
128           are unaffected by this setting.
129         strip_whitespace=False|True
130           Whether to remove semantically meaningless whitespace. (This
131           compresses all whitespace to a single space except within pre.)
132         minimize_boolean_attributes=True|False
133           Shortens boolean attributes to give just the attribute value,
134           for example <input disabled="disabled"> becomes <input disabled>.
135         use_trailing_solidus=False|True
136           Includes a close-tag slash at the end of the start tag of void
137           elements (empty elements whose end tag is forbidden). E.g. <hr/>.
138         space_before_trailing_solidus=True|False
139           Places a space immediately before the closing slash in a tag
140           using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
141         sanitize=False|True
142           Strip all unsafe or unknown constructs from output.
143           See `html5lib user documentation`_
144         omit_optional_tags=True|False
145           Omit start/end tags that are optional.
146
147         .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
148         """
149         if 'quote_char' in kwargs:
150             self.use_best_quote_char = False
151         for attr in self.options:
152             setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
153         self.errors = []
154         self.strict = False
155
156     def encode(self, string):
157         assert(isinstance(string, text_type))
158         if self.encoding:
159             return string.encode(self.encoding, unicode_encode_errors)
160         else:
161             return string
162
163     def encodeStrict(self, string):
164         assert(isinstance(string, text_type))
165         if self.encoding:
166             return string.encode(self.encoding, "strict")
167         else:
168             return string
169
170     def serialize(self, treewalker, encoding=None):
171         self.encoding = encoding
172         in_cdata = False
173         self.errors = []
174         if encoding and self.inject_meta_charset:
175             from ..filters.inject_meta_charset import Filter
176             treewalker = Filter(treewalker, encoding)
177         # XXX: WhitespaceFilter should be used before OptionalTagFilter
178         # for maximum efficiently of this latter filter
179         if self.strip_whitespace:
180             from ..filters.whitespace import Filter
181             treewalker = Filter(treewalker)
182         if self.sanitize:
183             from ..filters.sanitizer import Filter
184             treewalker = Filter(treewalker)
185         if self.omit_optional_tags:
186             from ..filters.optionaltags import Filter
187             treewalker = Filter(treewalker)
188         for token in treewalker:
189             type = token["type"]
190             if type == "Doctype":
191                 doctype = "<!DOCTYPE %s" % token["name"]
192
193                 if token["publicId"]:
194                     doctype += ' PUBLIC "%s"' % token["publicId"]
195                 elif token["systemId"]:
196                     doctype += " SYSTEM"
197                 if token["systemId"]:
198                     if token["systemId"].find('"') >= 0:
199                         if token["systemId"].find("'") >= 0:
200                             self.serializeError(_("System identifer contains both single and double quote characters"))
201                         quote_char = "'"
202                     else:
203                         quote_char = '"'
204                     doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
205
206                 doctype += ">"
207                 yield self.encodeStrict(doctype)
208
209             elif type in ("Characters", "SpaceCharacters"):
210                 if type == "SpaceCharacters" or in_cdata:
211                     if in_cdata and token["data"].find("</") >= 0:
212                         self.serializeError(_("Unexpected </ in CDATA"))
213                     yield self.encode(token["data"])
214                 else:
215                     yield self.encode(escape(token["data"]))
216
217             elif type in ("StartTag", "EmptyTag"):
218                 name = token["name"]
219                 yield self.encodeStrict("<%s" % name)
220                 if name in rcdataElements and not self.escape_rcdata:
221                     in_cdata = True
222                 elif in_cdata:
223                     self.serializeError(_("Unexpected child element of a CDATA element"))
224                 for (attr_namespace, attr_name), attr_value in token["data"].items():
225                     # TODO: Add namespace support here
226                     k = attr_name
227                     v = attr_value
228                     yield self.encodeStrict(' ')
229
230                     yield self.encodeStrict(k)
231                     if not self.minimize_boolean_attributes or \
232                         (k not in booleanAttributes.get(name, tuple())
233                          and k not in booleanAttributes.get("", tuple())):
234                         yield self.encodeStrict("=")
235                         if self.quote_attr_values or not v:
236                             quote_attr = True
237                         else:
238                             quote_attr = reduce(lambda x, y: x or (y in v),
239                                                 spaceCharacters + ">\"'=", False)
240                         v = v.replace("&", "&amp;")
241                         if self.escape_lt_in_attrs:
242                             v = v.replace("<", "&lt;")
243                         if quote_attr:
244                             quote_char = self.quote_char
245                             if self.use_best_quote_char:
246                                 if "'" in v and '"' not in v:
247                                     quote_char = '"'
248                                 elif '"' in v and "'" not in v:
249                                     quote_char = "'"
250                             if quote_char == "'":
251                                 v = v.replace("'", "&#39;")
252                             else:
253                                 v = v.replace('"', "&quot;")
254                             yield self.encodeStrict(quote_char)
255                             yield self.encode(v)
256                             yield self.encodeStrict(quote_char)
257                         else:
258                             yield self.encode(v)
259                 if name in voidElements and self.use_trailing_solidus:
260                     if self.space_before_trailing_solidus:
261                         yield self.encodeStrict(" /")
262                     else:
263                         yield self.encodeStrict("/")
264                 yield self.encode(">")
265
266             elif type == "EndTag":
267                 name = token["name"]
268                 if name in rcdataElements:
269                     in_cdata = False
270                 elif in_cdata:
271                     self.serializeError(_("Unexpected child element of a CDATA element"))
272                 yield self.encodeStrict("</%s>" % name)
273
274             elif type == "Comment":
275                 data = token["data"]
276                 if data.find("--") >= 0:
277                     self.serializeError(_("Comment contains --"))
278                 yield self.encodeStrict("<!--%s-->" % token["data"])
279
280             elif type == "Entity":
281                 name = token["name"]
282                 key = name + ";"
283                 if not key in entities:
284                     self.serializeError(_("Entity %s not recognized" % name))
285                 if self.resolve_entities and key not in xmlEntities:
286                     data = entities[key]
287                 else:
288                     data = "&%s;" % name
289                 yield self.encodeStrict(data)
290
291             else:
292                 self.serializeError(token["data"])
293
294     def render(self, treewalker, encoding=None):
295         if encoding:
296             return b"".join(list(self.serialize(treewalker, encoding)))
297         else:
298             return "".join(list(self.serialize(treewalker)))
299
300     def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
301         # XXX The idea is to make data mandatory.
302         self.errors.append(data)
303         if self.strict:
304             raise SerializeError
305
306
307 def SerializeError(Exception):
308     """Error in serialized tree"""
309     pass