gerrit.onap Code Review - sdc/sdc-distribution-client.git/blob

   1 from __future__ import absolute_import, division, unicode_literals
   2 from pip._vendor.six import text_type
   3
   4 import gettext
   5 _ = gettext.gettext
   6
   7 try:
   8     from functools import reduce
   9 except ImportError:
  10     pass
  11
  12 from ..constants import voidElements, booleanAttributes, spaceCharacters
  13 from ..constants import rcdataElements, entities, xmlEntities
  14 from .. import utils
  15 from xml.sax.saxutils import escape
  16
  17 spaceCharacters = "".join(spaceCharacters)
  18
  19 try:
  20     from codecs import register_error, xmlcharrefreplace_errors
  21 except ImportError:
  22     unicode_encode_errors = "strict"
  23 else:
  24     unicode_encode_errors = "htmlentityreplace"
  25
  26     encode_entity_map = {}
  27     is_ucs4 = len("\U0010FFFF") == 1
  28     for k, v in list(entities.items()):
  29         # skip multi-character entities
  30         if ((is_ucs4 and len(v) > 1) or
  31                 (not is_ucs4 and len(v) > 2)):
  32             continue
  33         if v != "&":
  34             if len(v) == 2:
  35                 v = utils.surrogatePairToCodepoint(v)
  36             else:
  37                 v = ord(v)
  38             if not v in encode_entity_map or k.islower():
  39                 # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
  40                 encode_entity_map[v] = k
  41
  42     def htmlentityreplace_errors(exc):
  43         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
  44             res = []
  45             codepoints = []
  46             skip = False
  47             for i, c in enumerate(exc.object[exc.start:exc.end]):
  48                 if skip:
  49                     skip = False
  50                     continue
  51                 index = i + exc.start
  52                 if utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):
  53                     codepoint = utils.surrogatePairToCodepoint(exc.object[index:index + 2])
  54                     skip = True
  55                 else:
  56                     codepoint = ord(c)
  57                 codepoints.append(codepoint)
  58             for cp in codepoints:
  59                 e = encode_entity_map.get(cp)
  60                 if e:
  61                     res.append("&")
  62                     res.append(e)
  63                     if not e.endswith(";"):
  64                         res.append(";")
  65                 else:
  66                     res.append("&#x%s;" % (hex(cp)[2:]))
  67             return ("".join(res), exc.end)
  68         else:
  69             return xmlcharrefreplace_errors(exc)
  70
  71     register_error(unicode_encode_errors, htmlentityreplace_errors)
  72
  73     del register_error
  74
  75
  76 class HTMLSerializer(object):
  77
  78     # attribute quoting options
  79     quote_attr_values = False
  80     quote_char = '"'
  81     use_best_quote_char = True
  82
  83     # tag syntax options
  84     omit_optional_tags = True
  85     minimize_boolean_attributes = True
  86     use_trailing_solidus = False
  87     space_before_trailing_solidus = True
  88
  89     # escaping options
  90     escape_lt_in_attrs = False
  91     escape_rcdata = False
  92     resolve_entities = True
  93
  94     # miscellaneous options
  95     inject_meta_charset = True
  96     strip_whitespace = False
  97     sanitize = False
  98
  99     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
 100                "minimize_boolean_attributes", "use_trailing_solidus",
 101                "space_before_trailing_solidus", "omit_optional_tags",
 102                "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
 103                "escape_rcdata", "resolve_entities", "sanitize")
 104
 105     def __init__(self, **kwargs):
 106         """Initialize HTMLSerializer.
 107
 108         Keyword options (default given first unless specified) include:
 109
 110         inject_meta_charset=True|False
 111           Whether it insert a meta element to define the character set of the
 112           document.
 113         quote_attr_values=True|False
 114           Whether to quote attribute values that don't require quoting
 115           per HTML5 parsing rules.
 116         quote_char=u'"'|u"'"
 117           Use given quote character for attribute quoting. Default is to
 118           use double quote unless attribute value contains a double quote,
 119           in which case single quotes are used instead.
 120         escape_lt_in_attrs=False|True
 121           Whether to escape < in attribute values.
 122         escape_rcdata=False|True
 123           Whether to escape characters that need to be escaped within normal
 124           elements within rcdata elements such as style.
 125         resolve_entities=True|False
 126           Whether to resolve named character entities that appear in the
 127           source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
 128           are unaffected by this setting.
 129         strip_whitespace=False|True
 130           Whether to remove semantically meaningless whitespace. (This
 131           compresses all whitespace to a single space except within pre.)
 132         minimize_boolean_attributes=True|False
 133           Shortens boolean attributes to give just the attribute value,
 134           for example <input disabled="disabled"> becomes <input disabled>.
 135         use_trailing_solidus=False|True
 136           Includes a close-tag slash at the end of the start tag of void
 137           elements (empty elements whose end tag is forbidden). E.g. <hr/>.
 138         space_before_trailing_solidus=True|False
 139           Places a space immediately before the closing slash in a tag
 140           using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
 141         sanitize=False|True
 142           Strip all unsafe or unknown constructs from output.
 143           See `html5lib user documentation`_
 144         omit_optional_tags=True|False
 145           Omit start/end tags that are optional.
 146
 147         .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
 148         """
 149         if 'quote_char' in kwargs:
 150             self.use_best_quote_char = False
 151         for attr in self.options:
 152             setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
 153         self.errors = []
 154         self.strict = False
 155
 156     def encode(self, string):
 157         assert(isinstance(string, text_type))
 158         if self.encoding:
 159             return string.encode(self.encoding, unicode_encode_errors)
 160         else:
 161             return string
 162
 163     def encodeStrict(self, string):
 164         assert(isinstance(string, text_type))
 165         if self.encoding:
 166             return string.encode(self.encoding, "strict")
 167         else:
 168             return string
 169
 170     def serialize(self, treewalker, encoding=None):
 171         self.encoding = encoding
 172         in_cdata = False
 173         self.errors = []
 174         if encoding and self.inject_meta_charset:
 175             from ..filters.inject_meta_charset import Filter
 176             treewalker = Filter(treewalker, encoding)
 177         # XXX: WhitespaceFilter should be used before OptionalTagFilter
 178         # for maximum efficiently of this latter filter
 179         if self.strip_whitespace:
 180             from ..filters.whitespace import Filter
 181             treewalker = Filter(treewalker)
 182         if self.sanitize:
 183             from ..filters.sanitizer import Filter
 184             treewalker = Filter(treewalker)
 185         if self.omit_optional_tags:
 186             from ..filters.optionaltags import Filter
 187             treewalker = Filter(treewalker)
 188         for token in treewalker:
 189             type = token["type"]
 190             if type == "Doctype":
 191                 doctype = "<!DOCTYPE %s" % token["name"]
 192
 193                 if token["publicId"]:
 194                     doctype += ' PUBLIC "%s"' % token["publicId"]
 195                 elif token["systemId"]:
 196                     doctype += " SYSTEM"
 197                 if token["systemId"]:
 198                     if token["systemId"].find('"') >= 0:
 199                         if token["systemId"].find("'") >= 0:
 200                             self.serializeError(_("System identifer contains both single and double quote characters"))
 201                         quote_char = "'"
 202                     else:
 203                         quote_char = '"'
 204                     doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)
 205
 206                 doctype += ">"
 207                 yield self.encodeStrict(doctype)
 208
 209             elif type in ("Characters", "SpaceCharacters"):
 210                 if type == "SpaceCharacters" or in_cdata:
 211                     if in_cdata and token["data"].find("</") >= 0:
 212                         self.serializeError(_("Unexpected </ in CDATA"))
 213                     yield self.encode(token["data"])
 214                 else:
 215                     yield self.encode(escape(token["data"]))
 216
 217             elif type in ("StartTag", "EmptyTag"):
 218                 name = token["name"]
 219                 yield self.encodeStrict("<%s" % name)
 220                 if name in rcdataElements and not self.escape_rcdata:
 221                     in_cdata = True
 222                 elif in_cdata:
 223                     self.serializeError(_("Unexpected child element of a CDATA element"))
 224                 for (attr_namespace, attr_name), attr_value in token["data"].items():
 225                     # TODO: Add namespace support here
 226                     k = attr_name
 227                     v = attr_value
 228                     yield self.encodeStrict(' ')
 229
 230                     yield self.encodeStrict(k)
 231                     if not self.minimize_boolean_attributes or \
 232                         (k not in booleanAttributes.get(name, tuple())
 233                          and k not in booleanAttributes.get("", tuple())):
 234                         yield self.encodeStrict("=")
 235                         if self.quote_attr_values or not v:
 236                             quote_attr = True
 237                         else:
 238                             quote_attr = reduce(lambda x, y: x or (y in v),
 239                                                 spaceCharacters + ">\"'=", False)
 240                         v = v.replace("&", "&amp;")
 241                         if self.escape_lt_in_attrs:
 242                             v = v.replace("<", "&lt;")
 243                         if quote_attr:
 244                             quote_char = self.quote_char
 245                             if self.use_best_quote_char:
 246                                 if "'" in v and '"' not in v:
 247                                     quote_char = '"'
 248                                 elif '"' in v and "'" not in v:
 249                                     quote_char = "'"
 250                             if quote_char == "'":
 251                                 v = v.replace("'", "&#39;")
 252                             else:
 253                                 v = v.replace('"', "&quot;")
 254                             yield self.encodeStrict(quote_char)
 255                             yield self.encode(v)
 256                             yield self.encodeStrict(quote_char)
 257                         else:
 258                             yield self.encode(v)
 259                 if name in voidElements and self.use_trailing_solidus:
 260                     if self.space_before_trailing_solidus:
 261                         yield self.encodeStrict(" /")
 262                     else:
 263                         yield self.encodeStrict("/")
 264                 yield self.encode(">")
 265
 266             elif type == "EndTag":
 267                 name = token["name"]
 268                 if name in rcdataElements:
 269                     in_cdata = False
 270                 elif in_cdata:
 271                     self.serializeError(_("Unexpected child element of a CDATA element"))
 272                 yield self.encodeStrict("</%s>" % name)
 273
 274             elif type == "Comment":
 275                 data = token["data"]
 276                 if data.find("--") >= 0:
 277                     self.serializeError(_("Comment contains --"))
 278                 yield self.encodeStrict("<!--%s-->" % token["data"])
 279
 280             elif type == "Entity":
 281                 name = token["name"]
 282                 key = name + ";"
 283                 if not key in entities:
 284                     self.serializeError(_("Entity %s not recognized" % name))
 285                 if self.resolve_entities and key not in xmlEntities:
 286                     data = entities[key]
 287                 else:
 288                     data = "&%s;" % name
 289                 yield self.encodeStrict(data)
 290
 291             else:
 292                 self.serializeError(token["data"])
 293
 294     def render(self, treewalker, encoding=None):
 295         if encoding:
 296             return b"".join(list(self.serialize(treewalker, encoding)))
 297         else:
 298             return "".join(list(self.serialize(treewalker)))
 299
 300     def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
 301         # XXX The idea is to make data mandatory.
 302         self.errors.append(data)
 303         if self.strict:
 304             raise SerializeError
 305
 306
 307 def SerializeError(Exception):
 308     """Error in serialized tree"""
 309     pass