35d08efaa6145719f8daad533d03df6188e2d2e4
[sdc/sdc-distribution-client.git] /
1 """Module for supporting the lxml.etree library. The idea here is to use as much
2 of the native library as possible, without using fragile hacks like custom element
3 names that break between releases. The downside of this is that we cannot represent
4 all possible trees; specifically the following are known to cause problems:
5
6 Text or comments as siblings of the root element
7 Docypes with no name
8
9 When any of these things occur, we emit a DataLossWarning
10 """
11
12 from __future__ import absolute_import, division, unicode_literals
13
14 import warnings
15 import re
16 import sys
17
18 from . import _base
19 from ..constants import DataLossWarning
20 from .. import constants
21 from . import etree as etree_builders
22 from .. import ihatexml
23
24 import lxml.etree as etree
25
26
27 fullTree = True
28 tag_regexp = re.compile("{([^}]*)}(.*)")
29
30 comment_type = etree.Comment("asd").tag
31
32
33 class DocumentType(object):
34     def __init__(self, name, publicId, systemId):
35         self.name = name
36         self.publicId = publicId
37         self.systemId = systemId
38
39
40 class Document(object):
41     def __init__(self):
42         self._elementTree = None
43         self._childNodes = []
44
45     def appendChild(self, element):
46         self._elementTree.getroot().addnext(element._element)
47
48     def _getChildNodes(self):
49         return self._childNodes
50
51     childNodes = property(_getChildNodes)
52
53
54 def testSerializer(element):
55     rv = []
56     finalText = None
57     infosetFilter = ihatexml.InfosetFilter()
58
59     def serializeElement(element, indent=0):
60         if not hasattr(element, "tag"):
61             if hasattr(element, "getroot"):
62                 # Full tree case
63                 rv.append("#document")
64                 if element.docinfo.internalDTD:
65                     if not (element.docinfo.public_id or
66                             element.docinfo.system_url):
67                         dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
68                     else:
69                         dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
70                             element.docinfo.root_name,
71                             element.docinfo.public_id,
72                             element.docinfo.system_url)
73                     rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
74                 next_element = element.getroot()
75                 while next_element.getprevious() is not None:
76                     next_element = next_element.getprevious()
77                 while next_element is not None:
78                     serializeElement(next_element, indent + 2)
79                     next_element = next_element.getnext()
80             elif isinstance(element, str) or isinstance(element, bytes):
81                 # Text in a fragment
82                 assert isinstance(element, str) or sys.version_info.major == 2
83                 rv.append("|%s\"%s\"" % (' ' * indent, element))
84             else:
85                 # Fragment case
86                 rv.append("#document-fragment")
87                 for next_element in element:
88                     serializeElement(next_element, indent + 2)
89         elif element.tag == comment_type:
90             rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
91             if hasattr(element, "tail") and element.tail:
92                 rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
93         else:
94             assert isinstance(element, etree._Element)
95             nsmatch = etree_builders.tag_regexp.match(element.tag)
96             if nsmatch is not None:
97                 ns = nsmatch.group(1)
98                 tag = nsmatch.group(2)
99                 prefix = constants.prefixes[ns]
100                 rv.append("|%s<%s %s>" % (' ' * indent, prefix,
101                                           infosetFilter.fromXmlName(tag)))
102             else:
103                 rv.append("|%s<%s>" % (' ' * indent,
104                                        infosetFilter.fromXmlName(element.tag)))
105
106             if hasattr(element, "attrib"):
107                 attributes = []
108                 for name, value in element.attrib.items():
109                     nsmatch = tag_regexp.match(name)
110                     if nsmatch is not None:
111                         ns, name = nsmatch.groups()
112                         name = infosetFilter.fromXmlName(name)
113                         prefix = constants.prefixes[ns]
114                         attr_string = "%s %s" % (prefix, name)
115                     else:
116                         attr_string = infosetFilter.fromXmlName(name)
117                     attributes.append((attr_string, value))
118
119                 for name, value in sorted(attributes):
120                     rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
121
122             if element.text:
123                 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
124             indent += 2
125             for child in element:
126                 serializeElement(child, indent)
127             if hasattr(element, "tail") and element.tail:
128                 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
129     serializeElement(element, 0)
130
131     if finalText is not None:
132         rv.append("|%s\"%s\"" % (' ' * 2, finalText))
133
134     return "\n".join(rv)
135
136
137 def tostring(element):
138     """Serialize an element and its child nodes to a string"""
139     rv = []
140     finalText = None
141
142     def serializeElement(element):
143         if not hasattr(element, "tag"):
144             if element.docinfo.internalDTD:
145                 if element.docinfo.doctype:
146                     dtd_str = element.docinfo.doctype
147                 else:
148                     dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
149                 rv.append(dtd_str)
150             serializeElement(element.getroot())
151
152         elif element.tag == comment_type:
153             rv.append("<!--%s-->" % (element.text,))
154
155         else:
156             # This is assumed to be an ordinary element
157             if not element.attrib:
158                 rv.append("<%s>" % (element.tag,))
159             else:
160                 attr = " ".join(["%s=\"%s\"" % (name, value)
161                                  for name, value in element.attrib.items()])
162                 rv.append("<%s %s>" % (element.tag, attr))
163             if element.text:
164                 rv.append(element.text)
165
166             for child in element:
167                 serializeElement(child)
168
169             rv.append("</%s>" % (element.tag,))
170
171         if hasattr(element, "tail") and element.tail:
172             rv.append(element.tail)
173
174     serializeElement(element)
175
176     if finalText is not None:
177         rv.append("%s\"" % (' ' * 2, finalText))
178
179     return "".join(rv)
180
181
182 class TreeBuilder(_base.TreeBuilder):
183     documentClass = Document
184     doctypeClass = DocumentType
185     elementClass = None
186     commentClass = None
187     fragmentClass = Document
188     implementation = etree
189
190     def __init__(self, namespaceHTMLElements, fullTree=False):
191         builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
192         infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
193         self.namespaceHTMLElements = namespaceHTMLElements
194
195         class Attributes(dict):
196             def __init__(self, element, value={}):
197                 self._element = element
198                 dict.__init__(self, value)
199                 for key, value in self.items():
200                     if isinstance(key, tuple):
201                         name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
202                     else:
203                         name = infosetFilter.coerceAttribute(key)
204                     self._element._element.attrib[name] = value
205
206             def __setitem__(self, key, value):
207                 dict.__setitem__(self, key, value)
208                 if isinstance(key, tuple):
209                     name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
210                 else:
211                     name = infosetFilter.coerceAttribute(key)
212                 self._element._element.attrib[name] = value
213
214         class Element(builder.Element):
215             def __init__(self, name, namespace):
216                 name = infosetFilter.coerceElement(name)
217                 builder.Element.__init__(self, name, namespace=namespace)
218                 self._attributes = Attributes(self)
219
220             def _setName(self, name):
221                 self._name = infosetFilter.coerceElement(name)
222                 self._element.tag = self._getETreeTag(
223                     self._name, self._namespace)
224
225             def _getName(self):
226                 return infosetFilter.fromXmlName(self._name)
227
228             name = property(_getName, _setName)
229
230             def _getAttributes(self):
231                 return self._attributes
232
233             def _setAttributes(self, attributes):
234                 self._attributes = Attributes(self, attributes)
235
236             attributes = property(_getAttributes, _setAttributes)
237
238             def insertText(self, data, insertBefore=None):
239                 data = infosetFilter.coerceCharacters(data)
240                 builder.Element.insertText(self, data, insertBefore)
241
242             def appendChild(self, child):
243                 builder.Element.appendChild(self, child)
244
245         class Comment(builder.Comment):
246             def __init__(self, data):
247                 data = infosetFilter.coerceComment(data)
248                 builder.Comment.__init__(self, data)
249
250             def _setData(self, data):
251                 data = infosetFilter.coerceComment(data)
252                 self._element.text = data
253
254             def _getData(self):
255                 return self._element.text
256
257             data = property(_getData, _setData)
258
259         self.elementClass = Element
260         self.commentClass = builder.Comment
261         # self.fragmentClass = builder.DocumentFragment
262         _base.TreeBuilder.__init__(self, namespaceHTMLElements)
263
264     def reset(self):
265         _base.TreeBuilder.reset(self)
266         self.insertComment = self.insertCommentInitial
267         self.initial_comments = []
268         self.doctype = None
269
270     def testSerializer(self, element):
271         return testSerializer(element)
272
273     def getDocument(self):
274         if fullTree:
275             return self.document._elementTree
276         else:
277             return self.document._elementTree.getroot()
278
279     def getFragment(self):
280         fragment = []
281         element = self.openElements[0]._element
282         if element.text:
283             fragment.append(element.text)
284         fragment.extend(list(element))
285         if element.tail:
286             fragment.append(element.tail)
287         return fragment
288
289     def insertDoctype(self, token):
290         name = token["name"]
291         publicId = token["publicId"]
292         systemId = token["systemId"]
293
294         if not name:
295             warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
296             self.doctype = None
297         else:
298             coercedName = self.infosetFilter.coerceElement(name)
299             if coercedName != name:
300                 warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
301
302             doctype = self.doctypeClass(coercedName, publicId, systemId)
303             self.doctype = doctype
304
305     def insertCommentInitial(self, data, parent=None):
306         self.initial_comments.append(data)
307
308     def insertCommentMain(self, data, parent=None):
309         if (parent == self.document and
310                 self.document._elementTree.getroot()[-1].tag == comment_type):
311                 warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
312         super(TreeBuilder, self).insertComment(data, parent)
313
314     def insertRoot(self, token):
315         """Create the document root"""
316         # Because of the way libxml2 works, it doesn't seem to be possible to
317         # alter information like the doctype after the tree has been parsed.
318         # Therefore we need to use the built-in parser to create our iniial
319         # tree, after which we can add elements like normal
320         docStr = ""
321         if self.doctype:
322             assert self.doctype.name
323             docStr += "<!DOCTYPE %s" % self.doctype.name
324             if (self.doctype.publicId is not None or
325                     self.doctype.systemId is not None):
326                 docStr += (' PUBLIC "%s" ' %
327                            (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
328                 if self.doctype.systemId:
329                     sysid = self.doctype.systemId
330                     if sysid.find("'") >= 0 and sysid.find('"') >= 0:
331                         warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
332                         sysid = sysid.replace("'", 'U00027')
333                     if sysid.find("'") >= 0:
334                         docStr += '"%s"' % sysid
335                     else:
336                         docStr += "'%s'" % sysid
337                 else:
338                     docStr += "''"
339             docStr += ">"
340             if self.doctype.name != token["name"]:
341                 warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
342         docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
343         root = etree.fromstring(docStr)
344
345         # Append the initial comments:
346         for comment_token in self.initial_comments:
347             root.addprevious(etree.Comment(comment_token["data"]))
348
349         # Create the root document and add the ElementTree to it
350         self.document = self.documentClass()
351         self.document._elementTree = root.getroottree()
352
353         # Give the root element the right name
354         name = token["name"]
355         namespace = token.get("namespace", self.defaultNamespace)
356         if namespace is None:
357             etree_tag = name
358         else:
359             etree_tag = "{%s}%s" % (namespace, name)
360         root.tag = etree_tag
361
362         # Add the root element to the internal child/open data structures
363         root_element = self.elementClass(name, namespace)
364         root_element._element = root
365         self.document._childNodes.append(root_element)
366         self.openElements.append(root_element)
367
368         # Reset to the default insert comment function
369         self.insertComment = self.insertCommentMain