1 """Module for supporting the lxml.etree library. The idea here is to use as much
2 of the native library as possible, without using fragile hacks like custom element
3 names that break between releases. The downside of this is that we cannot represent
4 all possible trees; specifically the following are known to cause problems:
6 Text or comments as siblings of the root element
9 When any of these things occur, we emit a DataLossWarning
12 from __future__ import absolute_import, division, unicode_literals
19 from ..constants import DataLossWarning
20 from .. import constants
21 from . import etree as etree_builders
22 from .. import ihatexml
24 import lxml.etree as etree
28 tag_regexp = re.compile("{([^}]*)}(.*)")
30 comment_type = etree.Comment("asd").tag
33 class DocumentType(object):
34 def __init__(self, name, publicId, systemId):
36 self.publicId = publicId
37 self.systemId = systemId
40 class Document(object):
42 self._elementTree = None
45 def appendChild(self, element):
46 self._elementTree.getroot().addnext(element._element)
48 def _getChildNodes(self):
49 return self._childNodes
51 childNodes = property(_getChildNodes)
54 def testSerializer(element):
57 infosetFilter = ihatexml.InfosetFilter()
59 def serializeElement(element, indent=0):
60 if not hasattr(element, "tag"):
61 if hasattr(element, "getroot"):
63 rv.append("#document")
64 if element.docinfo.internalDTD:
65 if not (element.docinfo.public_id or
66 element.docinfo.system_url):
67 dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
69 dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
70 element.docinfo.root_name,
71 element.docinfo.public_id,
72 element.docinfo.system_url)
73 rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
74 next_element = element.getroot()
75 while next_element.getprevious() is not None:
76 next_element = next_element.getprevious()
77 while next_element is not None:
78 serializeElement(next_element, indent + 2)
79 next_element = next_element.getnext()
80 elif isinstance(element, str) or isinstance(element, bytes):
82 assert isinstance(element, str) or sys.version_info.major == 2
83 rv.append("|%s\"%s\"" % (' ' * indent, element))
86 rv.append("#document-fragment")
87 for next_element in element:
88 serializeElement(next_element, indent + 2)
89 elif element.tag == comment_type:
90 rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
91 if hasattr(element, "tail") and element.tail:
92 rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
94 assert isinstance(element, etree._Element)
95 nsmatch = etree_builders.tag_regexp.match(element.tag)
96 if nsmatch is not None:
98 tag = nsmatch.group(2)
99 prefix = constants.prefixes[ns]
100 rv.append("|%s<%s %s>" % (' ' * indent, prefix,
101 infosetFilter.fromXmlName(tag)))
103 rv.append("|%s<%s>" % (' ' * indent,
104 infosetFilter.fromXmlName(element.tag)))
106 if hasattr(element, "attrib"):
108 for name, value in element.attrib.items():
109 nsmatch = tag_regexp.match(name)
110 if nsmatch is not None:
111 ns, name = nsmatch.groups()
112 name = infosetFilter.fromXmlName(name)
113 prefix = constants.prefixes[ns]
114 attr_string = "%s %s" % (prefix, name)
116 attr_string = infosetFilter.fromXmlName(name)
117 attributes.append((attr_string, value))
119 for name, value in sorted(attributes):
120 rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
123 rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
125 for child in element:
126 serializeElement(child, indent)
127 if hasattr(element, "tail") and element.tail:
128 rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
129 serializeElement(element, 0)
131 if finalText is not None:
132 rv.append("|%s\"%s\"" % (' ' * 2, finalText))
137 def tostring(element):
138 """Serialize an element and its child nodes to a string"""
142 def serializeElement(element):
143 if not hasattr(element, "tag"):
144 if element.docinfo.internalDTD:
145 if element.docinfo.doctype:
146 dtd_str = element.docinfo.doctype
148 dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
150 serializeElement(element.getroot())
152 elif element.tag == comment_type:
153 rv.append("<!--%s-->" % (element.text,))
156 # This is assumed to be an ordinary element
157 if not element.attrib:
158 rv.append("<%s>" % (element.tag,))
160 attr = " ".join(["%s=\"%s\"" % (name, value)
161 for name, value in element.attrib.items()])
162 rv.append("<%s %s>" % (element.tag, attr))
164 rv.append(element.text)
166 for child in element:
167 serializeElement(child)
169 rv.append("</%s>" % (element.tag,))
171 if hasattr(element, "tail") and element.tail:
172 rv.append(element.tail)
174 serializeElement(element)
176 if finalText is not None:
177 rv.append("%s\"" % (' ' * 2, finalText))
182 class TreeBuilder(_base.TreeBuilder):
183 documentClass = Document
184 doctypeClass = DocumentType
187 fragmentClass = Document
188 implementation = etree
190 def __init__(self, namespaceHTMLElements, fullTree=False):
191 builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
192 infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
193 self.namespaceHTMLElements = namespaceHTMLElements
195 class Attributes(dict):
196 def __init__(self, element, value={}):
197 self._element = element
198 dict.__init__(self, value)
199 for key, value in self.items():
200 if isinstance(key, tuple):
201 name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
203 name = infosetFilter.coerceAttribute(key)
204 self._element._element.attrib[name] = value
206 def __setitem__(self, key, value):
207 dict.__setitem__(self, key, value)
208 if isinstance(key, tuple):
209 name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
211 name = infosetFilter.coerceAttribute(key)
212 self._element._element.attrib[name] = value
214 class Element(builder.Element):
215 def __init__(self, name, namespace):
216 name = infosetFilter.coerceElement(name)
217 builder.Element.__init__(self, name, namespace=namespace)
218 self._attributes = Attributes(self)
220 def _setName(self, name):
221 self._name = infosetFilter.coerceElement(name)
222 self._element.tag = self._getETreeTag(
223 self._name, self._namespace)
226 return infosetFilter.fromXmlName(self._name)
228 name = property(_getName, _setName)
230 def _getAttributes(self):
231 return self._attributes
233 def _setAttributes(self, attributes):
234 self._attributes = Attributes(self, attributes)
236 attributes = property(_getAttributes, _setAttributes)
238 def insertText(self, data, insertBefore=None):
239 data = infosetFilter.coerceCharacters(data)
240 builder.Element.insertText(self, data, insertBefore)
242 def appendChild(self, child):
243 builder.Element.appendChild(self, child)
245 class Comment(builder.Comment):
246 def __init__(self, data):
247 data = infosetFilter.coerceComment(data)
248 builder.Comment.__init__(self, data)
250 def _setData(self, data):
251 data = infosetFilter.coerceComment(data)
252 self._element.text = data
255 return self._element.text
257 data = property(_getData, _setData)
259 self.elementClass = Element
260 self.commentClass = builder.Comment
261 # self.fragmentClass = builder.DocumentFragment
262 _base.TreeBuilder.__init__(self, namespaceHTMLElements)
265 _base.TreeBuilder.reset(self)
266 self.insertComment = self.insertCommentInitial
267 self.initial_comments = []
270 def testSerializer(self, element):
271 return testSerializer(element)
273 def getDocument(self):
275 return self.document._elementTree
277 return self.document._elementTree.getroot()
279 def getFragment(self):
281 element = self.openElements[0]._element
283 fragment.append(element.text)
284 fragment.extend(list(element))
286 fragment.append(element.tail)
289 def insertDoctype(self, token):
291 publicId = token["publicId"]
292 systemId = token["systemId"]
295 warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
298 coercedName = self.infosetFilter.coerceElement(name)
299 if coercedName != name:
300 warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
302 doctype = self.doctypeClass(coercedName, publicId, systemId)
303 self.doctype = doctype
305 def insertCommentInitial(self, data, parent=None):
306 self.initial_comments.append(data)
308 def insertCommentMain(self, data, parent=None):
309 if (parent == self.document and
310 self.document._elementTree.getroot()[-1].tag == comment_type):
311 warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
312 super(TreeBuilder, self).insertComment(data, parent)
314 def insertRoot(self, token):
315 """Create the document root"""
316 # Because of the way libxml2 works, it doesn't seem to be possible to
317 # alter information like the doctype after the tree has been parsed.
318 # Therefore we need to use the built-in parser to create our iniial
319 # tree, after which we can add elements like normal
322 assert self.doctype.name
323 docStr += "<!DOCTYPE %s" % self.doctype.name
324 if (self.doctype.publicId is not None or
325 self.doctype.systemId is not None):
326 docStr += (' PUBLIC "%s" ' %
327 (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
328 if self.doctype.systemId:
329 sysid = self.doctype.systemId
330 if sysid.find("'") >= 0 and sysid.find('"') >= 0:
331 warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
332 sysid = sysid.replace("'", 'U00027')
333 if sysid.find("'") >= 0:
334 docStr += '"%s"' % sysid
336 docStr += "'%s'" % sysid
340 if self.doctype.name != token["name"]:
341 warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
342 docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
343 root = etree.fromstring(docStr)
345 # Append the initial comments:
346 for comment_token in self.initial_comments:
347 root.addprevious(etree.Comment(comment_token["data"]))
349 # Create the root document and add the ElementTree to it
350 self.document = self.documentClass()
351 self.document._elementTree = root.getroottree()
353 # Give the root element the right name
355 namespace = token.get("namespace", self.defaultNamespace)
356 if namespace is None:
359 etree_tag = "{%s}%s" % (namespace, name)
362 # Add the root element to the internal child/open data structures
363 root_element = self.elementClass(name, namespace)
364 root_element._element = root
365 self.document._childNodes.append(root_element)
366 self.openElements.append(root_element)
368 # Reset to the default insert comment function
369 self.insertComment = self.insertCommentMain