You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

370 lines
14 KiB

13 years ago
"""Module for supporting the lxml.etree library. The idea here is to use as much
of the native library as possible, without using fragile hacks like custom element
names that break between releases. The downside of this is that we cannot represent
all possible trees; specifically the following are known to cause problems:
Text or comments as siblings of the root element
Docypes with no name
When any of these things occur, we emit a DataLossWarning
"""
12 years ago
from __future__ import absolute_import, division, unicode_literals
import warnings
import re
import sys
from . import _base
from ..constants import DataLossWarning
from .. import constants
from . import etree as etree_builders
from .. import ihatexml
import lxml.etree as etree
fullTree = True
tag_regexp = re.compile("{([^}]*)}(.*)")
comment_type = etree.Comment("asd").tag
13 years ago
class DocumentType(object):
def __init__(self, name, publicId, systemId):
12 years ago
self.name = name
13 years ago
self.publicId = publicId
self.systemId = systemId
12 years ago
13 years ago
class Document(object):
def __init__(self):
self._elementTree = None
self._childNodes = []
def appendChild(self, element):
self._elementTree.getroot().addnext(element._element)
def _getChildNodes(self):
return self._childNodes
12 years ago
13 years ago
childNodes = property(_getChildNodes)
12 years ago
13 years ago
def testSerializer(element):
rv = []
finalText = None
12 years ago
infosetFilter = ihatexml.InfosetFilter()
13 years ago
def serializeElement(element, indent=0):
if not hasattr(element, "tag"):
12 years ago
if hasattr(element, "getroot"):
# Full tree case
13 years ago
rv.append("#document")
if element.docinfo.internalDTD:
12 years ago
if not (element.docinfo.public_id or
13 years ago
element.docinfo.system_url):
12 years ago
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
13 years ago
else:
12 years ago
dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
element.docinfo.root_name,
13 years ago
element.docinfo.public_id,
element.docinfo.system_url)
12 years ago
rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
13 years ago
next_element = element.getroot()
while next_element.getprevious() is not None:
next_element = next_element.getprevious()
while next_element is not None:
12 years ago
serializeElement(next_element, indent + 2)
13 years ago
next_element = next_element.getnext()
12 years ago
elif isinstance(element, str) or isinstance(element, bytes):
# Text in a fragment
assert isinstance(element, str) or sys.version_info.major == 2
rv.append("|%s\"%s\"" % (' ' * indent, element))
13 years ago
else:
12 years ago
# Fragment case
13 years ago
rv.append("#document-fragment")
for next_element in element:
12 years ago
serializeElement(next_element, indent + 2)
elif element.tag == comment_type:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
13 years ago
else:
12 years ago
assert isinstance(element, etree._Element)
13 years ago
nsmatch = etree_builders.tag_regexp.match(element.tag)
if nsmatch is not None:
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
12 years ago
rv.append("|%s<%s %s>" % (' ' * indent, prefix,
infosetFilter.fromXmlName(tag)))
13 years ago
else:
12 years ago
rv.append("|%s<%s>" % (' ' * indent,
infosetFilter.fromXmlName(element.tag)))
13 years ago
if hasattr(element, "attrib"):
attributes = []
12 years ago
for name, value in element.attrib.items():
13 years ago
nsmatch = tag_regexp.match(name)
if nsmatch is not None:
ns, name = nsmatch.groups()
12 years ago
name = infosetFilter.fromXmlName(name)
13 years ago
prefix = constants.prefixes[ns]
12 years ago
attr_string = "%s %s" % (prefix, name)
13 years ago
else:
12 years ago
attr_string = infosetFilter.fromXmlName(name)
13 years ago
attributes.append((attr_string, value))
for name, value in sorted(attributes):
12 years ago
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
13 years ago
if element.text:
12 years ago
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
13 years ago
indent += 2
12 years ago
for child in element:
13 years ago
serializeElement(child, indent)
12 years ago
if hasattr(element, "tail") and element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
13 years ago
serializeElement(element, 0)
if finalText is not None:
12 years ago
rv.append("|%s\"%s\"" % (' ' * 2, finalText))
13 years ago
return "\n".join(rv)
12 years ago
13 years ago
def tostring(element):
"""Serialize an element and its child nodes to a string"""
rv = []
finalText = None
12 years ago
13 years ago
def serializeElement(element):
if not hasattr(element, "tag"):
if element.docinfo.internalDTD:
if element.docinfo.doctype:
dtd_str = element.docinfo.doctype
else:
12 years ago
dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
13 years ago
rv.append(dtd_str)
serializeElement(element.getroot())
12 years ago
elif element.tag == comment_type:
rv.append("<!--%s-->" % (element.text,))
13 years ago
else:
12 years ago
# This is assumed to be an ordinary element
13 years ago
if not element.attrib:
12 years ago
rv.append("<%s>" % (element.tag,))
13 years ago
else:
12 years ago
attr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
13 years ago
if element.text:
rv.append(element.text)
12 years ago
for child in element:
13 years ago
serializeElement(child)
12 years ago
rv.append("</%s>" % (element.tag,))
13 years ago
if hasattr(element, "tail") and element.tail:
rv.append(element.tail)
serializeElement(element)
if finalText is not None:
12 years ago
rv.append("%s\"" % (' ' * 2, finalText))
13 years ago
return "".join(rv)
12 years ago
13 years ago
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType
elementClass = None
commentClass = None
12 years ago
fragmentClass = Document
implementation = etree
13 years ago
12 years ago
def __init__(self, namespaceHTMLElements, fullTree=False):
13 years ago
builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
12 years ago
infosetFilter = self.infosetFilter = ihatexml.InfosetFilter()
13 years ago
self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict):
def __init__(self, element, value={}):
self._element = element
dict.__init__(self, value)
12 years ago
for key, value in self.items():
13 years ago
if isinstance(key, tuple):
12 years ago
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
13 years ago
else:
12 years ago
name = infosetFilter.coerceAttribute(key)
13 years ago
self._element._element.attrib[name] = value
def __setitem__(self, key, value):
dict.__setitem__(self, key, value)
if isinstance(key, tuple):
12 years ago
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
13 years ago
else:
12 years ago
name = infosetFilter.coerceAttribute(key)
13 years ago
self._element._element.attrib[name] = value
class Element(builder.Element):
def __init__(self, name, namespace):
12 years ago
name = infosetFilter.coerceElement(name)
13 years ago
builder.Element.__init__(self, name, namespace=namespace)
self._attributes = Attributes(self)
def _setName(self, name):
12 years ago
self._name = infosetFilter.coerceElement(name)
13 years ago
self._element.tag = self._getETreeTag(
self._name, self._namespace)
12 years ago
13 years ago
def _getName(self):
12 years ago
return infosetFilter.fromXmlName(self._name)
13 years ago
name = property(_getName, _setName)
def _getAttributes(self):
return self._attributes
def _setAttributes(self, attributes):
self._attributes = Attributes(self, attributes)
12 years ago
13 years ago
attributes = property(_getAttributes, _setAttributes)
def insertText(self, data, insertBefore=None):
12 years ago
data = infosetFilter.coerceCharacters(data)
13 years ago
builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child):
builder.Element.appendChild(self, child)
class Comment(builder.Comment):
def __init__(self, data):
12 years ago
data = infosetFilter.coerceComment(data)
13 years ago
builder.Comment.__init__(self, data)
def _setData(self, data):
12 years ago
data = infosetFilter.coerceComment(data)
13 years ago
self._element.text = data
def _getData(self):
return self._element.text
data = property(_getData, _setData)
self.elementClass = Element
self.commentClass = builder.Comment
12 years ago
# self.fragmentClass = builder.DocumentFragment
13 years ago
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
12 years ago
13 years ago
def reset(self):
_base.TreeBuilder.reset(self)
self.insertComment = self.insertCommentInitial
self.initial_comments = []
self.doctype = None
def testSerializer(self, element):
return testSerializer(element)
def getDocument(self):
if fullTree:
return self.document._elementTree
else:
return self.document._elementTree.getroot()
12 years ago
13 years ago
def getFragment(self):
fragment = []
element = self.openElements[0]._element
if element.text:
fragment.append(element.text)
12 years ago
fragment.extend(list(element))
13 years ago
if element.tail:
fragment.append(element.tail)
return fragment
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
12 years ago
if not name:
warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
self.doctype = None
else:
coercedName = self.infosetFilter.coerceElement(name)
if coercedName != name:
warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(coercedName, publicId, systemId)
self.doctype = doctype
13 years ago
def insertCommentInitial(self, data, parent=None):
self.initial_comments.append(data)
12 years ago
def insertCommentMain(self, data, parent=None):
if (parent == self.document and
self.document._elementTree.getroot()[-1].tag == comment_type):
warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
super(TreeBuilder, self).insertComment(data, parent)
13 years ago
def insertRoot(self, token):
"""Create the document root"""
12 years ago
# Because of the way libxml2 works, it doesn't seem to be possible to
# alter information like the doctype after the tree has been parsed.
# Therefore we need to use the built-in parser to create our iniial
# tree, after which we can add elements like normal
13 years ago
docStr = ""
12 years ago
if self.doctype:
assert self.doctype.name
docStr += "<!DOCTYPE %s" % self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += (' PUBLIC "%s" ' %
(self.infosetFilter.coercePubid(self.doctype.publicId or "")))
if self.doctype.systemId:
sysid = self.doctype.systemId
if sysid.find("'") >= 0 and sysid.find('"') >= 0:
warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
sysid = sysid.replace("'", 'U00027')
if sysid.find("'") >= 0:
docStr += '"%s"' % sysid
else:
docStr += "'%s'" % sysid
else:
docStr += "''"
13 years ago
docStr += ">"
12 years ago
if self.doctype.name != token["name"]:
warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
13 years ago
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
12 years ago
root = etree.fromstring(docStr)
# Append the initial comments:
13 years ago
for comment_token in self.initial_comments:
root.addprevious(etree.Comment(comment_token["data"]))
12 years ago
# Create the root document and add the ElementTree to it
13 years ago
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
12 years ago
13 years ago
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
12 years ago
etree_tag = "{%s}%s" % (namespace, name)
13 years ago
root.tag = etree_tag
12 years ago
# Add the root element to the internal child/open data structures
13 years ago
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)
12 years ago
# Reset to the default insert comment function
self.insertComment = self.insertCommentMain