You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

378 lines
13 KiB

12 years ago
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from ..constants import scopingElements, tableInsertModeElements, namespaces
13 years ago
# The scope markers are inserted when entering object elements,
# marquees, table cells, and table captions, and are used to prevent formatting
# from "leaking" into tables, object elements, and marquees.
Marker = None
12 years ago
listElementsMap = {
None: (frozenset(scopingElements), False),
"button": (frozenset(scopingElements | set([(namespaces["html"], "button")])), False),
"list": (frozenset(scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")])), False),
"table": (frozenset([(namespaces["html"], "html"),
(namespaces["html"], "table")]), False),
"select": (frozenset([(namespaces["html"], "optgroup"),
(namespaces["html"], "option")]), True)
}
13 years ago
class Node(object):
def __init__(self, name):
"""Node representing an item in the tree.
name - The tag name associated with the node
parent - The parent of the current node (or None for the document node)
12 years ago
value - The value of the current node (applies to text nodes and
13 years ago
comments
attributes - a dict holding name, value pairs for attributes of the node
12 years ago
childNodes - a list of child nodes of the current node. This must
13 years ago
include all elements but not necessarily other node types
_flags - A list of miscellaneous flags that can be set on the node
"""
self.name = name
self.parent = None
self.value = None
self.attributes = {}
self.childNodes = []
self._flags = []
12 years ago
def __str__(self):
attributesStr = " ".join(["%s=\"%s\"" % (name, value)
for name, value in
self.attributes.items()])
13 years ago
if attributesStr:
12 years ago
return "<%s %s>" % (self.name, attributesStr)
13 years ago
else:
12 years ago
return "<%s>" % (self.name)
13 years ago
def __repr__(self):
return "<%s>" % (self.name)
def appendChild(self, node):
"""Insert node as a child of the current node
"""
raise NotImplementedError
def insertText(self, data, insertBefore=None):
12 years ago
"""Insert data as text in the current node, positioned before the
13 years ago
start of node insertBefore or to the end of the node's text.
"""
raise NotImplementedError
def insertBefore(self, node, refNode):
12 years ago
"""Insert node as a child of the current node, before refNode in the
list of child nodes. Raises ValueError if refNode is not a child of
13 years ago
the current node"""
raise NotImplementedError
def removeChild(self, node):
"""Remove node from the children of the current node
"""
raise NotImplementedError
def reparentChildren(self, newParent):
12 years ago
"""Move all the children of the current node to newParent.
This is needed so that trees that don't store text as nodes move the
13 years ago
text in the correct way
"""
12 years ago
# XXX - should this method be made more general?
13 years ago
for child in self.childNodes:
newParent.appendChild(child)
self.childNodes = []
def cloneNode(self):
"""Return a shallow copy of the current node i.e. a node with the same
name and attributes but with no parent or child nodes
"""
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text, false otherwise
"""
raise NotImplementedError
12 years ago
13 years ago
class ActiveFormattingElements(list):
def append(self, node):
equalCount = 0
if node != Marker:
for element in self[::-1]:
if element == Marker:
break
if self.nodesEqual(element, node):
equalCount += 1
if equalCount == 3:
self.remove(element)
break
list.append(self, node)
def nodesEqual(self, node1, node2):
if not node1.nameTuple == node2.nameTuple:
return False
12 years ago
13 years ago
if not node1.attributes == node2.attributes:
return False
12 years ago
13 years ago
return True
12 years ago
13 years ago
class TreeBuilder(object):
"""Base treebuilder implementation
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
"""
12 years ago
# Document class
13 years ago
documentClass = None
12 years ago
# The class to use for creating a node
13 years ago
elementClass = None
12 years ago
# The class to use for creating comments
13 years ago
commentClass = None
12 years ago
# The class to use for creating doctypes
13 years ago
doctypeClass = None
12 years ago
# Fragment class
13 years ago
fragmentClass = None
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
self.reset()
12 years ago
13 years ago
def reset(self):
self.openElements = []
self.activeFormattingElements = ActiveFormattingElements()
12 years ago
# XXX - rename these to headElement, formElement
13 years ago
self.headPointer = None
self.formPointer = None
self.insertFromTable = False
self.document = self.documentClass()
def elementInScope(self, target, variant=None):
12 years ago
# If we pass a node in we match that. if we pass a string
# match any node with that name
13 years ago
exactNode = hasattr(target, "nameTuple")
listElements, invert = listElementsMap[variant]
for node in reversed(self.openElements):
if (node.name == target and not exactNode or
12 years ago
node == target and exactNode):
13 years ago
return True
12 years ago
elif (invert ^ (node.nameTuple in listElements)):
13 years ago
return False
12 years ago
assert False # We should never reach this point
13 years ago
def reconstructActiveFormattingElements(self):
# Within this algorithm the order of steps described in the
# specification is not quite the same as the order of steps in the
# code. It should still do the same though.
# Step 1: stop the algorithm when there's nothing to do.
if not self.activeFormattingElements:
return
# Step 2 and step 3: we start with the last element. So i is -1.
i = len(self.activeFormattingElements) - 1
entry = self.activeFormattingElements[i]
if entry == Marker or entry in self.openElements:
return
# Step 6
while entry != Marker and entry not in self.openElements:
if i == 0:
12 years ago
# This will be reset to 0 below
13 years ago
i = -1
break
i -= 1
# Step 5: let entry be one earlier in the list.
entry = self.activeFormattingElements[i]
while True:
# Step 7
i += 1
# Step 8
entry = self.activeFormattingElements[i]
12 years ago
clone = entry.cloneNode() # Mainly to get a new copy of the attributes
13 years ago
# Step 9
12 years ago
element = self.insertElement({"type": "StartTag",
"name": clone.name,
"namespace": clone.namespace,
"data": clone.attributes})
13 years ago
# Step 10
self.activeFormattingElements[i] = element
# Step 11
if element == self.activeFormattingElements[-1]:
break
def clearActiveFormattingElements(self):
entry = self.activeFormattingElements.pop()
while self.activeFormattingElements and entry != Marker:
entry = self.activeFormattingElements.pop()
def elementInActiveFormattingElements(self, name):
"""Check if an element exists between the end of the active
formatting elements and the last marker. If it does, return it, else
return false"""
for item in self.activeFormattingElements[::-1]:
# Check for Marker first because if it's a Marker it doesn't have a
# name attribute.
if item == Marker:
break
elif item.name == name:
return item
return False
def insertRoot(self, token):
element = self.createElement(token)
self.openElements.append(element)
self.document.appendChild(element)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = self.doctypeClass(name, publicId, systemId)
self.document.appendChild(doctype)
def insertComment(self, token, parent=None):
if parent is None:
parent = self.openElements[-1]
parent.appendChild(self.commentClass(token["data"]))
12 years ago
13 years ago
def createElement(self, token):
"""Create an element but don't insert it anywhere"""
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
return element
def _getInsertFromTable(self):
return self._insertFromTable
def _setInsertFromTable(self, value):
"""Switch the function used to insert an element from the
normal one to the misnested table one and back again"""
self._insertFromTable = value
if value:
self.insertElement = self.insertElementTable
else:
self.insertElement = self.insertElementNormal
insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
12 years ago
13 years ago
def insertElementNormal(self, token):
name = token["name"]
12 years ago
assert isinstance(name, text_type), "Element %s not unicode" % name
13 years ago
namespace = token.get("namespace", self.defaultNamespace)
element = self.elementClass(name, namespace)
element.attributes = token["data"]
self.openElements[-1].appendChild(element)
self.openElements.append(element)
return element
def insertElementTable(self, token):
12 years ago
"""Create an element and insert it into the tree"""
13 years ago
element = self.createElement(token)
if self.openElements[-1].name not in tableInsertModeElements:
return self.insertElementNormal(token)
else:
12 years ago
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
13 years ago
parent, insertBefore = self.getTableMisnestedNodePosition()
if insertBefore is None:
parent.appendChild(element)
else:
parent.insertBefore(element, insertBefore)
self.openElements.append(element)
return element
def insertText(self, data, parent=None):
"""Insert text data."""
if parent is None:
parent = self.openElements[-1]
if (not self.insertFromTable or (self.insertFromTable and
12 years ago
self.openElements[-1].name
13 years ago
not in tableInsertModeElements)):
parent.insertText(data)
else:
# We should be in the InTable mode. This means we want to do
# special magic element rearranging
parent, insertBefore = self.getTableMisnestedNodePosition()
parent.insertText(data, insertBefore)
12 years ago
13 years ago
def getTableMisnestedNodePosition(self):
"""Get the foster parent element, and sibling to insert before
(or None) when inserting a misnested table node"""
# The foster parent element is the one which comes before the most
# recently opened table element
# XXX - this is really inelegant
12 years ago
lastTable = None
13 years ago
fosterParent = None
insertBefore = None
for elm in self.openElements[::-1]:
if elm.name == "table":
lastTable = elm
break
if lastTable:
# XXX - we should really check that this parent is actually a
# node here
if lastTable.parent:
fosterParent = lastTable.parent
insertBefore = lastTable
else:
fosterParent = self.openElements[
self.openElements.index(lastTable) - 1]
else:
fosterParent = self.openElements[0]
return fosterParent, insertBefore
def generateImpliedEndTags(self, exclude=None):
name = self.openElements[-1].name
# XXX td, th and tr are not actually needed
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
12 years ago
and name != exclude):
13 years ago
self.openElements.pop()
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):
"Return the final tree"
return self.document
12 years ago
13 years ago
def getFragment(self):
"Return the final fragment"
12 years ago
# assert self.innerHTML
13 years ago
fragment = self.fragmentClass()
self.openElements[0].reparentChildren(fragment)
return fragment
def testSerializer(self, node):
"""Serialize the subtree of node in the format required by unit tests
node - the node from which to start serializing"""
raise NotImplementedError