You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

77 lines
3.3 KiB

13 years ago
"""A collection of modules for building different kinds of tree from
HTML documents.
To create a treebuilder for a new type of tree, you need to do
implement several things:
1) A set of classes for various types of elements: Document, Doctype,
Comment, Element. These must implement the interface of
_base.treebuilders.Node (although comment nodes have a different
12 years ago
signature for their constructor, see treebuilders.etree.Comment)
13 years ago
Textual content may also be implemented as another node type, or not, as
your tree implementation requires.
2) A treebuilder object (called TreeBuilder by convention) that
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
documentClass - the class to use for the bottommost node of a document
elementClass - the class to use for HTML Elements
commentClass - the class to use for comments
doctypeClass - the class to use for doctypes
It also has one required method:
getDocument - Returns the root node of the complete document tree
3) If you wish to run the unit tests, you must also create a
testSerializer method on your treebuilder which accepts a node and
returns a string containing Node and its children serialized according
to the format used in the unittests
"""
12 years ago
from __future__ import absolute_import, division, unicode_literals
from ..utils import default_etree
13 years ago
treeBuilderCache = {}
def getTreeBuilder(treeType, implementation=None, **kwargs):
"""Get a TreeBuilder class for various types of tree with built-in support
12 years ago
13 years ago
treeType - the name of the tree type required (case-insensitive). Supported
12 years ago
values are:
"dom" - A generic builder for DOM implementations, defaulting to
a xml.dom.minidom based implementation.
"etree" - A generic builder for tree implementations exposing an
ElementTree-like interface, defaulting to
xml.etree.cElementTree if available and
xml.etree.ElementTree if not.
"lxml" - A etree-based builder for lxml.etree, handling
limitations of lxml's implementation.
13 years ago
implementation - (Currently applies to the "etree" and "dom" tree types). A
module implementing the tree type e.g.
12 years ago
xml.etree.ElementTree or xml.etree.cElementTree."""
13 years ago
treeType = treeType.lower()
if treeType not in treeBuilderCache:
if treeType == "dom":
12 years ago
from . import dom
# Come up with a sane default (pref. from the stdlib)
if implementation is None:
13 years ago
from xml.dom import minidom
implementation = minidom
12 years ago
# NEVER cache here, caching is done in the dom submodule
13 years ago
return dom.getDomModule(implementation, **kwargs).TreeBuilder
elif treeType == "lxml":
12 years ago
from . import etree_lxml
13 years ago
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
12 years ago
from . import etree
if implementation is None:
implementation = default_etree
13 years ago
# NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
12 years ago
raise ValueError("""Unrecognised treebuilder "%s" """ % treeType)
13 years ago
return treeBuilderCache.get(treeType)