Merge branch 'feature/UpdateBS4' into develop

6 years ago · f7136b1cdd
9 changed files with 550 additions and 284 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,6 +1,7 @@
 ### 0.21.0 (2019-xx-xx xx:xx:xx UTC)
 * Update attr 19.2.0.dev0 (de84609) to 19.2.0.dev0 (154b4e5)
 * Update Beautiful Soup 4.7.1 (r497) to 4.8.0 (r526)
 * Update Certifi 2019.03.09 (401100f) to 2019.06.16 (84dc766)
 * Update DiskCache library 3.1.1 (2649ac9) to 4.0.0 (2c79bb9)
 * Update feedparser 5.2.1 (2b11c80) to 5.2.1 (cbe18d0)
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -18,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """
 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.7.1"
+__version__ = "4.8.0"
 __copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
@ -63,7 +63,7 @@ class BeautifulSoup(Tag):
      handle_starttag(name, attrs) # See note about return value
      handle_endtag(name)
      handle_data(data) # Appends to the current data node
-      endData(containerClass=NavigableString) # Ends the current data node
+      endData(containerClass) # Ends the current data node
    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
@ -78,14 +78,14 @@ class BeautifulSoup(Tag):
    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
+   
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
-                 **kwargs):
+                 element_classes=None, **kwargs):
        """Constructor.
        :param markup: A string or a file-like object representing
@ -98,8 +98,10 @@ class BeautifulSoup(Tag):
        name a specific parser, so that Beautiful Soup gives you the
        same results across platforms and virtual environments.
-        :param builder: A specific TreeBuilder to use instead of looking one
+        :param builder: A TreeBuilder subclass to instantiate (or
-        up based on `features`. You shouldn't need to use this.
+        instance to use) instead of looking one up based on
        `features`. You only need to use this if you've implemented a
        custom TreeBuilder.
        :param parse_only: A SoupStrainer. Only parts of the document
        matching the SoupStrainer will be considered. This is useful
@ -115,14 +117,26 @@ class BeautifulSoup(Tag):
        the document's encoding but you know Beautiful Soup's guess is
        wrong.
        :param element_classes: A dictionary mapping BeautifulSoup
        classes like Tag and NavigableString to other classes you'd
        like to be instantiated instead as the parse tree is
        built. This is useful for using subclasses to modify the
        default behavior of Tag or NavigableString.
        :param kwargs: For backwards compatibility purposes, the
        constructor accepts certain keyword arguments used in
        Beautiful Soup 3. None of these arguments do anything in
-        Beautiful Soup 4 and there's no need to actually pass keyword
+        Beautiful Soup 4; they will result in a warning and then be ignored.
-        arguments into the constructor.
+
        Apart from this, any keyword arguments passed into the BeautifulSoup
        constructor are propagated to the TreeBuilder constructor. This
        makes it possible to configure a TreeBuilder beyond saying
        which one to use.
        """
        if 'convertEntities' in kwargs:
            del kwargs['convertEntities']
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
@ -177,13 +191,19 @@ class BeautifulSoup(Tag):
            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
            from_encoding = None
-        if len(kwargs) > 0:
+        self.element_classes = element_classes or dict()
-            arg = kwargs.keys().pop()
+
-            raise TypeError(
+        # We need this information to track whether or not the builder
-                "__init__() got an unexpected keyword argument '%s'" % arg)
+        # was specified well enough that we can omit the 'you need to
-
+        # specify a parser' warning.
-        if builder is None:
+        original_builder = builder
-            original_features = features
+        original_features = features
        if isinstance(builder, type):
            # A builder class was passed in; it needs to be instantiated.
            builder_class = builder
            builder = None
        elif builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
@ -194,9 +214,16 @@ class BeautifulSoup(Tag):
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
-            builder = builder_class()
+
-            if not (original_features == builder.NAME or
+        # At this point either we have a TreeBuilder instance in
-                    original_features in builder.ALTERNATE_NAMES):
+        # builder, or we have a builder_class that we can instantiate
        # with the remaining **kwargs.
        if builder is None:
            builder = builder_class(**kwargs)
            if not original_builder and not (
                    original_features == builder.NAME or
                    original_features in builder.ALTERNATE_NAMES
            ):
                if builder.is_xml:
                    markup_type = "XML"
                else:
@ -231,7 +258,10 @@ class BeautifulSoup(Tag):
                        markup_type=markup_type
                    )
                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
-
+        else:
            if kwargs:
                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
        self.builder = builder
        self.is_xml = builder.is_xml
        self.known_xml = self.is_xml
@ -272,6 +302,8 @@ class BeautifulSoup(Tag):
                    ' Beautiful Soup.' % markup)
            self._check_markup_is_url(markup)
        rejections = []
        success = False
        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
             self.builder.prepare_markup(
@ -279,10 +311,18 @@ class BeautifulSoup(Tag):
            self.reset()
            try:
                self._feed()
                success = True
                break
-            except ParserRejectedMarkup:
+            except ParserRejectedMarkup as e:
                rejections.append(e)
                pass
        if not success:
            other_exceptions = [unicode(e) for e in rejections]
            raise ParserRejectedMarkup(
                u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
            )
        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
@ -355,13 +395,20 @@ class BeautifulSoup(Tag):
        self.preserve_whitespace_tag_stack = []
        self.pushTag(self)
-    def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
                sourceline=None, sourcepos=None, **kwattrs):
        """Create a new tag associated with this soup."""
        kwattrs.update(attrs)
-        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
+        return self.element_classes.get(Tag, Tag)(
            None, self.builder, name, namespace, nsprefix, kwattrs,
            sourceline=sourceline, sourcepos=sourcepos
        )
-    def new_string(self, s, subclass=NavigableString):
+    def new_string(self, s, subclass=None):
        """Create a new NavigableString associated with this soup."""
        subclass = subclass or self.element_classes.get(
            NavigableString, NavigableString
        )
        return subclass(s)
    def insert_before(self, successor):
@ -388,7 +435,17 @@ class BeautifulSoup(Tag):
        if tag.name in self.builder.preserve_whitespace_tags:
            self.preserve_whitespace_tag_stack.append(tag)
-    def endData(self, containerClass=NavigableString):
+    def endData(self, containerClass=None):
        # Default container is NavigableString.
        containerClass = containerClass or NavigableString
        # The user may want us to instantiate some alias for the
        # container class.
        containerClass = self.element_classes.get(
            containerClass, containerClass
        )
        if self.current_data:
            current_data = u''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
@ -509,7 +566,8 @@ class BeautifulSoup(Tag):
        return most_recently_popped
-    def handle_starttag(self, name, namespace, nsprefix, attrs):
+    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
                        sourcepos=None):
        """Push a start tag on to the stack.
        If this method returns None, the tag was rejected by the
@ -526,8 +584,11 @@ class BeautifulSoup(Tag):
                 or not self.parse_only.search_tag(name, attrs))):
            return None
-        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+        tag = self.element_classes.get(Tag, Tag)(
-                  self.currentTag, self._most_recent_element)
+            self, self.builder, name, namespace, nsprefix, attrs,
            self.currentTag, self._most_recent_element,
            sourceline=sourceline, sourcepos=sourcepos
        )
        if tag is None:
            return tag
        if self._most_recent_element is not None:
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -7,7 +7,6 @@ import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
    HTMLAwareEntitySubstitution,
    nonwhitespace_re
    )
@ -90,18 +89,58 @@ class TreeBuilder(object):
    is_xml = False
    picklable = False
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
    # A value for these tag/attribute combinations is a space- or
    # comma-separated list of CDATA, rather than a single CDATA.
-    cdata_list_attributes = {}
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {}
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
    USE_DEFAULT = object()
-    def __init__(self):
+    # Most parsers don't keep track of line numbers.
    TRACKS_LINE_NUMBERS = False
    def __init__(self, multi_valued_attributes=USE_DEFAULT,
                 preserve_whitespace_tags=USE_DEFAULT,
                 store_line_numbers=USE_DEFAULT):
        """Constructor.
        :param multi_valued_attributes: If this is set to None, the
        TreeBuilder will not turn any values for attributes like
        'class' into lists. Setting this do a dictionary will
        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
        for an example.
        Internally, these are called "CDATA list attributes", but that
        probably doesn't make sense to an end-user, so the argument name
        is `multi_valued_attributes`.
        :param preserve_whitespace_tags: A list of tags to treat
        the way <pre> tags are treated in HTML. Tags in this list
        will have 
        :param store_line_numbers: If the parser keeps track of the
        line numbers and positions of the original markup, that
        information will, by default, be stored in each corresponding
        `Tag` object. You can turn this off by passing
        store_line_numbers=False. If the parser you're using doesn't 
        keep track of this information, then setting store_line_numbers=True
        will do nothing.
        """
        self.soup = None
-
+        if multi_valued_attributes is self.USE_DEFAULT:
            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
        self.cdata_list_attributes = multi_valued_attributes
        if preserve_whitespace_tags is self.USE_DEFAULT:
            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if store_line_numbers == self.USE_DEFAULT:
            store_line_numbers = self.TRACKS_LINE_NUMBERS
        self.store_line_numbers = store_line_numbers
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.
@ -131,13 +170,13 @@ class TreeBuilder(object):
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
-        
+    
    def feed(self, markup):
        raise NotImplementedError()
    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
-        return markup, None, None, False
+        yield markup, None, None, False
    def test_fragment_to_document(self, fragment):
        """Wrap an HTML fragment to make it look like a document.
@ -237,7 +276,6 @@ class HTMLTreeBuilder(TreeBuilder):
    Such as which tags are empty-element tags.
    """
    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
    empty_element_tags = set([
        # These are from HTML5.
        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
@ -259,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
    # encounter one of these attributes, we will parse its value into
    # a list of values if possible. Upon output, the list will be
    # converted back into a string.
-    cdata_list_attributes = {
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {
        "*" : ['class', 'accesskey', 'dropzone'],
        "a" : ['rel', 'rev'],
        "link" :  ['rel', 'rev'],
@ -276,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
        "output" : ["for"],
        }
    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
    def set_up_substitutions(self, tag):
        # We are only interested in <meta> tags
        if tag.name != 'meta':
@ -323,8 +363,15 @@ def register_treebuilders_from(module):
            this_module.builder_registry.register(obj)
 class ParserRejectedMarkup(Exception):
-    pass
+    def __init__(self, message_or_exception):
-
+        """Explain why the parser rejected the given markup, either
        with a textual explanation or another exception.
        """
        if isinstance(message_or_exception, Exception):
            e = message_or_exception
            message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
        super(ParserRejectedMarkup, self).__init__(message_or_exception)
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -45,6 +45,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
    features = [NAME, PERMISSIVE, HTML_5, HTML]
    # html5lib can tell us which line number and position in the
    # original file is the source of an element.
    TRACKS_LINE_NUMBERS = True
    def prepare_markup(self, markup, user_specified_encoding,
                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
@ -62,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-
+        self.underlying_builder.parser = parser
        extra_kwargs = dict()
        if not isinstance(markup, unicode):
            if new_html5lib:
@ -70,7 +74,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)
-
+        
        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
@ -84,10 +88,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding
-
+        self.underlying_builder.parser = None
    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
-            namespaceHTMLElements, self.soup)
+            namespaceHTMLElements, self.soup,
            store_line_numbers=self.store_line_numbers
        )
        return self.underlying_builder
    def test_fragment_to_document(self, fragment):
@ -96,15 +103,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
 class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
-
+    
-    def __init__(self, namespaceHTMLElements, soup=None):
+    def __init__(self, namespaceHTMLElements, soup=None,
                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
-            self.soup = BeautifulSoup("", "html.parser")
+            # TODO: Why is the parser 'html.parser' here? To avoid an
            # infinite loop?
            self.soup = BeautifulSoup(
                "", "html.parser", store_line_numbers=store_line_numbers,
                **kwargs
            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
        # This will be set later to an html5lib.html5parser.HTMLParser
        # object, which we can use to track the current line number.
        self.parser = None
        self.store_line_numbers = store_line_numbers
    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)
@ -118,7 +136,16 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
        self.soup.object_was_parsed(doctype)
    def elementClass(self, name, namespace):
-        tag = self.soup.new_tag(name, namespace)
+        kwargs = {}
        if self.parser and self.store_line_numbers:
            # This represents the point immediately after the end of the
            # tag. We don't know when the tag started, but we do know
            # where it ended -- the character just before this one.
            sourceline, sourcepos = self.parser.tokenizer.stream.position()
            kwargs['sourceline'] = sourceline
            kwargs['sourcepos'] = sourcepos-1
        tag = self.soup.new_tag(name, namespace, **kwargs)
        return Element(tag, self.soup, namespace)
    def commentClass(self, data):
@ -126,6 +153,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
    def fragmentClass(self):
        from bs4 import BeautifulSoup
        # TODO: Why is the parser 'html.parser' here? To avoid an
        # infinite loop?
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)
@ -199,7 +228,7 @@ class AttrList(object):
    def __setitem__(self, name, value):
        # If this attribute is a multi-valued attribute for this element,
        # turn its value into a list.
-        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        list_attr = self.element.cdata_list_attributes
        if (name in list_attr['*']
            or (self.element.name in list_attr
                and name in list_attr[self.element.name])):
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -99,7 +99,11 @@ class BeautifulSoupHTMLParser(HTMLParser):
            attr_dict[key] = value
            attrvalue = '""'
        #print "START", name
-        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        sourceline, sourcepos = self.getpos()
        tag = self.soup.handle_starttag(
            name, None, None, attr_dict, sourceline=sourceline,
            sourcepos=sourcepos
        )
        if tag and tag.is_empty_element and handle_empty_element:
            # Unlike other parsers, html.parser doesn't send separate end tag
            # events for empty-element tags. (It's handled in
@ -214,12 +218,19 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
    NAME = HTMLPARSER
    features = [NAME, HTML, STRICT]
-    def __init__(self, *args, **kwargs):
+    # The html.parser knows which line number and position in the
    # original file is the source of an element.
    TRACKS_LINE_NUMBERS = True
    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
        parser_args = parser_args or []
        parser_kwargs = parser_kwargs or {}
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
-            kwargs['strict'] = False
+            parser_kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
-            kwargs['convert_charrefs'] = False
+            parser_kwargs['convert_charrefs'] = False
-        self.parser_args = (args, kwargs)
+        self.parser_args = (parser_args, parser_kwargs)
    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -57,6 +57,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
    # NOTE: If we parsed Element objects and looked at .sourceline,
    # we'd be able to see the line numbers from the original document.
    # But instead we build an XMLParser or HTMLParser object to serve
    # as the target of parse messages, and those messages don't include
    # line numbers.
    def initialize_soup(self, soup):
        """Let the BeautifulSoup object know about the standard namespace
        mapping.
@ -94,7 +100,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            parser = parser(target=self, strip_cdata=False, encoding=encoding)
        return parser
-    def __init__(self, parser=None, empty_element_tags=None):
+    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
        # TODO: Issue a warning if parser is present but not a
        # callable, since that means there's no way to create new
        # parsers for different encodings.
@ -103,6 +109,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.empty_element_tags = set(empty_element_tags)
        self.soup = None
        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
@ -168,7 +175,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    self.parser.feed(data)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
    def close(self):
        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
@ -287,7 +294,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
            self.parser.feed(markup)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)
    def test_fragment_to_document(self, fragment):
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -22,6 +22,8 @@ try:
    #  PyPI package: cchardet
    import cchardet
    def chardet_dammit(s):
        if isinstance(s, unicode):
            return None
        return cchardet.detect(s)['encoding']
 except ImportError:
    try:
@ -30,6 +32,8 @@ except ImportError:
        #  PyPI package: chardet
        import chardet
        def chardet_dammit(s):
            if isinstance(s, unicode):
                return None
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
@ -44,10 +48,19 @@ try:
 except ImportError:
    pass
-xml_encoding_re = re.compile(
+# Build bytestring and Unicode versions of regular expressions for finding
-    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
+# a declared encoding inside an XML or HTML document.
-html_meta_re = re.compile(
+xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
-    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
 encoding_res = dict()
 encoding_res[bytes] = {
    'html' : re.compile(html_meta.encode("ascii"), re.I),
    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
 }
 encoding_res[unicode] = {
    'html' : re.compile(html_meta, re.I),
    'xml' : re.compile(xml_encoding, re.I)
 }
 class EntitySubstitution(object):
@ -57,15 +70,24 @@ class EntitySubstitution(object):
        lookup = {}
        reverse_lookup = {}
        characters_for_re = []
-        for codepoint, name in list(codepoint2name.items()):
+
        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
        # entity. We don't want to use it, but we want to recognize it on the way in.
        #
        # TODO: Ideally we would be able to recognize all HTML 5 named
        # entities, but that's a little tricky.
        extra = [(39, 'apos')]
        for codepoint, name in list(codepoint2name.items()) + extra:
            character = unichr(codepoint)
-            if codepoint != 34:
+            if codepoint not in (34, 39):
                # There's no point in turning the quotation mark into
-                # &quot;, unless it happens within an attribute value, which
+                # &quot; or the single quote into &apos;, unless it
-                # is handled elsewhere.
+                # happens within an attribute value, which is handled
                # elsewhere.
                characters_for_re.append(character)
                lookup[character] = name
-            # But we do want to turn &quot; into the quotation mark.
+            # But we do want to recognize those entities on the way in and
            # convert them to Unicode characters.
            reverse_lookup[name] = character
        re_definition = "[%s]" % "".join(characters_for_re)
        return lookup, reverse_lookup, re.compile(re_definition)
@ -310,14 +332,22 @@ class EncodingDetector:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))
        if isinstance(markup, bytes):
            res = encoding_res[bytes]
        else:
            res = encoding_res[unicode]
        xml_re = res['xml']
        html_re = res['html']
        declared_encoding = None
-        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
-            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
-            declared_encoding = declared_encoding_match.groups()[0].decode(
+            declared_encoding = declared_encoding_match.groups()[0]
                'ascii', 'replace')
        if declared_encoding:
            if isinstance(declared_encoding, bytes):
                declared_encoding = declared_encoding.decode('ascii', 'replace')
            return declared_encoding.lower()
        return None
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -16,7 +16,11 @@ except ImportError, e:
        'The soupsieve package is not installed. CSS selectors cannot be used.'
    )
-from bs4.dammit import EntitySubstitution
+from bs4.formatter import (
    Formatter,
    HTMLFormatter,
    XMLFormatter,
 )
 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)
@ -42,6 +46,11 @@ def _alias(attr):
 class NamespacedAttribute(unicode):
    def __new__(cls, prefix, name, namespace=None):
        if not name:
            # This is the default namespace. Its name "has no value"
            # per https://www.w3.org/TR/xml-names/#defaulting
            name = None
        if name is None:
            obj = unicode.__new__(cls, prefix)
        elif prefix is None:
@ -99,138 +108,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
            return match.group(1) + encoding
        return self.CHARSET_RE.sub(rewrite, self.original_value)
-class HTMLAwareEntitySubstitution(EntitySubstitution):
+    
-
+class PageElement(object):
-    """Entity substitution rules that are aware of some HTML quirks.
+    """Contains the navigational information for some part of the page
-
+    (either a tag or a piece of text)"""
-    Specifically, the contents of <script> and <style> tags should not
+   
-    undergo entity substitution.
+    def setup(self, parent=None, previous_element=None, next_element=None,
-
+              previous_sibling=None, next_sibling=None):
-    Incoming NavigableString objects are checked to see if they're the
+        """Sets up the initial relations between this element and
-    direct children of a <script> or <style> tag.
+        other elements."""
-    """
+        self.parent = parent
    cdata_containing_tags = set(["script", "style"])
    preformatted_tags = set(["pre"])
    preserve_whitespace_tags = set(['pre', 'textarea'])
    @classmethod
    def _substitute_if_appropriate(cls, ns, f):
        if (isinstance(ns, NavigableString)
            and ns.parent is not None
            and ns.parent.name in cls.cdata_containing_tags):
            # Do nothing.
            return ns
        # Substitute.
        return f(ns)
    @classmethod
    def substitute_html(cls, ns):
        return cls._substitute_if_appropriate(
            ns, EntitySubstitution.substitute_html)
-    @classmethod
+        self.previous_element = previous_element
-    def substitute_xml(cls, ns):
+        if previous_element is not None:
-        return cls._substitute_if_appropriate(
+            self.previous_element.next_element = self
            ns, EntitySubstitution.substitute_xml)
-class Formatter(object):
+        self.next_element = next_element
-    """Contains information about how to format a parse tree."""
+        if self.next_element is not None:
-    
+            self.next_element.previous_element = self
    # By default, represent void elements as <tag/> rather than <tag>
    void_element_close_prefix = '/'
    def substitute_entities(self, *args, **kwargs):
        """Transform certain characters into named entities."""
        raise NotImplementedError()
 class HTMLFormatter(Formatter):
    """The default HTML formatter."""
    def substitute(self, *args, **kwargs):
        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
 class MinimalHTMLFormatter(Formatter):
    """A minimal HTML formatter."""
    def substitute(self, *args, **kwargs):
        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
 class HTML5Formatter(HTMLFormatter):
    """An HTML formatter that omits the slash in a void tag."""
    void_element_close_prefix = None
-class XMLFormatter(Formatter):
+        self.next_sibling = next_sibling
-    """Substitute only the essential XML entities."""
+        if self.next_sibling is not None:
-    def substitute(self, *args, **kwargs):
+            self.next_sibling.previous_sibling = self
        return EntitySubstitution.substitute_xml(*args, **kwargs)
-class HTMLXMLFormatter(Formatter):
+        if (previous_sibling is None
-    """Format XML using HTML rules."""
+            and self.parent is not None and self.parent.contents):
-    def substitute(self, *args, **kwargs):
+            previous_sibling = self.parent.contents[-1]
        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-    
+        self.previous_sibling = previous_sibling
-class PageElement(object):
+        if previous_sibling is not None:
-    """Contains the navigational information for some part of the page
+            self.previous_sibling.next_sibling = self
    (either a tag or a piece of text)"""
-    # There are five possible values for the "formatter" argument passed in
+    def format_string(self, s, formatter):
    # to methods like encode() and prettify():
    #
    # "html" - All Unicode characters with corresponding HTML entities
    #   are converted to those entities on output.
    # "html5" - The same as "html", but empty void tags are represented as
    #   <tag> rather than <tag/>
    # "minimal" - Bare ampersands and angle brackets are converted to
    #   XML entities: &amp; &lt; &gt;
    # None - The null formatter. Unicode characters are never
    #   converted to entities.  This is not recommended, but it's
    #   faster than "minimal".
    # A callable function - it will be called on every string that needs to undergo entity substitution.
    # A Formatter instance - Formatter.substitute(string) will be called on every string that
    #  needs to undergo entity substitution.
    #
    # In an HTML document, the default "html", "html5", and "minimal"
    # functions will leave the contents of <script> and <style> tags
    # alone. For an XML document, all tags will be given the same
    # treatment.
    HTML_FORMATTERS = {
        "html" : HTMLFormatter(),
        "html5" : HTML5Formatter(),
        "minimal" : MinimalHTMLFormatter(),
        None : None
        }
    XML_FORMATTERS = {
        "html" : HTMLXMLFormatter(),
        "minimal" : XMLFormatter(),
        None : None
        }
    def format_string(self, s, formatter='minimal'):
        """Format the given string using the given formatter."""
        if isinstance(formatter, basestring):
            formatter = self._formatter_for_name(formatter)
        if formatter is None:
-            output = s
+            return s
-        else:
+        if not isinstance(formatter, Formatter):
-            if isinstance(formatter, Callable):
+            formatter = self.formatter_for_name(formatter)
-                # Backwards compatibility -- you used to pass in a formatting method.
+        output = formatter.substitute(s)
                output = formatter(s)
            else:
                output = formatter.substitute(s)
        return output
    def formatter_for_name(self, formatter):
        """Look up or create a Formatter for the given identifier,
        if necessary.
        :param formatter: Can be a Formatter object (used as-is), a
        function (used as the entity substitution hook for an
        XMLFormatter or HTMLFormatter), or a string (used to look up
        an XMLFormatter or HTMLFormatter in the appropriate registry.
        """
        if isinstance(formatter, Formatter):
            return formatter
        if self._is_xml:
            c = XMLFormatter
        else:
            c = HTMLFormatter
        if callable(formatter):
            return c(entity_substitution=formatter)
        return c.REGISTRY[formatter]
    @property
    def _is_xml(self):
        """Is this element part of an XML tree or an HTML tree?
-        This is used when mapping a formatter name ("minimal") to an
+        This is used in formatter_for_name, when deciding whether an
-        appropriate function (one that performs entity-substitution on
+        XMLFormatter or HTMLFormatter is more appropriate. It can be
        the contents of <script> and <style> tags, or not). It can be
        inefficient, but it should be called very rarely.
        """
        if self.known_xml is not None:
@ -248,46 +190,13 @@ class PageElement(object):
            return getattr(self, 'is_xml', False)
        return self.parent._is_xml
    def _formatter_for_name(self, name):
        "Look up a formatter function based on its name and the tree."
        if self._is_xml:
            return self.XML_FORMATTERS.get(name, XMLFormatter())
        else:
            return self.HTML_FORMATTERS.get(name, HTMLFormatter())
    def setup(self, parent=None, previous_element=None, next_element=None,
              previous_sibling=None, next_sibling=None):
        """Sets up the initial relations between this element and
        other elements."""
        self.parent = parent
        self.previous_element = previous_element
        if previous_element is not None:
            self.previous_element.next_element = self
        self.next_element = next_element
        if self.next_element is not None:
            self.next_element.previous_element = self
        self.next_sibling = next_sibling
        if self.next_sibling is not None:
            self.next_sibling.previous_sibling = self
        if (previous_sibling is None
            and self.parent is not None and self.parent.contents):
            previous_sibling = self.parent.contents[-1]
        self.previous_sibling = previous_sibling
        if previous_sibling is not None:
            self.previous_sibling.next_sibling = self
    nextSibling = _alias("next_sibling")  # BS3
    previousSibling = _alias("previous_sibling")  # BS3
    def replace_with(self, replace_with):
        if self.parent is None:
            raise ValueError(
-                "Cannot replace one element with another when the"
+                "Cannot replace one element with another when the "
                "element to be replaced is not part of a tree.")
        if replace_with is self:
            return
@ -742,6 +651,7 @@ class NavigableString(unicode, PageElement):
                    self.__class__.__name__, attr))
    def output_ready(self, formatter="minimal"):
        """Run the string through the provided formatter."""
        output = self.format_string(self, formatter)
        return self.PREFIX + output + self.SUFFIX
@ -760,10 +670,12 @@ class PreformattedString(NavigableString):
    but the return value will be ignored.
    """
-    def output_ready(self, formatter="minimal"):
+    def output_ready(self, formatter=None):
-        """CData strings are passed into the formatter.
+        """CData strings are passed into the formatter, purely
-        But the return value is ignored."""
+        for any side effects. The return value is ignored.
-        self.format_string(self, formatter)
+        """
        if formatter is not None:
            ignore = self.format_string(self, formatter)
        return self.PREFIX + self + self.SUFFIX
 class CData(PreformattedString):
@ -817,7 +729,10 @@ class Tag(PageElement):
    def __init__(self, parser=None, builder=None, name=None, namespace=None,
                 prefix=None, attrs=None, parent=None, previous=None,
-                 is_xml=None):
+                 is_xml=None, sourceline=None, sourcepos=None,
                 can_be_empty_element=None, cdata_list_attributes=None,
                 preserve_whitespace_tags=None
    ):
        "Basic constructor."
        if parser is None:
@ -831,14 +746,10 @@ class Tag(PageElement):
        self.name = name
        self.namespace = namespace
        self.prefix = prefix
-        if builder is not None:
+        if ((not builder or builder.store_line_numbers)
-            preserve_whitespace_tags = builder.preserve_whitespace_tags
+            and (sourceline is not None or sourcepos is not None)):
-        else:
+            self.sourceline = sourceline
-            if is_xml:
+            self.sourcepos = sourcepos        
                preserve_whitespace_tags = []
            else:
                preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
        self.preserve_whitespace_tags = preserve_whitespace_tags
        if attrs is None:
            attrs = {}
        elif attrs:
@ -861,12 +772,33 @@ class Tag(PageElement):
        self.setup(parent, previous)
        self.hidden = False
-        # Set up any substitutions, such as the charset in a META tag.
+        if builder is None:
-        if builder is not None:
+            # In the absence of a TreeBuilder, use whatever values were
            # passed in here. They're probably None, unless this is a copy of some
            # other tag.
            self.can_be_empty_element = can_be_empty_element
            self.cdata_list_attributes = cdata_list_attributes
            self.preserve_whitespace_tags = preserve_whitespace_tags
        else:
            # Set up any substitutions for this tag, such as the charset in a META tag.
            builder.set_up_substitutions(self)
            # Ask the TreeBuilder whether this tag might be an empty-element tag.
            self.can_be_empty_element = builder.can_be_empty_element(name)
-        else:
+
-            self.can_be_empty_element = False
+            # Keep track of the list of attributes of this tag that
            # might need to be treated as a list.
            #
            # For performance reasons, we store the whole data structure
            # rather than asking the question of every tag. Asking would
            # require building a new data structure every time, and
            # (unlike can_be_empty_element), we almost never need
            # to check this.
            self.cdata_list_attributes = builder.cdata_list_attributes
            # Keep track of the names that might cause this tag to be treated as a
            # whitespace-preserved tag.
            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
    parserClass = _alias("parser_class")  # BS3
@ -874,8 +806,14 @@ class Tag(PageElement):
        """A copy of a Tag is a new Tag, unconnected to the parse tree.
        Its contents are a copy of the old Tag's contents.
        """
-        clone = type(self)(None, self.builder, self.name, self.namespace,
+        clone = type(self)(
-                           self.prefix, self.attrs, is_xml=self._is_xml)
+            None, self.builder, self.name, self.namespace,
            self.prefix, self.attrs, is_xml=self._is_xml,
            sourceline=self.sourceline, sourcepos=self.sourcepos,
            can_be_empty_element=self.can_be_empty_element,
            cdata_list_attributes=self.cdata_list_attributes,
            preserve_whitespace_tags=self.preserve_whitespace_tags
        )
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@ -981,6 +919,43 @@ class Tag(PageElement):
            for element in self.contents[:]:
                element.extract()
    def smooth(self):
        """Smooth out this element's children by consolidating consecutive strings.
        This makes pretty-printed output look more natural following a
        lot of operations that modified the tree.
        """
        # Mark the first position of every pair of children that need
        # to be consolidated.  Do this rather than making a copy of
        # self.contents, since in most cases very few strings will be
        # affected.
        marked = []
        for i, a in enumerate(self.contents):
            if isinstance(a, Tag):
                # Recursively smooth children.
                a.smooth()
            if i == len(self.contents)-1:
                # This is the last item in .contents, and it's not a
                # tag. There's no chance it needs any work.
                continue
            b = self.contents[i+1]
            if (isinstance(a, NavigableString)
                and isinstance(b, NavigableString)
                and not isinstance(a, PreformattedString)
                and not isinstance(b, PreformattedString)
            ):
                marked.append(i)
        # Go over the marked positions in reverse order, so that
        # removing items from .contents won't affect the remaining
        # positions.
        for i in reversed(marked):
            a = self.contents[i]
            b = self.contents[i+1]
            b.extract()
            n = NavigableString(a+b)
            a.replace_with(n)
    def index(self, element):
        """
        Find the index of a child by identity, not value. Avoids issues with
@ -1115,14 +1090,6 @@ class Tag(PageElement):
        u = self.decode(indent_level, encoding, formatter)
        return u.encode(encoding, errors)
    def _should_pretty_print(self, indent_level):
        """Should this tag be pretty-printed?"""
        return (
            indent_level is not None
            and self.name not in self.preserve_whitespace_tags
        )
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
@ -1136,30 +1103,32 @@ class Tag(PageElement):
           encoding.
        """
-        # First off, turn a string formatter into a Formatter object. This
+        # First off, turn a non-Formatter `formatter` into a Formatter
-        # will stop the lookup from happening over and over again.
+        # object. This will stop the lookup from happening over and
-        if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
+        # over again.
-            formatter = self._formatter_for_name(formatter)
+        if not isinstance(formatter, Formatter):
            formatter = self.formatter_for_name(formatter)
        attributes = formatter.attributes(self)
        attrs = []
-        if self.attrs:
+        for key, val in attributes:
-            for key, val in sorted(self.attrs.items()):
+            if val is None:
-                if val is None:
+                decoded = key
-                    decoded = key
+            else:
-                else:
+                if isinstance(val, list) or isinstance(val, tuple):
-                    if isinstance(val, list) or isinstance(val, tuple):
+                    val = ' '.join(val)
-                        val = ' '.join(val)
+                elif not isinstance(val, basestring):
-                    elif not isinstance(val, basestring):
+                    val = unicode(val)
-                        val = unicode(val)
+                elif (
                    elif (
                        isinstance(val, AttributeValueWithCharsetSubstitution)
-                        and eventual_encoding is not None):
+                        and eventual_encoding is not None
-                        val = val.encode(eventual_encoding)
+                ):
-
+                    val = val.encode(eventual_encoding)
-                    text = self.format_string(val, formatter)
+
-                    decoded = (
+                text = formatter.attribute_value(val)
-                        unicode(key) + '='
+                decoded = (
-                        + EntitySubstitution.quoted_attribute_value(text))
+                    unicode(key) + '='
-                attrs.append(decoded)
+                    + formatter.quoted_attribute_value(text))
            attrs.append(decoded)
        close = ''
        closeTag = ''
@ -1168,9 +1137,7 @@ class Tag(PageElement):
            prefix = self.prefix + ":"
        if self.is_empty_element:
-            close = ''
+            close = formatter.void_element_close_prefix or ''
            if isinstance(formatter, Formatter):
                close = formatter.void_element_close_prefix or close
        else:
            closeTag = '</%s%s>' % (prefix, self.name)
@ -1185,7 +1152,8 @@ class Tag(PageElement):
        else:
            indent_contents = None
        contents = self.decode_contents(
-            indent_contents, eventual_encoding, formatter)
+            indent_contents, eventual_encoding, formatter
        )
        if self.hidden:
            # This is the 'document root' object.
@ -1217,6 +1185,16 @@ class Tag(PageElement):
            s = ''.join(s)
        return s
    def _should_pretty_print(self, indent_level):
        """Should this tag be pretty-printed?"""
        return (
            indent_level is not None
            and (
                not self.preserve_whitespace_tags
                or self.name not in self.preserve_whitespace_tags
            )
        )
    def prettify(self, encoding=None, formatter="minimal"):
        if encoding is None:
            return self.decode(True, formatter=formatter)
@ -1232,19 +1210,19 @@ class Tag(PageElement):
           indented this many spaces.
        :param eventual_encoding: The tag is destined to be
-           encoded into this encoding. This method is _not_
+           encoded into this encoding. decode_contents() is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.
-        :param formatter: The output formatter responsible for converting
+        :param formatter: A Formatter object, or a string naming one of
-           entities to Unicode characters.
+            the standard Formatters.
        """
        # First off, turn a string formatter into a Formatter object. This
        # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
+        if not isinstance(formatter, Formatter):
-            formatter = self._formatter_for_name(formatter)
+            formatter = self.formatter_for_name(formatter)
        pretty_print = (indent_level is not None)
        s = []
@ -1255,16 +1233,19 @@ class Tag(PageElement):
            elif isinstance(c, Tag):
                s.append(c.decode(indent_level, eventual_encoding,
                                  formatter))
-            if text and indent_level and not self.name == 'pre':
+            preserve_whitespace = (
                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
            )
            if text and indent_level and not preserve_whitespace:
                text = text.strip()
            if text:
-                if pretty_print and not self.name == 'pre':
+                if pretty_print and not preserve_whitespace:
                    s.append(" " * (indent_level - 1))
                s.append(text)
-                if pretty_print and not self.name == 'pre':
+                if pretty_print and not preserve_whitespace:
                    s.append("\n")
        return ''.join(s)
-
+       
    def encode_contents(
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
        formatter="minimal"):
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -0,0 +1,99 @@
 from bs4.dammit import EntitySubstitution
 class Formatter(EntitySubstitution):
    """Describes a strategy to use when outputting a parse tree to a string.
    Some parts of this strategy come from the distinction between
    HTML4, HTML5, and XML. Others are configurable by the user.
    """
    # Registries of XML and HTML formatters.
    XML_FORMATTERS = {}
    HTML_FORMATTERS = {}
    HTML = 'html'
    XML = 'xml'
    HTML_DEFAULTS = dict(
        cdata_containing_tags=set(["script", "style"]),
    )
    def _default(self, language, value, kwarg):
        if value is not None:
            return value
        if language == self.XML:
            return set()
        return self.HTML_DEFAULTS[kwarg]
    def __init__(
            self, language=None, entity_substitution=None,
            void_element_close_prefix='/', cdata_containing_tags=None,
    ):
        """
        :param void_element_close_prefix: By default, represent void
        elements as <tag/> rather than <tag>
        """
        self.language = language
        self.entity_substitution = entity_substitution
        self.void_element_close_prefix = void_element_close_prefix
        self.cdata_containing_tags = self._default(
            language, cdata_containing_tags, 'cdata_containing_tags'
        )
    def substitute(self, ns):
        """Process a string that needs to undergo entity substitution."""
        if not self.entity_substitution:
            return ns
        from element import NavigableString
        if (isinstance(ns, NavigableString)
            and ns.parent is not None
            and ns.parent.name in self.cdata_containing_tags):
            # Do nothing.
            return ns
        # Substitute.
        return self.entity_substitution(ns)
    def attribute_value(self, value):
        """Process the value of an attribute."""
        return self.substitute(value)
    def attributes(self, tag):
        """Reorder a tag's attributes however you want."""
        return sorted(tag.attrs.items())
 class HTMLFormatter(Formatter):
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
 class XMLFormatter(Formatter):
    REGISTRY = {}
    def __init__(self, *args, **kwargs):
        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
 # Set up aliases for the default formatters.
 HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
 )
 HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html,
    void_element_close_prefix = None
 )
 HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
 )
 HTMLFormatter.REGISTRY[None] = HTMLFormatter(
    entity_substitution=None
 )
 XMLFormatter.REGISTRY["html"] =  XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_html
 )
 XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
    entity_substitution=EntitySubstitution.substitute_xml
 )
 XMLFormatter.REGISTRY[None] = Formatter(
    Formatter(Formatter.XML, entity_substitution=None)
 )