Merge branch 'feature/UpdateBS4' into develop

6 years ago · f7136b1cdd
9 changed files with 550 additions and 284 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@ -1,6 +1,7 @@
 ### 0.21.0 (2019-xx-xx xx:xx:xx UTC)

 * Update attr 19.2.0.dev0 (de84609) to 19.2.0.dev0 (154b4e5)
+* Update Beautiful Soup 4.7.1 (r497) to 4.8.0 (r526)
 * Update Certifi 2019.03.09 (401100f) to 2019.06.16 (84dc766)
 * Update DiskCache library 3.1.1 (2649ac9) to 4.0.0 (2c79bb9)
 * Update feedparser 5.2.1 (2b11c80) to 5.2.1 (cbe18d0)
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -18,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
 """

 __author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.7.1"
+__version__ = "4.8.0"
 __copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
 # Use of this source code is governed by the MIT license.
 __license__ = "MIT"
@ -63,7 +63,7 @@ class BeautifulSoup(Tag):
      handle_starttag(name, attrs) # See note about return value
      handle_endtag(name)
      handle_data(data) # Appends to the current data node
-      endData(containerClass=NavigableString) # Ends the current data node
+      endData(containerClass) # Ends the current data node

    No matter how complicated the underlying parser is, you should be
    able to build a tree using 'start tag' events, 'end tag' events,
@ -78,14 +78,14 @@ class BeautifulSoup(Tag):
    # If the end-user gives no indication which tree builder they
    # want, look for one with these features.
    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
+   
    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"

    def __init__(self, markup="", features=None, builder=None,
                 parse_only=None, from_encoding=None, exclude_encodings=None,
-                 **kwargs):
+                 element_classes=None, **kwargs):
        """Constructor.

        :param markup: A string or a file-like object representing
@ -98,8 +98,10 @@ class BeautifulSoup(Tag):
        name a specific parser, so that Beautiful Soup gives you the
        same results across platforms and virtual environments.

-        :param builder: A specific TreeBuilder to use instead of looking one
-        up based on `features`. You shouldn't need to use this.
+        :param builder: A TreeBuilder subclass to instantiate (or
+        instance to use) instead of looking one up based on
+        `features`. You only need to use this if you've implemented a
+        custom TreeBuilder.

        :param parse_only: A SoupStrainer. Only parts of the document
        matching the SoupStrainer will be considered. This is useful
@ -115,14 +117,26 @@ class BeautifulSoup(Tag):
        the document's encoding but you know Beautiful Soup's guess is
        wrong.

+        :param element_classes: A dictionary mapping BeautifulSoup
+        classes like Tag and NavigableString to other classes you'd
+        like to be instantiated instead as the parse tree is
+        built. This is useful for using subclasses to modify the
+        default behavior of Tag or NavigableString.
+
        :param kwargs: For backwards compatibility purposes, the
        constructor accepts certain keyword arguments used in
        Beautiful Soup 3. None of these arguments do anything in
-        Beautiful Soup 4 and there's no need to actually pass keyword
-        arguments into the constructor.
+        Beautiful Soup 4; they will result in a warning and then be ignored.
+
+        Apart from this, any keyword arguments passed into the BeautifulSoup
+        constructor are propagated to the TreeBuilder constructor. This
+        makes it possible to configure a TreeBuilder beyond saying
+        which one to use.
+
        """

        if 'convertEntities' in kwargs:
+            del kwargs['convertEntities']
            warnings.warn(
                "BS4 does not respect the convertEntities argument to the "
                "BeautifulSoup constructor. Entities are always converted "
@ -177,13 +191,19 @@ class BeautifulSoup(Tag):
            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
            from_encoding = None

-        if len(kwargs) > 0:
-            arg = kwargs.keys().pop()
-            raise TypeError(
-                "__init__() got an unexpected keyword argument '%s'" % arg)
-
-        if builder is None:
-            original_features = features
+        self.element_classes = element_classes or dict()
+
+        # We need this information to track whether or not the builder
+        # was specified well enough that we can omit the 'you need to
+        # specify a parser' warning.
+        original_builder = builder
+        original_features = features
+            
+        if isinstance(builder, type):
+            # A builder class was passed in; it needs to be instantiated.
+            builder_class = builder
+            builder = None
+        elif builder is None:
            if isinstance(features, basestring):
                features = [features]
            if features is None or len(features) == 0:
@ -194,9 +214,16 @@ class BeautifulSoup(Tag):
                    "Couldn't find a tree builder with the features you "
                    "requested: %s. Do you need to install a parser library?"
                    % ",".join(features))
-            builder = builder_class()
-            if not (original_features == builder.NAME or
-                    original_features in builder.ALTERNATE_NAMES):
+
+        # At this point either we have a TreeBuilder instance in
+        # builder, or we have a builder_class that we can instantiate
+        # with the remaining **kwargs.
+        if builder is None:
+            builder = builder_class(**kwargs)
+            if not original_builder and not (
+                    original_features == builder.NAME or
+                    original_features in builder.ALTERNATE_NAMES
+            ):
                if builder.is_xml:
                    markup_type = "XML"
                else:
@ -231,7 +258,10 @@ class BeautifulSoup(Tag):
                        markup_type=markup_type
                    )
                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
-
+        else:
+            if kwargs:
+                warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
+                    
        self.builder = builder
        self.is_xml = builder.is_xml
        self.known_xml = self.is_xml
@ -272,6 +302,8 @@ class BeautifulSoup(Tag):
                    ' Beautiful Soup.' % markup)
            self._check_markup_is_url(markup)

+        rejections = []
+        success = False
        for (self.markup, self.original_encoding, self.declared_html_encoding,
         self.contains_replacement_characters) in (
             self.builder.prepare_markup(
@ -279,10 +311,18 @@ class BeautifulSoup(Tag):
            self.reset()
            try:
                self._feed()
+                success = True
                break
-            except ParserRejectedMarkup:
+            except ParserRejectedMarkup as e:
+                rejections.append(e)
                pass

+        if not success:
+            other_exceptions = [unicode(e) for e in rejections]
+            raise ParserRejectedMarkup(
+                u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
+            )
+
        # Clear out the markup and remove the builder's circular
        # reference to this object.
        self.markup = None
@ -355,13 +395,20 @@ class BeautifulSoup(Tag):
        self.preserve_whitespace_tag_stack = []
        self.pushTag(self)

-    def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
+                sourceline=None, sourcepos=None, **kwattrs):
        """Create a new tag associated with this soup."""
        kwattrs.update(attrs)
-        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
+        return self.element_classes.get(Tag, Tag)(
+            None, self.builder, name, namespace, nsprefix, kwattrs,
+            sourceline=sourceline, sourcepos=sourcepos
+        )

-    def new_string(self, s, subclass=NavigableString):
+    def new_string(self, s, subclass=None):
        """Create a new NavigableString associated with this soup."""
+        subclass = subclass or self.element_classes.get(
+            NavigableString, NavigableString
+        )
        return subclass(s)

    def insert_before(self, successor):
@ -388,7 +435,17 @@ class BeautifulSoup(Tag):
        if tag.name in self.builder.preserve_whitespace_tags:
            self.preserve_whitespace_tag_stack.append(tag)

-    def endData(self, containerClass=NavigableString):
+    def endData(self, containerClass=None):
+
+        # Default container is NavigableString.
+        containerClass = containerClass or NavigableString
+
+        # The user may want us to instantiate some alias for the
+        # container class.
+        containerClass = self.element_classes.get(
+            containerClass, containerClass
+        )
+        
        if self.current_data:
            current_data = u''.join(self.current_data)
            # If whitespace is not preserved, and this string contains
@ -509,7 +566,8 @@ class BeautifulSoup(Tag):

        return most_recently_popped

-    def handle_starttag(self, name, namespace, nsprefix, attrs):
+    def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
+                        sourcepos=None):
        """Push a start tag on to the stack.

        If this method returns None, the tag was rejected by the
@ -526,8 +584,11 @@ class BeautifulSoup(Tag):
                 or not self.parse_only.search_tag(name, attrs))):
            return None

-        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self._most_recent_element)
+        tag = self.element_classes.get(Tag, Tag)(
+            self, self.builder, name, namespace, nsprefix, attrs,
+            self.currentTag, self._most_recent_element,
+            sourceline=sourceline, sourcepos=sourcepos
+        )
        if tag is None:
            return tag
        if self._most_recent_element is not None:
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -7,7 +7,6 @@ import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
-    HTMLAwareEntitySubstitution,
    nonwhitespace_re
    )

@ -90,18 +89,58 @@ class TreeBuilder(object):

    is_xml = False
    picklable = False
-    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
    
    # A value for these tag/attribute combinations is a space- or
    # comma-separated list of CDATA, rather than a single CDATA.
-    cdata_list_attributes = {}
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {}

+    DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
+    
+    USE_DEFAULT = object()

-    def __init__(self):
+    # Most parsers don't keep track of line numbers.
+    TRACKS_LINE_NUMBERS = False
+    
+    def __init__(self, multi_valued_attributes=USE_DEFAULT,
+                 preserve_whitespace_tags=USE_DEFAULT,
+                 store_line_numbers=USE_DEFAULT):
+        """Constructor.
+
+        :param multi_valued_attributes: If this is set to None, the
+        TreeBuilder will not turn any values for attributes like
+        'class' into lists. Setting this do a dictionary will
+        customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
+        for an example.
+
+        Internally, these are called "CDATA list attributes", but that
+        probably doesn't make sense to an end-user, so the argument name
+        is `multi_valued_attributes`.
+
+        :param preserve_whitespace_tags: A list of tags to treat
+        the way <pre> tags are treated in HTML. Tags in this list
+        will have 
+
+        :param store_line_numbers: If the parser keeps track of the
+        line numbers and positions of the original markup, that
+        information will, by default, be stored in each corresponding
+        `Tag` object. You can turn this off by passing
+        store_line_numbers=False. If the parser you're using doesn't 
+        keep track of this information, then setting store_line_numbers=True
+        will do nothing.
+        """
        self.soup = None
-
+        if multi_valued_attributes is self.USE_DEFAULT:
+            multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
+        self.cdata_list_attributes = multi_valued_attributes
+        if preserve_whitespace_tags is self.USE_DEFAULT:
+            preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
+        self.preserve_whitespace_tags = preserve_whitespace_tags
+        if store_line_numbers == self.USE_DEFAULT:
+            store_line_numbers = self.TRACKS_LINE_NUMBERS
+        self.store_line_numbers = store_line_numbers
+        
    def initialize_soup(self, soup):
        """The BeautifulSoup object has been initialized and is now
        being associated with the TreeBuilder.
@ -131,13 +170,13 @@ class TreeBuilder(object):
        if self.empty_element_tags is None:
            return True
        return tag_name in self.empty_element_tags
-        
+    
    def feed(self, markup):
        raise NotImplementedError()

    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
-        return markup, None, None, False
+                       document_declared_encoding=None, exclude_encodings=None):
+        yield markup, None, None, False

    def test_fragment_to_document(self, fragment):
        """Wrap an HTML fragment to make it look like a document.
@ -237,7 +276,6 @@ class HTMLTreeBuilder(TreeBuilder):
    Such as which tags are empty-element tags.
    """

-    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
    empty_element_tags = set([
        # These are from HTML5.
        'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
@ -259,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
    # encounter one of these attributes, we will parse its value into
    # a list of values if possible. Upon output, the list will be
    # converted back into a string.
-    cdata_list_attributes = {
+    DEFAULT_CDATA_LIST_ATTRIBUTES = {
        "*" : ['class', 'accesskey', 'dropzone'],
        "a" : ['rel', 'rev'],
        "link" :  ['rel', 'rev'],
@ -276,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
        "output" : ["for"],
        }

+    DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
+    
    def set_up_substitutions(self, tag):
        # We are only interested in <meta> tags
        if tag.name != 'meta':
@ -323,8 +363,15 @@ def register_treebuilders_from(module):
            this_module.builder_registry.register(obj)

 class ParserRejectedMarkup(Exception):
-    pass
-
+    def __init__(self, message_or_exception):
+        """Explain why the parser rejected the given markup, either
+        with a textual explanation or another exception.
+        """
+        if isinstance(message_or_exception, Exception):
+            e = message_or_exception
+            message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
+        super(ParserRejectedMarkup, self).__init__(message_or_exception)
+            
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -45,6 +45,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):

    features = [NAME, PERMISSIVE, HTML_5, HTML]

+    # html5lib can tell us which line number and position in the
+    # original file is the source of an element.
+    TRACKS_LINE_NUMBERS = True
+    
    def prepare_markup(self, markup, user_specified_encoding,
                       document_declared_encoding=None, exclude_encodings=None):
        # Store the user-specified encoding for use later on.
@ -62,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
        if self.soup.parse_only is not None:
            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-
+        self.underlying_builder.parser = parser
        extra_kwargs = dict()
        if not isinstance(markup, unicode):
            if new_html5lib:
@ -70,7 +74,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
            else:
                extra_kwargs['encoding'] = self.user_specified_encoding
        doc = parser.parse(markup, **extra_kwargs)
-
+        
        # Set the character encoding detected by the tokenizer.
        if isinstance(markup, unicode):
            # We need to special-case this because html5lib sets
@ -84,10 +88,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
                # with other tree builders.
                original_encoding = original_encoding.name
            doc.original_encoding = original_encoding
-
+        self.underlying_builder.parser = None
+            
    def create_treebuilder(self, namespaceHTMLElements):
        self.underlying_builder = TreeBuilderForHtml5lib(
-            namespaceHTMLElements, self.soup)
+            namespaceHTMLElements, self.soup,
+            store_line_numbers=self.store_line_numbers
+        )
        return self.underlying_builder

    def test_fragment_to_document(self, fragment):
@ -96,15 +103,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder):


 class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
-
-    def __init__(self, namespaceHTMLElements, soup=None):
+    
+    def __init__(self, namespaceHTMLElements, soup=None,
+                 store_line_numbers=True, **kwargs):
        if soup:
            self.soup = soup
        else:
            from bs4 import BeautifulSoup
-            self.soup = BeautifulSoup("", "html.parser")
+            # TODO: Why is the parser 'html.parser' here? To avoid an
+            # infinite loop?
+            self.soup = BeautifulSoup(
+                "", "html.parser", store_line_numbers=store_line_numbers,
+                **kwargs
+            )
        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)

+        # This will be set later to an html5lib.html5parser.HTMLParser
+        # object, which we can use to track the current line number.
+        self.parser = None
+        self.store_line_numbers = store_line_numbers
+        
    def documentClass(self):
        self.soup.reset()
        return Element(self.soup, self.soup, None)
@ -118,7 +136,16 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
        self.soup.object_was_parsed(doctype)

    def elementClass(self, name, namespace):
-        tag = self.soup.new_tag(name, namespace)
+        kwargs = {}
+        if self.parser and self.store_line_numbers:
+            # This represents the point immediately after the end of the
+            # tag. We don't know when the tag started, but we do know
+            # where it ended -- the character just before this one.
+            sourceline, sourcepos = self.parser.tokenizer.stream.position()
+            kwargs['sourceline'] = sourceline
+            kwargs['sourcepos'] = sourcepos-1
+        tag = self.soup.new_tag(name, namespace, **kwargs)
+
        return Element(tag, self.soup, namespace)

    def commentClass(self, data):
@ -126,6 +153,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):

    def fragmentClass(self):
        from bs4 import BeautifulSoup
+        # TODO: Why is the parser 'html.parser' here? To avoid an
+        # infinite loop?
        self.soup = BeautifulSoup("", "html.parser")
        self.soup.name = "[document_fragment]"
        return Element(self.soup, self.soup, None)
@ -199,7 +228,7 @@ class AttrList(object):
    def __setitem__(self, name, value):
        # If this attribute is a multi-valued attribute for this element,
        # turn its value into a list.
-        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        list_attr = self.element.cdata_list_attributes
        if (name in list_attr['*']
            or (self.element.name in list_attr
                and name in list_attr[self.element.name])):
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -99,7 +99,11 @@ class BeautifulSoupHTMLParser(HTMLParser):
            attr_dict[key] = value
            attrvalue = '""'
        #print "START", name
-        tag = self.soup.handle_starttag(name, None, None, attr_dict)
+        sourceline, sourcepos = self.getpos()
+        tag = self.soup.handle_starttag(
+            name, None, None, attr_dict, sourceline=sourceline,
+            sourcepos=sourcepos
+        )
        if tag and tag.is_empty_element and handle_empty_element:
            # Unlike other parsers, html.parser doesn't send separate end tag
            # events for empty-element tags. (It's handled in
@ -214,12 +218,19 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
    NAME = HTMLPARSER
    features = [NAME, HTML, STRICT]

-    def __init__(self, *args, **kwargs):
+    # The html.parser knows which line number and position in the
+    # original file is the source of an element.
+    TRACKS_LINE_NUMBERS = True
+    
+    def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
+        super(HTMLParserTreeBuilder, self).__init__(**kwargs)
+        parser_args = parser_args or []
+        parser_kwargs = parser_kwargs or {}
        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
-            kwargs['strict'] = False
+            parser_kwargs['strict'] = False
        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
-            kwargs['convert_charrefs'] = False
-        self.parser_args = (args, kwargs)
+            parser_kwargs['convert_charrefs'] = False
+        self.parser_args = (parser_args, parser_kwargs)

    def prepare_markup(self, markup, user_specified_encoding=None,
                       document_declared_encoding=None, exclude_encodings=None):
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -57,6 +57,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):

    DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)

+    # NOTE: If we parsed Element objects and looked at .sourceline,
+    # we'd be able to see the line numbers from the original document.
+    # But instead we build an XMLParser or HTMLParser object to serve
+    # as the target of parse messages, and those messages don't include
+    # line numbers.
+    
    def initialize_soup(self, soup):
        """Let the BeautifulSoup object know about the standard namespace
        mapping.
@ -94,7 +100,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            parser = parser(target=self, strip_cdata=False, encoding=encoding)
        return parser

-    def __init__(self, parser=None, empty_element_tags=None):
+    def __init__(self, parser=None, empty_element_tags=None, **kwargs):
        # TODO: Issue a warning if parser is present but not a
        # callable, since that means there's no way to create new
        # parsers for different encodings.
@ -103,6 +109,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
            self.empty_element_tags = set(empty_element_tags)
        self.soup = None
        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
+        super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
        
    def _getNsTag(self, tag):
        # Split the namespace URL out of a fully-qualified lxml tag
@ -168,7 +175,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
                    self.parser.feed(data)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)

    def close(self):
        self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
@ -287,7 +294,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
            self.parser.feed(markup)
            self.parser.close()
        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
-            raise ParserRejectedMarkup(str(e))
+            raise ParserRejectedMarkup(e)


    def test_fragment_to_document(self, fragment):
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -22,6 +22,8 @@ try:
    #  PyPI package: cchardet
    import cchardet
    def chardet_dammit(s):
+        if isinstance(s, unicode):
+            return None
        return cchardet.detect(s)['encoding']
 except ImportError:
    try:
@ -30,6 +32,8 @@ except ImportError:
        #  PyPI package: chardet
        import chardet
        def chardet_dammit(s):
+            if isinstance(s, unicode):
+                return None
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
@ -44,10 +48,19 @@ try:
 except ImportError:
    pass

-xml_encoding_re = re.compile(
-    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
-html_meta_re = re.compile(
-    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+# Build bytestring and Unicode versions of regular expressions for finding
+# a declared encoding inside an XML or HTML document.
+xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
+html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
+encoding_res = dict()
+encoding_res[bytes] = {
+    'html' : re.compile(html_meta.encode("ascii"), re.I),
+    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
+}
+encoding_res[unicode] = {
+    'html' : re.compile(html_meta, re.I),
+    'xml' : re.compile(xml_encoding, re.I)
+}

 class EntitySubstitution(object):

@ -57,15 +70,24 @@ class EntitySubstitution(object):
        lookup = {}
        reverse_lookup = {}
        characters_for_re = []
-        for codepoint, name in list(codepoint2name.items()):
+
+        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
+        # entity. We don't want to use it, but we want to recognize it on the way in.
+        #
+        # TODO: Ideally we would be able to recognize all HTML 5 named
+        # entities, but that's a little tricky.
+        extra = [(39, 'apos')]
+        for codepoint, name in list(codepoint2name.items()) + extra:
            character = unichr(codepoint)
-            if codepoint != 34:
+            if codepoint not in (34, 39):
                # There's no point in turning the quotation mark into
-                # &quot;, unless it happens within an attribute value, which
-                # is handled elsewhere.
+                # &quot; or the single quote into &apos;, unless it
+                # happens within an attribute value, which is handled
+                # elsewhere.
                characters_for_re.append(character)
                lookup[character] = name
-            # But we do want to turn &quot; into the quotation mark.
+            # But we do want to recognize those entities on the way in and
+            # convert them to Unicode characters.
            reverse_lookup[name] = character
        re_definition = "[%s]" % "".join(characters_for_re)
        return lookup, reverse_lookup, re.compile(re_definition)
@ -310,14 +332,22 @@ class EncodingDetector:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))

+        if isinstance(markup, bytes):
+            res = encoding_res[bytes]
+        else:
+            res = encoding_res[unicode]
+
+        xml_re = res['xml']
+        html_re = res['html']
        declared_encoding = None
-        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
-            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
-            declared_encoding = declared_encoding_match.groups()[0].decode(
-                'ascii', 'replace')
+            declared_encoding = declared_encoding_match.groups()[0]
        if declared_encoding:
+            if isinstance(declared_encoding, bytes):
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
            return declared_encoding.lower()
        return None

--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
@ -16,7 +16,11 @@ except ImportError, e:
        'The soupsieve package is not installed. CSS selectors cannot be used.'
    )

-from bs4.dammit import EntitySubstitution
+from bs4.formatter import (
+    Formatter,
+    HTMLFormatter,
+    XMLFormatter,
+)

 DEFAULT_OUTPUT_ENCODING = "utf-8"
 PY3K = (sys.version_info[0] > 2)
@ -42,6 +46,11 @@ def _alias(attr):
 class NamespacedAttribute(unicode):

    def __new__(cls, prefix, name, namespace=None):
+        if not name:
+            # This is the default namespace. Its name "has no value"
+            # per https://www.w3.org/TR/xml-names/#defaulting
+            name = None
+
        if name is None:
            obj = unicode.__new__(cls, prefix)
        elif prefix is None:
@ -99,138 +108,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
            return match.group(1) + encoding
        return self.CHARSET_RE.sub(rewrite, self.original_value)

-class HTMLAwareEntitySubstitution(EntitySubstitution):
-
-    """Entity substitution rules that are aware of some HTML quirks.
-
-    Specifically, the contents of <script> and <style> tags should not
-    undergo entity substitution.
-
-    Incoming NavigableString objects are checked to see if they're the
-    direct children of a <script> or <style> tag.
-    """
-
-    cdata_containing_tags = set(["script", "style"])
-
-    preformatted_tags = set(["pre"])
-
-    preserve_whitespace_tags = set(['pre', 'textarea'])
-
-    @classmethod
-    def _substitute_if_appropriate(cls, ns, f):
-        if (isinstance(ns, NavigableString)
-            and ns.parent is not None
-            and ns.parent.name in cls.cdata_containing_tags):
-            # Do nothing.
-            return ns
-        # Substitute.
-        return f(ns)
-
-    @classmethod
-    def substitute_html(cls, ns):
-        return cls._substitute_if_appropriate(
-            ns, EntitySubstitution.substitute_html)
+    
+class PageElement(object):
+    """Contains the navigational information for some part of the page
+    (either a tag or a piece of text)"""
+   
+    def setup(self, parent=None, previous_element=None, next_element=None,
+              previous_sibling=None, next_sibling=None):
+        """Sets up the initial relations between this element and
+        other elements."""
+        self.parent = parent

-    @classmethod
-    def substitute_xml(cls, ns):
-        return cls._substitute_if_appropriate(
-            ns, EntitySubstitution.substitute_xml)
+        self.previous_element = previous_element
+        if previous_element is not None:
+            self.previous_element.next_element = self

-class Formatter(object):
-    """Contains information about how to format a parse tree."""
-    
-    # By default, represent void elements as <tag/> rather than <tag>
-    void_element_close_prefix = '/'
-
-    def substitute_entities(self, *args, **kwargs):
-        """Transform certain characters into named entities."""
-        raise NotImplementedError()
-
-class HTMLFormatter(Formatter):
-    """The default HTML formatter."""
-    def substitute(self, *args, **kwargs):
-        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
-
-class MinimalHTMLFormatter(Formatter):
-    """A minimal HTML formatter."""
-    def substitute(self, *args, **kwargs):
-        return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
-    
-class HTML5Formatter(HTMLFormatter):
-    """An HTML formatter that omits the slash in a void tag."""
-    void_element_close_prefix = None
+        self.next_element = next_element
+        if self.next_element is not None:
+            self.next_element.previous_element = self

-class XMLFormatter(Formatter):
-    """Substitute only the essential XML entities."""
-    def substitute(self, *args, **kwargs):
-        return EntitySubstitution.substitute_xml(*args, **kwargs)
+        self.next_sibling = next_sibling
+        if self.next_sibling is not None:
+            self.next_sibling.previous_sibling = self

-class HTMLXMLFormatter(Formatter):
-    """Format XML using HTML rules."""
-    def substitute(self, *args, **kwargs):
-        return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
+        if (previous_sibling is None
+            and self.parent is not None and self.parent.contents):
+            previous_sibling = self.parent.contents[-1]

-    
-class PageElement(object):
-    """Contains the navigational information for some part of the page
-    (either a tag or a piece of text)"""
+        self.previous_sibling = previous_sibling
+        if previous_sibling is not None:
+            self.previous_sibling.next_sibling = self

-    # There are five possible values for the "formatter" argument passed in
-    # to methods like encode() and prettify():
-    #
-    # "html" - All Unicode characters with corresponding HTML entities
-    #   are converted to those entities on output.
-    # "html5" - The same as "html", but empty void tags are represented as
-    #   <tag> rather than <tag/>
-    # "minimal" - Bare ampersands and angle brackets are converted to
-    #   XML entities: &amp; &lt; &gt;
-    # None - The null formatter. Unicode characters are never
-    #   converted to entities.  This is not recommended, but it's
-    #   faster than "minimal".
-    # A callable function - it will be called on every string that needs to undergo entity substitution.
-    # A Formatter instance - Formatter.substitute(string) will be called on every string that
-    #  needs to undergo entity substitution.
-    #
-
-    # In an HTML document, the default "html", "html5", and "minimal"
-    # functions will leave the contents of <script> and <style> tags
-    # alone. For an XML document, all tags will be given the same
-    # treatment.
-
-    HTML_FORMATTERS = {
-        "html" : HTMLFormatter(),
-        "html5" : HTML5Formatter(),
-        "minimal" : MinimalHTMLFormatter(),
-        None : None
-        }
-
-    XML_FORMATTERS = {
-        "html" : HTMLXMLFormatter(),
-        "minimal" : XMLFormatter(),
-        None : None
-        }
-
-    def format_string(self, s, formatter='minimal'):
+    def format_string(self, s, formatter):
        """Format the given string using the given formatter."""
-        if isinstance(formatter, basestring):
-            formatter = self._formatter_for_name(formatter)
        if formatter is None:
-            output = s
-        else:
-            if isinstance(formatter, Callable):
-                # Backwards compatibility -- you used to pass in a formatting method.
-                output = formatter(s)
-            else:
-                output = formatter.substitute(s)
+            return s
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)
+        output = formatter.substitute(s)
        return output

+    def formatter_for_name(self, formatter):
+        """Look up or create a Formatter for the given identifier,
+        if necessary.
+
+        :param formatter: Can be a Formatter object (used as-is), a
+        function (used as the entity substitution hook for an
+        XMLFormatter or HTMLFormatter), or a string (used to look up
+        an XMLFormatter or HTMLFormatter in the appropriate registry.
+        """
+        if isinstance(formatter, Formatter):
+            return formatter
+        if self._is_xml:
+            c = XMLFormatter
+        else:
+            c = HTMLFormatter
+        if callable(formatter):
+            return c(entity_substitution=formatter)
+        return c.REGISTRY[formatter]
+
    @property
    def _is_xml(self):
        """Is this element part of an XML tree or an HTML tree?

-        This is used when mapping a formatter name ("minimal") to an
-        appropriate function (one that performs entity-substitution on
-        the contents of <script> and <style> tags, or not). It can be
+        This is used in formatter_for_name, when deciding whether an
+        XMLFormatter or HTMLFormatter is more appropriate. It can be
        inefficient, but it should be called very rarely.
        """
        if self.known_xml is not None:
@ -248,46 +190,13 @@ class PageElement(object):
            return getattr(self, 'is_xml', False)
        return self.parent._is_xml

-    def _formatter_for_name(self, name):
-        "Look up a formatter function based on its name and the tree."
-        if self._is_xml:
-            return self.XML_FORMATTERS.get(name, XMLFormatter())
-        else:
-            return self.HTML_FORMATTERS.get(name, HTMLFormatter())
-
-    def setup(self, parent=None, previous_element=None, next_element=None,
-              previous_sibling=None, next_sibling=None):
-        """Sets up the initial relations between this element and
-        other elements."""
-        self.parent = parent
-
-        self.previous_element = previous_element
-        if previous_element is not None:
-            self.previous_element.next_element = self
-
-        self.next_element = next_element
-        if self.next_element is not None:
-            self.next_element.previous_element = self
-
-        self.next_sibling = next_sibling
-        if self.next_sibling is not None:
-            self.next_sibling.previous_sibling = self
-
-        if (previous_sibling is None
-            and self.parent is not None and self.parent.contents):
-            previous_sibling = self.parent.contents[-1]
-
-        self.previous_sibling = previous_sibling
-        if previous_sibling is not None:
-            self.previous_sibling.next_sibling = self
-
    nextSibling = _alias("next_sibling")  # BS3
    previousSibling = _alias("previous_sibling")  # BS3

    def replace_with(self, replace_with):
        if self.parent is None:
            raise ValueError(
-                "Cannot replace one element with another when the"
+                "Cannot replace one element with another when the "
                "element to be replaced is not part of a tree.")
        if replace_with is self:
            return
@ -742,6 +651,7 @@ class NavigableString(unicode, PageElement):
                    self.__class__.__name__, attr))

    def output_ready(self, formatter="minimal"):
+        """Run the string through the provided formatter."""
        output = self.format_string(self, formatter)
        return self.PREFIX + output + self.SUFFIX

@ -760,10 +670,12 @@ class PreformattedString(NavigableString):
    but the return value will be ignored.
    """

-    def output_ready(self, formatter="minimal"):
-        """CData strings are passed into the formatter.
-        But the return value is ignored."""
-        self.format_string(self, formatter)
+    def output_ready(self, formatter=None):
+        """CData strings are passed into the formatter, purely
+        for any side effects. The return value is ignored.
+        """
+        if formatter is not None:
+            ignore = self.format_string(self, formatter)
        return self.PREFIX + self + self.SUFFIX

 class CData(PreformattedString):
@ -817,7 +729,10 @@ class Tag(PageElement):

    def __init__(self, parser=None, builder=None, name=None, namespace=None,
                 prefix=None, attrs=None, parent=None, previous=None,
-                 is_xml=None):
+                 is_xml=None, sourceline=None, sourcepos=None,
+                 can_be_empty_element=None, cdata_list_attributes=None,
+                 preserve_whitespace_tags=None
+    ):
        "Basic constructor."

        if parser is None:
@ -831,14 +746,10 @@ class Tag(PageElement):
        self.name = name
        self.namespace = namespace
        self.prefix = prefix
-        if builder is not None:
-            preserve_whitespace_tags = builder.preserve_whitespace_tags
-        else:
-            if is_xml:
-                preserve_whitespace_tags = []
-            else:
-                preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
-        self.preserve_whitespace_tags = preserve_whitespace_tags
+        if ((not builder or builder.store_line_numbers)
+            and (sourceline is not None or sourcepos is not None)):
+            self.sourceline = sourceline
+            self.sourcepos = sourcepos        
        if attrs is None:
            attrs = {}
        elif attrs:
@ -861,12 +772,33 @@ class Tag(PageElement):
        self.setup(parent, previous)
        self.hidden = False

-        # Set up any substitutions, such as the charset in a META tag.
-        if builder is not None:
+        if builder is None:
+            # In the absence of a TreeBuilder, use whatever values were
+            # passed in here. They're probably None, unless this is a copy of some
+            # other tag.
+            self.can_be_empty_element = can_be_empty_element
+            self.cdata_list_attributes = cdata_list_attributes
+            self.preserve_whitespace_tags = preserve_whitespace_tags
+        else:
+            # Set up any substitutions for this tag, such as the charset in a META tag.
            builder.set_up_substitutions(self)
+
+            # Ask the TreeBuilder whether this tag might be an empty-element tag.
            self.can_be_empty_element = builder.can_be_empty_element(name)
-        else:
-            self.can_be_empty_element = False
+
+            # Keep track of the list of attributes of this tag that
+            # might need to be treated as a list.
+            #
+            # For performance reasons, we store the whole data structure
+            # rather than asking the question of every tag. Asking would
+            # require building a new data structure every time, and
+            # (unlike can_be_empty_element), we almost never need
+            # to check this.
+            self.cdata_list_attributes = builder.cdata_list_attributes
+
+            # Keep track of the names that might cause this tag to be treated as a
+            # whitespace-preserved tag.
+            self.preserve_whitespace_tags = builder.preserve_whitespace_tags
            
    parserClass = _alias("parser_class")  # BS3

@ -874,8 +806,14 @@ class Tag(PageElement):
        """A copy of a Tag is a new Tag, unconnected to the parse tree.
        Its contents are a copy of the old Tag's contents.
        """
-        clone = type(self)(None, self.builder, self.name, self.namespace,
-                           self.prefix, self.attrs, is_xml=self._is_xml)
+        clone = type(self)(
+            None, self.builder, self.name, self.namespace,
+            self.prefix, self.attrs, is_xml=self._is_xml,
+            sourceline=self.sourceline, sourcepos=self.sourcepos,
+            can_be_empty_element=self.can_be_empty_element,
+            cdata_list_attributes=self.cdata_list_attributes,
+            preserve_whitespace_tags=self.preserve_whitespace_tags
+        )
        for attr in ('can_be_empty_element', 'hidden'):
            setattr(clone, attr, getattr(self, attr))
        for child in self.contents:
@ -981,6 +919,43 @@ class Tag(PageElement):
            for element in self.contents[:]:
                element.extract()

+    def smooth(self):
+        """Smooth out this element's children by consolidating consecutive strings.
+
+        This makes pretty-printed output look more natural following a
+        lot of operations that modified the tree.
+        """
+        # Mark the first position of every pair of children that need
+        # to be consolidated.  Do this rather than making a copy of
+        # self.contents, since in most cases very few strings will be
+        # affected.
+        marked = []
+        for i, a in enumerate(self.contents):
+            if isinstance(a, Tag):
+                # Recursively smooth children.
+                a.smooth()
+            if i == len(self.contents)-1:
+                # This is the last item in .contents, and it's not a
+                # tag. There's no chance it needs any work.
+                continue
+            b = self.contents[i+1]
+            if (isinstance(a, NavigableString)
+                and isinstance(b, NavigableString)
+                and not isinstance(a, PreformattedString)
+                and not isinstance(b, PreformattedString)
+            ):
+                marked.append(i)
+
+        # Go over the marked positions in reverse order, so that
+        # removing items from .contents won't affect the remaining
+        # positions.
+        for i in reversed(marked):
+            a = self.contents[i]
+            b = self.contents[i+1]
+            b.extract()
+            n = NavigableString(a+b)
+            a.replace_with(n)
+
    def index(self, element):
        """
        Find the index of a child by identity, not value. Avoids issues with
@ -1115,14 +1090,6 @@ class Tag(PageElement):
        u = self.decode(indent_level, encoding, formatter)
        return u.encode(encoding, errors)

-    def _should_pretty_print(self, indent_level):
-        """Should this tag be pretty-printed?"""
-
-        return (
-            indent_level is not None
-            and self.name not in self.preserve_whitespace_tags
-        )
-
    def decode(self, indent_level=None,
               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
               formatter="minimal"):
@ -1136,30 +1103,32 @@ class Tag(PageElement):
           encoding.
        """

-        # First off, turn a string formatter into a Formatter object. This
-        # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
-            formatter = self._formatter_for_name(formatter)
+        # First off, turn a non-Formatter `formatter` into a Formatter
+        # object. This will stop the lookup from happening over and
+        # over again.
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)
+        attributes = formatter.attributes(self)
        attrs = []
-        if self.attrs:
-            for key, val in sorted(self.attrs.items()):
-                if val is None:
-                    decoded = key
-                else:
-                    if isinstance(val, list) or isinstance(val, tuple):
-                        val = ' '.join(val)
-                    elif not isinstance(val, basestring):
-                        val = unicode(val)
-                    elif (
+        for key, val in attributes:
+            if val is None:
+                decoded = key
+            else:
+                if isinstance(val, list) or isinstance(val, tuple):
+                    val = ' '.join(val)
+                elif not isinstance(val, basestring):
+                    val = unicode(val)
+                elif (
                        isinstance(val, AttributeValueWithCharsetSubstitution)
-                        and eventual_encoding is not None):
-                        val = val.encode(eventual_encoding)
-
-                    text = self.format_string(val, formatter)
-                    decoded = (
-                        unicode(key) + '='
-                        + EntitySubstitution.quoted_attribute_value(text))
-                attrs.append(decoded)
+                        and eventual_encoding is not None
+                ):
+                    val = val.encode(eventual_encoding)
+
+                text = formatter.attribute_value(val)
+                decoded = (
+                    unicode(key) + '='
+                    + formatter.quoted_attribute_value(text))
+            attrs.append(decoded)
        close = ''
        closeTag = ''

@ -1168,9 +1137,7 @@ class Tag(PageElement):
            prefix = self.prefix + ":"

        if self.is_empty_element:
-            close = ''
-            if isinstance(formatter, Formatter):
-                close = formatter.void_element_close_prefix or close
+            close = formatter.void_element_close_prefix or ''
        else:
            closeTag = '</%s%s>' % (prefix, self.name)

@ -1185,7 +1152,8 @@ class Tag(PageElement):
        else:
            indent_contents = None
        contents = self.decode_contents(
-            indent_contents, eventual_encoding, formatter)
+            indent_contents, eventual_encoding, formatter
+        )

        if self.hidden:
            # This is the 'document root' object.
@ -1217,6 +1185,16 @@ class Tag(PageElement):
            s = ''.join(s)
        return s

+    def _should_pretty_print(self, indent_level):
+        """Should this tag be pretty-printed?"""
+        return (
+            indent_level is not None
+            and (
+                not self.preserve_whitespace_tags
+                or self.name not in self.preserve_whitespace_tags
+            )
+        )
+
    def prettify(self, encoding=None, formatter="minimal"):
        if encoding is None:
            return self.decode(True, formatter=formatter)
@ -1232,19 +1210,19 @@ class Tag(PageElement):
           indented this many spaces.

        :param eventual_encoding: The tag is destined to be
-           encoded into this encoding. This method is _not_
+           encoded into this encoding. decode_contents() is _not_
           responsible for performing that encoding. This information
           is passed in so that it can be substituted in if the
           document contains a <META> tag that mentions the document's
           encoding.

-        :param formatter: The output formatter responsible for converting
-           entities to Unicode characters.
+        :param formatter: A Formatter object, or a string naming one of
+            the standard Formatters.
        """
        # First off, turn a string formatter into a Formatter object. This
        # will stop the lookup from happening over and over again.
-        if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
-            formatter = self._formatter_for_name(formatter)
+        if not isinstance(formatter, Formatter):
+            formatter = self.formatter_for_name(formatter)

        pretty_print = (indent_level is not None)
        s = []
@ -1255,16 +1233,19 @@ class Tag(PageElement):
            elif isinstance(c, Tag):
                s.append(c.decode(indent_level, eventual_encoding,
                                  formatter))
-            if text and indent_level and not self.name == 'pre':
+            preserve_whitespace = (
+                self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
+            )
+            if text and indent_level and not preserve_whitespace:
                text = text.strip()
            if text:
-                if pretty_print and not self.name == 'pre':
+                if pretty_print and not preserve_whitespace:
                    s.append(" " * (indent_level - 1))
                s.append(text)
-                if pretty_print and not self.name == 'pre':
+                if pretty_print and not preserve_whitespace:
                    s.append("\n")
        return ''.join(s)
-
+       
    def encode_contents(
        self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
        formatter="minimal"):
--- a/lib/bs4/formatter.py
+++ b/lib/bs4/formatter.py
@ -0,0 +1,99 @@
+from bs4.dammit import EntitySubstitution
+
+class Formatter(EntitySubstitution):
+    """Describes a strategy to use when outputting a parse tree to a string.
+
+    Some parts of this strategy come from the distinction between
+    HTML4, HTML5, and XML. Others are configurable by the user.
+    """
+    # Registries of XML and HTML formatters.
+    XML_FORMATTERS = {}
+    HTML_FORMATTERS = {}
+
+    HTML = 'html'
+    XML = 'xml'
+
+    HTML_DEFAULTS = dict(
+        cdata_containing_tags=set(["script", "style"]),
+    )
+
+    def _default(self, language, value, kwarg):
+        if value is not None:
+            return value
+        if language == self.XML:
+            return set()
+        return self.HTML_DEFAULTS[kwarg]
+
+    def __init__(
+            self, language=None, entity_substitution=None,
+            void_element_close_prefix='/', cdata_containing_tags=None,
+    ):
+        """
+
+        :param void_element_close_prefix: By default, represent void
+        elements as <tag/> rather than <tag>
+        """
+        self.language = language
+        self.entity_substitution = entity_substitution
+        self.void_element_close_prefix = void_element_close_prefix
+        self.cdata_containing_tags = self._default(
+            language, cdata_containing_tags, 'cdata_containing_tags'
+        )
+            
+    def substitute(self, ns):
+        """Process a string that needs to undergo entity substitution."""
+        if not self.entity_substitution:
+            return ns
+        from element import NavigableString
+        if (isinstance(ns, NavigableString)
+            and ns.parent is not None
+            and ns.parent.name in self.cdata_containing_tags):
+            # Do nothing.
+            return ns
+        # Substitute.
+        return self.entity_substitution(ns)
+
+    def attribute_value(self, value):
+        """Process the value of an attribute."""
+        return self.substitute(value)
+    
+    def attributes(self, tag):
+        """Reorder a tag's attributes however you want."""
+        return sorted(tag.attrs.items())
+
+   
+class HTMLFormatter(Formatter):
+    REGISTRY = {}
+    def __init__(self, *args, **kwargs):
+        return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
+
+    
+class XMLFormatter(Formatter):
+    REGISTRY = {}
+    def __init__(self, *args, **kwargs):
+        return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
+
+
+# Set up aliases for the default formatters.
+HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html,
+    void_element_close_prefix = None
+)
+HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+HTMLFormatter.REGISTRY[None] = HTMLFormatter(
+    entity_substitution=None
+)
+XMLFormatter.REGISTRY["html"] =  XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_html
+)
+XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
+    entity_substitution=EntitySubstitution.substitute_xml
+)
+XMLFormatter.REGISTRY[None] = Formatter(
+    Formatter(Formatter.XML, entity_substitution=None)
+)