Library update

13 years ago · 02855c7b9c
115 changed files with 22575 additions and 2880 deletions
--- a/couchpotato/core/providers/nzb/mysterbin/main.py
+++ b/couchpotato/core/providers/nzb/mysterbin/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from couchpotato.core.event import fireEvent
 from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \
    simplifyString
@ -49,21 +49,21 @@ class Mysterbin(NZBProvider):
            try:
                html = BeautifulSoup(data)
                resultable = html.find('table', attrs = {'class':'t'})
-                for result in resultable.findAll('tr'):
+                for result in resultable.find_all('tr'):

                    try:
                        myster_id = result.find('input', attrs = {'class': 'check4nzb'})['value']

                        # Age
                        age = ''
-                        for temp in result.find('td', attrs = {'class': 'cdetail'}).findAll(text = True):
+                        for temp in result.find('td', attrs = {'class': 'cdetail'}).find_all(text = True):
                            if 'days' in temp:
                                age = tryInt(temp.split(' ')[0])
                                break

                        # size
                        size = None
-                        for temp in result.find('div', attrs = {'class': 'cdetail'}).findAll(text = True):
+                        for temp in result.find('div', attrs = {'class': 'cdetail'}).find_all(text = True):
                            if 'gb' in temp.lower() or 'mb' in temp.lower() or 'kb' in temp.lower():
                                size = self.parseSize(temp)
                                break
@ -74,7 +74,7 @@ class Mysterbin(NZBProvider):

                        new = {
                            'id': myster_id,
-                            'name': ''.join(result.find('span', attrs = {'class': 'cname'}).findAll(text = True)),
+                            'name': ''.join(result.find('span', attrs = {'class': 'cname'}).find_all(text = True)),
                            'type': 'nzb',
                            'provider': self.getName(),
                            'age': age,
--- a/couchpotato/core/providers/nzb/nzbclub/main.py
+++ b/couchpotato/core/providers/nzb/nzbclub/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from couchpotato.core.event import fireEvent
 from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \
    simplifyString
--- a/couchpotato/core/providers/nzb/nzbindex/main.py
+++ b/couchpotato/core/providers/nzb/nzbindex/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from couchpotato.core.event import fireEvent
 from couchpotato.core.helpers.encoding import toUnicode, tryUrlencode, \
    simplifyString
--- a/couchpotato/core/providers/torrent/kickasstorrents/main.py
+++ b/couchpotato/core/providers/torrent/kickasstorrents/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from couchpotato.core.event import fireEvent
 from couchpotato.core.helpers.variable import tryInt, getTitle
 from couchpotato.core.logger import CPLog
@ -47,14 +47,14 @@ class KickAssTorrents(TorrentProvider):
            try:
                html = BeautifulSoup(data)
                resultdiv = html.find('div', attrs = {'class':'tabs'})
-                for result in resultdiv.findAll('div', recursive = False):
+                for result in resultdiv.find_all('div', recursive = False):
                    if result.get('id').lower() not in cat_ids:
                        continue

                    try:

                        try:
-                            for temp in result.findAll('tr'):
+                            for temp in result.find_all('tr'):
                                if temp['class'] is 'firstr' or not temp.get('id'):
                                    continue

@ -68,15 +68,15 @@ class KickAssTorrents(TorrentProvider):
                                }

                                nr = 0
-                                for td in temp.findAll('td'):
+                                for td in temp.find_all('td'):
                                    column_name = table_order[nr]
                                    if column_name:

                                        if column_name is 'name':
-                                            link = td.find('div', {'class': 'torrentname'}).findAll('a')[1]
+                                            link = td.find('div', {'class': 'torrentname'}).find_all('a')[1]
                                            new['id'] = temp.get('id')[-8:]
                                            new['name'] = link.text
-                                            new['url'] = td.findAll('a', 'idownload')[1]['href']
+                                            new['url'] = td.find_all('a', 'idownload')[1]['href']
                                            if new['url'][:2] == '//':
                                                new['url'] = 'http:%s' % new['url']
                                            new['score'] = 20 if td.find('a', 'iverif') else 0
--- a/couchpotato/core/providers/trailer/hdtrailers/main.py
+++ b/couchpotato/core/providers/trailer/hdtrailers/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import SoupStrainer, BeautifulSoup
+from bs4 import SoupStrainer, BeautifulSoup
 from couchpotato.core.helpers.encoding import tryUrlencode
 from couchpotato.core.helpers.variable import mergeDicts, getTitle
 from couchpotato.core.logger import CPLog
@ -51,13 +51,13 @@ class HDTrailers(TrailerProvider):

        try:
            tables = SoupStrainer('div')
-            html = BeautifulSoup(data, parseOnlyThese = tables)
-            result_table = html.findAll('h2', text = re.compile(movie_name))
+            html = BeautifulSoup(data, parse_only = tables)
+            result_table = html.find_all('h2', text = re.compile(movie_name))

            for h2 in result_table:
                if 'trailer' in h2.lower():
                    parent = h2.parent.parent.parent
-                    trailerLinks = parent.findAll('a', text = re.compile('480p|720p|1080p'))
+                    trailerLinks = parent.find_all('a', text = re.compile('480p|720p|1080p'))
                    try:
                        for trailer in trailerLinks:
                            results[trailer].insert(0, trailer.parent['href'])
@ -74,11 +74,11 @@ class HDTrailers(TrailerProvider):
        results = {'480p':[], '720p':[], '1080p':[]}
        try:
            tables = SoupStrainer('table')
-            html = BeautifulSoup(data, parseOnlyThese = tables)
+            html = BeautifulSoup(data, parse_only = tables)
            result_table = html.find('table', attrs = {'class':'bottomTable'})


-            for tr in result_table.findAll('tr'):
+            for tr in result_table.find_all('tr'):
                trtext = str(tr).lower()
                if 'clips' in trtext:
                    break
@ -86,7 +86,7 @@ class HDTrailers(TrailerProvider):
                    nr = 0
                    if 'trailer' not in tr.find('span', 'standardTrailerName').text.lower():
                        continue
-                    resolutions = tr.findAll('td', attrs = {'class':'bottomTableResolution'})
+                    resolutions = tr.find_all('td', attrs = {'class':'bottomTableResolution'})
                    for res in resolutions:
                        results[str(res.a.contents[0])].insert(0, res.a['href'])
                        nr += 1
--- a/couchpotato/core/providers/userscript/allocine/main.py
+++ b/couchpotato/core/providers/userscript/allocine/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from couchpotato.core.providers.userscript.base import UserscriptBase

 class AlloCine(UserscriptBase):
--- a/couchpotato/core/providers/userscript/rottentomatoes/main.py
+++ b/couchpotato/core/providers/userscript/rottentomatoes/main.py
@ -1,4 +1,4 @@
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 from couchpotato.core.event import fireEvent
 from couchpotato.core.providers.userscript.base import UserscriptBase

--- a/libs/BeautifulSoup.py
+++ b/libs/BeautifulSoup.py
--- a/libs/bs4/init.py
+++ b/libs/bs4/init.py
@ -0,0 +1,355 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides provides methods and Pythonic idioms that make it easy to
+navigate, search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.6 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+"""
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.1.0"
+__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import re
+import warnings
+
+from .builder import builder_registry
+from .dammit import UnicodeDammit
+from .element import (
+    CData,
+    Comment,
+    DEFAULT_OUTPUT_ENCODING,
+    Declaration,
+    Doctype,
+    NavigableString,
+    PageElement,
+    ProcessingInstruction,
+    ResultSet,
+    SoupStrainer,
+    Tag,
+    )
+
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 3 without converting it.
+syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+
+class BeautifulSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
+
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
+
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs) # See note about return value
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
+
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree using 'start tag' events, 'end tag' events,
+    'data' events, and "done with data" events.
+
+    If you encounter an empty-element tag (aka a self-closing tag,
+    like HTML's <br> tag), call handle_starttag and then
+    handle_endtag.
+    """
+    ROOT_TAG_NAME = u'[document]'
+
+    # If the end-user gives no indication which tree builder they
+    # want, look for one with these features.
+    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
+
+    def __init__(self, markup="", features=None, builder=None,
+                 parse_only=None, from_encoding=None, **kwargs):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser."""
+
+        if 'convertEntities' in kwargs:
+            warnings.warn(
+                "BS4 does not respect the convertEntities argument to the "
+                "BeautifulSoup constructor. Entities are always converted "
+                "to Unicode characters.")
+
+        if 'markupMassage' in kwargs:
+            del kwargs['markupMassage']
+            warnings.warn(
+                "BS4 does not respect the markupMassage argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for any necessary markup massage.")
+
+        if 'smartQuotesTo' in kwargs:
+            del kwargs['smartQuotesTo']
+            warnings.warn(
+                "BS4 does not respect the smartQuotesTo argument to the "
+                "BeautifulSoup constructor. Smart quotes are always converted "
+                "to Unicode characters.")
+
+        if 'selfClosingTags' in kwargs:
+            del kwargs['selfClosingTags']
+            warnings.warn(
+                "BS4 does not respect the selfClosingTags argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for understanding self-closing tags.")
+
+        if 'isHTML' in kwargs:
+            del kwargs['isHTML']
+            warnings.warn(
+                "BS4 does not respect the isHTML argument to the "
+                "BeautifulSoup constructor. You can pass in features='html' "
+                "or features='xml' to get a builder capable of handling "
+                "one or the other.")
+
+        def deprecated_argument(old_name, new_name):
+            if old_name in kwargs:
+                warnings.warn(
+                    'The "%s" argument to the BeautifulSoup constructor '
+                    'has been renamed to "%s."' % (old_name, new_name))
+                value = kwargs[old_name]
+                del kwargs[old_name]
+                return value
+            return None
+
+        parse_only = parse_only or deprecated_argument(
+            "parseOnlyThese", "parse_only")
+
+        from_encoding = from_encoding or deprecated_argument(
+            "fromEncoding", "from_encoding")
+
+        if len(kwargs) > 0:
+            arg = kwargs.keys().pop()
+            raise TypeError(
+                "__init__() got an unexpected keyword argument '%s'" % arg)
+
+        if builder is None:
+            if isinstance(features, basestring):
+                features = [features]
+            if features is None or len(features) == 0:
+                features = self.DEFAULT_BUILDER_FEATURES
+            builder_class = builder_registry.lookup(*features)
+            if builder_class is None:
+                raise ValueError(
+                    "Couldn't find a tree builder with the features you "
+                    "requested: %s. Do you need to install a parser library?"
+                    % ",".join(features))
+            builder = builder_class()
+        self.builder = builder
+        self.is_xml = builder.is_xml
+        self.builder.soup = self
+
+        self.parse_only = parse_only
+
+        self.reset()
+
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) = (
+            self.builder.prepare_markup(markup, from_encoding))
+
+        try:
+            self._feed()
+        except StopParsing:
+            pass
+
+        # Clear out the markup and remove the builder's circular
+        # reference to this object.
+        self.markup = None
+        self.builder.soup = None
+
+    def _feed(self):
+        # Convert the document to Unicode.
+        self.builder.reset()
+
+        self.builder.feed(self.markup)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
+    def reset(self):
+        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+        self.hidden = 1
+        self.builder.reset()
+        self.currentData = []
+        self.currentTag = None
+        self.tagStack = []
+        self.pushTag(self)
+
+    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+        """Create a new tag associated with this soup."""
+        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+    def new_string(self, s):
+        """Create a new NavigableString associated with this soup."""
+        navigable = NavigableString(s)
+        navigable.setup()
+        return navigable
+
+    def insert_before(self, successor):
+        raise ValueError("BeautifulSoup objects don't support insert_before().")
+
+    def insert_after(self, successor):
+        raise ValueError("BeautifulSoup objects don't support insert_after().")
+
+    def popTag(self):
+        tag = self.tagStack.pop()
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.contents.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+
+    def endData(self, containerClass=NavigableString):
+        if self.currentData:
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not set([tag.name for tag in self.tagStack]).intersection(
+                    self.builder.preserve_whitespace_tags)):
+                if '\n' in currentData:
+                    currentData = '\n'
+                else:
+                    currentData = ' '
+            self.currentData = []
+            if self.parse_only and len(self.tagStack) <= 1 and \
+                   (not self.parse_only.text or \
+                    not self.parse_only.search(currentData)):
+                return
+            o = containerClass(currentData)
+            self.object_was_parsed(o)
+
+    def object_was_parsed(self, o):
+        """Add an object to the parse tree."""
+        o.setup(self.currentTag, self.previous_element)
+        if self.previous_element:
+            self.previous_element.next_element = o
+        self.previous_element = o
+        self.currentTag.contents.append(o)
+
+    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        #print "Popping to %s" % name
+        if name == self.ROOT_TAG_NAME:
+            return
+
+        numPops = 0
+        mostRecentTag = None
+
+        for i in range(len(self.tagStack) - 1, 0, -1):
+            if (name == self.tagStack[i].name
+                and nsprefix == self.tagStack[i].nsprefix == nsprefix):
+                numPops = len(self.tagStack) - i
+                break
+        if not inclusivePop:
+            numPops = numPops - 1
+
+        for i in range(0, numPops):
+            mostRecentTag = self.popTag()
+        return mostRecentTag
+
+    def handle_starttag(self, name, namespace, nsprefix, attrs):
+        """Push a start tag on to the stack.
+
+        If this method returns None, the tag was rejected by the
+        SoupStrainer. You should proceed as if the tag had not occured
+        in the document. For instance, if this was a self-closing tag,
+        don't call handle_endtag.
+        """
+
+        # print "Start tag %s: %s" % (name, attrs)
+        self.endData()
+
+        if (self.parse_only and len(self.tagStack) <= 1
+            and (self.parse_only.text
+                 or not self.parse_only.search_tag(name, attrs))):
+            return None
+
+        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+                  self.currentTag, self.previous_element)
+        if tag is None:
+            return tag
+        if self.previous_element:
+            self.previous_element.next_element = tag
+        self.previous_element = tag
+        self.pushTag(tag)
+        return tag
+
+    def handle_endtag(self, name, nsprefix=None):
+        #print "End tag: " + name
+        self.endData()
+        self._popToTag(name, nsprefix)
+
+    def handle_data(self, data):
+        self.currentData.append(data)
+
+    def decode(self, pretty_print=False,
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               formatter="minimal"):
+        """Returns a string or Unicode representation of this document.
+        To get Unicode, pass None for encoding."""
+
+        if self.is_xml:
+            # Print the XML declaration
+            encoding_part = ''
+            if eventual_encoding != None:
+                encoding_part = ' encoding="%s"' % eventual_encoding
+            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+        else:
+            prefix = u''
+        if not pretty_print:
+            indent_level = None
+        else:
+            indent_level = 0
+        return prefix + super(BeautifulSoup, self).decode(
+            indent_level, eventual_encoding, formatter)
+
+class BeautifulStoneSoup(BeautifulSoup):
+    """Deprecated interface to an XML parser."""
+
+    def __init__(self, *args, **kwargs):
+        kwargs['features'] = 'xml'
+        warnings.warn(
+            'The BeautifulStoneSoup class is deprecated. Instead of using '
+            'it, pass features="xml" into the BeautifulSoup constructor.')
+        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+    pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulSoup(sys.stdin)
+    print soup.prettify()
--- a/libs/bs4/builder/init.py
+++ b/libs/bs4/builder/init.py
@ -0,0 +1,307 @@
+from collections import defaultdict
+import itertools
+import sys
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    whitespace_re
+    )
+
+__all__ = [
+    'HTMLTreeBuilder',
+    'SAXTreeBuilder',
+    'TreeBuilder',
+    'TreeBuilderRegistry',
+    ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+class TreeBuilder(object):
+    """Turn a document into a Beautiful Soup object tree."""
+
+    features = []
+
+    is_xml = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
+
+    # A value for these tag/attribute combinations is a space- or
+    # comma-separated list of CDATA, rather than a single CDATA.
+    cdata_list_attributes = {}
+
+
+    def __init__(self):
+        self.soup = None
+
+    def reset(self):
+        pass
+
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
+
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        return markup, None, None, False
+
+    def test_fragment_to_document(self, fragment):
+        """Wrap an HTML fragment to make it look like a document.
+
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+
+        This method should not be used outside of tests.
+        """
+        return fragment
+
+    def set_up_substitutions(self, tag):
+        return False
+
+    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+        """Replaces class="foo bar" with class=["foo", "bar"]
+
+        Modifies its input in place.
+        """
+        if self.cdata_list_attributes:
+            universal = self.cdata_list_attributes.get('*', [])
+            tag_specific = self.cdata_list_attributes.get(
+                tag_name.lower(), [])
+            for cdata_list_attr in itertools.chain(universal, tag_specific):
+                if cdata_list_attr in dict(attrs):
+                    # Basically, we have a "class" attribute whose
+                    # value is a whitespace-separated list of CSS
+                    # classes. Split it into a list.
+                    value = attrs[cdata_list_attr]
+                    values = whitespace_re.split(value)
+                    attrs[cdata_list_attr] = values
+        return attrs
+
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events."""
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def close(self):
+        pass
+
+    def startElement(self, name, attrs):
+        attrs = dict((key[1], value) for key, value in list(attrs.items()))
+        #print "Start %s, %r" % (name, attrs)
+        self.soup.handle_starttag(name, attrs)
+
+    def endElement(self, name):
+        #print "End %s" % name
+        self.soup.handle_endtag(name)
+
+    def startElementNS(self, nsTuple, nodeName, attrs):
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+
+    def endElementNS(self, nsTuple, nodeName):
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+    def startPrefixMapping(self, prefix, nodeValue):
+        # Ignore the prefix for now.
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+
+    def characters(self, content):
+        self.soup.handle_data(content)
+
+    def startDocument(self):
+        pass
+
+    def endDocument(self):
+        pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML.
+
+    Such as which tags are empty-element tags.
+    """
+
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
+
+    # The HTML standard defines these attributes as containing a
+    # space-separated list of values, not a single value. That is,
+    # class="foo bar" means that the 'class' attribute has two values,
+    # 'foo' and 'bar', not the single value 'foo bar'.  When we
+    # encounter one of these attributes, we will parse its value into
+    # a list of values if possible. Upon output, the list will be
+    # converted back into a string.
+    cdata_list_attributes = {
+        "*" : ['class', 'accesskey', 'dropzone'],
+        "a" : ['rel', 'rev'],
+        "link" :  ['rel', 'rev'],
+        "td" : ["headers"],
+        "th" : ["headers"],
+        "td" : ["headers"],
+        "form" : ["accept-charset"],
+        "object" : ["archive"],
+
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area" : ["rel"],
+        "icon" : ["sizes"],
+        "iframe" : ["sandbox"],
+        "output" : ["for"],
+        }
+
+    def set_up_substitutions(self, tag):
+        # We are only interested in <meta> tags
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+        charset = tag.get('charset')
+
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
+        meta_encoding = None
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            meta_encoding = charset
+            tag['charset'] = CharsetMetaAttributeValue(charset)
+
+        elif (content is not None and http_equiv is not None
+              and http_equiv.lower() == 'content-type'):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
+            tag['content'] = ContentMetaAttributeValue(content)
+
+        return (meta_encoding is not None)
+
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
+    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules['bs4.builder']
+    for name in module.__all__:
+        obj = getattr(module, name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+    from . import _html5lib
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
+try:
+    from . import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
--- a/libs/bs4/builder/_html5lib.py
+++ b/libs/bs4/builder/_html5lib.py
@ -0,0 +1,222 @@
+__all__ = [
+    'HTML5TreeBuilder',
+    ]
+
+import warnings
+from bs4.builder import (
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+    )
+from bs4.element import NamespacedAttribute
+import html5lib
+from html5lib.constants import namespaces
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+    )
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use html5lib to build a tree."""
+
+    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+        return markup, None, None, False
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup):
+        if self.soup.parse_only is not None:
+            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, unicode):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+
+    def create_treebuilder(self, namespaceHTMLElements):
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            self.soup, namespaceHTMLElements)
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+    def __init__(self, soup, namespaceHTMLElements):
+        self.soup = soup
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+    def documentClass(self):
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name, namespace):
+        tag = self.soup.new_tag(name, namespace)
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None)
+
+    def appendChild(self, node):
+        # XXX This code is not covered by the BS4 tests.
+        self.soup.append(node.element)
+
+    def getDocument(self):
+        return self.soup
+
+    def getFragment(self):
+        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return list(self.attrs.items()).__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return list(self.attrs.items())
+    def keys(self):
+        return list(self.attrs.keys())
+    def __len__(self):
+        return len(self.attrs)
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in list(self.attrs.keys())
+
+
+class Element(html5lib.treebuilders._base.Node):
+    def __init__(self, element, soup, namespace):
+        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # Concatenate new text onto old text node
+            # XXX This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + node.element)
+            old_element.replace_with(new_element)
+        else:
+            self.element.append(node.element)
+            node.parent = self
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes is not None and len(attributes) > 0:
+
+            converted_attributes = []
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, attributes)
+            for name, value in attributes.items():
+                self.element[name] = value
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self.element.index(refNode.element)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index-1]
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        node.element.extract()
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(
+                    Element(child, self.soup, namespaces["html"]))
+            else:
+                newParent.appendChild(
+                    TextNode(child, self.soup))
+
+    def cloneNode(self):
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        html5lib.treebuilders._base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+
+    def cloneNode(self):
+        raise NotImplementedError
--- a/libs/bs4/builder/_htmlparser.py
+++ b/libs/bs4/builder/_htmlparser.py
@ -0,0 +1,244 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+    'HTMLParserTreeBuilder',
+    ]
+
+from HTMLParser import (
+    HTMLParser,
+    HTMLParseError,
+    )
+import sys
+import warnings
+
+# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
+# argument, which we'd like to set to False. Unfortunately,
+# http://bugs.python.org/issue13273 makes strict=True a better bet
+# before Python 3.2.3.
+#
+# At the end of this file, we monkeypatch HTMLParser so that
+# strict=True works well on Python 3.2.2.
+major, minor, release = sys.version_info[:3]
+CONSTRUCTOR_TAKES_STRICT = (
+    major > 3
+    or (major == 3 and minor > 2)
+    or (major == 3 and minor == 2 and release >= 3))
+
+from bs4.element import (
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+    )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+    )
+
+
+HTMLPARSER = 'html.parser'
+
+class BeautifulSoupHTMLParser(HTMLParser):
+    def handle_starttag(self, name, attrs):
+        # XXX namespace
+        self.soup.handle_starttag(name, None, None, dict(attrs))
+
+    def handle_endtag(self, name):
+        self.soup.handle_endtag(name)
+
+    def handle_data(self, data):
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name):
+        # XXX workaround for a bug in HTMLParser. Remove this once
+        # it's fixed.
+        if name.startswith('x'):
+            real_name = int(name.lstrip('x'), 16)
+        else:
+            real_name = int(name)
+
+        try:
+            data = unichr(real_name)
+        except (ValueError, OverflowError), e:
+            data = u"\N{REPLACEMENT CHARACTER}"
+
+        self.handle_data(data)
+
+    def handle_entityref(self, name):
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            data = "&%s;" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data):
+        self.soup.endData()
+        if data.startswith("DOCTYPE "):
+            data = data[len("DOCTYPE "):]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data):
+        if data.upper().startswith('CDATA['):
+            cls = CData
+            data = data[len('CDATA['):]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data):
+        self.soup.endData()
+        if data.endswith("?") and data.lower().startswith("xml"):
+            # "An XHTML processing instruction using the trailing '?'
+            # will cause the '?' to be included in data." - HTMLParser
+            # docs.
+            #
+            # Strip the question mark so we don't end up with two
+            # question marks.
+            data = data[:-1]
+        self.soup.handle_data(data)
+        self.soup.endData(ProcessingInstruction)
+
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+    is_xml = False
+    features = [HTML, STRICT, HTMLPARSER]
+
+    def __init__(self, *args, **kwargs):
+        if CONSTRUCTOR_TAKES_STRICT:
+            kwargs['strict'] = False
+        self.parser_args = (args, kwargs)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 4-tuple (markup, original encoding, encoding
+        declared within markup, whether any characters had to be
+        replaced with REPLACEMENT CHARACTER).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        args, kwargs = self.parser_args
+        parser = BeautifulSoupHTMLParser(*args, **kwargs)
+        parser.soup = self.soup
+        try:
+            parser.feed(markup)
+        except HTMLParseError, e:
+            warnings.warn(RuntimeWarning(
+                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
+            raise e
+
+# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
+# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
+# string.
+#
+# XXX This code can be removed once most Python 3 users are on 3.2.3.
+if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
+    import re
+    attrfind_tolerant = re.compile(
+        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
+
+    locatestarttagend = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s+                             # whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
+    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
+
+    from html.parser import tagfind, attrfind
+
+    def parse_starttag(self, i):
+        self.__starttag_text = None
+        endpos = self.check_for_whole_start_tag(i)
+        if endpos < 0:
+            return endpos
+        rawdata = self.rawdata
+        self.__starttag_text = rawdata[i:endpos]
+
+        # Now parse the data between i+1 and j into a tag and attrs
+        attrs = []
+        match = tagfind.match(rawdata, i+1)
+        assert match, 'unexpected call to parse_starttag()'
+        k = match.end()
+        self.lasttag = tag = rawdata[i+1:k].lower()
+        while k < endpos:
+            if self.strict:
+                m = attrfind.match(rawdata, k)
+            else:
+                m = attrfind_tolerant.match(rawdata, k)
+            if not m:
+                break
+            attrname, rest, attrvalue = m.group(1, 2, 3)
+            if not rest:
+                attrvalue = None
+            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+                 attrvalue[:1] == '"' == attrvalue[-1:]:
+                attrvalue = attrvalue[1:-1]
+            if attrvalue:
+                attrvalue = self.unescape(attrvalue)
+            attrs.append((attrname.lower(), attrvalue))
+            k = m.end()
+
+        end = rawdata[k:endpos].strip()
+        if end not in (">", "/>"):
+            lineno, offset = self.getpos()
+            if "\n" in self.__starttag_text:
+                lineno = lineno + self.__starttag_text.count("\n")
+                offset = len(self.__starttag_text) \
+                         - self.__starttag_text.rfind("\n")
+            else:
+                offset = offset + len(self.__starttag_text)
+            if self.strict:
+                self.error("junk characters in start tag: %r"
+                           % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
+        if end.endswith('/>'):
+            # XHTML-style empty tag: <span attr="value" />
+            self.handle_startendtag(tag, attrs)
+        else:
+            self.handle_starttag(tag, attrs)
+            if tag in self.CDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag)
+        return endpos
+
+    def set_cdata_mode(self, elem):
+        self.cdata_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+
+    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
+
+    CONSTRUCTOR_TAKES_STRICT = True
--- a/libs/bs4/builder/_lxml.py
+++ b/libs/bs4/builder/_lxml.py
@ -0,0 +1,179 @@
+__all__ = [
+    'LXMLTreeBuilderForXML',
+    'LXMLTreeBuilder',
+    ]
+
+from StringIO import StringIO
+import collections
+from lxml import etree
+from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML)
+from bs4.dammit import UnicodeDammit
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
+
+    is_xml = True
+
+    # Well, it's permissive by XML parser standards.
+    features = [LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE = 512
+
+    @property
+    def default_parser(self):
+        # This can either return a parser object or a class, which
+        # will be instantiated with default arguments.
+        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+    def __init__(self, parser=None, empty_element_tags=None):
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
+        if parser is None:
+            # Use the default parser.
+            parser = self.default_parser
+        if isinstance(parser, collections.Callable):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False)
+        self.parser = parser
+        self.soup = None
+        self.nsmaps = None
+
+    def _getNsTag(self, tag):
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == '{':
+            return tuple(tag[1:].split('}', 1))
+        else:
+            return (None, tag)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        if isinstance(markup, basestring):
+            markup = StringIO(markup)
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = markup.read(self.CHUNK_SIZE)
+        self.parser.feed(data)
+        while data != '':
+            # Now call feed() on the rest of the data, chunk by chunk.
+            data = markup.read(self.CHUNK_SIZE)
+            if data != '':
+                self.parser.feed(data)
+        self.parser.close()
+
+    def close(self):
+        self.nsmaps = None
+
+    def start(self, name, attrs, nsmap={}):
+        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+        attrs = dict(attrs)
+
+        nsprefix = None
+        # Invert each namespace map as it comes in.
+        if len(nsmap) == 0 and self.nsmaps != None:
+            # There are no new namespaces for this tag, but namespaces
+            # are in play, so we need a separate tag stack to know
+            # when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+            if self.nsmaps is None:
+                self.nsmaps = []
+            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            self.nsmaps.append(inverted_nsmap)
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            attrs = attrs.copy()
+            for prefix, namespace in nsmap.items():
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+                attrs[attribute] = namespace
+        namespace, name = self._getNsTag(name)
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+
+    def end(self, name):
+        self.soup.endData()
+        completed_tag = self.soup.tagStack[-1]
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if self.nsmaps != None:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            self.nsmaps.pop()
+            if len(self.nsmaps) == 0:
+                # Namespaces are no longer in play, so don't bother keeping
+                # track of the namespace stack.
+                self.nsmaps = None
+
+    def pi(self, target, data):
+        pass
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+    features = [LXML, HTML, FAST, PERMISSIVE]
+    is_xml = False
+
+    @property
+    def default_parser(self):
+        return etree.HTMLParser
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
--- a/libs/bs4/dammit.py
+++ b/libs/bs4/dammit.py
@ -0,0 +1,792 @@
+# -*- coding: utf-8 -*-
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This class forces XML data into a standard format (usually to UTF-8 or
+Unicode).  It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It does not rewrite the XML or HTML to reflect a new
+encoding; that's the tree builder's job.
+"""
+
+import codecs
+from htmlentitydefs import codepoint2name
+import re
+import warnings
+
+# Autodetects character encodings. Very useful.
+# Download from http://chardet.feedparser.org/
+#  or 'apt-get install python-chardet'
+#  or 'easy_install chardet'
+try:
+    import chardet
+    #import chardet.constants
+    #chardet.constants._debug = 1
+except ImportError:
+    chardet = None
+
+# Available from http://cjkpython.i18n.org/.
+try:
+    import iconv_codec
+except ImportError:
+    pass
+
+xml_encoding_re = re.compile(
+    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+
+class EntitySubstitution(object):
+
+    """Substitute XML or HTML entities for the corresponding characters."""
+
+    def _populate_class_variables():
+        lookup = {}
+        reverse_lookup = {}
+        characters_for_re = []
+        for codepoint, name in list(codepoint2name.items()):
+            character = unichr(codepoint)
+            if codepoint != 34:
+                # There's no point in turning the quotation mark into
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
+                characters_for_re.append(character)
+                lookup[character] = name
+            # But we do want to turn &quot; into the quotation mark.
+            reverse_lookup[name] = character
+        re_definition = "[%s]" % "".join(characters_for_re)
+        return lookup, reverse_lookup, re.compile(re_definition)
+    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'": "apos",
+        '"': "quot",
+        "&": "amp",
+        "<": "lt",
+        ">": "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    @classmethod
+    def _substitute_html_entity(cls, matchobj):
+        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    @classmethod
+    def _substitute_xml_entity(cls, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    @classmethod
+    def quoted_attribute_value(self, value):
+        """Make a value into a quoted XML attribute, possibly escaping it.
+
+         Most strings will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If a string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If a string contains both single and double quotes, the
+         double quotes will be escaped, and the string will be quoted
+         using double quotes.
+
+          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
+        """
+        quote_with = '"'
+        if '"' in value:
+            if "'" in value:
+                # The string contains both single and double
+                # quotes.  Turn the double quotes into
+                # entities. We quote the double quotes rather than
+                # the single quotes because the entity name is
+                # "&quot;" whether this is HTML or XML.  If we
+                # quoted the single quotes, we'd have to decide
+                # between &apos; and &squot;.
+                replace_with = "&quot;"
+                value = value.replace('"', replace_with)
+            else:
+                # There are double quotes but no single quotes.
+                # We can use single quotes to quote the attribute.
+                quote_with = "'"
+        return quote_with + value + quote_with
+
+    @classmethod
+    def substitute_xml(cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+
+        if make_quoted_attribute:
+            value = cls.quoted_attribute_value(value)
+        return value
+
+    @classmethod
+    def substitute_html(cls, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            cls._substitute_html_entity, s)
+
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = {"macintosh": "mac-roman",
+                       "x-sjis": "shift-jis"}
+
+    ENCODINGS_WITH_SMART_QUOTES = [
+        "windows-1252",
+        "iso-8859-1",
+        "iso-8859-2",
+        ]
+
+    def __init__(self, markup, override_encodings=[],
+                 smart_quotes_to=None, is_html=False):
+        self.declared_html_encoding = None
+        self.smart_quotes_to = smart_quotes_to
+        self.tried_encodings = []
+        self.contains_replacement_characters = False
+
+        if markup == '' or isinstance(markup, unicode):
+            self.markup = markup
+            self.unicode_markup = unicode(markup)
+            self.original_encoding = None
+            return
+
+        new_markup, document_encoding, sniffed_encoding = \
+            self._detectEncoding(markup, is_html)
+        self.markup = new_markup
+
+        u = None
+        if new_markup != markup:
+            # _detectEncoding modified the markup, then converted it to
+            # Unicode and then to UTF-8. So convert it from UTF-8.
+            u = self._convert_from("utf8")
+            self.original_encoding = sniffed_encoding
+
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [document_encoding, sniffed_encoding]):
+                if proposed_encoding is not None:
+                    u = self._convert_from(proposed_encoding)
+                    if u:
+                        break
+
+        # If no luck and we have auto-detection library, try that:
+        if not u and chardet and not isinstance(self.markup, unicode):
+            u = self._convert_from(chardet.detect(self.markup)['encoding'])
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
+
+        # As an absolute last resort, try the encodings again with
+        # character replacement.
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [
+                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
+                if proposed_encoding != "ascii":
+                    u = self._convert_from(proposed_encoding, "replace")
+                if u is not None:
+                    warnings.warn(
+                        UnicodeWarning(
+                            "Some characters could not be decoded, and were "
+                            "replaced with REPLACEMENT CHARACTER."))
+                    self.contains_replacement_characters = True
+                    break
+
+        # We could at this point force it to ASCII, but that would
+        # destroy so much data that I think giving up is better
+        self.unicode_markup = u
+        if not u:
+            self.original_encoding = None
+
+    def _sub_ms_char(self, match):
+        """Changes a MS smart quote character to an XML or HTML
+        entity, or an ASCII character."""
+        orig = match.group(1)
+        if self.smart_quotes_to == 'ascii':
+            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
+        else:
+            sub = self.MS_CHARS.get(orig)
+            if type(sub) == tuple:
+                if self.smart_quotes_to == 'xml':
+                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+                else:
+                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
+            else:
+                sub = sub.encode()
+        return sub
+
+    def _convert_from(self, proposed, errors="strict"):
+        proposed = self.find_codec(proposed)
+        if not proposed or (proposed, errors) in self.tried_encodings:
+            return None
+        self.tried_encodings.append((proposed, errors))
+        markup = self.markup
+
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if (self.smart_quotes_to is not None
+            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            smart_quotes_re = b"([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
+
+        try:
+            #print "Trying to convert document to %s (errors=%s)" % (
+            #    proposed, errors)
+            u = self._to_unicode(markup, proposed, errors)
+            self.markup = u
+            self.original_encoding = proposed
+        except Exception as e:
+            #print "That didn't work!"
+            #print e
+            return None
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _to_unicode(self, data, encoding, errors="strict"):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding, errors)
+        return newdata
+
+    def _detectEncoding(self, xml_data, is_html=False):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+                     and (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+                     (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == b'\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+        except:
+            xml_encoding_match = None
+        xml_encoding_match = xml_encoding_re.match(xml_data)
+        if not xml_encoding_match and is_html:
+            xml_encoding_match = html_meta_re.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if is_html:
+                self.declared_html_encoding = xml_encoding
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset:
+            return charset
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except (LookupError, ValueError):
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
+            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
+    MS_CHARS = {b'\x80': ('euro', '20AC'),
+                b'\x81': ' ',
+                b'\x82': ('sbquo', '201A'),
+                b'\x83': ('fnof', '192'),
+                b'\x84': ('bdquo', '201E'),
+                b'\x85': ('hellip', '2026'),
+                b'\x86': ('dagger', '2020'),
+                b'\x87': ('Dagger', '2021'),
+                b'\x88': ('circ', '2C6'),
+                b'\x89': ('permil', '2030'),
+                b'\x8A': ('Scaron', '160'),
+                b'\x8B': ('lsaquo', '2039'),
+                b'\x8C': ('OElig', '152'),
+                b'\x8D': '?',
+                b'\x8E': ('#x17D', '17D'),
+                b'\x8F': '?',
+                b'\x90': '?',
+                b'\x91': ('lsquo', '2018'),
+                b'\x92': ('rsquo', '2019'),
+                b'\x93': ('ldquo', '201C'),
+                b'\x94': ('rdquo', '201D'),
+                b'\x95': ('bull', '2022'),
+                b'\x96': ('ndash', '2013'),
+                b'\x97': ('mdash', '2014'),
+                b'\x98': ('tilde', '2DC'),
+                b'\x99': ('trade', '2122'),
+                b'\x9a': ('scaron', '161'),
+                b'\x9b': ('rsaquo', '203A'),
+                b'\x9c': ('oelig', '153'),
+                b'\x9d': '?',
+                b'\x9e': ('#x17E', '17E'),
+                b'\x9f': ('Yuml', ''),}
+
+    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
+    # horrors like stripping diacritical marks to turn á into a, but also
+    # contains non-horrors like turning “ into ".
+    MS_CHARS_TO_ASCII = {
+        b'\x80' : 'EUR',
+        b'\x81' : ' ',
+        b'\x82' : ',',
+        b'\x83' : 'f',
+        b'\x84' : ',,',
+        b'\x85' : '...',
+        b'\x86' : '+',
+        b'\x87' : '++',
+        b'\x88' : '^',
+        b'\x89' : '%',
+        b'\x8a' : 'S',
+        b'\x8b' : '<',
+        b'\x8c' : 'OE',
+        b'\x8d' : '?',
+        b'\x8e' : 'Z',
+        b'\x8f' : '?',
+        b'\x90' : '?',
+        b'\x91' : "'",
+        b'\x92' : "'",
+        b'\x93' : '"',
+        b'\x94' : '"',
+        b'\x95' : '*',
+        b'\x96' : '-',
+        b'\x97' : '--',
+        b'\x98' : '~',
+        b'\x99' : '(TM)',
+        b'\x9a' : 's',
+        b'\x9b' : '>',
+        b'\x9c' : 'oe',
+        b'\x9d' : '?',
+        b'\x9e' : 'z',
+        b'\x9f' : 'Y',
+        b'\xa0' : ' ',
+        b'\xa1' : '!',
+        b'\xa2' : 'c',
+        b'\xa3' : 'GBP',
+        b'\xa4' : '$', #This approximation is especially parochial--this is the
+                       #generic currency symbol.
+        b'\xa5' : 'YEN',
+        b'\xa6' : '|',
+        b'\xa7' : 'S',
+        b'\xa8' : '..',
+        b'\xa9' : '',
+        b'\xaa' : '(th)',
+        b'\xab' : '<<',
+        b'\xac' : '!',
+        b'\xad' : ' ',
+        b'\xae' : '(R)',
+        b'\xaf' : '-',
+        b'\xb0' : 'o',
+        b'\xb1' : '+-',
+        b'\xb2' : '2',
+        b'\xb3' : '3',
+        b'\xb4' : ("'", 'acute'),
+        b'\xb5' : 'u',
+        b'\xb6' : 'P',
+        b'\xb7' : '*',
+        b'\xb8' : ',',
+        b'\xb9' : '1',
+        b'\xba' : '(th)',
+        b'\xbb' : '>>',
+        b'\xbc' : '1/4',
+        b'\xbd' : '1/2',
+        b'\xbe' : '3/4',
+        b'\xbf' : '?',
+        b'\xc0' : 'A',
+        b'\xc1' : 'A',
+        b'\xc2' : 'A',
+        b'\xc3' : 'A',
+        b'\xc4' : 'A',
+        b'\xc5' : 'A',
+        b'\xc6' : 'AE',
+        b'\xc7' : 'C',
+        b'\xc8' : 'E',
+        b'\xc9' : 'E',
+        b'\xca' : 'E',
+        b'\xcb' : 'E',
+        b'\xcc' : 'I',
+        b'\xcd' : 'I',
+        b'\xce' : 'I',
+        b'\xcf' : 'I',
+        b'\xd0' : 'D',
+        b'\xd1' : 'N',
+        b'\xd2' : 'O',
+        b'\xd3' : 'O',
+        b'\xd4' : 'O',
+        b'\xd5' : 'O',
+        b'\xd6' : 'O',
+        b'\xd7' : '*',
+        b'\xd8' : 'O',
+        b'\xd9' : 'U',
+        b'\xda' : 'U',
+        b'\xdb' : 'U',
+        b'\xdc' : 'U',
+        b'\xdd' : 'Y',
+        b'\xde' : 'b',
+        b'\xdf' : 'B',
+        b'\xe0' : 'a',
+        b'\xe1' : 'a',
+        b'\xe2' : 'a',
+        b'\xe3' : 'a',
+        b'\xe4' : 'a',
+        b'\xe5' : 'a',
+        b'\xe6' : 'ae',
+        b'\xe7' : 'c',
+        b'\xe8' : 'e',
+        b'\xe9' : 'e',
+        b'\xea' : 'e',
+        b'\xeb' : 'e',
+        b'\xec' : 'i',
+        b'\xed' : 'i',
+        b'\xee' : 'i',
+        b'\xef' : 'i',
+        b'\xf0' : 'o',
+        b'\xf1' : 'n',
+        b'\xf2' : 'o',
+        b'\xf3' : 'o',
+        b'\xf4' : 'o',
+        b'\xf5' : 'o',
+        b'\xf6' : 'o',
+        b'\xf7' : '/',
+        b'\xf8' : 'o',
+        b'\xf9' : 'u',
+        b'\xfa' : 'u',
+        b'\xfb' : 'u',
+        b'\xfc' : 'u',
+        b'\xfd' : 'y',
+        b'\xfe' : 'b',
+        b'\xff' : 'y',
+        }
+
+    # A map used when removing rogue Windows-1252/ISO-8859-1
+    # characters in otherwise UTF-8 documents.
+    #
+    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
+    # Windows-1252.
+    WINDOWS_1252_TO_UTF8 = {
+        0x80 : b'\xe2\x82\xac', # €
+        0x82 : b'\xe2\x80\x9a', # ‚
+        0x83 : b'\xc6\x92',     # ƒ
+        0x84 : b'\xe2\x80\x9e', # „
+        0x85 : b'\xe2\x80\xa6', # …
+        0x86 : b'\xe2\x80\xa0', # †
+        0x87 : b'\xe2\x80\xa1', # ‡
+        0x88 : b'\xcb\x86',     # ˆ
+        0x89 : b'\xe2\x80\xb0', # ‰
+        0x8a : b'\xc5\xa0',     # Š
+        0x8b : b'\xe2\x80\xb9', # ‹
+        0x8c : b'\xc5\x92',     # Œ
+        0x8e : b'\xc5\xbd',     # Ž
+        0x91 : b'\xe2\x80\x98', # ‘
+        0x92 : b'\xe2\x80\x99', # ’
+        0x93 : b'\xe2\x80\x9c', # “
+        0x94 : b'\xe2\x80\x9d', # ”
+        0x95 : b'\xe2\x80\xa2', # •
+        0x96 : b'\xe2\x80\x93', # –
+        0x97 : b'\xe2\x80\x94', # —
+        0x98 : b'\xcb\x9c',     # ˜
+        0x99 : b'\xe2\x84\xa2', # ™
+        0x9a : b'\xc5\xa1',     # š
+        0x9b : b'\xe2\x80\xba', # ›
+        0x9c : b'\xc5\x93',     # œ
+        0x9e : b'\xc5\xbe',     # ž
+        0x9f : b'\xc5\xb8',     # Ÿ
+        0xa0 : b'\xc2\xa0',     #  
+        0xa1 : b'\xc2\xa1',     # ¡
+        0xa2 : b'\xc2\xa2',     # ¢
+        0xa3 : b'\xc2\xa3',     # £
+        0xa4 : b'\xc2\xa4',     # ¤
+        0xa5 : b'\xc2\xa5',     # ¥
+        0xa6 : b'\xc2\xa6',     # ¦
+        0xa7 : b'\xc2\xa7',     # §
+        0xa8 : b'\xc2\xa8',     # ¨
+        0xa9 : b'\xc2\xa9',     # ©
+        0xaa : b'\xc2\xaa',     # ª
+        0xab : b'\xc2\xab',     # «
+        0xac : b'\xc2\xac',     # ¬
+        0xad : b'\xc2\xad',     # 
+        0xae : b'\xc2\xae',     # ®
+        0xaf : b'\xc2\xaf',     # ¯
+        0xb0 : b'\xc2\xb0',     # °
+        0xb1 : b'\xc2\xb1',     # ±
+        0xb2 : b'\xc2\xb2',     # ²
+        0xb3 : b'\xc2\xb3',     # ³
+        0xb4 : b'\xc2\xb4',     # ´
+        0xb5 : b'\xc2\xb5',     # µ
+        0xb6 : b'\xc2\xb6',     # ¶
+        0xb7 : b'\xc2\xb7',     # ·
+        0xb8 : b'\xc2\xb8',     # ¸
+        0xb9 : b'\xc2\xb9',     # ¹
+        0xba : b'\xc2\xba',     # º
+        0xbb : b'\xc2\xbb',     # »
+        0xbc : b'\xc2\xbc',     # ¼
+        0xbd : b'\xc2\xbd',     # ½
+        0xbe : b'\xc2\xbe',     # ¾
+        0xbf : b'\xc2\xbf',     # ¿
+        0xc0 : b'\xc3\x80',     # À
+        0xc1 : b'\xc3\x81',     # Á
+        0xc2 : b'\xc3\x82',     # Â
+        0xc3 : b'\xc3\x83',     # Ã
+        0xc4 : b'\xc3\x84',     # Ä
+        0xc5 : b'\xc3\x85',     # Å
+        0xc6 : b'\xc3\x86',     # Æ
+        0xc7 : b'\xc3\x87',     # Ç
+        0xc8 : b'\xc3\x88',     # È
+        0xc9 : b'\xc3\x89',     # É
+        0xca : b'\xc3\x8a',     # Ê
+        0xcb : b'\xc3\x8b',     # Ë
+        0xcc : b'\xc3\x8c',     # Ì
+        0xcd : b'\xc3\x8d',     # Í
+        0xce : b'\xc3\x8e',     # Î
+        0xcf : b'\xc3\x8f',     # Ï
+        0xd0 : b'\xc3\x90',     # Ð
+        0xd1 : b'\xc3\x91',     # Ñ
+        0xd2 : b'\xc3\x92',     # Ò
+        0xd3 : b'\xc3\x93',     # Ó
+        0xd4 : b'\xc3\x94',     # Ô
+        0xd5 : b'\xc3\x95',     # Õ
+        0xd6 : b'\xc3\x96',     # Ö
+        0xd7 : b'\xc3\x97',     # ×
+        0xd8 : b'\xc3\x98',     # Ø
+        0xd9 : b'\xc3\x99',     # Ù
+        0xda : b'\xc3\x9a',     # Ú
+        0xdb : b'\xc3\x9b',     # Û
+        0xdc : b'\xc3\x9c',     # Ü
+        0xdd : b'\xc3\x9d',     # Ý
+        0xde : b'\xc3\x9e',     # Þ
+        0xdf : b'\xc3\x9f',     # ß
+        0xe0 : b'\xc3\xa0',     # à
+        0xe1 : b'\xa1',     # á
+        0xe2 : b'\xc3\xa2',     # â
+        0xe3 : b'\xc3\xa3',     # ã
+        0xe4 : b'\xc3\xa4',     # ä
+        0xe5 : b'\xc3\xa5',     # å
+        0xe6 : b'\xc3\xa6',     # æ
+        0xe7 : b'\xc3\xa7',     # ç
+        0xe8 : b'\xc3\xa8',     # è
+        0xe9 : b'\xc3\xa9',     # é
+        0xea : b'\xc3\xaa',     # ê
+        0xeb : b'\xc3\xab',     # ë
+        0xec : b'\xc3\xac',     # ì
+        0xed : b'\xc3\xad',     # í
+        0xee : b'\xc3\xae',     # î
+        0xef : b'\xc3\xaf',     # ï
+        0xf0 : b'\xc3\xb0',     # ð
+        0xf1 : b'\xc3\xb1',     # ñ
+        0xf2 : b'\xc3\xb2',     # ò
+        0xf3 : b'\xc3\xb3',     # ó
+        0xf4 : b'\xc3\xb4',     # ô
+        0xf5 : b'\xc3\xb5',     # õ
+        0xf6 : b'\xc3\xb6',     # ö
+        0xf7 : b'\xc3\xb7',     # ÷
+        0xf8 : b'\xc3\xb8',     # ø
+        0xf9 : b'\xc3\xb9',     # ù
+        0xfa : b'\xc3\xba',     # ú
+        0xfb : b'\xc3\xbb',     # û
+        0xfc : b'\xc3\xbc',     # ü
+        0xfd : b'\xc3\xbd',     # ý
+        0xfe : b'\xc3\xbe',     # þ
+        }
+
+    MULTIBYTE_MARKERS_AND_SIZES = [
+        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
+        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
+        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
+        ]
+
+    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
+    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
+
+    @classmethod
+    def detwingle(cls, in_bytes, main_encoding="utf8",
+                  embedded_encoding="windows-1252"):
+        """Fix characters from one encoding embedded in some other encoding.
+
+        Currently the only situation supported is Windows-1252 (or its
+        subset ISO-8859-1), embedded in UTF-8.
+
+        The input must be a bytestring. If you've already converted
+        the document to Unicode, you're too late.
+
+        The output is a bytestring in which `embedded_encoding`
+        characters have been converted to their `main_encoding`
+        equivalents.
+        """
+        if embedded_encoding.replace('_', '-').lower() not in (
+            'windows-1252', 'windows_1252'):
+            raise NotImplementedError(
+                "Windows-1252 and ISO-8859-1 are the only currently supported "
+                "embedded encodings.")
+
+        if main_encoding.lower() not in ('utf8', 'utf-8'):
+            raise NotImplementedError(
+                "UTF-8 is the only currently supported main encoding.")
+
+        byte_chunks = []
+
+        chunk_start = 0
+        pos = 0
+        while pos < len(in_bytes):
+            byte = in_bytes[pos]
+            if not isinstance(byte, int):
+                # Python 2.x
+                byte = ord(byte)
+            if (byte >= cls.FIRST_MULTIBYTE_MARKER
+                and byte <= cls.LAST_MULTIBYTE_MARKER):
+                # This is the start of a UTF-8 multibyte character. Skip
+                # to the end.
+                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
+                    if byte >= start and byte <= end:
+                        pos += size
+                        break
+            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
+                # We found a Windows-1252 character!
+                # Save the string up to this point as a chunk.
+                byte_chunks.append(in_bytes[chunk_start:pos])
+
+                # Now translate the Windows-1252 character into UTF-8
+                # and add it as another, one-byte chunk.
+                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
+                pos += 1
+                chunk_start = pos
+            else:
+                # Go on to the next character.
+                pos += 1
+        if chunk_start == 0:
+            # The string is unchanged.
+            return in_bytes
+        else:
+            # Store the final chunk.
+            byte_chunks.append(in_bytes[chunk_start:])
+        return b''.join(byte_chunks)
+
--- a/libs/bs4/element.py
+++ b/libs/bs4/element.py
--- a/libs/bs4/testing.py
+++ b/libs/bs4/testing.py
@ -0,0 +1,515 @@
+"""Helper classes for tests."""
+
+import copy
+import functools
+import unittest
+from unittest import TestCase
+from bs4 import BeautifulSoup
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    Comment,
+    ContentMetaAttributeValue,
+    Doctype,
+    SoupStrainer,
+)
+
+from bs4.builder import HTMLParserTreeBuilder
+default_builder = HTMLParserTreeBuilder
+
+
+class SoupTest(unittest.TestCase):
+
+    @property
+    def default_builder(self):
+        return default_builder()
+
+    def soup(self, markup, **kwargs):
+        """Build a Beautiful Soup object from markup."""
+        builder = kwargs.pop('builder', self.default_builder)
+        return BeautifulSoup(markup, builder=builder, **kwargs)
+
+    def document_for(self, markup):
+        """Turn an HTML fragment into a document.
+
+        The details depend on the builder.
+        """
+        return self.default_builder.test_fragment_to_document(markup)
+
+    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
+        builder = self.default_builder
+        obj = BeautifulSoup(to_parse, builder=builder)
+        if compare_parsed_to is None:
+            compare_parsed_to = to_parse
+
+        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+
+
+class HTMLTreeBuilderSmokeTest(object):
+
+    """A basic test of a treebuilder's competence.
+
+    Any HTML treebuilder, present or future, should be able to pass
+    these tests. With invalid markup, there's room for interpretation,
+    and different parsers can handle it differently. But with the
+    markup in these tests, there's not much room for interpretation.
+    """
+
+    def assertDoctypeHandled(self, doctype_fragment):
+        """Assert that a given doctype string is handled correctly."""
+        doctype_str, soup = self._document_with_doctype(doctype_fragment)
+
+        # Make sure a Doctype object was created.
+        doctype = soup.contents[0]
+        self.assertEqual(doctype.__class__, Doctype)
+        self.assertEqual(doctype, doctype_fragment)
+        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+
+        # Make sure that the doctype was correctly associated with the
+        # parse tree and that the rest of the document parsed.
+        self.assertEqual(soup.p.contents[0], 'foo')
+
+    def _document_with_doctype(self, doctype_fragment):
+        """Generate and parse a document with the given doctype."""
+        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        markup = doctype + '\n<p>foo</p>'
+        soup = self.soup(markup)
+        return doctype, soup
+
+    def test_normal_doctypes(self):
+        """Make sure normal, everyday HTML doctypes are handled correctly."""
+        self.assertDoctypeHandled("html")
+        self.assertDoctypeHandled(
+            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+
+    def test_public_doctype_with_url(self):
+        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
+        self.assertDoctypeHandled(doctype)
+
+    def test_system_doctype(self):
+        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
+
+    def test_namespaced_system_doctype(self):
+        # We can handle a namespaced doctype with a system ID.
+        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_public_doctype(self):
+        # Test a namespaced doctype with a public id.
+        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out more or less the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b""),
+            markup.replace(b"\n", b""))
+
+    def test_deepcopy(self):
+        """Make sure you can copy the tree builder.
+
+        This is important because the builder is part of a
+        BeautifulSoup object, and we want to be able to copy that.
+        """
+        copy.deepcopy(self.default_builder)
+
+    def test_p_tag_is_never_empty_element(self):
+        """A <p> tag is never designated as an empty-element tag.
+
+        Even if the markup shows it as an empty-element tag, it
+        shouldn't be presented that way.
+        """
+        soup = self.soup("<p/>")
+        self.assertFalse(soup.p.is_empty_element)
+        self.assertEqual(str(soup.p), "<p></p>")
+
+    def test_unclosed_tags_get_closed(self):
+        """A tag that's not closed by the end of the document should be closed.
+
+        This applies to all tags except empty-element tags.
+        """
+        self.assertSoupEquals("<p>", "<p></p>")
+        self.assertSoupEquals("<b>", "<b></b>")
+
+        self.assertSoupEquals("<br>", "<br/>")
+
+    def test_br_is_always_empty_element_tag(self):
+        """A <br> tag is designated as an empty-element tag.
+
+        Some parsers treat <br></br> as one <br/> tag, some parsers as
+        two tags, but it should always be an empty-element tag.
+        """
+        soup = self.soup("<br></br>")
+        self.assertTrue(soup.br.is_empty_element)
+        self.assertEqual(str(soup.br), "<br/>")
+
+    def test_nested_formatting_elements(self):
+        self.assertSoupEquals("<em><em></em></em>")
+
+    def test_comment(self):
+        # Comments are represented as Comment objects.
+        markup = "<p>foo<!--foobar-->baz</p>"
+        self.assertSoupEquals(markup)
+
+        soup = self.soup(markup)
+        comment = soup.find(text="foobar")
+        self.assertEqual(comment.__class__, Comment)
+
+    def test_preserved_whitespace_in_pre_and_textarea(self):
+        """Whitespace must be preserved in <pre> and <textarea> tags."""
+        self.assertSoupEquals("<pre>   </pre>")
+        self.assertSoupEquals("<textarea> woo  </textarea>")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "<b>Inside a B tag</b>"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+    def test_nested_block_level_elements(self):
+        """Block elements can be nested."""
+        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
+        blockquote = soup.blockquote
+        self.assertEqual(blockquote.p.b.string, 'Foo')
+        self.assertEqual(blockquote.b.string, 'Foo')
+
+    def test_correctly_nested_tables(self):
+        """One table can go inside another one."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tr><td>Here\'s another table:'
+            '<table id="2"><tr><td>foo</td></tr></table>'
+            '</td></tr></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_angle_brackets_in_attribute_values_are_escaped(self):
+        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
+
+    def test_entities_in_attributes_converted_to_unicode(self):
+        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
+
+    def test_entities_in_text_converted_to_unicode(self):
+        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
+
+    def test_quot_entity_converted_to_quotation_mark(self):
+        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
+                              '<p>I said "good day!"</p>')
+
+    def test_out_of_range_entity(self):
+        expect = u"\N{REPLACEMENT CHARACTER}"
+        self.assertSoupEquals("&#10000000000000;", expect)
+        self.assertSoupEquals("&#x10000000000000;", expect)
+        self.assertSoupEquals("&#1000000000;", expect)
+
+    def test_basic_namespaces(self):
+        """Parsers don't need to *understand* namespaces, but at the
+        very least they should not choke on namespaces or lose
+        data."""
+
+        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode())
+        html = soup.html
+        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
+        self.assertEqual(
+            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
+        self.assertEqual(
+            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
+
+    def test_multivalued_attribute_value_becomes_list(self):
+        markup = b'<a class="foo bar">'
+        soup = self.soup(markup)
+        self.assertEqual(['foo', 'bar'], soup.a['class'])
+
+    #
+    # Generally speaking, tests below this point are more tests of
+    # Beautiful Soup than tests of the tree builders. But parsers are
+    # weird, so we run these tests separately for every tree builder
+    # to detect any differences between them.
+    #
+
+    def test_soupstrainer(self):
+        """Parsers should be able to work with SoupStrainers."""
+        strainer = SoupStrainer("b")
+        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
+                         parse_only=strainer)
+        self.assertEqual(soup.decode(), "<b>bold</b>")
+
+    def test_single_quote_attribute_values_become_double_quotes(self):
+        self.assertSoupEquals("<foo attr='bar'></foo>",
+                              '<foo attr="bar"></foo>')
+
+    def test_attribute_values_with_nested_quotes_are_left_alone(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        self.assertSoupEquals(text)
+
+    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        soup = self.soup(text)
+        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
+        self.assertSoupEquals(
+            soup.foo.decode(),
+            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
+
+    def test_ampersand_in_attribute_value_gets_escaped(self):
+        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
+                              '<this is="really messed up &amp; stuff"></this>')
+
+        self.assertSoupEquals(
+            '<a href="http://example.org?a=1&b=2;3">foo</a>',
+            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
+
+    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
+        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
+
+    def test_entities_in_strings_converted_during_parsing(self):
+        # Both XML and HTML entities are converted to Unicode characters
+        # during parsing.
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+        self.assertSoupEquals(text, expected)
+
+    def test_smart_quotes_converted_on_the_way_in(self):
+        # Microsoft smart quotes are converted to Unicode characters during
+        # parsing.
+        quote = b"<p>\x91Foo\x92</p>"
+        soup = self.soup(quote)
+        self.assertEqual(
+            soup.p.string,
+            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+    def test_non_breaking_spaces_converted_on_the_way_in(self):
+        soup = self.soup("<a>&nbsp;&nbsp;</a>")
+        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
+    def test_entities_converted_on_the_way_out(self):
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+        soup = self.soup(text)
+        self.assertEqual(soup.p.encode("utf-8"), expected)
+
+    def test_real_iso_latin_document(self):
+        # Smoke test of interrelated functionality, using an
+        # easy-to-understand document.
+
+        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
+        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+
+        # That's because we're going to encode it into ISO-Latin-1, and use
+        # that to test.
+        iso_latin_html = unicode_html.encode("iso-8859-1")
+
+        # Parse the ISO-Latin-1 HTML.
+        soup = self.soup(iso_latin_html)
+        # Encode it to UTF-8.
+        result = soup.encode("utf-8")
+
+        # What do we expect the result to look like? Well, it would
+        # look like unicode_html, except that the META tag would say
+        # UTF-8 instead of ISO-Latin-1.
+        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+
+        # And, of course, it would be in UTF-8, not Unicode.
+        expected = expected.encode("utf-8")
+
+        # Ta-da!
+        self.assertEqual(result, expected)
+
+    def test_real_shift_jis_document(self):
+        # Smoke test to make sure the parser can handle a document in
+        # Shift-JIS encoding, without choking.
+        shift_jis_html = (
+            b'<html><head></head><body><pre>'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'</pre></body></html>')
+        unicode_html = shift_jis_html.decode("shift-jis")
+        soup = self.soup(unicode_html)
+
+        # Make sure the parse tree is correctly encoded to various
+        # encodings.
+        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
+        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
+
+    def test_real_hebrew_document(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
+        soup = self.soup(
+            hebrew_document, from_encoding="iso8859-8")
+        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        self.assertEqual(
+            soup.encode('utf-8'),
+            hebrew_document.decode("iso8859-8").encode("utf-8"))
+
+    def test_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type"/>')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
+        content = parsed_meta['content']
+        self.assertEqual('text/html; charset=x-sjis', content)
+
+        # But that value is actually a ContentMetaAttributeValue object.
+        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
+
+        # For the rest of the story, see TestSubstitutions in
+        # test_tree.py.
+
+    def test_html5_style_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', id="encoding")
+        charset = parsed_meta['charset']
+        self.assertEqual('x-sjis', charset)
+
+        # But that value is actually a CharsetMetaAttributeValue object.
+        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('utf8', charset.encode("utf8"))
+
+    def test_tag_with_no_attributes_can_have_attributes_added(self):
+        data = self.soup("<a>text</a>")
+        data.a['foo'] = 'bar'
+        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+
+class XMLTreeBuilderSmokeTest(object):
+
+    def test_docstring_generated(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out *exactly* the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8"), markup)
+
+
+    def test_docstring_includes_correct_encoding(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode("latin1"),
+            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
+
+    def test_large_xml_document(self):
+        """A large XML document should come out the same as it went in."""
+        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+                  + b'0' * (2**12)
+                  + b'</root>')
+        soup = self.soup(markup)
+        self.assertEqual(soup.encode("utf-8"), markup)
+
+
+    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
+        self.assertSoupEquals("<p>", "<p/>")
+        self.assertSoupEquals("<p>foo</p>")
+
+    def test_namespaces_are_preserved(self):
+        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
+        soup = self.soup(markup)
+        root = soup.root
+        self.assertEqual("http://example.com/", root['xmlns:a'])
+        self.assertEqual("http://example.net/", root['xmlns:b'])
+
+
+class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
+    """Smoke test for a tree builder that supports HTML5."""
+
+    def test_real_xhtml_document(self):
+        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
+        # XHTML documents in any particular way.
+        pass
+
+    def test_html_tags_have_namespace(self):
+        markup = "<a>"
+        soup = self.soup(markup)
+        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
+
+    def test_svg_tags_have_namespace(self):
+        markup = '<svg><circle/></svg>'
+        soup = self.soup(markup)
+        namespace = "http://www.w3.org/2000/svg"
+        self.assertEqual(namespace, soup.svg.namespace)
+        self.assertEqual(namespace, soup.circle.namespace)
+
+
+    def test_mathml_tags_have_namespace(self):
+        markup = '<math><msqrt>5</msqrt></math>'
+        soup = self.soup(markup)
+        namespace = 'http://www.w3.org/1998/Math/MathML'
+        self.assertEqual(namespace, soup.math.namespace)
+        self.assertEqual(namespace, soup.msqrt.namespace)
+
+
+def skipIf(condition, reason):
+   def nothing(test, *args, **kwargs):
+       return None
+
+   def decorator(test_item):
+       if condition:
+           return nothing
+       else:
+           return test_item
+
+   return decorator
--- a/libs/certifi/init.py
+++ b/libs/certifi/init.py
@ -0,0 +1 @@
+from .core import where
--- a/libs/certifi/cacert.pem
+++ b/libs/certifi/cacert.pem
--- a/libs/certifi/core.py
+++ b/libs/certifi/core.py
@ -0,0 +1,19 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+ceritfi.py
+~~~~~~~~~~
+
+This module returns the installation location of cacert.pem.
+"""
+
+import os
+
+def where():
+    f = os.path.split(__file__)[0]
+
+    return os.path.join(f, 'cacert.pem')
+
+if __name__ == '__main__':
+    print(where())
--- a/libs/guessit/ISO-3166-1_utf8.txt
+++ b/libs/guessit/ISO-3166-1_utf8.txt
--- a/libs/guessit/ISO-639-2_utf-8.txt
+++ b/libs/guessit/ISO-639-2_utf-8.txt
--- a/libs/guessit/init.py
+++ b/libs/guessit/init.py
@ -18,7 +18,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #

-__version__ = '0.4'
+__version__ = '0.5-dev'
 __all__ = ['Guess', 'Language',
           'guess_file_info', 'guess_video_info',
           'guess_movie_info', 'guess_episode_info']
--- a/libs/guessit/country.py
+++ b/libs/guessit/country.py
@ -20,6 +20,7 @@

 from __future__ import unicode_literals
 from guessit import fileutils
+from guessit.textutils import to_unicode
 import logging

 log = logging.getLogger(__name__)
@ -66,7 +67,8 @@ class Country(object):
    """

    def __init__(self, country, strict=False):
-        self.alpha3 = country_to_alpha3.get(country.lower())
+        country = to_unicode(country.strip().lower())
+        self.alpha3 = country_to_alpha3.get(country)

        if self.alpha3 is None and strict:
            msg = 'The given string "%s" could not be identified as a country'
--- a/libs/guessit/date.py
+++ b/libs/guessit/date.py
--- a/libs/guessit/fileutils.py
+++ b/libs/guessit/fileutils.py
@ -29,6 +29,7 @@ def split_path(path):
    If the given path was an absolute path, the first element will always be:
     - the '/' root folder on Unix systems
     - the drive letter on Windows systems (eg: r'C:\')
+     - the mount point '\\' on Windows systems (eg: r'\\host\share')

    >>> split_path('/usr/bin/smewt')
    ['/', 'usr', 'bin', 'smewt']
@ -36,12 +37,6 @@ def split_path(path):
    >>> split_path('relative_path/to/my_folder/')
    ['relative_path', 'to', 'my_folder']

-    >>> split_path(r'C:\Program Files\Smewt\smewt.exe')
-    ['C:\\', 'Program Files', 'Smewt', 'smewt.exe']
-
-    >>> split_path(r'Documents and Settings\User\config\\')
-    ['Documents and Settings', 'User', 'config']
-
    """
    result = []
    while True:
--- a/libs/guessit/guess.py
+++ b/libs/guessit/guess.py
--- a/libs/guessit/hash_ed2k.py
+++ b/libs/guessit/hash_ed2k.py
--- a/libs/guessit/hash_mpc.py
+++ b/libs/guessit/hash_mpc.py
--- a/libs/guessit/language.py
+++ b/libs/guessit/language.py
@ -21,11 +21,13 @@
 from __future__ import unicode_literals
 from guessit import fileutils
 from guessit.country import Country
+from guessit.textutils import to_unicode
 import re
 import logging

 __all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language',
-            'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'search_language' ]
+            'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED',
+            'search_language' ]


 log = logging.getLogger(__name__)
@ -46,14 +48,21 @@ _iso639_contents = _iso639_contents[1:]
 language_matrix = [ l.strip().split('|')
                    for l in _iso639_contents.strip().split('\n') ]

-language_matrix += [ [ 'unk', '', 'un', 'Unknown', 'inconnu' ] ]

+# update information in the language matrix
+language_matrix += [['mol', '', 'mo', 'Moldavian', 'moldave'],
+                    ['ass', '', '', 'Assyrian', 'assyrien']]

-# remove unused languages that shadow other common ones with a non-official form
 for lang in language_matrix:
+    # remove unused languages that shadow other common ones with a non-official form
    if (lang[2] == 'se' or # Northern Sami shadows Swedish
        lang[2] == 'br'):  # Breton shadows Brazilian
-        language_matrix.remove(lang)
+        lang[2] = ''
+    # add missing information
+    if lang[0] == 'und':
+        lang[2] = 'un'
+    if lang[0] == 'srp':
+        lang[1] = 'scc' # from OpenSubtitles


 lng3        = frozenset(l[0] for l in language_matrix if l[0])
@ -87,12 +96,17 @@ lng_fr_name_to_lng3 = dict((fr_name.lower(), l[0])

 # contains a list of exceptions: strings that should be parsed as a language
 # but which are not in an ISO form
-lng_exceptions = { 'gr': ('gre', None),
+lng_exceptions = { 'unknown': ('und', None),
+                   'inconnu': ('und', None),
+                   'unk': ('und', None),
+                   'un': ('und', None),
+                   'gr': ('gre', None),
                   'greek': ('gre', None),
                   'esp': ('spa', None),
                   'español': ('spa', None),
                   'se': ('swe', None),
                   'po': ('pt', 'br'),
+                   'pb': ('pt', 'br'),
                   'pob': ('pt', 'br'),
                   'br': ('pt', 'br'),
                   'brazilian': ('pt', 'br'),
@ -101,7 +115,8 @@ lng_exceptions = { 'gr': ('gre', None),
                   'ua': ('ukr', None),
                   'cn': ('chi', None),
                   'chs': ('chi', None),
-                   'jp': ('jpn', None)
+                   'jp': ('jpn', None),
+                   'scr': ('hrv', None)
                   }


@ -130,6 +145,11 @@ class Language(object):
    You can also distinguish languages for specific countries, such as
    Portuguese and Brazilian Portuguese.

+    There are various properties on the language object that give you the
+    representation of the language for a specific usage, such as .alpha3
+    to get the ISO 3-letter code, or .opensubtitles to get the OpenSubtitles
+    language code.
+
    >>> Language('fr')
    Language(French)

@ -146,16 +166,19 @@ class Language(object):
    True

    >>> Language('zz', strict=False).english_name
-    u'Unknown'
+    u'Undetermined'
+
+    >>> Language('pt(br)').opensubtitles
+    u'pob'
    """

    _with_country_regexp = re.compile('(.*)\((.*)\)')
+    _with_country_regexp2 = re.compile('(.*)-(.*)')

-    def __init__(self, language, country=None, strict=False):
-        language = language.strip().lower()
-        if isinstance(language, str):
-            language = language.decode('utf-8')
-        with_country = Language._with_country_regexp.match(language)
+    def __init__(self, language, country=None, strict=False, scheme=None):
+        language = to_unicode(language.strip().lower())
+        with_country = (Language._with_country_regexp.match(language) or
+                        Language._with_country_regexp2.match(language))
        if with_country:
            self.lang = Language(with_country.group(1)).lang
            self.country = Country(with_country.group(2))
@ -164,6 +187,18 @@ class Language(object):
        self.lang = None
        self.country = Country(country) if country else None

+        # first look for scheme specific languages
+        if scheme == 'opensubtitles':
+            if language == 'br':
+                self.lang = 'bre'
+                return
+            elif language == 'se':
+                self.lang = 'sme'
+                return
+        elif scheme is not None:
+            log.warning('Unrecognized scheme: "%s" - Proceeding with standard one' % scheme)
+
+        # look for ISO language codes
        if len(language) == 2:
            self.lang = lng2_to_lng3.get(language)
        elif len(language) == 3:
@ -174,6 +209,7 @@ class Language(object):
            self.lang = (lng_en_name_to_lng3.get(language) or
                         lng_fr_name_to_lng3.get(language))

+        # general language exceptions
        if self.lang is None and language in lng_exceptions:
            lang, country = lng_exceptions[language]
            self.lang = Language(lang).alpha3
@ -186,7 +222,7 @@ class Language(object):

        if self.lang is None:
            log.debug(msg)
-            self.lang = 'unk'
+            self.lang = 'und'

    @property
    def alpha2(self):
@ -208,6 +244,20 @@ class Language(object):
    def french_name(self):
        return lng3_to_lng_fr_name[self.lang]

+    @property
+    def opensubtitles(self):
+        if self.lang == 'por' and self.country and self.country.alpha2 == 'br':
+            return 'pob'
+        elif self.lang in ['gre', 'srp']:
+            return self.alpha3term
+        return self.alpha3
+
+    @property
+    def tmdb(self):
+        if self.country:
+            return '%s-%s' % (self.alpha2, self.country.alpha2.upper())
+        return self.alpha2
+
    def __hash__(self):
        return hash(self.lang)

@ -227,7 +277,7 @@ class Language(object):
        return not self == other

    def __nonzero__(self):
-        return self.lang != 'unk'
+        return self.lang != 'und'

    def __unicode__(self):
        if self.country:
@ -245,7 +295,8 @@ class Language(object):
            return 'Language(%s)' % self.english_name


-ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([Language('unk')])
+UNDETERMINED = Language('und')
+ALL_LANGUAGES = frozenset(Language(lng) for lng in lng_all_names) - frozenset([UNDETERMINED])
 ALL_LANGUAGES_NAMES = lng_all_names

 def search_language(string, lang_filter=None):
--- a/libs/guessit/matcher.py
+++ b/libs/guessit/matcher.py
--- a/libs/guessit/matchtree.py
+++ b/libs/guessit/matchtree.py
--- a/libs/guessit/patterns.py
+++ b/libs/guessit/patterns.py
@ -40,10 +40,10 @@ episode_rexps = [ # ... Season 2 ...
                  (r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),

                  # ... s02e13 ...
-                  (r'[Ss](?P<season>[0-9]{1,2}).{,3}[EeXx](?P<episodeNumber>[0-9]{1,2})[^0-9]', 1.0, (0, -1)),
+                  (r'[Ss](?P<season>[0-9]{1,2}).{,3}(?P<episodeNumber>(?:[EeXx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),

                  # ... 2x13 ...
-                  (r'[^0-9](?P<season>[0-9]{1,2})x(?P<episodeNumber>[0-9]{2})[^0-9]', 0.8, (1, -1)),
+                  (r'[^0-9](?P<season>[0-9]{1,2})(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),

                  # ... s02 ...
                  #(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
@ -61,7 +61,7 @@ weak_episode_rexps = [ # ... 213 or 0106 ...
                       (sep + r'(?P<episodeNumber>[0-9]{1,4})' + sep, (1, -1)),

                       # ... 2x13 ...
-                       (sep + r'[^0-9](?P<season>[0-9]{1,2})\.(?P<episodeNumber>[0-9]{2})[^0-9]' + sep, (1, -1)),
+                       (sep + r'[^0-9](?P<season>[0-9]{1,2})\.(?P<episodeNumber>[0-9]{1,2})[^0-9]' + sep, (1, -1)),

                       # ... e13 ... for a mini-series without a season number
                       (r'e(?P<episodeNumber>[0-9]{1,4})[^0-9]', (0, -1)),
--- a/libs/guessit/slogging.py
+++ b/libs/guessit/slogging.py
--- a/libs/guessit/textutils.py
+++ b/libs/guessit/textutils.py
@ -19,6 +19,7 @@
 #

 from guessit.patterns import sep
+import unicodedata
 import copy

 # string-related functions
@ -70,6 +71,7 @@ def to_utf8(o):
        return [ to_utf8(i) for i in o ]
    elif isinstance(o, dict):
        # need to do it like that to handle Guess instances correctly
+        # FIXME: why is that necessary?
        result = copy.deepcopy(o)
        for key, value in o.items():
            result[to_utf8(key)] = to_utf8(value)
@ -78,6 +80,26 @@ def to_utf8(o):
    else:
        return o

+def to_unicode(o):
+    """Convert all strings found in the given object to normalized
+    unicode strings, using the UTF-8 codec if needed."""
+
+    if isinstance(o, unicode):
+        return unicodedata.normalize('NFC', o)
+    if isinstance(o, str):
+        return unicodedata.normalize('NFC', o.decode('utf-8'))
+    elif isinstance(o, list):
+        return [ to_unicode(i) for i in o ]
+    elif isinstance(o, dict):
+        # need to do it like that to handle Guess instances correctly
+        #result = copy.deepcopy(o)
+        for key, value in o.items():
+            result[to_unicode(key)] = to_unicode(value)
+        return result
+
+    else:
+        return o
+

 def levenshtein(a, b):
    if not a:
--- a/libs/guessit/transfo/init.py
+++ b/libs/guessit/transfo/init.py
--- a/libs/guessit/transfo/guess_bonus_features.py
+++ b/libs/guessit/transfo/guess_bonus_features.py
--- a/libs/guessit/transfo/guess_date.py
+++ b/libs/guessit/transfo/guess_date.py
--- a/libs/guessit/transfo/guess_episode_info_from_position.py
+++ b/libs/guessit/transfo/guess_episode_info_from_position.py
--- a/libs/guessit/transfo/guess_episodes_rexps.py
+++ b/libs/guessit/transfo/guess_episodes_rexps.py
@ -26,14 +26,31 @@ import logging

 log = logging.getLogger(__name__)

+def number_list(s):
+    return re.sub('[^0-9]+', ' ', s).split()

 def guess_episodes_rexps(string):
    for rexp, confidence, span_adjust in episode_rexps:
        match = re.search(rexp, string, re.IGNORECASE)
        if match:
-            return (Guess(match.groupdict(), confidence=confidence),
+            result = (Guess(match.groupdict(), confidence=confidence),
                      (match.start() + span_adjust[0],
                       match.end() + span_adjust[1]))
+            # episodes which have a season > 25 are most likely errors
+            # (Simpsons is at 23!)
+            if int(result[0].get('season', 0)) > 25:
+                continue
+
+            # decide whether we have only a single episode number or an
+            # episode list
+            if result[0].get('episodeNumber'):
+                eplist = number_list(result[0]['episodeNumber'])
+                result[0].set('episodeNumber', int(eplist[0]), confidence=confidence)
+
+                if len(eplist) > 1:
+                    result[0].set('episodeList', map(int, eplist), confidence=confidence)
+
+            return result

    return None, None

--- a/libs/guessit/transfo/guess_filetype.py
+++ b/libs/guessit/transfo/guess_filetype.py
@ -87,6 +87,9 @@ def guess_filetype(filename, filetype):
                upgrade_episode()
                break

+        if 'tvu.org.ru' in filename:
+            upgrade_episode()
+
        # if no episode info found, assume it's a movie
        upgrade_movie()

--- a/libs/guessit/transfo/guess_language.py
+++ b/libs/guessit/transfo/guess_language.py
--- a/libs/guessit/transfo/guess_movie_title_from_position.py
+++ b/libs/guessit/transfo/guess_movie_title_from_position.py
--- a/libs/guessit/transfo/guess_properties.py
+++ b/libs/guessit/transfo/guess_properties.py
--- a/libs/guessit/transfo/guess_release_group.py
+++ b/libs/guessit/transfo/guess_release_group.py
--- a/libs/guessit/transfo/guess_video_rexps.py
+++ b/libs/guessit/transfo/guess_video_rexps.py
--- a/libs/guessit/transfo/guess_weak_episodes_rexps.py
+++ b/libs/guessit/transfo/guess_weak_episodes_rexps.py
--- a/libs/guessit/transfo/guess_website.py
+++ b/libs/guessit/transfo/guess_website.py
--- a/libs/guessit/transfo/guess_year.py
+++ b/libs/guessit/transfo/guess_year.py
--- a/libs/guessit/transfo/post_process.py
+++ b/libs/guessit/transfo/post_process.py
--- a/libs/guessit/transfo/split_explicit_groups.py
+++ b/libs/guessit/transfo/split_explicit_groups.py
--- a/libs/guessit/transfo/split_on_dash.py
+++ b/libs/guessit/transfo/split_on_dash.py
--- a/libs/guessit/transfo/split_path_components.py
+++ b/libs/guessit/transfo/split_path_components.py
--- a/libs/html5lib/init.py
+++ b/libs/html5lib/init.py
@ -0,0 +1,17 @@
+""" 
+HTML parsing library based on the WHATWG "HTML5"
+specification. The parser is designed to be compatible with existing
+HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage:
+
+import html5lib
+f = open("my_document.html")
+tree = html5lib.parse(f) 
+"""
+__version__ = "0.95-dev"
+from html5parser import HTMLParser, parse, parseFragment
+from treebuilders import getTreeBuilder
+from treewalkers import getTreeWalker
+from serializer import serialize
--- a/libs/html5lib/constants.py
+++ b/libs/html5lib/constants.py
--- a/libs/html5lib/filters/init.py
+++ b/libs/html5lib/filters/init.py
--- a/libs/html5lib/filters/_base.py
+++ b/libs/html5lib/filters/_base.py
@ -0,0 +1,10 @@
+
+class Filter(object):
+    def __init__(self, source):
+        self.source = source
+
+    def __iter__(self):
+        return iter(self.source)
+
+    def __getattr__(self, name):
+        return getattr(self.source, name)
--- a/libs/html5lib/filters/formfiller.py
+++ b/libs/html5lib/filters/formfiller.py
@ -0,0 +1,127 @@
+#
+# The goal is to finally have a form filler where you pass data for
+# each form, using the algorithm for "Seeding a form with initial values"
+# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
+#
+
+import _base
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class SimpleFilter(_base.Filter):
+    def __init__(self, source, fieldStorage):
+        _base.Filter.__init__(self, source)
+        self.fieldStorage = fieldStorage
+
+    def __iter__(self):
+        field_indices = {}
+        state = None
+        field_name = None
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                name = token["name"].lower()
+                if name == "input":
+                    field_name = None
+                    field_type = None
+                    input_value_index = -1
+                    input_checked_index = -1
+                    for i,(n,v) in enumerate(token["data"]):
+                        n = n.lower()
+                        if n == u"name":
+                            field_name = v.strip(spaceCharacters)
+                        elif n == u"type":
+                            field_type = v.strip(spaceCharacters)
+                        elif n == u"checked":
+                            input_checked_index = i
+                        elif n == u"value":
+                            input_value_index = i
+
+                    value_list = self.fieldStorage.getlist(field_name)
+                    field_index = field_indices.setdefault(field_name, 0)
+                    if field_index < len(value_list):
+                        value = value_list[field_index]
+                    else:
+                        value = ""
+
+                    if field_type in (u"checkbox", u"radio"):
+                        if value_list:
+                            if token["data"][input_value_index][1] == value:
+                                if input_checked_index < 0:
+                                    token["data"].append((u"checked", u""))
+                                field_indices[field_name] = field_index + 1
+                            elif input_checked_index >= 0:
+                                del token["data"][input_checked_index]
+
+                    elif field_type not in (u"button", u"submit", u"reset"):
+                        if input_value_index >= 0:
+                            token["data"][input_value_index] = (u"value", value)
+                        else:
+                            token["data"].append((u"value", value))
+                        field_indices[field_name] = field_index + 1
+
+                    field_type = None
+                    field_name = None
+
+                elif name == "textarea":
+                    field_type = "textarea"
+                    field_name = dict((token["data"])[::-1])["name"]
+
+                elif name == "select":
+                    field_type = "select"
+                    attributes = dict(token["data"][::-1])
+                    field_name = attributes.get("name")
+                    is_select_multiple = "multiple" in attributes
+                    is_selected_option_found = False
+
+                elif field_type == "select" and field_name and name == "option":
+                    option_selected_index = -1
+                    option_value = None
+                    for i,(n,v) in enumerate(token["data"]):
+                        n = n.lower()
+                        if n == "selected":
+                            option_selected_index = i
+                        elif n == "value":
+                            option_value = v.strip(spaceCharacters)
+                    if option_value is None:
+                        raise NotImplementedError("<option>s without a value= attribute")
+                    else:
+                        value_list = self.fieldStorage.getlist(field_name)
+                        if value_list:
+                            field_index = field_indices.setdefault(field_name, 0)
+                            if field_index < len(value_list):
+                                value = value_list[field_index]
+                            else:
+                                value = ""
+                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
+                                if option_selected_index < 0:
+                                    token["data"].append((u"selected", u""))
+                                field_indices[field_name] = field_index + 1
+                                is_selected_option_found = True
+                            elif option_selected_index >= 0:
+                                del token["data"][option_selected_index]
+
+            elif field_type is not None and field_name and type == "EndTag":
+                name = token["name"].lower()
+                if name == field_type:
+                    if name == "textarea":
+                        value_list = self.fieldStorage.getlist(field_name)
+                        if value_list:
+                            field_index = field_indices.setdefault(field_name, 0)
+                            if field_index < len(value_list):
+                                value = value_list[field_index]
+                            else:
+                                value = ""
+                            yield {"type": "Characters", "data": value}
+                            field_indices[field_name] = field_index + 1
+
+                    field_name = None
+
+                elif name == "option" and field_type == "select":
+                    pass # TODO: part of "option without value= attribute" processing
+
+            elif field_type == "textarea":
+                continue # ignore token
+
+            yield token
--- a/libs/html5lib/filters/inject_meta_charset.py
+++ b/libs/html5lib/filters/inject_meta_charset.py
@ -0,0 +1,62 @@
+import _base
+
+class Filter(_base.Filter):
+    def __init__(self, source, encoding):
+        _base.Filter.__init__(self, source)
+        self.encoding = encoding
+
+    def __iter__(self):
+        state = "pre_head"
+        meta_found = (self.encoding is None)
+        pending = []
+
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag":
+                if token["name"].lower() == u"head":
+                    state = "in_head"
+
+            elif type == "EmptyTag":
+                if token["name"].lower() == u"meta":
+                   # replace charset with actual encoding
+                   has_http_equiv_content_type = False
+                   for (namespace,name),value in token["data"].iteritems():
+                       if namespace != None:
+                           continue
+                       elif name.lower() == u'charset':
+                          token["data"][(namespace,name)] = self.encoding
+                          meta_found = True
+                          break
+                       elif name == u'http-equiv' and value.lower() == u'content-type':
+                           has_http_equiv_content_type = True
+                   else:
+                       if has_http_equiv_content_type and (None, u"content") in token["data"]:
+                           token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding
+                           meta_found = True
+
+                elif token["name"].lower() == u"head" and not meta_found:
+                    # insert meta into empty head
+                    yield {"type": "StartTag", "name": u"head",
+                           "data": token["data"]}
+                    yield {"type": "EmptyTag", "name": u"meta",
+                           "data": {(None, u"charset"): self.encoding}}
+                    yield {"type": "EndTag", "name": u"head"}
+                    meta_found = True
+                    continue
+
+            elif type == "EndTag":
+                if token["name"].lower() == u"head" and pending:
+                    # insert meta into head (if necessary) and flush pending queue
+                    yield pending.pop(0)
+                    if not meta_found:
+                        yield {"type": "EmptyTag", "name": u"meta",
+                               "data": {(None, u"charset"): self.encoding}}
+                    while pending:
+                        yield pending.pop(0)
+                    meta_found = True
+                    state = "post_head"
+
+            if state == "in_head":
+                pending.append(token)
+            else:
+                yield token
--- a/libs/html5lib/filters/lint.py
+++ b/libs/html5lib/filters/lint.py
@ -0,0 +1,88 @@
+from gettext import gettext
+_ = gettext
+
+import _base
+from html5lib.constants import cdataElements, rcdataElements, voidElements
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class LintError(Exception): pass
+
+class Filter(_base.Filter):
+    def __iter__(self):
+        open_elements = []
+        contentModelFlag = "PCDATA"
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                if contentModelFlag != "PCDATA":
+                    raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+                if not isinstance(name, unicode):
+                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                if not name:
+                    raise LintError(_(u"Empty tag name"))
+                if type == "StartTag" and name in voidElements:
+                    raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+                elif type == "EmptyTag" and name not in voidElements:
+                    raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+                if type == "StartTag":
+                    open_elements.append(name)
+                for name, value in token["data"]:
+                    if not isinstance(name, unicode):
+                        raise LintError(_("Attribute name is not a string: %r") % name)
+                    if not name:
+                        raise LintError(_(u"Empty attribute name"))
+                    if not isinstance(value, unicode):
+                        raise LintError(_("Attribute value is not a string: %r") % value)
+                if name in cdataElements:
+                    contentModelFlag = "CDATA"
+                elif name in rcdataElements:
+                    contentModelFlag = "RCDATA"
+                elif name == "plaintext":
+                    contentModelFlag = "PLAINTEXT"
+
+            elif type == "EndTag":
+                name = token["name"]
+                if not isinstance(name, unicode):
+                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                if not name:
+                    raise LintError(_(u"Empty tag name"))
+                if name in voidElements:
+                    raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+                start_name = open_elements.pop()
+                if start_name != name:
+                    raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+                contentModelFlag = "PCDATA"
+
+            elif type == "Comment":
+                if contentModelFlag != "PCDATA":
+                    raise LintError(_("Comment not in PCDATA content model flag"))
+
+            elif type in ("Characters", "SpaceCharacters"):
+                data = token["data"]
+                if not isinstance(data, unicode):
+                    raise LintError(_("Attribute name is not a string: %r") % data)
+                if not data:
+                    raise LintError(_(u"%s token with empty data") % type)
+                if type == "SpaceCharacters":
+                    data = data.strip(spaceCharacters)
+                    if data:
+                        raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+
+            elif type == "Doctype":
+                name = token["name"]
+                if contentModelFlag != "PCDATA":
+                    raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+                if not isinstance(name, unicode):
+                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                # XXX: what to do with token["data"] ?
+
+            elif type in ("ParseError", "SerializeError"):
+                pass
+
+            else:
+                raise LintError(_(u"Unknown token type: %s") % type)
+
+            yield token
--- a/libs/html5lib/filters/optionaltags.py
+++ b/libs/html5lib/filters/optionaltags.py
@ -0,0 +1,202 @@
+import _base
+
+class Filter(_base.Filter):
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if (token["data"] or 
+                    not self.is_optional_start(token["name"], previous, next)):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceeded by another colgroup element whose
+            # end tag has been omitted.
+            if type in ("StartTag", "EmptyTag"):
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceeded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                  previous['name'] in ('tbody','thead','tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog', 
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol', 
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
--- a/libs/html5lib/filters/sanitizer.py
+++ b/libs/html5lib/filters/sanitizer.py
@ -0,0 +1,8 @@
+import _base
+from html5lib.sanitizer import HTMLSanitizerMixin
+
+class Filter(_base.Filter, HTMLSanitizerMixin):
+    def __iter__(self):
+        for token in _base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token: yield token
--- a/libs/html5lib/filters/whitespace.py
+++ b/libs/html5lib/filters/whitespace.py
@ -0,0 +1,41 @@
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import ImmutableSet as frozenset
+
+import re
+
+import _base
+from html5lib.constants import rcdataElements, spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
+
+class Filter(_base.Filter):
+
+    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+
+    def __iter__(self):
+        preserve = 0
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag" \
+              and (preserve or token["name"] in self.spacePreserveElements):
+                preserve += 1
+
+            elif type == "EndTag" and preserve:
+                preserve -= 1
+
+            elif not preserve and type == "SpaceCharacters" and token["data"]:
+                # Test on token["data"] above to not introduce spaces where there were not
+                token["data"] = u" "
+
+            elif not preserve and type == "Characters":
+                token["data"] = collapse_spaces(token["data"])
+
+            yield token
+
+def collapse_spaces(text):
+    return SPACES_REGEX.sub(' ', text)
+
--- a/libs/html5lib/html5parser.py
+++ b/libs/html5lib/html5parser.py
--- a/libs/html5lib/ihatexml.py
+++ b/libs/html5lib/ihatexml.py
@ -0,0 +1,177 @@
+import re
+
+baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+
+digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+#Without the 
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, 
+                       extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1]*2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+            
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i+j][1]
+            j += 1
+        i += j
+    return rv
+
+#We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+    
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1]+1, charList[i+1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+           rv.append(escapeRegexp(unichr(item[0])))
+        else:
+            rv.append(escapeRegexp(unichr(item[0])) + "-" +
+                      escapeRegexp(unichr(item[1])))
+    return "[%s]"%"".join(rv)
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                          "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, "\\" + char)
+        if char in string:
+            print string
+
+    return string
+
+#output from the above
+nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+    def __init__(self, replaceChars = None,  
+                 dropXmlnsLocalName = False, 
+                 dropXmlnsAttrNs = False,
+                 preventDoubleDashComments = False,
+                 preventDashAtCommentEnd = False,
+                 replaceFormFeedCharacters = True):
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            #Need a datalosswarning here
+            return None
+        elif (self.dropXmlnsAttrNs and 
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name, namespace=None):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                data = data.replace("--", "- -")
+        return data
+    
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            data = data.replace("\x0C", " ")
+        #Other non-xml characters
+        return data
+
+    def toXmlName(self, name):
+        nameFirst = name[0]
+        nameRest = name[1:]
+        m = nonXmlNameFirstBMPRegexp.match(nameFirst)
+        if m:
+            nameFirstOutput = self.getReplacementCharacter(nameFirst)
+        else:
+            nameFirstOutput = nameFirst
+
+        nameRestOutput = nameRest
+        replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
+        for char in replaceChars:
+            replacement = self.getReplacementCharacter(char)
+            nameRestOutput = nameRestOutput.replace(char, replacement)
+        return nameFirstOutput + nameRestOutput
+    
+    def getReplacementCharacter(self, char):
+        if char in self.replaceCache:
+            replacement = self.replaceCache[char]
+        else:
+            replacement = self.escapeChar(char)
+        return replacement
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return unichr(int(charcode[1:], 16))
--- a/libs/html5lib/inputstream.py
+++ b/libs/html5lib/inputstream.py
@ -0,0 +1,782 @@
+import codecs
+import re
+import types
+import sys
+
+from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from constants import encodings, ReparseException
+import utils
+
+#Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
+asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
+asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+
+invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                                  0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+        
+class BufferedStream:
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that 
+    joining many strings will be slow since it is O(n**2)
+    """
+    
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1,0] #chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos < self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= pos
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+    
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+            
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            data = rv.append(bufferedData[bufferOffset: 
+                                          bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+        
+        return "".join(rv)
+        
+
+
+class HTMLInputStream:
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    _defaultChunkSize = 10240
+
+    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by html5lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+        
+        parseMeta - Look for a <meta> element containing encoding information
+
+        """
+
+        #Craziness
+        if len(u"\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+            self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]")
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+            self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
+
+        # List of where new lines occur
+        self.newLines = [0]
+
+        self.charEncoding = (codecName(encoding), "certain")
+
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
+        self.rawStream = self.openStream(source)
+
+        # Encoding Information
+        #Number of bytes to use when looking for a meta element with
+        #encoding information
+        self.numBytesMeta = 512
+        #Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
+        #Encoding to use if no other information can be found
+        self.defaultEncoding = "windows-1252"
+        
+        #Detect encoding iff no explicit "transport level" encoding is supplied
+        if (self.charEncoding[0] is None):
+            self.charEncoding = self.detectEncoding(parseMeta, chardet)
+
+
+        self.reset()
+
+    def reset(self):
+        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+                                                                 'replace')
+
+        self.chunk = u""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+        self.errors = []
+
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
+        
+        #Deal with CR LF and surrogates split over chunk boundaries
+        self._bufferedCharacter = None
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            # Otherwise treat source as a string and convert to a file object
+            if isinstance(source, unicode):
+                source = source.encode('utf-8')
+                self.charEncoding = ("utf-8", "certain")
+            try:
+                from io import BytesIO
+            except:
+                # 2to3 converts this line to: from io import StringIO  
+                from cStringIO import StringIO as BytesIO
+            stream = BytesIO(source)
+
+        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+            stream is sys.stdin):
+            stream = BufferedStream(stream)
+
+        return stream
+
+    def detectEncoding(self, parseMeta=True, chardet=True):
+        #First look for a BOM
+        #This will also read past the BOM if present
+        encoding = self.detectBOM()
+        confidence = "certain"
+        #If there is no BOM need to look for meta elements with encoding 
+        #information
+        if encoding is None and parseMeta:
+            encoding = self.detectEncodingMeta()
+            confidence = "tentative"
+        #Guess with chardet, if avaliable
+        if encoding is None and chardet:
+            confidence = "tentative"
+            try:
+                from chardet.universaldetector import UniversalDetector
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = detector.result['encoding']
+                self.rawStream.seek(0)
+            except ImportError:
+                pass
+        # If all else fails use the default encoding
+        if encoding is None:
+            confidence="tentative"
+            encoding = self.defaultEncoding
+        
+        #Substitute for equivalent encodings:
+        encodingSub = {"iso-8859-1":"windows-1252"}
+
+        if encoding.lower() in encodingSub:
+            encoding = encodingSub[encoding.lower()]
+
+        return encoding, confidence
+
+    def changeEncoding(self, newEncoding):
+        newEncoding = codecName(newEncoding)
+        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            newEncoding = "utf-8"
+        if newEncoding is None:
+            return
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.reset()
+            self.charEncoding = (newEncoding, "certain")
+            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
+            
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
+            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        string = self.rawStream.read(4)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])         # UTF-8
+        seek = 3
+        if not encoding:
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
+            if not encoding:
+                encoding = bomDict.get(string[:2]) # UTF-16
+                seek = 2
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        self.rawStream.seek(encoding and seek or 0)
+
+        return encoding
+
+    def detectEncodingMeta(self):
+        """Report the encoding declared by the meta element
+        """
+        buffer = self.rawStream.read(self.numBytesMeta)
+        parser = EncodingParser(buffer)
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+        
+        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            encoding = "utf-8"
+
+        return encoding
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count(u'\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind(u'\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        line, col = self._position(self.chunkOffset)
+        return (line+1, col)
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
+                return EOF
+
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1
+
+        return char
+
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = u""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
+        data = self.dataStream.read(chunkSize)
+        
+        #Deal with CR LF and surrogates broken across chunks
+        if self._bufferedCharacter:
+            data = self._bufferedCharacter + data
+            self._bufferedCharacter = None
+        elif not data:
+            # We have no more data, bye-bye stream
+            return False
+        
+        if len(data) > 1:
+            lastv = ord(data[-1])
+            if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF:
+                self._bufferedCharacter = data[-1]
+                data = data[:-1]
+        
+        self.reportCharacterErrors(data)
+        
+        # Replace invalid characters
+        # Note U+0000 is dealt with in the tokenizer
+        data = self.replaceCharactersRegexp.sub(u"\ufffd", data)
+                    
+        data = data.replace(u"\r\n", u"\n")
+        data = data.replace(u"\r", u"\n")
+
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for i in xrange(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        #Someone picked the wrong compile option
+        #You lose
+        skip = False
+        import sys
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            #Pretty sure there should be endianness issues here
+            if utils.isSurrogatePair(data[pos:pos+2]):
+                #We have a surrogate pair!
+                char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+
+    def charsUntil(self, characters, opposite = False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
+        """
+
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters: 
+                    assert(ord(c) < 128)
+            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = u"^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
+
+        rv = []
+
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
+                    break
+            else:
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break
+
+        r = u"".join(rv)
+        return r
+
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget
+        if char is not None:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
+            else:
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char
+
+class EncodingBytes(str):
+    """String-like object with an associated position and various extra methods
+    If the position is ever greater than the string length then an exception is
+    raised"""
+    def __new__(self, value):
+        return str.__new__(self, value.lower())
+
+    def __init__(self, value):
+        self._position=-1
+    
+    def __iter__(self):
+        return self
+    
+    def next(self):
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p]
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p]
+    
+    def setPosition(self, position):
+        if self._position >= len(self):
+            raise StopIteration
+        self._position = position
+    
+    def getPosition(self):
+        if self._position >= len(self):
+            raise StopIteration
+        if self._position >= 0:
+            return self._position
+        else:
+            return None
+    
+    position = property(getPosition, setPosition)
+
+    def getCurrentByte(self):
+        return self[self.position]
+    
+    currentByte = property(getCurrentByte)
+
+    def skip(self, chars=spaceCharactersBytes):
+        """Skip past a list of characters"""
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def matchBytes(self, bytes):
+        """Look for a sequence of bytes at the start of a string. If the bytes 
+        are found return True and advance the position to the byte after the 
+        match. Otherwise return False and leave the position alone"""
+        p = self.position
+        data = self[p:p+len(bytes)]
+        rv = data.startswith(bytes)
+        if rv:
+            self.position += len(bytes)
+        return rv
+    
+    def jumpTo(self, bytes):
+        """Look for the next sequence of bytes matching a given sequence. If
+        a match is found advance the position to the last byte of the match"""
+        newPosition = self[self.position:].find(bytes)
+        if newPosition > -1:
+            # XXX: This is ugly, but I can't see a nicer way to fix this.
+            if self._position == -1:
+                self._position = 0
+            self._position += (newPosition + len(bytes)-1)
+            return True
+        else:
+            raise StopIteration
+
+class EncodingParser(object):
+    """Mini parser for detecting character encoding from meta elements"""
+
+    def __init__(self, data):
+        """string - the data to work on for encoding detection"""
+        self.data = EncodingBytes(data)
+        self.encoding = None
+
+    def getEncoding(self):
+        methodDispatch = (
+            ("<!--",self.handleComment),
+            ("<meta",self.handleMeta),
+            ("</",self.handlePossibleEndTag),
+            ("<!",self.handleOther),
+            ("<?",self.handleOther),
+            ("<",self.handlePossibleStartTag))
+        for byte in self.data:
+            keepParsing = True
+            for key, method in methodDispatch:
+                if self.data.matchBytes(key):
+                    try:
+                        keepParsing = method()    
+                        break
+                    except StopIteration:
+                        keepParsing=False
+                        break
+            if not keepParsing:
+                break
+        
+        return self.encoding
+
+    def handleComment(self):
+        """Skip over comments"""
+        return self.data.jumpTo("-->")
+
+    def handleMeta(self):
+        if self.data.currentByte not in spaceCharactersBytes:
+            #if we have <meta not followed by a space so just keep going
+            return True
+        #We have a valid meta element we want to search for attributes
+        while True:
+            #Try to find the next attribute after the current position
+            attr = self.getAttribute()
+            if attr is None:
+                return True
+            else:
+                if attr[0] == "charset":
+                    tentativeEncoding = attr[1]
+                    codec = codecName(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
+                        return False
+                elif attr[0] == "content":
+                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+                    tentativeEncoding = contentParser.parse()
+                    codec = codecName(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
+                        return False
+
+    def handlePossibleStartTag(self):
+        return self.handlePossibleTag(False)
+
+    def handlePossibleEndTag(self):
+        self.data.next()
+        return self.handlePossibleTag(True)
+
+    def handlePossibleTag(self, endTag):
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
+            #If the next byte is not an ascii letter either ignore this
+            #fragment (possible start tag case) or treat it according to 
+            #handleOther
+            if endTag:
+                data.previous()
+                self.handleOther()
+            return True
+        
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == "<":
+            #return to the first step in the overall "two step" algorithm
+            #reprocessing the < byte
+            data.previous()
+        else:
+            #Read all attributes
+            attr = self.getAttribute()
+            while attr is not None:
+                attr = self.getAttribute()
+        return True
+
+    def handleOther(self):
+        return self.data.jumpTo(">")
+
+    def getAttribute(self):
+        """Return a name,value pair for the next attribute in the stream, 
+        if one is found, or None"""
+        data = self.data
+        # Step 1 (skip chars)
+        c = data.skip(spaceCharactersBytes | frozenset("/"))
+        # Step 2
+        if c in (">", None):
+            return None
+        # Step 3
+        attrName = []
+        attrValue = []
+        #Step 4 attribute name
+        while True:
+            if c == "=" and attrName:   
+                break
+            elif c in spaceCharactersBytes:
+                #Step 6!
+                c = data.skip()
+                c = data.next()
+                break
+            elif c in ("/", ">"):
+                return "".join(attrName), ""
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
+            elif c == None:
+                return None
+            else:
+                attrName.append(c)
+            #Step 5
+            c = data.next()
+        #Step 7
+        if c != "=":
+            data.previous()
+            return "".join(attrName), ""
+        #Step 8
+        data.next()
+        #Step 9
+        c = data.skip()
+        #Step 10
+        if c in ("'", '"'):
+            #10.1
+            quoteChar = c
+            while True:
+                #10.2
+                c = data.next()
+                #10.3
+                if c == quoteChar:
+                    data.next()
+                    return "".join(attrName), "".join(attrValue)
+                #10.4
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
+                #10.5
+                else:
+                    attrValue.append(c)
+        elif c == ">":
+            return "".join(attrName), ""
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
+        else:
+            attrValue.append(c)
+        # Step 11
+        while True:
+            c = data.next()
+            if c in spacesAngleBrackets:
+                return "".join(attrName), "".join(attrValue)
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
+            elif c is None:
+                return None
+            else:
+                attrValue.append(c)
+
+
+class ContentAttrParser(object):
+    def __init__(self, data):
+        self.data = data
+    def parse(self):
+        try:
+            #Check if the attr name is charset 
+            #otherwise return
+            self.data.jumpTo("charset")
+            self.data.position += 1
+            self.data.skip()
+            if not self.data.currentByte == "=":
+                #If there is no = sign keep looking for attrs
+                return None
+            self.data.position += 1
+            self.data.skip()
+            #Look for an encoding between matching quote marks
+            if self.data.currentByte in ('"', "'"):
+                quoteMark = self.data.currentByte
+                self.data.position += 1
+                oldPosition = self.data.position
+                if self.data.jumpTo(quoteMark):
+                    return self.data[oldPosition:self.data.position]
+                else:
+                    return None
+            else:
+                #Unquoted value
+                oldPosition = self.data.position
+                try:
+                    self.data.skipUntil(spaceCharactersBytes)
+                    return self.data[oldPosition:self.data.position]
+                except StopIteration:
+                    #Return the whole remaining value
+                    return self.data[oldPosition:]
+        except StopIteration:
+            return None
+
+
+def codecName(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if (encoding is not None and type(encoding) in types.StringTypes):
+        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
+        return encodings.get(canonicalName, None)
+    else:
+        return None
--- a/libs/html5lib/sanitizer.py
+++ b/libs/html5lib/sanitizer.py
@ -0,0 +1,258 @@
+import re
+from xml.sax.saxutils import escape, unescape
+
+from tokenizer import HTMLTokenizer
+from constants import tokenTypes
+
+class HTMLSanitizerMixin(object):
+    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+
+    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area',
+        'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button',
+        'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup',
+        'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn',
+        'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset',
+        'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1',
+        'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins',
+        'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter',
+        'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option',
+        'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select',
+        'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong',
+        'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot',
+        'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video']
+      
+    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
+        'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
+        'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
+        'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+        'munderover', 'none']
+      
+    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+        'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse',
+        'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern',
+        'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
+        'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
+        'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+        
+    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+        'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis',
+        'background', 'balance', 'bgcolor', 'bgproperties', 'border',
+        'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding',
+        'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff',
+        'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color',
+        'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords',
+        'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default',
+        'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end',
+        'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers',
+        'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace',
+        'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing',
+        'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend',
+        'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method',
+        'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open',
+        'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg',
+        'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min',
+        'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan',
+        'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start',
+        'step', 'style', 'summary', 'suppress', 'tabindex', 'target',
+        'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap',
+        'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml',
+        'width', 'wrap', 'xml:lang']
+
+    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+        'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+        'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
+        'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
+        'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
+        'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
+        'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+        'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
+        'xlink:type', 'xmlns', 'xmlns:xlink']
+  
+    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+        'arabic-form', 'ascent', 'attributeName', 'attributeType',
+        'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+        'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx',
+        'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill',
+        'fill-opacity', 'fill-rule', 'font-family', 'font-size',
+        'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from',
+        'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging',
+        'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
+        'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end',
+        'marker-mid', 'marker-start', 'markerHeight', 'markerUnits',
+        'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset',
+        'opacity', 'orient', 'origin', 'overline-position',
+        'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
+        'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount',
+        'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart',
+        'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color',
+        'stop-opacity', 'strikethrough-position', 'strikethrough-thickness',
+        'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
+        'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
+        'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
+        'transform', 'type', 'u1', 'u2', 'underline-position',
+        'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
+        'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
+        'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+        'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
+        'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y',
+        'y1', 'y2', 'zoomAndPan']
+
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+        'xlink:href', 'xml:base']
+
+    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
+        'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end',
+        'mask', 'stroke']
+
+    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor',
+        'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter',
+        'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref',
+        'set', 'use']
+  
+    acceptable_css_properties = ['azimuth', 'background-color',
+        'border-bottom-color', 'border-collapse', 'border-color',
+        'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+        'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+        'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+        'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+        'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+        'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+        'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+        'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+        'white-space', 'width']
+  
+    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+        'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+        'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+        'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+        'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+        'transparent', 'underline', 'white', 'yellow']
+  
+    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+        'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+        'stroke-opacity']
+  
+    acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
+        'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
+        'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
+        'ssh', 'sftp', 'rtsp', 'afs' ]
+  
+    # subclasses may define their own versions of these constants
+    allowed_elements = acceptable_elements + mathml_elements + svg_elements
+    allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
+    allowed_css_properties = acceptable_css_properties
+    allowed_css_keywords = acceptable_css_keywords
+    allowed_svg_properties = acceptable_svg_properties
+    allowed_protocols = acceptable_protocols
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+    # attributes are parsed, and a restricted set, # specified by
+    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+    # in ALLOWED_PROTOCOLS are allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def sanitize_token(self, token):
+
+        # accommodate filters which use token_type differently
+        token_type = token["type"]
+        if token_type in tokenTypes.keys():
+          token_type = tokenTypes[token_type]
+
+        if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
+                             tokenTypes["EmptyTag"]):
+            if token["name"] in self.allowed_elements:
+                if token.has_key("data"):
+                    attrs = dict([(name,val) for name,val in
+                                  token["data"][::-1] 
+                                  if name in self.allowed_attributes])
+                    for attr in self.attr_val_is_uri:
+                        if not attrs.has_key(attr):
+                            continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                               unescape(attrs[attr])).lower()
+                        #remove replacement characters from unescaped characters
+                        val_unescaped = val_unescaped.replace(u"\ufffd", "")
+                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
+                            (val_unescaped.split(':')[0] not in 
+                             self.allowed_protocols)):
+                            del attrs[attr]
+                    for attr in self.svg_attr_val_allows_ref:
+                        if attr in attrs:
+                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                                 ' ',
+                                                 unescape(attrs[attr]))
+                    if (token["name"] in self.svg_allow_local_href and
+                        'xlink:href' in attrs and re.search('^\s*[^#\s].*',
+                                                            attrs['xlink:href'])):
+                        del attrs['xlink:href']
+                    if attrs.has_key('style'):
+                        attrs['style'] = self.sanitize_css(attrs['style'])
+                    token["data"] = [[name,val] for name,val in attrs.items()]
+                return token
+            else:
+                if token_type == tokenTypes["EndTag"]:
+                    token["data"] = "</%s>" % token["name"]
+                elif token["data"]:
+                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+                    token["data"] = "<%s%s>" % (token["name"],attrs)
+                else:
+                    token["data"] = "<%s>" % token["name"]
+                if token.get("selfClosing"):
+                    token["data"]=token["data"][:-1] + "/>"
+
+                if token["type"] in tokenTypes.keys():
+                    token["type"] = "Characters"
+                else:
+                    token["type"] = tokenTypes["Characters"]
+
+                del token["name"]
+                return token
+        elif token_type == tokenTypes["Comment"]:
+            pass
+        else:
+            return token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+        # gauntlet
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
+
+        clean = []
+        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+          if not value: continue
+          if prop.lower() in self.allowed_css_properties:
+              clean.append(prop + ': ' + value + ';')
+          elif prop.split('-')[0].lower() in ['background','border','margin',
+                                              'padding']:
+              for keyword in value.split():
+                  if not keyword in self.acceptable_css_keywords and \
+                      not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
+                      break
+              else:
+                  clean.append(prop + ': ' + value + ';')
+          elif prop.lower() in self.allowed_svg_properties:
+              clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
+
+class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+                 lowercaseElementName=False, lowercaseAttrName=False, parser=None):
+        #Change case matching defaults as we only output lowercase html anyway
+        #This solution doesn't seem ideal...
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
+                               lowercaseElementName, lowercaseAttrName, parser=parser)
+
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            token = self.sanitize_token(token)
+            if token:
+                yield token
--- a/libs/html5lib/serializer/init.py
+++ b/libs/html5lib/serializer/init.py
@ -0,0 +1,17 @@
+
+from html5lib import treewalkers
+
+from htmlserializer import HTMLSerializer
+from xhtmlserializer import XHTMLSerializer
+
+def serialize(input, tree="simpletree", format="html", encoding=None,
+              **serializer_opts):
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree) 
+    if format == "html":
+        s = HTMLSerializer(**serializer_opts)
+    elif format == "xhtml":
+        s = XHTMLSerializer(**serializer_opts)
+    else:
+        raise ValueError, "type must be either html or xhtml"
+    return s.render(walker(input), encoding)
--- a/libs/html5lib/serializer/htmlserializer.py
+++ b/libs/html5lib/serializer/htmlserializer.py
@ -0,0 +1,312 @@
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import ImmutableSet as frozenset
+
+import gettext
+_ = gettext.gettext
+
+from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
+from html5lib.constants import rcdataElements, entities, xmlEntities
+from html5lib import utils
+from xml.sax.saxutils import escape
+
+spaceCharacters = u"".join(spaceCharacters)
+
+try:
+    from codecs import register_error, xmlcharrefreplace_errors
+except ImportError:
+    unicode_encode_errors = "strict"
+else:
+    unicode_encode_errors = "htmlentityreplace"
+
+    from html5lib.constants import entities
+
+    encode_entity_map = {}
+    is_ucs4 = len(u"\U0010FFFF") == 1
+    for k, v in entities.items():
+        #skip multi-character entities
+        if ((is_ucs4 and len(v) > 1) or
+            (not is_ucs4 and len(v) > 2)):
+            continue
+        if v != "&":
+            if len(v) == 2:
+                v = utils.surrogatePairToCodepoint(v)
+            else:
+                try:
+                    v = ord(v)
+                except:
+                    print v
+                    raise
+            if not v in encode_entity_map or k.islower():
+                # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+                encode_entity_map[v] = k
+
+    def htmlentityreplace_errors(exc):
+        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+            res = []
+            codepoints = []
+            skip = False
+            for i, c in enumerate(exc.object[exc.start:exc.end]):
+                if skip:
+                    skip = False
+                    continue
+                index = i + exc.start
+                if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
+                    codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
+                    skip = True
+                else:
+                    codepoint = ord(c)
+                codepoints.append(codepoint)
+            for cp in codepoints:
+                e = encode_entity_map.get(cp)
+                if e:
+                    res.append("&")
+                    res.append(e)
+                    if not e.endswith(";"):
+                        res.append(";")
+                else:
+                    res.append("&#x%s;"%(hex(cp)[2:]))
+            return (u"".join(res), exc.end)
+        else:
+            return xmlcharrefreplace_errors(exc)
+
+    register_error(unicode_encode_errors, htmlentityreplace_errors)
+
+    del register_error
+
+
+class HTMLSerializer(object):
+
+    # attribute quoting options
+    quote_attr_values = False
+    quote_char = u'"'
+    use_best_quote_char = True
+
+    # tag syntax options
+    omit_optional_tags = True
+    minimize_boolean_attributes = True
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+
+    # escaping options
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+    resolve_entities = True
+
+    # miscellaneous options
+    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+          "minimize_boolean_attributes", "use_trailing_solidus",
+          "space_before_trailing_solidus", "omit_optional_tags",
+          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
+          "escape_rcdata", "resolve_entities", "sanitize")
+
+    def __init__(self, **kwargs):
+        """Initialize HTMLSerializer.
+
+        Keyword options (default given first unless specified) include:
+
+        inject_meta_charset=True|False
+          Whether it insert a meta element to define the character set of the
+          document.
+        quote_attr_values=True|False
+          Whether to quote attribute values that don't require quoting
+          per HTML5 parsing rules.
+        quote_char=u'"'|u"'"
+          Use given quote character for attribute quoting. Default is to
+          use double quote unless attribute value contains a double quote,
+          in which case single quotes are used instead.
+        escape_lt_in_attrs=False|True
+          Whether to escape < in attribute values.
+        escape_rcdata=False|True
+          Whether to escape characters that need to be escaped within normal
+          elements within rcdata elements such as style.
+        resolve_entities=True|False
+          Whether to resolve named character entities that appear in the
+          source tree. The XML predefined entities &lt; &gt; &amp; &quot; &apos;
+          are unaffected by this setting.
+        strip_whitespace=False|True
+          Whether to remove semantically meaningless whitespace. (This
+          compresses all whitespace to a single space except within pre.)
+        minimize_boolean_attributes=True|False
+          Shortens boolean attributes to give just the attribute value,
+          for example <input disabled="disabled"> becomes <input disabled>.
+        use_trailing_solidus=False|True
+          Includes a close-tag slash at the end of the start tag of void
+          elements (empty elements whose end tag is forbidden). E.g. <hr/>.
+        space_before_trailing_solidus=True|False
+          Places a space immediately before the closing slash in a tag
+          using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus.
+        sanitize=False|True
+          Strip all unsafe or unknown constructs from output.
+          See `html5lib user documentation`_
+        omit_optional_tags=True|False
+          Omit start/end tags that are optional.
+
+        .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation
+        """
+        if kwargs.has_key('quote_char'):
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def encode(self, string):
+        assert(isinstance(string, unicode))
+        if self.encoding:
+            return string.encode(self.encoding, unicode_encode_errors)
+        else:
+            return string
+
+    def encodeStrict(self, string):
+        assert(isinstance(string, unicode))
+        if self.encoding:
+            return string.encode(self.encoding, "strict")
+        else:
+            return string
+
+    def serialize(self, treewalker, encoding=None):
+        self.encoding = encoding
+        in_cdata = False
+        self.errors = []
+        if encoding and self.inject_meta_charset:
+            from html5lib.filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
+        # XXX: WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            from html5lib.filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from html5lib.filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
+        if self.omit_optional_tags:
+            from html5lib.filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = u"<!DOCTYPE %s" % token["name"]
+                
+                if token["publicId"]:
+                    doctype += u' PUBLIC "%s"' % token["publicId"]
+                elif token["systemId"]:
+                    doctype += u" SYSTEM"
+                if token["systemId"]:                
+                    if token["systemId"].find(u'"') >= 0:
+                        if token["systemId"].find(u"'") >= 0:
+                            self.serializeError(_("System identifer contains both single and double quote characters"))
+                        quote_char = u"'"
+                    else:
+                        quote_char = u'"'
+                    doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
+                
+                doctype += u">"
+                yield self.encodeStrict(doctype)
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError(_("Unexpected </ in CDATA"))
+                    yield self.encode(token["data"])
+                else:
+                    yield self.encode(escape(token["data"]))
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                yield self.encodeStrict(u"<%s" % name)
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                attributes = []
+                for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()):
+                    #TODO: Add namespace support here
+                    k = attr_name
+                    v = attr_value
+                    yield self.encodeStrict(u' ')
+
+                    yield self.encodeStrict(k)
+                    if not self.minimize_boolean_attributes or \
+                      (k not in booleanAttributes.get(name, tuple()) \
+                      and k not in booleanAttributes.get("", tuple())):
+                        yield self.encodeStrict(u"=")
+                        if self.quote_attr_values or not v:
+                            quote_attr = True
+                        else:
+                            quote_attr = reduce(lambda x,y: x or (y in v),
+                                spaceCharacters + u">\"'=", False)
+                        v = v.replace(u"&", u"&amp;")
+                        if self.escape_lt_in_attrs: v = v.replace(u"<", u"&lt;")
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if u"'" in v and u'"' not in v:
+                                    quote_char = u'"'
+                                elif u'"' in v and u"'" not in v:
+                                    quote_char = u"'"
+                            if quote_char == u"'":
+                                v = v.replace(u"'", u"&#39;")
+                            else:
+                                v = v.replace(u'"', u"&quot;")
+                            yield self.encodeStrict(quote_char)
+                            yield self.encode(v)
+                            yield self.encodeStrict(quote_char)
+                        else:
+                            yield self.encode(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        yield self.encodeStrict(u" /")
+                    else:
+                        yield self.encodeStrict(u"/")
+                yield self.encode(u">")
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                yield self.encodeStrict(u"</%s>" % name)
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError(_("Comment contains --"))
+                yield self.encodeStrict(u"<!--%s-->" % token["data"])
+
+            elif type == "Entity":
+                name = token["name"]
+                key = name + ";"
+                if not key in entities:
+                    self.serializeError(_("Entity %s not recognized" % name))
+                if self.resolve_entities and key not in xmlEntities:
+                    data = entities[key]
+                else:
+                    data = u"&%s;" % name
+                yield self.encodeStrict(data)
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        if encoding:
+            return "".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return u"".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+def SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
--- a/libs/html5lib/serializer/xhtmlserializer.py
+++ b/libs/html5lib/serializer/xhtmlserializer.py
@ -0,0 +1,9 @@
+from htmlserializer import HTMLSerializer
+
+class XHTMLSerializer(HTMLSerializer):
+    quote_attr_values = True
+    minimize_boolean_attributes = False
+    use_trailing_solidus = True
+    escape_lt_in_attrs = True
+    omit_optional_tags = False
+    escape_rcdata = True
--- a/libs/html5lib/tokenizer.py
+++ b/libs/html5lib/tokenizer.py
--- a/libs/html5lib/treebuilders/init.py
+++ b/libs/html5lib/treebuilders/init.py
@ -0,0 +1,96 @@
+"""A collection of modules for building different kinds of tree from
+HTML documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1) A set of classes for various types of elements: Document, Doctype,
+Comment, Element. These must implement the interface of
+_base.treebuilders.Node (although comment nodes have a different
+signature for their constructor, see treebuilders.simpletree.Comment)
+Textual content may also be implemented as another node type, or not, as
+your tree implementation requires.
+
+2) A treebuilder object (called TreeBuilder by convention) that
+inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
+documentClass - the class to use for the bottommost node of a document
+elementClass - the class to use for HTML Elements
+commentClass - the class to use for comments
+doctypeClass - the class to use for doctypes
+It also has one required method:
+getDocument - Returns the root node of the complete document tree
+
+3) If you wish to run the unit tests, you must also create a
+testSerializer method on your treebuilder which accepts a node and
+returns a string containing Node and its children serialized according
+to the format used in the unittests
+
+The supplied simpletree module provides a python-only implementation
+of a full treebuilder and is a useful reference for the semantics of
+the various methods.
+"""
+
+treeBuilderCache = {}
+
+import sys
+
+def getTreeBuilder(treeType, implementation=None, **kwargs):
+    """Get a TreeBuilder class for various types of tree with built-in support
+    
+    treeType - the name of the tree type required (case-insensitive). Supported
+               values are "simpletree", "dom", "etree" and "beautifulsoup"
+               
+               "simpletree" - a built-in DOM-ish tree type with support for some
+                              more pythonic idioms.
+                "dom" - A generic builder for DOM implementations, defaulting to
+                        a xml.dom.minidom based implementation for the sake of
+                        backwards compatibility (as releases up until 0.10 had a
+                        builder called "dom" that was a minidom implemenation).
+                "etree" - A generic builder for tree implementations exposing an
+                          elementtree-like interface (known to work with
+                          ElementTree, cElementTree and lxml.etree).
+                "beautifulsoup" - Beautiful soup (if installed)
+               
+    implementation - (Currently applies to the "etree" and "dom" tree types). A
+                      module implementing the tree type e.g.
+                      xml.etree.ElementTree or lxml.etree."""
+    
+    treeType = treeType.lower()
+    if treeType not in treeBuilderCache:
+        if treeType == "dom":
+            import dom
+            # XXX: Keep backwards compatibility by using minidom if no implementation is given
+            if implementation == None:
+                from xml.dom import minidom
+                implementation = minidom
+            # XXX: NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "simpletree":
+            import simpletree
+            treeBuilderCache[treeType] = simpletree.TreeBuilder
+        elif treeType == "beautifulsoup":
+            import soup
+            treeBuilderCache[treeType] = soup.TreeBuilder
+        elif treeType == "lxml":
+            import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
+        elif treeType == "etree":
+            # Come up with a sane default
+            if implementation == None:
+                try:
+                    import xml.etree.cElementTree as ET
+                except ImportError:
+                    try:
+                        import xml.etree.ElementTree as ET
+                    except ImportError:
+                        try:
+                            import cElementTree as ET
+                        except ImportError:
+                            import elementtree.ElementTree as ET
+                implementation = ET
+            import etree
+            # NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
+        else:
+            raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
+    return treeBuilderCache.get(treeType)
--- a/libs/html5lib/treebuilders/_base.py
+++ b/libs/html5lib/treebuilders/_base.py
@ -0,0 +1,377 @@
+from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+
+# The scope markers are inserted when entering object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, object elements, and marquees.
+Marker = None
+
+class Node(object):
+    def __init__(self, name):
+        """Node representing an item in the tree.
+        name - The tag name associated with the node
+        parent - The parent of the current node (or None for the document node)
+        value - The value of the current node (applies to text nodes and 
+        comments
+        attributes - a dict holding name, value pairs for attributes of the node
+        childNodes - a list of child nodes of the current node. This must 
+        include all elements but not necessarily other node types
+        _flags - A list of miscellaneous flags that can be set on the node
+        """
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.attributes = {}
+        self.childNodes = []
+        self._flags = []
+
+    def __unicode__(self):
+        attributesStr =  " ".join(["%s=\"%s\""%(name, value) 
+                                   for name, value in 
+                                   self.attributes.iteritems()])
+        if attributesStr:
+            return "<%s %s>"%(self.name,attributesStr)
+        else:
+            return "<%s>"%(self.name)
+
+    def __repr__(self):
+        return "<%s>" % (self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the 
+        start of node insertBefore or to the end of the node's text.
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the 
+        list of child nodes. Raises ValueError if refNode is not a child of 
+        the current node"""
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent. 
+        This is needed so that trees that don't store text as nodes move the 
+        text in the correct way
+        """
+        #XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+class ActiveFormattingElements(list):
+    def append(self, node):
+        equalCount = 0
+        if node != Marker:
+            for element in self[::-1]:
+                if element == Marker:
+                    break
+                if self.nodesEqual(element, node):
+                    equalCount += 1
+                if equalCount == 3:
+                    self.remove(element)
+                    break
+        list.append(self, node)
+
+    def nodesEqual(self, node1, node2):
+        if not node1.nameTuple == node2.nameTuple:
+            return False
+        
+        if not node1.attributes == node2.attributes:
+            return False
+        
+        return True
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+    documentClass - the class to use for the bottommost node of a document
+    elementClass - the class to use for HTML Elements
+    commentClass - the class to use for comments
+    doctypeClass - the class to use for doctypes
+    """
+
+    #Document class
+    documentClass = None
+
+    #The class to use for creating a node
+    elementClass = None
+
+    #The class to use for creating comments
+    commentClass = None
+
+    #The class to use for creating doctypes
+    doctypeClass = None
+    
+    #Fragment class
+    fragmentClass = None
+
+    def __init__(self, namespaceHTMLElements):
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+        self.reset()
+    
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = ActiveFormattingElements()
+
+        #XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, variant=None):
+
+        #If we pass a node in we match that. if we pass a string
+        #match any node with that name
+        exactNode = hasattr(target, "nameTuple")
+
+        listElementsMap = {
+            None:(scopingElements, False),
+            "button":(scopingElements | set([(namespaces["html"], "button")]), False),
+            "list":(scopingElements | set([(namespaces["html"], "ol"),
+                                           (namespaces["html"], "ul")]), False),
+            "table":(set([(namespaces["html"], "html"),
+                          (namespaces["html"], "table")]), False),
+            "select":(set([(namespaces["html"], "optgroup"), 
+                           (namespaces["html"], "option")]), True)
+            }
+        listElements, invert = listElementsMap[variant]
+
+        for node in reversed(self.openElements):
+            if (node.name == target and not exactNode or
+                node == target and exactNode):
+                return True
+            elif (invert ^ (node.nameTuple in listElements)):                
+                return False
+
+        assert False # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = len(self.activeFormattingElements) - 1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            if i == 0:
+                #This will be reset to 0 below
+                i = -1
+                break
+            i -= 1
+            # Step 5: let entry be one earlier in the list.
+            entry = self.activeFormattingElements[i]
+
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            entry = self.activeFormattingElements[i]
+            clone = entry.cloneNode() #Mainly to get a new copy of the attributes
+
+            # Step 9
+            element = self.insertElement({"type":"StartTag", 
+                                          "name":clone.name, 
+                                          "namespace":clone.namespace, 
+                                          "data":clone.attributes})
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.document.appendChild(doctype)
+
+    def insertComment(self, token, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(token["data"]))
+                           
+    def createElement(self, token):
+        """Create an element but don't insert it anywhere"""
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+        
+    def insertElementNormal(self, token):
+        name = token["name"]
+        assert type(name) == unicode, "Element %s not unicode"%name
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, token):
+        """Create an element and insert it into the tree""" 
+        element = self.createElement(token)
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(token)
+        else:
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name 
+                                         not in tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+            
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        # The foster parent element is the one which comes before the most
+        # recently opened table element
+        # XXX - this is really inelegant
+        lastTable=None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == "table":
+                lastTable = elm
+                break
+        if lastTable:
+            # XXX - we should really check that this parent is actually a
+            # node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        # XXX td, th and tr are not actually needed
+        if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt"))
+            and name != exclude):
+            self.openElements.pop()
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        "Return the final tree"
+        return self.document
+    
+    def getFragment(self):
+        "Return the final fragment"
+        #assert self.innerHTML
+        fragment = self.fragmentClass()
+        self.openElements[0].reparentChildren(fragment)
+        return fragment
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+        node - the node from which to start serializing"""
+        raise NotImplementedError
--- a/libs/html5lib/treebuilders/dom.py
+++ b/libs/html5lib/treebuilders/dom.py
@ -0,0 +1,291 @@
+
+from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
+try:
+    from types import ModuleType
+except:
+    from new import module as ModuleType
+import re
+import weakref
+
+import _base
+from html5lib import constants, ihatexml
+from html5lib.constants import namespaces
+
+moduleCache = {}
+
+def getDomModule(DomImplementation):
+    name = "_" + DomImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
+    else:
+        mod = ModuleType(name)
+        objs = getDomBuilder(DomImplementation)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod    
+        return mod
+
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+    class AttrList(object):
+        def __init__(self, element):
+            self.element = element
+        def __iter__(self):
+            return self.element.attributes.items().__iter__()
+        def __setitem__(self, name, value):
+            self.element.setAttribute(name, value)
+        def __len__(self):
+            return len(self.element.attributes.items())
+        def items(self):
+            return [(item[0], item[1]) for item in
+                     self.element.attributes.items()]
+        def keys(self):
+            return self.element.attributes.keys()
+        def __getitem__(self, name):
+            return self.element.getAttribute(name)
+
+        def __contains__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.hasAttribute(name)
+    
+    class NodeBuilder(_base.Node):
+        def __init__(self, element):
+            _base.Node.__init__(self, element.nodeName)
+            self.element = element
+
+        namespace = property(lambda self:hasattr(self.element, "namespaceURI")
+                             and self.element.namespaceURI or None)
+
+        def appendChild(self, node):
+            node.parent = self
+            self.element.appendChild(node.element)
+    
+        def insertText(self, data, insertBefore=None):
+            text = self.element.ownerDocument.createTextNode(data)
+            if insertBefore:
+                self.element.insertBefore(text, insertBefore.element)
+            else:
+                self.element.appendChild(text)
+    
+        def insertBefore(self, node, refNode):
+            self.element.insertBefore(node.element, refNode.element)
+            node.parent = self
+    
+        def removeChild(self, node):
+            if node.element.parentNode == self.element:
+                self.element.removeChild(node.element)
+            node.parent = None
+    
+        def reparentChildren(self, newParent):
+            while self.element.hasChildNodes():
+                child = self.element.firstChild
+                self.element.removeChild(child)
+                newParent.element.appendChild(child)
+            self.childNodes = []
+    
+        def getAttributes(self):
+            return AttrList(self.element)
+    
+        def setAttributes(self, attributes):
+            if attributes:
+                for name, value in attributes.items():
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" + name[1])
+                        else:
+                            qualifiedName = name[1]
+                        self.element.setAttributeNS(name[2], qualifiedName, 
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            name, value)
+        attributes = property(getAttributes, setAttributes)
+    
+        def cloneNode(self):
+            return NodeBuilder(self.element.cloneNode(False))
+    
+        def hasContent(self):
+            return self.element.hasChildNodes()
+
+        def getNameTuple(self):
+            if self.namespace == None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
+
+        nameTuple = property(getNameTuple)
+
+    class TreeBuilder(_base.TreeBuilder):
+        def documentClass(self):
+            self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
+            return weakref.proxy(self)
+    
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
+            doctype = domimpl.createDocumentType(name, publicId, systemId)
+            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
+                doctype.ownerDocument = self.dom
+    
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
+            
+        def commentClass(self, data):
+            return NodeBuilder(self.dom.createComment(data))
+        
+        def fragmentClass(self):
+            return NodeBuilder(self.dom.createDocumentFragment())
+    
+        def appendChild(self, node):
+            self.dom.appendChild(node.element)
+    
+        def testSerializer(self, element):
+            return testSerializer(element)
+    
+        def getDocument(self):
+            return self.dom
+        
+        def getFragment(self):
+            return _base.TreeBuilder.getFragment(self).element
+    
+        def insertText(self, data, parent=None):
+            data=data
+            if parent <> self:
+                _base.TreeBuilder.insertText(self, data, parent)
+            else:
+                # HACK: allow text nodes as children of the document node
+                if hasattr(self.dom, '_child_node_types'):
+                    if not Node.TEXT_NODE in self.dom._child_node_types:
+                        self.dom._child_node_types=list(self.dom._child_node_types)
+                        self.dom._child_node_types.append(Node.TEXT_NODE)
+                self.dom.appendChild(self.dom.createTextNode(data))
+    
+        name = None
+    
+    def testSerializer(element):
+        element.normalize()
+        rv = []
+        def serializeElement(element, indent=0):
+            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
+                                ' '*indent, element.name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+                else:
+                    rv.append("|%s<!DOCTYPE >"%(' '*indent,))
+            elif element.nodeType == Node.DOCUMENT_NODE:
+                rv.append("#document")
+            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+                rv.append("#document-fragment")
+            elif element.nodeType == Node.COMMENT_NODE:
+                rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
+            elif element.nodeType == Node.TEXT_NODE:
+                rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
+            else:
+                if (hasattr(element, "namespaceURI") and
+                    element.namespaceURI != None):
+                    name = "%s %s"%(constants.prefixes[element.namespaceURI],
+                                    element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>"%(' '*indent, name))
+                if element.hasAttributes():
+                    attributes = []
+                    for i in range(len(element.attributes)):
+                        attr = element.attributes.item(i)
+                        name = attr.nodeName
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s"%(constants.prefixes[ns], attr.localName)
+                        else:
+                            name = attr.nodeName
+                        attributes.append((name, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+            indent += 2
+            for child in element.childNodes:
+                serializeElement(child, indent)
+        serializeElement(element, 0)
+    
+        return "\n".join(rv)
+    
+    def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
+      if node.nodeType == Node.ELEMENT_NODE:
+        if not nsmap:
+          handler.startElement(node.nodeName, node.attributes)
+          for child in node.childNodes: dom2sax(child, handler, nsmap)
+          handler.endElement(node.nodeName)
+        else:
+          attributes = dict(node.attributes.itemsNS()) 
+    
+          # gather namespace declarations
+          prefixes = []
+          for attrname in node.attributes.keys():
+            attr = node.getAttributeNode(attrname)
+            if (attr.namespaceURI == XMLNS_NAMESPACE or
+               (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
+              prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
+              handler.startPrefixMapping(prefix, attr.nodeValue)
+              prefixes.append(prefix)
+              nsmap = nsmap.copy()
+              nsmap[prefix] = attr.nodeValue
+              del attributes[(attr.namespaceURI, attr.nodeName)]
+    
+          # apply namespace declarations
+          for attrname in node.attributes.keys():
+            attr = node.getAttributeNode(attrname)
+            if attr.namespaceURI == None and ':' in attr.nodeName:
+              prefix = attr.nodeName.split(':')[0]
+              if nsmap.has_key(prefix):
+                del attributes[(attr.namespaceURI, attr.nodeName)]
+                attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
+    
+          # SAX events
+          ns = node.namespaceURI or nsmap.get(None,None)
+          handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
+          for child in node.childNodes: dom2sax(child, handler, nsmap)
+          handler.endElementNS((ns, node.nodeName), node.nodeName)
+          for prefix in prefixes: handler.endPrefixMapping(prefix)
+    
+      elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
+        handler.characters(node.nodeValue)
+    
+      elif node.nodeType == Node.DOCUMENT_NODE:
+        handler.startDocument()
+        for child in node.childNodes: dom2sax(child, handler, nsmap)
+        handler.endDocument()
+    
+      elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+        for child in node.childNodes: dom2sax(child, handler, nsmap)
+    
+      else:
+        # ATTRIBUTE_NODE
+        # ENTITY_NODE
+        # PROCESSING_INSTRUCTION_NODE
+        # COMMENT_NODE
+        # DOCUMENT_TYPE_NODE
+        # NOTATION_NODE
+        pass
+        
+    return locals()
+
+# Keep backwards compatibility with things that directly load 
+# classes/functions from this module
+for key, value in getDomModule(minidom).__dict__.items():
+	globals()[key] = value
--- a/libs/html5lib/treebuilders/etree.py
+++ b/libs/html5lib/treebuilders/etree.py
@ -0,0 +1,344 @@
+try:
+    from types import ModuleType
+except:
+    from new import module as ModuleType
+import re
+import types
+
+import _base
+from html5lib import ihatexml
+from html5lib import constants
+from html5lib.constants import namespaces
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+moduleCache = {}
+
+def getETreeModule(ElementTreeImplementation, fullTree=False):
+    name = "_" + ElementTreeImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
+    else:
+        mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder")
+        objs = getETreeBuilder(ElementTreeImplementation, fullTree)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod    
+        return mod
+
+def getETreeBuilder(ElementTreeImplementation, fullTree=False):
+    ElementTree = ElementTreeImplementation
+    class Element(_base.Node):
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+
+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s"%(namespace, name)
+            return etree_tag
+    
+        def _setName(self, name):
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+        
+        def _getName(self):
+            return self._name
+        
+        name = property(_getName, _setName)
+
+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
+    
+        def _getAttributes(self):
+            return self._element.attrib
+    
+        def _setAttributes(self, attributes):
+            #Delete existing attributes first
+            #XXX - there may be a better way to do this...
+            for key in self._element.attrib.keys():
+                del self._element.attrib[key]
+            for key, value in attributes.iteritems():
+                if isinstance(key, tuple):
+                    name = "{%s}%s"%(key[2], key[1])
+                else:
+                    name = key
+                self._element.set(name, value)
+    
+        attributes = property(_getAttributes, _setAttributes)
+    
+        def _getChildNodes(self):
+            return self._childNodes    
+        def _setChildNodes(self, value):
+            del self._element[:]
+            self._childNodes = []
+            for element in value:
+                self.insertChild(element)
+    
+        childNodes = property(_getChildNodes, _setChildNodes)
+    
+        def hasContent(self):
+            """Return true if the node has children or text"""
+            return bool(self._element.text or len(self._element))
+    
+        def appendChild(self, node):
+            self._childNodes.append(node)
+            self._element.append(node._element)
+            node.parent = self
+    
+        def insertBefore(self, node, refNode):
+            index = list(self._element).index(refNode._element)
+            self._element.insert(index, node._element)
+            node.parent = self
+    
+        def removeChild(self, node):
+            self._element.remove(node._element)
+            node.parent=None
+    
+        def insertText(self, data, insertBefore=None):
+            if not(len(self._element)):
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+            elif insertBefore is None:
+                #Insert the text as the tail of the last child element
+                if not self._element[-1].tail:
+                    self._element[-1].tail = ""
+                self._element[-1].tail += data
+            else:
+                #Insert the text before the specified node
+                children = list(self._element)
+                index = children.index(insertBefore._element)
+                if index > 0:
+                    if not self._element[index-1].tail:
+                        self._element[index-1].tail = ""
+                    self._element[index-1].tail += data
+                else:
+                    if not self._element.text:
+                        self._element.text = ""
+                    self._element.text += data
+    
+        def cloneNode(self):
+            element = type(self)(self.name, self.namespace)
+            for name, value in self.attributes.iteritems():
+                element.attributes[name] = value
+            return element
+    
+        def reparentChildren(self, newParent):
+            if newParent.childNodes:
+                newParent.childNodes[-1]._element.tail += self._element.text
+            else:
+                if not newParent._element.text:
+                    newParent._element.text = ""
+                if self._element.text is not None:
+                    newParent._element.text += self._element.text
+            self._element.text = ""
+            _base.Node.reparentChildren(self, newParent)
+    
+    class Comment(Element):
+        def __init__(self, data):
+            #Use the superclass constructor to set all properties on the 
+            #wrapper element
+            self._element = ElementTree.Comment(data)
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+            
+        def _getData(self):
+            return self._element.text
+    
+        def _setData(self, value):
+            self._element.text = value
+    
+        data = property(_getData, _setData)
+    
+    class DocumentType(Element):
+        def __init__(self, name, publicId, systemId):
+            Element.__init__(self, "<!DOCTYPE>") 
+            self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId
+
+        def _getPublicId(self):
+            return self._element.get(u"publicId", "")
+
+        def _setPublicId(self, value):
+            if value is not None:
+                self._element.set(u"publicId", value)
+
+        publicId = property(_getPublicId, _setPublicId)
+    
+        def _getSystemId(self):
+            return self._element.get(u"systemId", "")
+
+        def _setSystemId(self, value):
+            if value is not None:
+                self._element.set(u"systemId", value)
+
+        systemId = property(_getSystemId, _setSystemId)
+    
+    class Document(Element):
+        def __init__(self):
+            Element.__init__(self, "<DOCUMENT_ROOT>") 
+    
+    class DocumentFragment(Element):
+        def __init__(self):
+            Element.__init__(self, "<DOCUMENT_FRAGMENT>")
+    
+    def testSerializer(element):
+        rv = []
+        finalText = None
+        def serializeElement(element, indent=0):
+            if not(hasattr(element, "tag")):
+                element = element.getroot()
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
+                            element.text, publicId, systemId))
+                else:     
+                    rv.append("<!DOCTYPE %s>"%(element.text,))
+            elif element.tag == "<DOCUMENT_ROOT>":
+                rv.append("#document")
+                if element.text:
+                    rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
+                if element.tail:
+                    finalText = element.tail
+            elif element.tag == ElementTree.Comment:
+                rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+            else:
+                assert type(element.tag) in types.StringTypes, "Expected unicode, got %s"%type(element.tag)
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    name = "%s %s"%(prefix, name)
+                rv.append("|%s<%s>"%(' '*indent, name))
+
+                if hasattr(element, "attrib"):
+                    attributes = []
+                    for name, value in element.attrib.iteritems():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            attr_string = "%s %s"%(prefix, name)
+                        else:
+                            attr_string = name
+                        attributes.append((attr_string, value))
+
+                    for name, value in sorted(attributes):
+                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+                if element.text:
+                    rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+            indent += 2
+            for child in element:
+                serializeElement(child, indent)
+            if element.tail:
+                rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+        serializeElement(element, 0)
+    
+        if finalText is not None:
+            rv.append("|%s\"%s\""%(' '*2, finalText))
+    
+        return "\n".join(rv)
+    
+    def tostring(element):
+        """Serialize an element and its child nodes to a string"""
+        rv = []
+        finalText = None
+        filter = ihatexml.InfosetFilter()
+        def serializeElement(element):
+            if type(element) == type(ElementTree.ElementTree):
+                element = element.getroot()
+            
+            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
+                            element.text, publicId, systemId))
+                else:     
+                    rv.append("<!DOCTYPE %s>"%(element.text,))
+            elif element.tag == "<DOCUMENT_ROOT>":
+                if element.text:
+                    rv.append(element.text)
+                if element.tail:
+                    finalText = element.tail
+    
+                for child in element:
+                    serializeElement(child)
+    
+            elif type(element.tag) == type(ElementTree.Comment):
+                rv.append("<!--%s-->"%(element.text,))
+            else:
+                #This is assumed to be an ordinary element
+                if not element.attrib:
+                    rv.append("<%s>"%(filter.fromXmlName(element.tag),))
+                else:
+                    attr = " ".join(["%s=\"%s\""%(
+                                filter.fromXmlName(name), value) 
+                                     for name, value in element.attrib.iteritems()])
+                    rv.append("<%s %s>"%(element.tag, attr))
+                if element.text:
+                    rv.append(element.text)
+    
+                for child in element:
+                    serializeElement(child)
+    
+                rv.append("</%s>"%(element.tag,))
+    
+            if element.tail:
+                rv.append(element.tail)
+    
+        serializeElement(element)
+    
+        if finalText is not None:
+            rv.append("%s\""%(' '*2, finalText))
+    
+        return "".join(rv)
+    
+    class TreeBuilder(_base.TreeBuilder):
+        documentClass = Document
+        doctypeClass = DocumentType
+        elementClass = Element
+        commentClass = Comment
+        fragmentClass = DocumentFragment
+    
+        def testSerializer(self, element):
+            return testSerializer(element)
+    
+        def getDocument(self):
+            if fullTree:
+                return self.document._element
+            else:
+                if self.defaultNamespace is not None:
+                    return self.document._element.find(
+                        "{%s}html"%self.defaultNamespace)
+                else:
+                    return self.document._element.find("html")
+        
+        def getFragment(self):
+            return _base.TreeBuilder.getFragment(self)._element
+        
+    return locals()
--- a/libs/html5lib/treebuilders/etree_lxml.py
+++ b/libs/html5lib/treebuilders/etree_lxml.py
@ -0,0 +1,336 @@
+import warnings
+import re
+
+import _base
+from html5lib.constants import DataLossWarning
+import html5lib.constants as constants
+import etree as etree_builders
+from html5lib import ihatexml
+
+try:
+    import lxml.etree as etree
+except ImportError:
+    pass
+
+fullTree = True
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name         
+        self.publicId = publicId
+        self.systemId = systemId
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+    
+    childNodes = property(_getChildNodes)
+
+def testSerializer(element):
+    rv = []
+    finalText = None
+    filter = ihatexml.InfosetFilter()
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if  hasattr(element, "getroot"):
+                #Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or 
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
+                            element.docinfo.root_name, 
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s"%(' '*(indent+2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent+2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, basestring):
+                #Text in a fragment
+                rv.append("|%s\"%s\""%(' '*indent, element))
+            else:
+                #Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent+2)
+        elif type(element.tag) == type(etree.Comment):
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+        else:
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                rv.append("|%s<%s %s>"%(' '*indent, prefix,
+                                        filter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>"%(' '*indent,
+                                     filter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                attributes = []
+                for name, value in element.attrib.iteritems():
+                    nsmatch = tag_regexp.match(name)
+                    if nsmatch is not None:
+                        ns, name = nsmatch.groups()
+                        name = filter.fromXmlName(name)
+                        prefix = constants.prefixes[ns]
+                        attr_string = "%s %s"%(prefix, name)
+                    else:
+                        attr_string = filter.fromXmlName(name)
+                    attributes.append((attr_string, value))
+
+                for name, value in sorted(attributes):
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+
+            if element.text:
+                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+            indent += 2
+            for child in element.getchildren():
+                serializeElement(child, indent)
+        if hasattr(element, "tail") and element.tail:
+            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+    serializeElement(element, 0)
+
+    if finalText is not None:
+        rv.append("|%s\"%s\""%(' '*2, finalText))
+
+    return "\n".join(rv)
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+    finalText = None
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+            
+        elif type(element.tag) == type(etree.Comment):
+            rv.append("<!--%s-->"%(element.text,))
+        
+        else:
+            #This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>"%(element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\""%(name, value) 
+                                 for name, value in element.attrib.iteritems()])
+                rv.append("<%s %s>"%(element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+            rv.append("</%s>"%(element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    if finalText is not None:
+        rv.append("%s\""%(' '*2, finalText))
+
+    return "".join(rv)
+        
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document    
+
+    def __init__(self, namespaceHTMLElements, fullTree = False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        filter = self.filter = ihatexml.InfosetFilter()
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(dict):
+            def __init__(self, element, value={}):
+                self._element = element
+                dict.__init__(self, value)
+                for key, value in self.iteritems():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                    else:
+                        name = filter.coerceAttribute(key)
+                    self._element._element.attrib[name] = value
+
+            def __setitem__(self, key, value):
+                dict.__setitem__(self, key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                else:
+                    name = filter.coerceAttribute(key)
+                self._element._element.attrib[name] = value
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = filter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = filter.coerceElement(name)
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+        
+            def _getName(self):
+                return filter.fromXmlName(self._name)
+        
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, attributes):
+                self._attributes = Attributes(self, attributes)
+    
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = filter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def appendChild(self, child):
+                builder.Element.appendChild(self, child)
+                
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = filter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = filter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = builder.Comment
+        #self.fragmentClass = builder.DocumentFragment
+        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+    
+    def reset(self):
+        _base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+    
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(element.getchildren())
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
+            warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.doctype = doctype
+    
+    def insertCommentInitial(self, data, parent=None):
+        self.initial_comments.append(data)
+    
+    def insertRoot(self, token):
+        """Create the document root"""
+        #Because of the way libxml2 works, it doesn't seem to be possible to
+        #alter information like the doctype after the tree has been parsed. 
+        #Therefore we need to use the built-in parser to create our iniial 
+        #tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
+            docStr += "<!DOCTYPE %s"%self.doctype.name
+            if (self.doctype.publicId is not None or 
+                self.doctype.systemId is not None):
+                docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
+                                               self.doctype.systemId or "")
+            docStr += ">"
+        docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
+        
+        try:
+            root = etree.fromstring(docStr)
+        except etree.XMLSyntaxError:
+            print docStr
+            raise
+        
+        #Append the initial comments:
+        for comment_token in self.initial_comments:
+            root.addprevious(etree.Comment(comment_token["data"]))
+        
+        #Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+        
+        # Give the root element the right name
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        if namespace is None:
+            etree_tag = name
+        else:
+            etree_tag = "{%s}%s"%(namespace, name)
+        root.tag = etree_tag
+        
+        #Add the root element to the internal child/open data structures
+        root_element = self.elementClass(name, namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+    
+        #Reset to the default insert comment function
+        self.insertComment = super(TreeBuilder, self).insertComment
--- a/libs/html5lib/treebuilders/simpletree.py
+++ b/libs/html5lib/treebuilders/simpletree.py
@ -0,0 +1,256 @@
+import _base
+from html5lib.constants import voidElements, namespaces, prefixes
+from xml.sax.saxutils import escape
+
+# Really crappy basic implementation of a DOM-core like thing
+class Node(_base.Node):
+    type = -1
+    def __init__(self, name):
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.childNodes = []
+        self._flags = []
+
+    def __iter__(self):
+        for node in self.childNodes:
+            yield node
+            for item in node:
+                yield item
+
+    def __unicode__(self):
+        return self.name
+
+    def toxml(self):
+        raise NotImplementedError
+
+    def printTree(self, indent=0):
+        tree = '\n|%s%s' % (' '* indent, unicode(self))
+        for child in self.childNodes:
+            tree += child.printTree(indent + 2)
+        return tree
+
+    def appendChild(self, node):
+        assert isinstance(node, Node)
+        if (isinstance(node, TextNode) and self.childNodes and
+          isinstance(self.childNodes[-1], TextNode)):
+            self.childNodes[-1].value += node.value
+        else:
+            self.childNodes.append(node)
+        node.parent = self
+
+    def insertText(self, data, insertBefore=None):
+        assert isinstance(data, unicode), "data %s is of type %s expected unicode"%(repr(data), type(data))
+        if insertBefore is None:
+            self.appendChild(TextNode(data))
+        else:
+            self.insertBefore(TextNode(data), insertBefore)
+
+    def insertBefore(self, node, refNode):
+        index = self.childNodes.index(refNode)
+        if (isinstance(node, TextNode) and index > 0 and
+          isinstance(self.childNodes[index - 1], TextNode)):
+            self.childNodes[index - 1].value += node.value
+        else:
+            self.childNodes.insert(index, node)
+        node.parent = self
+
+    def removeChild(self, node):
+        try:
+            self.childNodes.remove(node)
+        except:
+            # XXX
+            raise
+        node.parent = None
+
+    def cloneNode(self):
+        raise NotImplementedError
+
+    def hasContent(self):
+        """Return true if the node has children or text"""
+        return bool(self.childNodes)
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class Document(Node):
+    type = 1
+    def __init__(self):
+        Node.__init__(self, None)
+
+    def __str__(self):
+        return "#document"
+
+    def __unicode__(self):
+        return str(self)
+
+    def appendChild(self, child):
+        Node.appendChild(self, child)
+
+    def toxml(self, encoding="utf=8"):
+        result = ""
+        for child in self.childNodes:
+            result += child.toxml()
+        return result.encode(encoding)
+
+    def hilite(self, encoding="utf-8"):
+        result = "<pre>"
+        for child in self.childNodes:
+            result += child.hilite()
+        return result.encode(encoding) + "</pre>"
+    
+    def printTree(self):
+        tree = unicode(self)
+        for child in self.childNodes:
+            tree += child.printTree(2)
+        return tree
+
+    def cloneNode(self):
+        return Document()
+
+class DocumentFragment(Document):
+    type = 2
+    def __str__(self):
+        return "#document-fragment"
+
+    def __unicode__(self):
+        return str(self)
+
+    def cloneNode(self):
+        return DocumentFragment()
+
+class DocumentType(Node):
+    type = 3
+    def __init__(self, name, publicId, systemId):
+        Node.__init__(self, name)
+        self.publicId = publicId
+        self.systemId = systemId
+
+    def __unicode__(self):
+        if self.publicId or self.systemId:
+            publicId = self.publicId or ""
+            systemId = self.systemId or ""
+            return """<!DOCTYPE %s "%s" "%s">"""%(
+                self.name, publicId, systemId)
+                            
+        else:
+            return u"<!DOCTYPE %s>" % self.name
+    
+
+    toxml = __unicode__
+    
+    def hilite(self):
+        return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
+
+    def cloneNode(self):
+        return DocumentType(self.name, self.publicId, self.systemId)
+
+class TextNode(Node):
+    type = 4
+    def __init__(self, value):
+        Node.__init__(self, None)
+        self.value = value
+
+    def __unicode__(self):
+        return u"\"%s\"" % self.value
+
+    def toxml(self):
+        return escape(self.value)
+    
+    hilite = toxml
+
+    def cloneNode(self):
+        return TextNode(self.value)
+
+class Element(Node):
+    type = 5
+    def __init__(self, name, namespace=None):
+        Node.__init__(self, name)
+        self.namespace = namespace
+        self.attributes = {}
+
+    def __unicode__(self):
+        if self.namespace == None:
+            return u"<%s>" % self.name
+        else:
+            return u"<%s %s>"%(prefixes[self.namespace], self.name)
+
+    def toxml(self):
+        result = '<' + self.name
+        if self.attributes:
+            for name,value in self.attributes.iteritems():
+                result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
+        if self.childNodes:
+            result += '>'
+            for child in self.childNodes:
+                result += child.toxml()
+            result += u'</%s>' % self.name
+        else:
+            result += u'/>'
+        return result
+    
+    def hilite(self):
+        result = '&lt;<code class="markup element-name">%s</code>' % self.name
+        if self.attributes:
+            for name, value in self.attributes.iteritems():
+                result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
+        if self.childNodes:
+            result += ">"
+            for child in self.childNodes:
+                result += child.hilite()
+        elif self.name in voidElements:
+            return result + ">"
+        return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
+
+    def printTree(self, indent):
+        tree = '\n|%s%s' % (' '*indent, unicode(self))
+        indent += 2
+        if self.attributes:
+            for name, value in sorted(self.attributes.iteritems()):
+                if isinstance(name, tuple):
+                    name = "%s %s"%(name[0], name[1])
+                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
+        for child in self.childNodes:
+            tree += child.printTree(indent)
+        return tree
+
+    def cloneNode(self):
+        newNode = Element(self.name)
+        if hasattr(self, 'namespace'):
+            newNode.namespace = self.namespace
+        for attr, value in self.attributes.iteritems():
+            newNode.attributes[attr] = value
+        return newNode
+
+class CommentNode(Node):
+    type = 6
+    def __init__(self, data):
+        Node.__init__(self, None)
+        self.data = data
+
+    def __unicode__(self):
+        return "<!-- %s -->" % self.data
+    
+    def toxml(self):
+        return "<!--%s-->" % self.data
+
+    def hilite(self):
+        return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
+
+    def cloneNode(self):
+        return CommentNode(self.data)
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = Element
+    commentClass = CommentNode
+    fragmentClass = DocumentFragment
+    
+    def testSerializer(self, node):
+        return node.printTree()
--- a/libs/html5lib/treebuilders/soup.py
+++ b/libs/html5lib/treebuilders/soup.py
@ -0,0 +1,236 @@
+import warnings
+
+warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
+
+from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
+
+import _base
+from html5lib.constants import namespaces, DataLossWarning
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return self.attrs.items().__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return self.attrs.items()
+    def keys(self):
+        return self.attrs.keys()
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in self.attrs.keys()
+    def __eq__(self, other):
+        if len(self.keys()) != len(other.keys()):
+            return False
+        for item in self.keys():
+            if item not in other:
+                return False
+            if self[item] != other[item]:
+                return False
+        return True
+
+class Element(_base.Node):
+    def __init__(self, element, soup, namespace):
+        _base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def _nodeIndex(self, node, refNode):
+        # Finds a node by identity rather than equality
+        for index in range(len(self.element.contents)):
+            if id(self.element.contents[index]) == id(refNode.element):
+                return index
+        return None
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # Concatenate new text onto old text node
+            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
+            newStr = NavigableString(self.element.contents[-1]+node.element)
+
+            # Remove the old text node
+            # (Can't simply use .extract() by itself, because it fails if
+            # an equal text node exists within the parent node)
+            oldElement = self.element.contents[-1]
+            del self.element.contents[-1]
+            oldElement.parent = None
+            oldElement.extract()
+
+            self.element.insert(len(self.element.contents), newStr)
+        else:
+            self.element.insert(len(self.element.contents), node.element)
+            node.parent = self
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes:
+            for name, value in attributes.items():
+                self.element[name] =  value
+
+    attributes = property(getAttributes, setAttributes)
+    
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(NavigableString(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self._nodeIndex(node, refNode)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            newStr = NavigableString(self.element.contents[index-1]+node.element)
+            oldNode = self.element.contents[index-1]
+            del self.element.contents[index-1]
+            oldNode.parent = None
+            oldNode.extract()
+
+            self.element.insert(index-1, newStr)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        index = self._nodeIndex(node.parent, node)
+        del node.parent.element.contents[index]
+        node.element.parent = None
+        node.element.extract()
+        node.parent = None
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
+            else:
+                newParent.appendChild(TextNode(child, self.soup))
+
+    def cloneNode(self):
+        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        _base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+    
+    def cloneNode(self):
+        raise NotImplementedError
+
+class TreeBuilder(_base.TreeBuilder):
+    def __init__(self, namespaceHTMLElements):
+        if namespaceHTMLElements:
+            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
+        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+        
+    def documentClass(self):
+        self.soup = BeautifulSoup("")
+        return Element(self.soup, self.soup, None)
+    
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if publicId:
+            self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
+        elif systemId:
+            self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
+                                            (name, systemId)))
+        else:
+            self.soup.insert(0, Declaration("DOCTYPE %s"%name))
+    
+    def elementClass(self, name, namespace):
+        if namespace is not None:
+            warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
+        return Element(Tag(self.soup, name), self.soup, namespace)
+        
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+    
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None) 
+
+    def appendChild(self, node):
+        self.soup.insert(len(self.soup.contents), node.element)
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.soup
+    
+    def getFragment(self):
+        return _base.TreeBuilder.getFragment(self).element
+    
+def testSerializer(element):
+    import re
+    rv = []
+    def serializeElement(element, indent=0):
+        if isinstance(element, Declaration):
+            doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
+            m = re.compile(doctype_regexp).match(element.string)
+            assert m is not None, "DOCTYPE did not match expected format"
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1') or ""
+            else:
+                systemId = m.group('systemId2')
+
+            if publicId is not None or systemId is not None:
+                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
+                          (' '*indent, name, publicId or "", systemId or ""))
+            else:
+                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
+            
+        elif isinstance(element, BeautifulSoup):
+            if element.name == "[document_fragment]":
+                rv.append("#document-fragment")                
+            else:
+                rv.append("#document")
+
+        elif isinstance(element, Comment):
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
+        elif isinstance(element, unicode):
+            rv.append("|%s\"%s\"" %(' '*indent, element))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.name))
+            if element.attrs:
+                for name, value in sorted(element.attrs):
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+        indent += 2
+        if hasattr(element, "contents"):
+            for child in element.contents:
+                serializeElement(child, indent)
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
--- a/libs/html5lib/treewalkers/init.py
+++ b/libs/html5lib/treewalkers/init.py
@ -0,0 +1,52 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to do
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method taking a tree as sole argument and
+returning an iterator generating tokens.
+"""
+
+treeWalkerCache = {}
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    treeType - the name of the tree type required (case-insensitive). Supported
+               values are "simpletree", "dom", "etree" and "beautifulsoup"
+
+               "simpletree" - a built-in DOM-ish tree type with support for some
+                              more pythonic idioms.
+                "dom" - The xml.dom.minidom DOM implementation
+                "pulldom" - The xml.dom.pulldom event stream
+                "etree" - A generic walker for tree implementations exposing an
+                          elementtree-like interface (known to work with
+                          ElementTree, cElementTree and lxml.etree).
+                "lxml" - Optimized walker for lxml.etree
+                "beautifulsoup" - Beautiful soup (if installed)
+                "genshi" - a Genshi stream
+
+    implementation - (Currently applies to the "etree" tree type only). A module
+                      implementing the tree type e.g. xml.etree.ElementTree or
+                      cElementTree."""
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType in ("dom", "pulldom", "simpletree"):
+            mod = __import__(treeType, globals())
+            treeWalkerCache[treeType] = mod.TreeWalker
+        elif treeType == "genshi":
+            import genshistream
+            treeWalkerCache[treeType] = genshistream.TreeWalker
+        elif treeType == "beautifulsoup":
+            import soup
+            treeWalkerCache[treeType] = soup.TreeWalker
+        elif treeType == "lxml":
+            import lxmletree
+            treeWalkerCache[treeType] = lxmletree.TreeWalker
+        elif treeType == "etree":
+            import etree
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
--- a/libs/html5lib/treewalkers/_base.py
+++ b/libs/html5lib/treewalkers/_base.py
@ -0,0 +1,176 @@
+import gettext
+_ = gettext.gettext
+
+from html5lib.constants import voidElements, spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class TreeWalker(object):
+    def __init__(self, tree):
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        return {"type": "SerializeError", "data": msg}
+
+    def normalizeAttrs(self, attrs):
+        newattrs = {}
+        if attrs:
+            #TODO: treewalkers should always have attrs
+            for (namespace,name),value in attrs.iteritems():
+                namespace = unicode(namespace) if namespace else None
+                name = unicode(name)
+                value = unicode(value)
+                newattrs[(namespace,name)] = value
+        return newattrs
+
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        yield {"type": "EmptyTag", "name": unicode(name), 
+               "namespace":unicode(namespace),
+               "data": self.normalizeAttrs(attrs)}
+        if hasChildren:
+            yield self.error(_("Void element has children"))
+
+    def startTag(self, namespace, name, attrs):
+        return {"type": "StartTag", 
+                "name": unicode(name),
+                "namespace":unicode(namespace),
+                "data": self.normalizeAttrs(attrs)}
+
+    def endTag(self, namespace, name):
+        return {"type": "EndTag", 
+                "name": unicode(name),
+                "namespace":unicode(namespace),
+                "data": {}}
+
+    def text(self, data):
+        data = unicode(data)
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data)-len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        return {"type": "Comment", "data": unicode(data)}
+
+    def doctype(self, name, publicId=None, systemId=None, correct=True):
+        return {"type": "Doctype",
+                "name": name is not None and unicode(name) or u"",
+                "publicId": publicId,
+                "systemId": systemId,
+                "correct": correct}
+
+    def entity(self, name):
+        return {"type": "Entity", "name": unicode(name)}
+
+    def unknown(self, nodeType):
+        return self.error(_("Unknown node type: ") + nodeType)
+
+class RecursiveTreeWalker(TreeWalker):
+    def walkChildren(self, node):
+        raise NodeImplementedError
+
+    def element(self, node, namespace, name, attrs, hasChildren):
+        if name in voidElements:
+            for token in self.emptyTag(namespace, name, attrs, hasChildren):
+                yield token
+        else:
+            yield self.startTag(name, attrs)
+            if hasChildren:
+                for token in self.walkChildren(node):
+                    yield token
+            yield self.endTag(name)
+
+from xml.dom import Node
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+ENTITY = Node.ENTITY_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+    
+    def getFirstChild(self, node):
+        raise NotImplementedError
+    
+    def getNextSibling(self, node):
+        raise NotImplementedError
+    
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+            endTag = None
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                namespace, name, attributes, hasChildren = details
+                if name in voidElements:
+                    for token in self.emptyTag(namespace, name, attributes, 
+                                               hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    endTag = name
+                    yield self.startTag(namespace, name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == ENTITY:
+                yield self.entity(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+            
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+            
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        namespace, name, attributes, hasChildren = details
+                        if name not in voidElements:
+                            yield self.endTag(namespace, name)
+                    if self.tree is currentNode:
+                        currentNode = None
+                        break
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    else:
+                        currentNode = self.getParentNode(currentNode)
--- a/libs/html5lib/treewalkers/dom.py
+++ b/libs/html5lib/treewalkers/dom.py
@ -0,0 +1,41 @@
+from xml.dom import Node
+
+import gettext
+_ = gettext.gettext
+
+import _base
+from html5lib.constants import voidElements
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            return _base.DOCTYPE, node.name, node.publicId, node.systemId
+
+        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+            return _base.TEXT, node.nodeValue
+
+        elif node.nodeType == Node.ELEMENT_NODE:
+            attrs = {}
+            for attr in node.attributes.keys():
+                attr = node.getAttributeNode(attr)
+                attrs[(attr.namespaceURI,attr.localName)] = attr.value
+            return (_base.ELEMENT, node.namespaceURI, node.nodeName, 
+                    attrs, node.hasChildNodes())
+
+        elif node.nodeType == Node.COMMENT_NODE:
+            return _base.COMMENT, node.nodeValue
+
+        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+            return (_base.DOCUMENT,)
+
+        else:
+            return _base.UNKNOWN, node.nodeType
+
+    def getFirstChild(self, node):
+        return node.firstChild
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parentNode
--- a/libs/html5lib/treewalkers/etree.py
+++ b/libs/html5lib/treewalkers/etree.py
@ -0,0 +1,141 @@
+import gettext
+_ = gettext.gettext
+
+try:
+    from types import ModuleType
+except:
+    from new import module as ModuleType
+import copy
+import re
+
+import _base
+from html5lib.constants import voidElements
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
+moduleCache = {}
+
+def getETreeModule(ElementTreeImplementation):
+    name = "_" + ElementTreeImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
+    else:
+        mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder")
+        objs = getETreeBuilder(ElementTreeImplementation)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod
+        return mod
+
+def getETreeBuilder(ElementTreeImplementation):
+    ElementTree = ElementTreeImplementation
+
+    class TreeWalker(_base.NonRecursiveTreeWalker):
+        """Given the particular ElementTree representation, this implementation,
+        to avoid using recursion, returns "nodes" as tuples with the following
+        content:
+
+        1. The current element
+        
+        2. The index of the element relative to its parent
+        
+        3. A stack of ancestor elements
+        
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
+        """
+        def getNodeDetails(self, node):
+            if isinstance(node, tuple): # It might be the root Element
+                elt, key, parents, flag = node
+                if flag in ("text", "tail"):
+                    return _base.TEXT, getattr(elt, flag)
+                else:
+                    node = elt
+
+            if not(hasattr(node, "tag")):
+                node = node.getroot()
+
+            if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
+                return (_base.DOCUMENT,)
+
+            elif node.tag == "<!DOCTYPE>":
+                return (_base.DOCTYPE, node.text, 
+                        node.get("publicId"), node.get("systemId"))
+
+            elif node.tag == ElementTree.Comment:
+                return _base.COMMENT, node.text
+
+            else:
+                assert type(node.tag) in (str, unicode), type(node.tag)
+                #This is assumed to be an ordinary element
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                attrs = {}
+                for name, value in node.attrib.items():
+                    match = tag_regexp.match(name)
+                    if match:
+                        attrs[(match.group(1),match.group(2))] = value
+                    else:
+                        attrs[(None,name)] = value
+                return (_base.ELEMENT, namespace, tag, 
+                        attrs, len(node) or node.text)
+    
+        def getFirstChild(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                element, key, parents, flag = node, None, [], None
+                
+            if flag in ("text", "tail"):
+                return None
+            else:
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+        
+        def getNextSibling(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+                
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key+1], key+1, parents, None
+                else:
+                    return None
+        
+        def getParentNode(self, node):
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
+            else:
+                return None
+            
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    return parent, list(parents[-1]).index(parent), parents, None
+
+    return locals()
--- a/libs/html5lib/treewalkers/genshistream.py
+++ b/libs/html5lib/treewalkers/genshistream.py
@ -0,0 +1,70 @@
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
+from genshi.core  import  START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+from genshi.output import NamespaceFlattener
+
+import _base
+
+from html5lib.constants import voidElements
+
+class TreeWalker(_base.TreeWalker):
+    def __iter__(self):
+        depth = 0
+        ignore_until = None
+        previous = None
+        for event in self.tree:
+            if previous is not None:
+                if previous[0] == START:
+                    depth += 1
+                if ignore_until <= depth:
+                    ignore_until = None
+                if ignore_until is None:
+                    for token in self.tokens(previous, event):
+                        yield token
+                        if token["type"] == "EmptyTag":
+                            ignore_until = depth
+                if previous[0] == END:
+                    depth -= 1
+            previous = event
+        if previous is not None:
+            if ignore_until is None or ignore_until <= depth:
+                for token in self.tokens(previous, None):
+                    yield token
+            elif ignore_until is not None:
+                raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
+
+    def tokens(self, event, next):
+        kind, data, pos = event
+        if kind == START:
+            tag, attrib = data
+            name = tag.localname
+            namespace = tag.namespace
+            if tag in voidElements:
+                for token in self.emptyTag(namespace, name, list(attrib),
+                                           not next or next[0] != END 
+                                           or next[1] != tag):
+                    yield token
+            else:
+                yield self.startTag(namespace, name, list(attrib))
+
+        elif kind == END:
+            name = data.localname
+            namespace = data.namespace
+            if name not in voidElements:
+                yield self.endTag(namespace, name)
+
+        elif kind == COMMENT:
+            yield self.comment(data)
+
+        elif kind == TEXT:
+            for token in self.text(data):
+                yield token
+
+        elif kind == DOCTYPE:
+            yield self.doctype(*data)
+
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
+          START_CDATA, END_CDATA, PI):
+            pass
+
+        else:
+            yield self.unknown(kind)
--- a/libs/html5lib/treewalkers/lxmletree.py
+++ b/libs/html5lib/treewalkers/lxmletree.py
@ -0,0 +1,186 @@
+from lxml import etree
+from html5lib.treebuilders.etree import tag_regexp
+
+from gettext import gettext
+_ = gettext
+
+import _base
+
+from html5lib.constants import voidElements
+from html5lib import ihatexml
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+        if et.docinfo.internalDTD:
+            self.children.append(Doctype(self, et.docinfo.root_name, 
+                                         et.docinfo.public_id, 
+                                         et.docinfo.system_url))
+        root = et.getroot()
+        node = root
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+    
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+        
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = self.obj.text
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = self.obj.tail
+        else:
+            self.tail = None
+        self.isstring = isinstance(obj, basestring)
+        
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+    
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __nonzero__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __unicode__(self):
+        return unicode(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+        
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        if hasattr(tree, "getroot"):
+            tree = Root(tree)
+        elif isinstance(tree, list):
+            tree = FragmentRoot(tree)
+        _base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = ihatexml.InfosetFilter()
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            return _base.TEXT, getattr(node, key)
+
+        elif isinstance(node, Root):
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return _base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and node.isstring:
+            return _base.TEXT, node
+
+        elif node.tag == etree.Comment:
+            return _base.COMMENT, node.text
+
+        elif node.tag == etree.Entity:
+            return _base.ENTITY, node.text[1:-1] # strip &;
+
+        else:
+            #This is assumed to be an ordinary element
+            match = tag_regexp.match(node.tag)
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = node.tag
+            attrs = {}
+            for name, value in node.attrib.items():
+                match = tag_regexp.match(name)
+                if match:
+                    attrs[(match.group(1),match.group(2))] = value
+                else:
+                    attrs[(None,name)] = value
+            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), 
+                    attrs, len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), _("Text nodes have no children")
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else: # tail
+                return node.getnext()
+
+        return node.tail and (node, "tail") or node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+
+        return node.getparent()
--- a/libs/html5lib/treewalkers/pulldom.py
+++ b/libs/html5lib/treewalkers/pulldom.py
@ -0,0 +1,60 @@
+from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
+    COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
+
+import _base
+
+from html5lib.constants import voidElements
+
+class TreeWalker(_base.TreeWalker):
+    def __iter__(self):
+        ignore_until = None
+        previous = None
+        for event in self.tree:
+            if previous is not None and \
+              (ignore_until is None or previous[1] is ignore_until):
+                if previous[1] is ignore_until:
+                    ignore_until = None
+                for token in self.tokens(previous, event):
+                    yield token
+                    if token["type"] == "EmptyTag":
+                        ignore_until = previous[1]
+            previous = event
+        if ignore_until is None or previous[1] is ignore_until:
+            for token in self.tokens(previous, None):
+                yield token
+        elif ignore_until is not None:
+            raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
+
+    def tokens(self, event, next):
+        type, node = event
+        if type == START_ELEMENT:
+            name = node.nodeName
+            namespace = node.namespaceURI
+            attrs = {}
+            for attr in node.attributes.keys():
+                attr = node.getAttributeNode(attr)
+                attrs[(attr.namespaceURI,attr.localName)] = attr.value
+            if name in voidElements:
+                for token in self.emptyTag(namespace,
+                                           name,
+                                           attrs,
+                                           not next or next[1] is not node):
+                    yield token
+            else:
+                yield self.startTag(namespace, name, attrs)
+
+        elif type == END_ELEMENT:
+            name = node.nodeName
+            namespace = node.namespaceURI
+            if name not in voidElements:
+                yield self.endTag(namespace, name)
+
+        elif type == COMMENT:
+            yield self.comment(node.nodeValue)
+
+        elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
+            for token in self.text(node.nodeValue):
+                yield token
+
+        else:
+            yield self.unknown(type)
--- a/libs/html5lib/treewalkers/simpletree.py
+++ b/libs/html5lib/treewalkers/simpletree.py
@ -0,0 +1,78 @@
+import gettext
+_ = gettext.gettext
+
+import _base
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    """Given that simpletree has no performant way of getting a node's
+    next sibling, this implementation returns "nodes" as tuples with the
+    following content:
+
+    1. The parent Node (Element, Document or DocumentFragment)
+
+    2. The child index of the current node in its parent's children list
+
+    3. A list used as a stack of all ancestors. It is a pair tuple whose
+       first item is a parent Node and second item is a child index.
+    """
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple): # It might be the root Node
+            parent, idx, parents = node
+            node = parent.childNodes[idx]
+
+        # testing node.type allows us not to import treebuilders.simpletree
+        if node.type in (1, 2): # Document or DocumentFragment
+            return (_base.DOCUMENT,)
+
+        elif node.type == 3: # DocumentType
+            return _base.DOCTYPE, node.name, node.publicId, node.systemId
+
+        elif node.type == 4: # TextNode
+            return _base.TEXT, node.value
+
+        elif node.type == 5: # Element
+            attrs = {}
+            for name, value in node.attributes.items():
+                if isinstance(name, tuple):
+                    attrs[(name[2],name[1])] = value
+                else:
+                    attrs[(None,name)] = value
+            return (_base.ELEMENT, node.namespace, node.name, 
+                    attrs, node.hasContent())
+
+        elif node.type == 6: # CommentNode
+            return _base.COMMENT, node.data
+
+        else:
+            return _node.UNKNOWN, node.type
+
+    def getFirstChild(self, node):
+        if isinstance(node, tuple): # It might be the root Node
+            parent, idx, parents = node
+            parents.append((parent, idx))
+            node = parent.childNodes[idx]
+        else:
+            parents = []
+
+        assert node.hasContent(), "Node has no children"
+        return (node, 0, parents)
+
+    def getNextSibling(self, node):
+        assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
+        parent, idx, parents = node
+        idx += 1
+        if len(parent.childNodes) > idx:
+            return (parent, idx, parents)
+        else:
+            return None
+
+    def getParentNode(self, node):
+        assert isinstance(node, tuple)
+        parent, idx, parents = node
+        if parents:
+            parent, idx = parents.pop()
+            return parent, idx, parents
+        else:
+            # HACK: We could return ``parent`` but None will stop the algorithm the same way
+            return None
--- a/libs/html5lib/treewalkers/soup.py
+++ b/libs/html5lib/treewalkers/soup.py
@ -0,0 +1,60 @@
+import re
+import gettext
+_ = gettext.gettext
+
+from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
+from html5lib.constants import namespaces
+import _base
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    doctype_regexp = re.compile(
+        r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
+    def getNodeDetails(self, node):
+        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Declaration): # DocumentType
+            string = unicode(node.string)
+            #Slice needed to remove markup added during unicode conversion,
+            #but only in some versions of BeautifulSoup/Python
+            if string.startswith('<!') and string.endswith('>'):
+                string = string[2:-1]
+            m = self.doctype_regexp.match(string)
+            #This regexp approach seems wrong and fragile
+            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
+            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
+            #been modified at all
+            #We could just feed to it a html5lib tokenizer, I guess...
+            assert m is not None, "DOCTYPE did not match expected format"
+
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1')
+            else:
+                systemId = m.group('systemId2')
+            return _base.DOCTYPE, name, publicId or "", systemId or ""
+
+        elif isinstance(node, Comment):
+            string = unicode(node.string)
+            if string.startswith('<!--') and string.endswith('-->'):
+                string = string[4:-3]
+            return _base.COMMENT, string
+
+        elif isinstance(node, unicode): # TextNode
+            return _base.TEXT, node
+
+        elif isinstance(node, Tag): # Element
+            return (_base.ELEMENT, namespaces["html"], node.name,
+                    dict(node.attrs).items(), node.contents)
+        else:
+            return _base.UNKNOWN, node.__class__.__name__
+
+    def getFirstChild(self, node):
+        return node.contents[0]
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parent
--- a/libs/html5lib/utils.py
+++ b/libs/html5lib/utils.py
@ -0,0 +1,175 @@
+try:
+    frozenset
+except NameError:
+    #Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        # Using _dictEntries instead of directly assigning to self is about
+        # twice as fast. Please do careful performance testing before changing
+        # anything here.
+        _dictEntries = []
+        for name,value in items:
+            if type(name) in (list, tuple, frozenset, set):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
+
+#Pure python implementation of deque taken from the ASPN Python Cookbook
+#Original code by Raymond Hettinger
+
+class deque(object):
+
+    def __init__(self, iterable=(), maxsize=-1):
+        if not hasattr(self, 'data'):
+            self.left = self.right = 0
+            self.data = {}
+        self.maxsize = maxsize
+        self.extend(iterable)
+
+    def append(self, x):
+        self.data[self.right] = x
+        self.right += 1
+        if self.maxsize != -1 and len(self) > self.maxsize:
+            self.popleft()
+        
+    def appendleft(self, x):
+        self.left -= 1        
+        self.data[self.left] = x
+        if self.maxsize != -1 and len(self) > self.maxsize:
+            self.pop()      
+        
+    def pop(self):
+        if self.left == self.right:
+            raise IndexError('cannot pop from empty deque')
+        self.right -= 1
+        elem = self.data[self.right]
+        del self.data[self.right]         
+        return elem
+    
+    def popleft(self):
+        if self.left == self.right:
+            raise IndexError('cannot pop from empty deque')
+        elem = self.data[self.left]
+        del self.data[self.left]
+        self.left += 1
+        return elem
+
+    def clear(self):
+        self.data.clear()
+        self.left = self.right = 0
+
+    def extend(self, iterable):
+        for elem in iterable:
+            self.append(elem)
+
+    def extendleft(self, iterable):
+        for elem in iterable:
+            self.appendleft(elem)
+
+    def rotate(self, n=1):
+        if self:
+            n %= len(self)
+            for i in xrange(n):
+                self.appendleft(self.pop())
+
+    def __getitem__(self, i):
+        if i < 0:
+            i += len(self)
+        try:
+            return self.data[i + self.left]
+        except KeyError:
+            raise IndexError
+
+    def __setitem__(self, i, value):
+        if i < 0:
+            i += len(self)        
+        try:
+            self.data[i + self.left] = value
+        except KeyError:
+            raise IndexError
+
+    def __delitem__(self, i):
+        size = len(self)
+        if not (-size <= i < size):
+            raise IndexError
+        data = self.data
+        if i < 0:
+            i += size
+        for j in xrange(self.left+i, self.right-1):
+            data[j] = data[j+1]
+        self.pop()
+    
+    def __len__(self):
+        return self.right - self.left
+
+    def __cmp__(self, other):
+        if type(self) != type(other):
+            return cmp(type(self), type(other))
+        return cmp(list(self), list(other))
+            
+    def __repr__(self, _track=[]):
+        if id(self) in _track:
+            return '...'
+        _track.append(id(self))
+        r = 'deque(%r)' % (list(self),)
+        _track.remove(id(self))
+        return r
+    
+    def __getstate__(self):
+        return (tuple(self),)
+    
+    def __setstate__(self, s):
+        self.__init__(s[0])
+        
+    def __hash__(self):
+        raise TypeError
+    
+    def __copy__(self):
+        return self.__class__(self)
+    
+    def __deepcopy__(self, memo={}):
+        from copy import deepcopy
+        result = self.__class__()
+        memo[id(self)] = result
+        result.__init__(deepcopy(tuple(self), memo))
+        return result
+
+#Some utility functions to dal with weirdness around UCS2 vs UCS4
+#python builds
+
+def encodingType():
+    if len() == 2:
+        return "UCS2"
+    else:
+        return "UCS4"
+
+def isSurrogatePair(data):   
+    return (len(data) == 2 and
+            ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
+            ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
+
+def surrogatePairToCodepoint(data):
+    char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 + 
+                (ord(data[1]) - 0xDC00))
+    return char_val
--- a/libs/oauthlib/init.py
+++ b/libs/oauthlib/init.py
--- a/libs/oauthlib/common.py
+++ b/libs/oauthlib/common.py
@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+"""
+oauthlib.common
+~~~~~~~~~~~~~~
+
+This module provides data structures and utilities common
+to all implementations of OAuth.
+"""
+
+import re
+import urllib
+import urlparse
+
+
+always_safe = (u'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+               u'abcdefghijklmnopqrstuvwxyz'
+               u'0123456789' u'_.-')
+
+
+def quote(s, safe=u'/'):
+    encoded = s.encode("utf-8")
+    quoted = urllib.quote(encoded, safe)
+    return quoted.decode("utf-8")
+
+
+def unquote(s):
+    encoded = s.encode("utf-8")
+    unquoted = urllib.unquote(encoded)
+    return unquoted.decode("utf-8")
+
+
+def urlencode(params):
+    utf8_params = encode_params_utf8(params)
+    urlencoded = urllib.urlencode(utf8_params)
+    return urlencoded.decode("utf-8")
+
+
+def encode_params_utf8(params):
+    """Ensures that all parameters in a list of 2-element tuples are encoded to
+    bytestrings using UTF-8
+    """
+    encoded = []
+    for k, v in params:
+        encoded.append((
+            k.encode('utf-8') if isinstance(k, unicode) else k,
+            v.encode('utf-8') if isinstance(v, unicode) else v))
+    return encoded
+
+
+def decode_params_utf8(params):
+    """Ensures that all parameters in a list of 2-element tuples are decoded to
+    unicode using UTF-8.
+    """
+    decoded = []
+    for k, v in params:
+        decoded.append((
+            k.decode('utf-8') if isinstance(k, str) else k,
+            v.decode('utf-8') if isinstance(v, str) else v))
+    return decoded
+
+
+urlencoded = set(always_safe) | set(u'=&;%+~')
+
+
+def urldecode(query):
+    """Decode a query string in x-www-form-urlencoded format into a sequence
+    of two-element tuples.
+
+    Unlike urlparse.parse_qsl(..., strict_parsing=True) urldecode will enforce
+    correct formatting of the query string by validation. If validation fails
+    a ValueError will be raised. urllib.parse_qsl will only raise errors if
+    any of name-value pairs omits the equals sign.
+    """
+    # Check if query contains invalid characters
+    if query and not set(query) <= urlencoded:
+        raise ValueError('Invalid characters in query string.')
+
+    # Check for correctly hex encoded values using a regular expression
+    # All encoded values begin with % followed by two hex characters
+    # correct = %00, %A0, %0A, %FF
+    # invalid = %G0, %5H, %PO
+    invalid_hex = u'%[^0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]'
+    if len(re.findall(invalid_hex, query)):
+        raise ValueError('Invalid hex encoding in query string.')
+
+    query = query.decode('utf-8') if isinstance(query, str) else query
+    # We want to allow queries such as "c2" whereas urlparse.parse_qsl
+    # with the strict_parsing flag will not.
+    params = urlparse.parse_qsl(query, keep_blank_values=True)
+
+    # unicode all the things
+    return decode_params_utf8(params)
+
+
+def extract_params(raw):
+    """Extract parameters and return them as a list of 2-tuples.
+
+    Will successfully extract parameters from urlencoded query strings,
+    dicts, or lists of 2-tuples. Empty strings/dicts/lists will return an
+    empty list of parameters. Any other input will result in a return
+    value of None.
+    """
+    if isinstance(raw, basestring):
+        try:
+            params = urldecode(raw)
+        except ValueError:
+            params = None
+    elif hasattr(raw, '__iter__'):
+        try:
+            dict(raw)
+        except ValueError:
+            params = None
+        except TypeError:
+            params = None
+        else:
+            params = list(raw.items() if isinstance(raw, dict) else raw)
+            params = decode_params_utf8(params)
+    else:
+        params = None
+
+    return params
+
+
+class Request(object):
+    """A malleable representation of a signable HTTP request.
+
+    Body argument may contain any data, but parameters will only be decoded if
+    they are one of:
+
+    * urlencoded query string
+    * dict
+    * list of 2-tuples
+
+    Anything else will be treated as raw body data to be passed through
+    unmolested.
+    """
+
+    def __init__(self, uri, http_method=u'GET', body=None, headers=None):
+        self.uri = uri
+        self.http_method = http_method
+        self.headers = headers or {}
+        self.body = body
+        self.decoded_body = extract_params(body)
+        self.oauth_params = []
+
+    @property
+    def uri_query(self):
+        return urlparse.urlparse(self.uri).query
+
+    @property
+    def uri_query_params(self):
+        return urlparse.parse_qsl(self.uri_query, keep_blank_values=True,
+                                  strict_parsing=True)
--- a/libs/oauthlib/oauth1/init.py
+++ b/libs/oauthlib/oauth1/init.py
@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+"""
+oauthlib.oauth1
+~~~~~~~~~~~~~~
+
+This module is a wrapper for the most recent implementation of OAuth 1.0 Client
+and Server classes.
+"""
+
+from .rfc5849 import Client, Server
+
--- a/libs/oauthlib/oauth1/rfc5849/init.py
+++ b/libs/oauthlib/oauth1/rfc5849/init.py
@ -0,0 +1,350 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+"""
+oauthlib.oauth1.rfc5849
+~~~~~~~~~~~~~~
+
+This module is an implementation of various logic needed
+for signing and checking OAuth 1.0 RFC 5849 requests.
+"""
+
+import logging
+import urlparse
+
+from oauthlib.common import Request, urlencode
+from . import parameters, signature, utils
+
+SIGNATURE_HMAC = u"HMAC-SHA1"
+SIGNATURE_RSA = u"RSA-SHA1"
+SIGNATURE_PLAINTEXT = u"PLAINTEXT"
+SIGNATURE_METHODS = (SIGNATURE_HMAC, SIGNATURE_RSA, SIGNATURE_PLAINTEXT)
+
+SIGNATURE_TYPE_AUTH_HEADER = u'AUTH_HEADER'
+SIGNATURE_TYPE_QUERY = u'QUERY'
+SIGNATURE_TYPE_BODY = u'BODY'
+
+CONTENT_TYPE_FORM_URLENCODED = u'application/x-www-form-urlencoded'
+
+
+class Client(object):
+    """A client used to sign OAuth 1.0 RFC 5849 requests"""
+    def __init__(self, client_key,
+            client_secret=None,
+            resource_owner_key=None,
+            resource_owner_secret=None,
+            callback_uri=None,
+            signature_method=SIGNATURE_HMAC,
+            signature_type=SIGNATURE_TYPE_AUTH_HEADER,
+            rsa_key=None, verifier=None):
+        self.client_key = client_key
+        self.client_secret = client_secret
+        self.resource_owner_key = resource_owner_key
+        self.resource_owner_secret = resource_owner_secret
+        self.signature_method = signature_method
+        self.signature_type = signature_type
+        self.callback_uri = callback_uri
+        self.rsa_key = rsa_key
+        self.verifier = verifier
+
+        if self.signature_method == SIGNATURE_RSA and self.rsa_key is None:
+            raise ValueError('rsa_key is required when using RSA signature method.')
+
+    def get_oauth_signature(self, request):
+        """Get an OAuth signature to be used in signing a request
+        """
+        if self.signature_method == SIGNATURE_PLAINTEXT:
+            # fast-path
+            return signature.sign_plaintext(self.client_secret,
+                self.resource_owner_secret)
+
+        uri, headers, body = self._render(request)
+
+        collected_params = signature.collect_parameters(
+            uri_query=urlparse.urlparse(uri).query,
+            body=body,
+            headers=headers)
+        logging.debug("Collected params: {0}".format(collected_params))
+
+        normalized_params = signature.normalize_parameters(collected_params)
+        normalized_uri = signature.normalize_base_string_uri(request.uri)
+        logging.debug("Normalized params: {0}".format(normalized_params))
+        logging.debug("Normalized URI: {0}".format(normalized_uri))
+
+        base_string = signature.construct_base_string(request.http_method,
+            normalized_uri, normalized_params)
+
+        logging.debug("Base signing string: {0}".format(base_string))
+
+        if self.signature_method == SIGNATURE_HMAC:
+            sig = signature.sign_hmac_sha1(base_string, self.client_secret,
+                self.resource_owner_secret)
+        elif self.signature_method == SIGNATURE_RSA:
+            sig = signature.sign_rsa_sha1(base_string, self.rsa_key)
+        else:
+            sig = signature.sign_plaintext(self.client_secret,
+                self.resource_owner_secret)
+
+        logging.debug("Signature: {0}".format(sig))
+        return sig
+
+    def get_oauth_params(self):
+        """Get the basic OAuth parameters to be used in generating a signature.
+        """
+        params = [
+            (u'oauth_nonce', utils.generate_nonce()),
+            (u'oauth_timestamp', utils.generate_timestamp()),
+            (u'oauth_version', u'1.0'),
+            (u'oauth_signature_method', self.signature_method),
+            (u'oauth_consumer_key', self.client_key),
+        ]
+        if self.resource_owner_key:
+            params.append((u'oauth_token', self.resource_owner_key))
+        if self.callback_uri:
+            params.append((u'oauth_callback', self.callback_uri))
+        if self.verifier:
+            params.append((u'oauth_verifier', self.verifier))
+
+        return params
+
+    def _render(self, request, formencode=False):
+        """Render a signed request according to signature type
+
+        Returns a 3-tuple containing the request URI, headers, and body.
+
+        If the formencode argument is True and the body contains parameters, it
+        is escaped and returned as a valid formencoded string.
+        """
+        # TODO what if there are body params on a header-type auth?
+        # TODO what if there are query params on a body-type auth?
+
+        uri, headers, body = request.uri, request.headers, request.body
+
+        # TODO: right now these prepare_* methods are very narrow in scope--they
+        # only affect their little thing. In some cases (for example, with
+        # header auth) it might be advantageous to allow these methods to touch
+        # other parts of the request, like the headers—so the prepare_headers
+        # method could also set the Content-Type header to x-www-form-urlencoded
+        # like the spec requires. This would be a fundamental change though, and
+        # I'm not sure how I feel about it.
+        if self.signature_type == SIGNATURE_TYPE_AUTH_HEADER:
+            headers = parameters.prepare_headers(request.oauth_params, request.headers)
+        elif self.signature_type == SIGNATURE_TYPE_BODY and request.decoded_body is not None:
+            body = parameters.prepare_form_encoded_body(request.oauth_params, request.decoded_body)
+            if formencode:
+                body = urlencode(body)
+            headers['Content-Type'] = u'application/x-www-form-urlencoded'
+        elif self.signature_type == SIGNATURE_TYPE_QUERY:
+            uri = parameters.prepare_request_uri_query(request.oauth_params, request.uri)
+        else:
+            raise ValueError('Unknown signature type specified.')
+
+        return uri, headers, body
+
+    def sign(self, uri, http_method=u'GET', body=None, headers=None):
+        """Sign a request
+
+        Signs an HTTP request with the specified parts.
+
+        Returns a 3-tuple of the signed request's URI, headers, and body.
+        Note that http_method is not returned as it is unaffected by the OAuth
+        signing process.
+
+        The body argument may be a dict, a list of 2-tuples, or a formencoded
+        string. The Content-Type header must be 'application/x-www-form-urlencoded'
+        if it is present.
+
+        If the body argument is not one of the above, it will be returned
+        verbatim as it is unaffected by the OAuth signing process. Attempting to
+        sign a request with non-formencoded data using the OAuth body signature
+        type is invalid and will raise an exception.
+
+        If the body does contain parameters, it will be returned as a properly-
+        formatted formencoded string.
+
+        All string data MUST be unicode. This includes strings inside body
+        dicts, for example.
+        """
+        # normalize request data
+        request = Request(uri, http_method, body, headers)
+
+        # sanity check
+        content_type = request.headers.get('Content-Type', None)
+        multipart = content_type and content_type.startswith('multipart/')
+        should_have_params = content_type == CONTENT_TYPE_FORM_URLENCODED
+        has_params = request.decoded_body is not None
+        # 3.4.1.3.1.  Parameter Sources
+        # [Parameters are collected from the HTTP request entity-body, but only
+        # if [...]:
+        #    *  The entity-body is single-part.
+        if multipart and has_params:
+            raise ValueError("Headers indicate a multipart body but body contains parameters.")
+        #    *  The entity-body follows the encoding requirements of the
+        #       "application/x-www-form-urlencoded" content-type as defined by
+        #       [W3C.REC-html40-19980424].
+        elif should_have_params and not has_params:
+            raise ValueError("Headers indicate a formencoded body but body was not decodable.")
+        #    *  The HTTP request entity-header includes the "Content-Type"
+        #       header field set to "application/x-www-form-urlencoded".
+        elif not should_have_params and has_params:
+            raise ValueError("Body contains parameters but Content-Type header was not set.")
+
+        # 3.5.2.  Form-Encoded Body
+        # Protocol parameters can be transmitted in the HTTP request entity-
+        # body, but only if the following REQUIRED conditions are met:
+        # o  The entity-body is single-part.
+        # o  The entity-body follows the encoding requirements of the
+        #    "application/x-www-form-urlencoded" content-type as defined by
+        #    [W3C.REC-html40-19980424].
+        # o  The HTTP request entity-header includes the "Content-Type" header
+        #    field set to "application/x-www-form-urlencoded".
+        elif self.signature_type == SIGNATURE_TYPE_BODY and not (
+                should_have_params and has_params and not multipart):
+            raise ValueError('Body signatures may only be used with form-urlencoded content')
+
+        # generate the basic OAuth parameters
+        request.oauth_params = self.get_oauth_params()
+
+        # generate the signature
+        request.oauth_params.append((u'oauth_signature', self.get_oauth_signature(request)))
+
+        # render the signed request and return it
+        return self._render(request, formencode=True)
+
+
+class Server(object):
+    """A server used to verify OAuth 1.0 RFC 5849 requests"""
+    def __init__(self, signature_method=SIGNATURE_HMAC, rsa_key=None):
+        self.signature_method = signature_method
+        self.rsa_key = rsa_key
+
+    def get_client_secret(self, client_key):
+        raise NotImplementedError("Subclasses must implement this function.")
+
+    def get_resource_owner_secret(self, resource_owner_key):
+        raise NotImplementedError("Subclasses must implement this function.")
+
+    def get_signature_type_and_params(self, uri_query, headers, body):
+        signature_types_with_oauth_params = filter(lambda s: s[1], (
+            (SIGNATURE_TYPE_AUTH_HEADER, utils.filter_oauth_params(
+                signature.collect_parameters(headers=headers,
+                exclude_oauth_signature=False))),
+            (SIGNATURE_TYPE_BODY, utils.filter_oauth_params(
+                signature.collect_parameters(body=body,
+                exclude_oauth_signature=False))),
+            (SIGNATURE_TYPE_QUERY, utils.filter_oauth_params(
+                signature.collect_parameters(uri_query=uri_query,
+                exclude_oauth_signature=False))),
+        ))
+
+        if len(signature_types_with_oauth_params) > 1:
+            raise ValueError('oauth_ params must come from only 1 signature type but were found in %s' % ', '.join(
+                [s[0] for s in signature_types_with_oauth_params]))
+        try:
+            signature_type, params = signature_types_with_oauth_params[0]
+        except IndexError:
+            raise ValueError('oauth_ params are missing. Could not determine signature type.')
+
+        return signature_type, dict(params)
+
+    def check_client_key(self, client_key):
+        raise NotImplementedError("Subclasses must implement this function.")
+
+    def check_resource_owner_key(self, client_key, resource_owner_key):
+        raise NotImplementedError("Subclasses must implement this function.")
+
+    def check_timestamp_and_nonce(self, timestamp, nonce):
+        raise NotImplementedError("Subclasses must implement this function.")
+
+    def check_request_signature(self, uri, http_method=u'GET', body='',
+            headers=None):
+        """Check a request's supplied signature to make sure the request is
+        valid.
+
+        Servers should return HTTP status 400 if a ValueError exception
+        is raised and HTTP status 401 on return value False.
+
+        Per `section 3.2`_ of the spec.
+
+        .. _`section 3.2`: http://tools.ietf.org/html/rfc5849#section-3.2
+        """
+        headers = headers or {}
+        signature_type = None
+        # FIXME: urlparse does not return unicode!
+        uri_query = urlparse.urlparse(uri).query
+
+        signature_type, params = self.get_signature_type_and_params(uri_query,
+            headers, body)
+
+        # the parameters may not include duplicate oauth entries
+        filtered_params = utils.filter_oauth_params(params)
+        if len(filtered_params) != len(params):
+            raise ValueError("Duplicate OAuth entries.")
+
+        params = dict(params)
+        request_signature = params.get(u'oauth_signature')
+        client_key = params.get(u'oauth_consumer_key')
+        resource_owner_key = params.get(u'oauth_token')
+        nonce = params.get(u'oauth_nonce')
+        timestamp = params.get(u'oauth_timestamp')
+        callback_uri = params.get(u'oauth_callback')
+        verifier = params.get(u'oauth_verifier')
+        signature_method = params.get(u'oauth_signature_method')
+
+        # ensure all mandatory parameters are present
+        if not all((request_signature, client_key, nonce,
+                    timestamp, signature_method)):
+            raise ValueError("Missing OAuth parameters.")
+
+        # if version is supplied, it must be "1.0"
+        if u'oauth_version' in params and params[u'oauth_version'] != u'1.0':
+            raise ValueError("Invalid OAuth version.")
+
+        # signature method must be valid
+        if not signature_method in SIGNATURE_METHODS:
+            raise ValueError("Invalid signature method.")
+
+        # ensure client key is valid
+        if not self.check_client_key(client_key):
+            return False
+
+        # ensure resource owner key is valid and not expired
+        if not self.check_resource_owner_key(client_key, resource_owner_key):
+            return False
+
+        # ensure the nonce and timestamp haven't been used before
+        if not self.check_timestamp_and_nonce(timestamp, nonce):
+            return False
+
+        # FIXME: extract realm, then self.check_realm
+
+        # oauth_client parameters depend on client chosen signature method
+        # which may vary for each request, section 3.4
+        # HMAC-SHA1 and PLAINTEXT share parameters
+        if signature_method == SIGNATURE_RSA:
+            oauth_client = Client(client_key,
+                resource_owner_key=resource_owner_key,
+                callback_uri=callback_uri,
+                signature_method=signature_method,
+                signature_type=signature_type,
+                rsa_key=self.rsa_key, verifier=verifier)
+        else:
+            client_secret = self.get_client_secret(client_key)
+            resource_owner_secret = self.get_resource_owner_secret(
+                resource_owner_key)
+            oauth_client = Client(client_key,
+                client_secret=client_secret,
+                resource_owner_key=resource_owner_key,
+                resource_owner_secret=resource_owner_secret,
+                callback_uri=callback_uri,
+                signature_method=signature_method,
+                signature_type=signature_type,
+                verifier=verifier)
+
+        request = Request(uri, http_method, body, headers)
+        request.oauth_params = params
+
+        client_signature = oauth_client.get_oauth_signature(request)
+
+        # FIXME: use near constant time string compare to avoid timing attacks
+        return client_signature == request_signature
--- a/libs/oauthlib/oauth1/rfc5849/parameters.py
+++ b/libs/oauthlib/oauth1/rfc5849/parameters.py
@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+"""
+oauthlib.parameters
+~~~~~~~~~~~~~~~~~~~
+
+This module contains methods related to `section 3.5`_ of the OAuth 1.0a spec.
+
+.. _`section 3.5`: http://tools.ietf.org/html/rfc5849#section-3.5
+"""
+
+from urlparse import urlparse, urlunparse
+from . import utils
+from oauthlib.common import extract_params, urlencode
+
+
+# TODO: do we need filter_params now that oauth_params are handled by Request?
+#       We can easily pass in just oauth protocol params.
+@utils.filter_params
+def prepare_headers(oauth_params, headers=None, realm=None):
+    """**Prepare the Authorization header.**
+    Per `section 3.5.1`_ of the spec.
+
+    Protocol parameters can be transmitted using the HTTP "Authorization"
+    header field as defined by `RFC2617`_ with the auth-scheme name set to
+    "OAuth" (case insensitive).
+
+    For example::
+
+        Authorization: OAuth realm="Example",
+            oauth_consumer_key="0685bd9184jfhq22",
+            oauth_token="ad180jjd733klru7",
+            oauth_signature_method="HMAC-SHA1",
+            oauth_signature="wOJIO9A2W5mFwDgiDvZbTSMK%2FPY%3D",
+            oauth_timestamp="137131200",
+            oauth_nonce="4572616e48616d6d65724c61686176",
+            oauth_version="1.0"
+
+
+    .. _`section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1
+    .. _`RFC2617`: http://tools.ietf.org/html/rfc2617
+    """
+    headers = headers or {}
+
+    # Protocol parameters SHALL be included in the "Authorization" header
+    # field as follows:
+    authorization_header_parameters_parts = []
+    for oauth_parameter_name, value in oauth_params:
+        # 1.  Parameter names and values are encoded per Parameter Encoding
+        #     (`Section 3.6`_)
+        #
+        # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+        escaped_name = utils.escape(oauth_parameter_name)
+        escaped_value = utils.escape(value)
+
+        # 2.  Each parameter's name is immediately followed by an "=" character
+        #     (ASCII code 61), a """ character (ASCII code 34), the parameter
+        #     value (MAY be empty), and another """ character (ASCII code 34).
+        part = u'{0}="{1}"'.format(escaped_name, escaped_value)
+
+        authorization_header_parameters_parts.append(part)
+
+    # 3.  Parameters are separated by a "," character (ASCII code 44) and
+    #     OPTIONAL linear whitespace per `RFC2617`_.
+    #
+    # .. _`RFC2617`: http://tools.ietf.org/html/rfc2617
+    authorization_header_parameters = ', '.join(
+        authorization_header_parameters_parts)
+
+    # 4.  The OPTIONAL "realm" parameter MAY be added and interpreted per
+    #     `RFC2617 section 1.2`_.
+    #
+    # .. _`RFC2617 section 1.2`: http://tools.ietf.org/html/rfc2617#section-1.2
+    if realm:
+        # NOTE: realm should *not* be escaped
+        authorization_header_parameters = (u'realm="%s", ' % realm +
+            authorization_header_parameters)
+
+    # the auth-scheme name set to "OAuth" (case insensitive).
+    authorization_header = u'OAuth %s' % authorization_header_parameters
+
+    # contribute the Authorization header to the given headers
+    full_headers = {}
+    full_headers.update(headers)
+    full_headers[u'Authorization'] = authorization_header
+    return full_headers
+
+
+def _append_params(oauth_params, params):
+    """Append OAuth params to an existing set of parameters.
+
+    Both params and oauth_params is must be lists of 2-tuples.
+
+    Per `section 3.5.2`_ and `3.5.3`_ of the spec.
+
+    .. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2
+    .. _`3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3
+
+    """
+    merged = list(params)
+    merged.extend(oauth_params)
+    # The request URI / entity-body MAY include other request-specific
+    # parameters, in which case, the protocol parameters SHOULD be appended
+    # following the request-specific parameters, properly separated by an "&"
+    # character (ASCII code 38)
+    merged.sort(key=lambda i: i[0].startswith('oauth_'))
+    return merged
+
+
+def prepare_form_encoded_body(oauth_params, body):
+    """Prepare the Form-Encoded Body.
+
+    Per `section 3.5.2`_ of the spec.
+
+    .. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2
+
+    """
+    # append OAuth params to the existing body
+    return _append_params(oauth_params, body)
+
+
+def prepare_request_uri_query(oauth_params, uri):
+    """Prepare the Request URI Query.
+
+    Per `section 3.5.3`_ of the spec.
+
+    .. _`section 3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3
+
+    """
+    # append OAuth params to the existing set of query components
+    sch, net, path, par, query, fra = urlparse(uri)
+    query = urlencode(_append_params(oauth_params, extract_params(query) or []))
+    return urlunparse((sch, net, path, par, query, fra))
--- a/libs/oauthlib/oauth1/rfc5849/signature.py
+++ b/libs/oauthlib/oauth1/rfc5849/signature.py
@ -0,0 +1,501 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+"""
+oauthlib.oauth1.rfc5849.signature
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This module represents a direct implementation of `section 3.4`_ of the spec.
+
+Terminology:
+ * Client: software interfacing with an OAuth API
+ * Server: the API provider
+ * Resource Owner: the user who is granting authorization to the client
+
+Steps for signing a request:
+
+1. Collect parameters from the uri query, auth header, & body
+2. Normalize those parameters
+3. Normalize the uri
+4. Pass the normalized uri, normalized parameters, and http method to
+   construct the base string
+5. Pass the base string and any keys needed to a signing function
+
+.. _`section 3.4`: http://tools.ietf.org/html/rfc5849#section-3.4
+"""
+import binascii
+import hashlib
+import hmac
+import urlparse
+from . import utils
+from oauthlib.common import extract_params
+
+
+def construct_base_string(http_method, base_string_uri,
+        normalized_encoded_request_parameters):
+    """**String Construction**
+    Per `section 3.4.1.1`_ of the spec.
+
+    For example, the HTTP request::
+
+        POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1
+        Host: example.com
+        Content-Type: application/x-www-form-urlencoded
+        Authorization: OAuth realm="Example",
+            oauth_consumer_key="9djdj82h48djs9d2",
+            oauth_token="kkk9d7dh3k39sjv7",
+            oauth_signature_method="HMAC-SHA1",
+            oauth_timestamp="137131201",
+            oauth_nonce="7d8f3e4a",
+            oauth_signature="bYT5CMsGcbgUdFHObYMEfcx6bsw%3D"
+
+        c2&a3=2+q
+
+    is represented by the following signature base string (line breaks
+    are for display purposes only)::
+
+        POST&http%3A%2F%2Fexample.com%2Frequest&a2%3Dr%2520b%26a3%3D2%2520q
+        %26a3%3Da%26b5%3D%253D%25253D%26c%2540%3D%26c2%3D%26oauth_consumer_
+        key%3D9djdj82h48djs9d2%26oauth_nonce%3D7d8f3e4a%26oauth_signature_m
+        ethod%3DHMAC-SHA1%26oauth_timestamp%3D137131201%26oauth_token%3Dkkk
+        9d7dh3k39sjv7
+
+    .. _`section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1
+    """
+
+    # The signature base string is constructed by concatenating together,
+    # in order, the following HTTP request elements:
+
+    # 1.  The HTTP request method in uppercase.  For example: "HEAD",
+    #     "GET", "POST", etc.  If the request uses a custom HTTP method, it
+    #     MUST be encoded (`Section 3.6`_).
+    #
+    # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+    base_string = utils.escape(http_method.upper())
+
+    # 2.  An "&" character (ASCII code 38).
+    base_string += u'&'
+
+    # 3.  The base string URI from `Section 3.4.1.2`_, after being encoded
+    #     (`Section 3.6`_).
+    #
+    # .. _`Section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2
+    # .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6
+    base_string += utils.escape(base_string_uri)
+
+    # 4.  An "&" character (ASCII code 38).
+    base_string += u'&'
+
+    # 5.  The request parameters as normalized in `Section 3.4.1.3.2`_, after
+    #     being encoded (`Section 3.6`).
+    #
+    # .. _`Section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2
+    # .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6
+    base_string += utils.escape(normalized_encoded_request_parameters)
+
+    return base_string
+
+
+def normalize_base_string_uri(uri):
+    """**Base String URI**
+    Per `section 3.4.1.2`_ of the spec.
+
+    For example, the HTTP request::
+
+        GET /r%20v/X?id=123 HTTP/1.1
+        Host: EXAMPLE.COM:80
+
+    is represented by the base string URI: "http://example.com/r%20v/X".
+
+    In another example, the HTTPS request::
+
+        GET /?q=1 HTTP/1.1
+        Host: www.example.net:8080
+
+    is represented by the base string URI: "https://www.example.net:8080/".
+
+    .. _`section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2
+    """
+    if not isinstance(uri, unicode):
+        raise ValueError('uri must be a unicode object.')
+
+    # FIXME: urlparse does not support unicode
+    scheme, netloc, path, params, query, fragment = urlparse.urlparse(uri)
+
+    # The scheme, authority, and path of the request resource URI `RFC3986`
+    # are included by constructing an "http" or "https" URI representing
+    # the request resource (without the query or fragment) as follows:
+    #
+    # .. _`RFC2616`: http://tools.ietf.org/html/rfc3986
+
+    # 1.  The scheme and host MUST be in lowercase.
+    scheme = scheme.lower()
+    netloc = netloc.lower()
+
+    # 2.  The host and port values MUST match the content of the HTTP
+    #     request "Host" header field.
+    # TODO: enforce this constraint
+
+    # 3.  The port MUST be included if it is not the default port for the
+    #     scheme, and MUST be excluded if it is the default.  Specifically,
+    #     the port MUST be excluded when making an HTTP request `RFC2616`_
+    #     to port 80 or when making an HTTPS request `RFC2818`_ to port 443.
+    #     All other non-default port numbers MUST be included.
+    #
+    # .. _`RFC2616`: http://tools.ietf.org/html/rfc2616
+    # .. _`RFC2818`: http://tools.ietf.org/html/rfc2818
+    default_ports = (
+        (u'http', u'80'),
+        (u'https', u'443'),
+    )
+    if u':' in netloc:
+        host, port = netloc.split(u':', 1)
+        if (scheme, port) in default_ports:
+            netloc = host
+
+    return urlparse.urlunparse((scheme, netloc, path, u'', u'', u''))
+
+
+# ** Request Parameters **
+#
+#    Per `section 3.4.1.3`_ of the spec.
+#
+#    In order to guarantee a consistent and reproducible representation of
+#    the request parameters, the parameters are collected and decoded to
+#    their original decoded form.  They are then sorted and encoded in a
+#    particular manner that is often different from their original
+#    encoding scheme, and concatenated into a single string.
+#
+#    .. _`section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3
+
+def collect_parameters(uri_query='', body=[], headers=None,
+        exclude_oauth_signature=True):
+    """**Parameter Sources**
+
+    Parameters starting with `oauth_` will be unescaped.
+
+    Body parameters must be supplied as a dict, a list of 2-tuples, or a
+    formencoded query string.
+
+    Headers must be supplied as a dict.
+
+    Per `section 3.4.1.3.1`_ of the spec.
+
+    For example, the HTTP request::
+
+        POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1
+        Host: example.com
+        Content-Type: application/x-www-form-urlencoded
+        Authorization: OAuth realm="Example",
+            oauth_consumer_key="9djdj82h48djs9d2",
+            oauth_token="kkk9d7dh3k39sjv7",
+            oauth_signature_method="HMAC-SHA1",
+            oauth_timestamp="137131201",
+            oauth_nonce="7d8f3e4a",
+            oauth_signature="djosJKDKJSD8743243%2Fjdk33klY%3D"
+
+        c2&a3=2+q
+
+    contains the following (fully decoded) parameters used in the
+    signature base sting::
+
+        +------------------------+------------------+
+        |          Name          |       Value      |
+        +------------------------+------------------+
+        |           b5           |       =%3D       |
+        |           a3           |         a        |
+        |           c@           |                  |
+        |           a2           |        r b       |
+        |   oauth_consumer_key   | 9djdj82h48djs9d2 |
+        |       oauth_token      | kkk9d7dh3k39sjv7 |
+        | oauth_signature_method |     HMAC-SHA1    |
+        |     oauth_timestamp    |     137131201    |
+        |       oauth_nonce      |     7d8f3e4a     |
+        |           c2           |                  |
+        |           a3           |        2 q       |
+        +------------------------+------------------+
+
+    Note that the value of "b5" is "=%3D" and not "==".  Both "c@" and
+    "c2" have empty values.  While the encoding rules specified in this
+    specification for the purpose of constructing the signature base
+    string exclude the use of a "+" character (ASCII code 43) to
+    represent an encoded space character (ASCII code 32), this practice
+    is widely used in "application/x-www-form-urlencoded" encoded values,
+    and MUST be properly decoded, as demonstrated by one of the "a3"
+    parameter instances (the "a3" parameter is used twice in this
+    request).
+
+    .. _`section 3.4.1.3.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.1
+    """
+    headers = headers or {}
+    params = []
+
+    # The parameters from the following sources are collected into a single
+    # list of name/value pairs:
+
+    # *  The query component of the HTTP request URI as defined by
+    #    `RFC3986, Section 3.4`_.  The query component is parsed into a list
+    #    of name/value pairs by treating it as an
+    #    "application/x-www-form-urlencoded" string, separating the names
+    #    and values and decoding them as defined by
+    #    `W3C.REC-html40-19980424`_, Section 17.13.4.
+    #
+    # .. _`RFC3986, Section 3.4`: http://tools.ietf.org/html/rfc3986#section-3.4
+    # .. _`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424
+    if uri_query:
+        params.extend(urlparse.parse_qsl(uri_query, keep_blank_values=True))
+
+    # *  The OAuth HTTP "Authorization" header field (`Section 3.5.1`_) if
+    #    present.  The header's content is parsed into a list of name/value
+    #    pairs excluding the "realm" parameter if present.  The parameter
+    #    values are decoded as defined by `Section 3.5.1`_.
+    #
+    # .. _`Section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1
+    if headers:
+        headers_lower = dict((k.lower(), v) for k, v in headers.items())
+        authorization_header = headers_lower.get(u'authorization')
+        if authorization_header is not None:
+            params.extend([i for i in utils.parse_authorization_header(
+                authorization_header) if i[0] != u'realm'])
+
+    # *  The HTTP request entity-body, but only if all of the following
+    #    conditions are met:
+    #     *  The entity-body is single-part.
+    #
+    #     *  The entity-body follows the encoding requirements of the
+    #        "application/x-www-form-urlencoded" content-type as defined by
+    #        `W3C.REC-html40-19980424`_.
+
+    #     *  The HTTP request entity-header includes the "Content-Type"
+    #        header field set to "application/x-www-form-urlencoded".
+    #
+    # .._`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424
+
+    # TODO: enforce header param inclusion conditions
+    bodyparams = extract_params(body) or []
+    params.extend(bodyparams)
+
+    # ensure all oauth params are unescaped
+    unescaped_params = []
+    for k, v in params:
+        if k.startswith(u'oauth_'):
+            v = utils.unescape(v)
+        unescaped_params.append((k, v))
+
+    # The "oauth_signature" parameter MUST be excluded from the signature
+    # base string if present.
+    if exclude_oauth_signature:
+        unescaped_params = filter(lambda i: i[0] != u'oauth_signature',
+            unescaped_params)
+
+    return unescaped_params
+
+
+def normalize_parameters(params):
+    """**Parameters Normalization**
+    Per `section 3.4.1.3.2`_ of the spec.
+
+    For example, the list of parameters from the previous section would
+    be normalized as follows:
+
+    Encoded::
+
+    +------------------------+------------------+
+    |          Name          |       Value      |
+    +------------------------+------------------+
+    |           b5           |     %3D%253D     |
+    |           a3           |         a        |
+    |          c%40          |                  |
+    |           a2           |       r%20b      |
+    |   oauth_consumer_key   | 9djdj82h48djs9d2 |
+    |       oauth_token      | kkk9d7dh3k39sjv7 |
+    | oauth_signature_method |     HMAC-SHA1    |
+    |     oauth_timestamp    |     137131201    |
+    |       oauth_nonce      |     7d8f3e4a     |
+    |           c2           |                  |
+    |           a3           |       2%20q      |
+    +------------------------+------------------+
+
+    Sorted::
+
+    +------------------------+------------------+
+    |          Name          |       Value      |
+    +------------------------+------------------+
+    |           a2           |       r%20b      |
+    |           a3           |       2%20q      |
+    |           a3           |         a        |
+    |           b5           |     %3D%253D     |
+    |          c%40          |                  |
+    |           c2           |                  |
+    |   oauth_consumer_key   | 9djdj82h48djs9d2 |
+    |       oauth_nonce      |     7d8f3e4a     |
+    | oauth_signature_method |     HMAC-SHA1    |
+    |     oauth_timestamp    |     137131201    |
+    |       oauth_token      | kkk9d7dh3k39sjv7 |
+    +------------------------+------------------+
+
+    Concatenated Pairs::
+
+    +-------------------------------------+
+    |              Name=Value             |
+    +-------------------------------------+
+    |               a2=r%20b              |
+    |               a3=2%20q              |
+    |                 a3=a                |
+    |             b5=%3D%253D             |
+    |                c%40=                |
+    |                 c2=                 |
+    | oauth_consumer_key=9djdj82h48djs9d2 |
+    |         oauth_nonce=7d8f3e4a        |
+    |   oauth_signature_method=HMAC-SHA1  |
+    |      oauth_timestamp=137131201      |
+    |     oauth_token=kkk9d7dh3k39sjv7    |
+    +-------------------------------------+
+
+    and concatenated together into a single string (line breaks are for
+    display purposes only)::
+
+        a2=r%20b&a3=2%20q&a3=a&b5=%3D%253D&c%40=&c2=&oauth_consumer_key=9dj
+        dj82h48djs9d2&oauth_nonce=7d8f3e4a&oauth_signature_method=HMAC-SHA1
+        &oauth_timestamp=137131201&oauth_token=kkk9d7dh3k39sjv7
+
+    .. _`section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2
+    """
+
+    # The parameters collected in `Section 3.4.1.3`_ are normalized into a
+    # single string as follows:
+    #
+    # .. _`Section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3
+
+    # 1.  First, the name and value of each parameter are encoded
+    #     (`Section 3.6`_).
+    #
+    # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+    key_values = [(utils.escape(k), utils.escape(v)) for k, v in params]
+
+    # 2.  The parameters are sorted by name, using ascending byte value
+    #     ordering.  If two or more parameters share the same name, they
+    #     are sorted by their value.
+    key_values.sort()
+
+    # 3.  The name of each parameter is concatenated to its corresponding
+    #     value using an "=" character (ASCII code 61) as a separator, even
+    #     if the value is empty.
+    parameter_parts = [u'{0}={1}'.format(k, v) for k, v in key_values]
+
+    # 4.  The sorted name/value pairs are concatenated together into a
+    #     single string by using an "&" character (ASCII code 38) as
+    #     separator.
+    return u'&'.join(parameter_parts)
+
+
+def sign_hmac_sha1(base_string, client_secret, resource_owner_secret):
+    """**HMAC-SHA1**
+
+    The "HMAC-SHA1" signature method uses the HMAC-SHA1 signature
+    algorithm as defined in `RFC2104`_::
+
+        digest = HMAC-SHA1 (key, text)
+
+    Per `section 3.4.2`_ of the spec.
+
+    .. _`RFC2104`: http://tools.ietf.org/html/rfc2104
+    .. _`section 3.4.2`: http://tools.ietf.org/html/rfc5849#section-3.4.2
+    """
+
+    # The HMAC-SHA1 function variables are used in following way:
+
+    # text is set to the value of the signature base string from
+    # `Section 3.4.1.1`_.
+    #
+    # .. _`Section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1
+    text = base_string
+
+    # key is set to the concatenated values of:
+    # 1.  The client shared-secret, after being encoded (`Section 3.6`_).
+    #
+    # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+    key = utils.escape(client_secret or u'')
+
+    # 2.  An "&" character (ASCII code 38), which MUST be included
+    #     even when either secret is empty.
+    key += u'&'
+
+    # 3.  The token shared-secret, after being encoded (`Section 3.6`_).
+    #
+    # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+    key += utils.escape(resource_owner_secret or u'')
+
+    # FIXME: HMAC does not support unicode!
+    key_utf8 = key.encode('utf-8')
+    text_utf8 = text.encode('utf-8')
+    signature = hmac.new(key_utf8, text_utf8, hashlib.sha1)
+
+    # digest  is used to set the value of the "oauth_signature" protocol
+    #         parameter, after the result octet string is base64-encoded
+    #         per `RFC2045, Section 6.8`.
+    #
+    # .. _`RFC2045, Section 6.8`: http://tools.ietf.org/html/rfc2045#section-6.8
+    return binascii.b2a_base64(signature.digest())[:-1].decode('utf-8')
+
+
+def sign_rsa_sha1(base_string, rsa_private_key):
+    """**RSA-SHA1**
+
+    Per `section 3.4.3`_ of the spec.
+
+    The "RSA-SHA1" signature method uses the RSASSA-PKCS1-v1_5 signature
+    algorithm as defined in `RFC3447, Section 8.2`_ (also known as
+    PKCS#1), using SHA-1 as the hash function for EMSA-PKCS1-v1_5.  To
+    use this method, the client MUST have established client credentials
+    with the server that included its RSA public key (in a manner that is
+    beyond the scope of this specification).
+
+    NOTE: this method requires the python-rsa library.
+
+    .. _`section 3.4.3`: http://tools.ietf.org/html/rfc5849#section-3.4.3
+    .. _`RFC3447, Section 8.2`: http://tools.ietf.org/html/rfc3447#section-8.2
+
+    """
+
+    # TODO: finish RSA documentation
+
+    import rsa
+    key = rsa.PrivateKey.load_pkcs1(rsa_private_key)
+    sig = rsa.sign(base_string, key, 'SHA-1')
+    return binascii.b2a_base64(sig)[:-1]
+
+
+def sign_plaintext(client_secret, resource_owner_secret):
+    """Sign a request using plaintext.
+
+    Per `section 3.4.4`_ of the spec.
+
+    The "PLAINTEXT" method does not employ a signature algorithm.  It
+    MUST be used with a transport-layer mechanism such as TLS or SSL (or
+    sent over a secure channel with equivalent protections).  It does not
+    utilize the signature base string or the "oauth_timestamp" and
+    "oauth_nonce" parameters.
+
+    .. _`section 3.4.4`: http://tools.ietf.org/html/rfc5849#section-3.4.4
+
+    """
+
+    # The "oauth_signature" protocol parameter is set to the concatenated
+    # value of:
+
+    # 1.  The client shared-secret, after being encoded (`Section 3.6`_).
+    #
+    # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+    signature = utils.escape(client_secret or u'')
+
+    # 2.  An "&" character (ASCII code 38), which MUST be included even
+    #     when either secret is empty.
+    signature += u'&'
+
+    # 3.  The token shared-secret, after being encoded (`Section 3.6`_).
+    #
+    # .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+    signature += utils.escape(resource_owner_secret or u'')
+
+    return signature
+
--- a/libs/oauthlib/oauth1/rfc5849/utils.py
+++ b/libs/oauthlib/oauth1/rfc5849/utils.py
@ -0,0 +1,141 @@
+# -*- coding: utf-8 -*-
+
+"""
+oauthlib.utils
+~~~~~~~~~~~~~~
+
+This module contains utility methods used by various parts of the OAuth
+spec.
+"""
+
+import string
+import time
+import urllib2
+from random import getrandbits, choice
+
+from oauthlib.common import quote, unquote
+
+UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') +
+    string.digits.decode('ascii'))
+
+
+def filter_params(target):
+    """Decorator which filters params to remove non-oauth_* parameters
+
+    Assumes the decorated method takes a params dict or list of tuples as its
+    first argument.
+    """
+    def wrapper(params, *args, **kwargs):
+        params = filter_oauth_params(params)
+        return target(params, *args, **kwargs)
+
+    wrapper.__doc__ = target.__doc__
+    return wrapper
+
+
+def filter_oauth_params(params):
+    """Removes all non oauth parameters from a dict or a list of params."""
+    is_oauth = lambda kv: kv[0].startswith(u"oauth_")
+    if isinstance(params, dict):
+        return filter(is_oauth, params.items())
+    else:
+        return filter(is_oauth, params)
+
+
+def generate_timestamp():
+    """Get seconds since epoch (UTC).
+
+    Per `section 3.3`_ of the spec.
+
+    .. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3
+    """
+    return unicode(int(time.time()))
+
+
+def generate_nonce():
+    """Generate pseudorandom nonce that is unlikely to repeat.
+
+    Per `section 3.3`_ of the spec.
+
+    A random 64-bit number is appended to the epoch timestamp for both
+    randomness and to decrease the likelihood of collisions.
+
+    .. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3
+    """
+    return unicode(getrandbits(64)) + generate_timestamp()
+
+
+def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET):
+    """Generates a generic OAuth token
+
+    According to `section 2`_ of the spec, the method of token
+    construction is undefined. This implementation is simply a random selection
+    of `length` choices from `chars`.
+
+    Credit to Ignacio Vazquez-Abrams for his excellent `Stackoverflow answer`_
+
+    .. _`Stackoverflow answer` : http://stackoverflow.com/questions/2257441/
+        python-random-string-generation-with-upper-case-letters-and-digits
+
+    """
+    return u''.join(choice(chars) for x in range(length))
+
+
+def escape(u):
+    """Escape a unicode string in an OAuth-compatible fashion.
+
+    Per `section 3.6`_ of the spec.
+
+    .. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+
+    """
+    if not isinstance(u, unicode):
+        raise ValueError('Only unicode objects are escapable.')
+    # Letters, digits, and the characters '_.-' are already treated as safe
+    # by urllib.quote(). We need to add '~' to fully support rfc5849.
+    return quote(u, safe='~')
+
+
+def unescape(u):
+    if not isinstance(u, unicode):
+        raise ValueError('Only unicode objects are unescapable.')
+    return unquote(u)
+
+
+def urlencode(query):
+    """Encode a sequence of two-element tuples or dictionary into a URL query string.
+
+    Operates using an OAuth-safe escape() method, in contrast to urllib.urlencode.
+    """
+    # Convert dictionaries to list of tuples
+    if isinstance(query, dict):
+        query = query.items()
+    return u"&".join([u'='.join([escape(k), escape(v)]) for k, v in query])
+
+
+def parse_keqv_list(l):
+    """A unicode-safe version of urllib2.parse_keqv_list"""
+    encoded_list = [u.encode('utf-8') for u in l]
+    encoded_parsed = urllib2.parse_keqv_list(encoded_list)
+    return dict((k.decode('utf-8'),
+        v.decode('utf-8')) for k,v in encoded_parsed.items())
+
+
+def parse_http_list(u):
+    """A unicode-safe version of urllib2.parse_http_list"""
+    encoded_str = u.encode('utf-8')
+    encoded_list = urllib2.parse_http_list(encoded_str)
+    return [s.decode('utf-8') for s in encoded_list]
+
+
+def parse_authorization_header(authorization_header):
+    """Parse an OAuth authorization header into a list of 2-tuples"""
+    auth_scheme = u'OAuth '
+    if authorization_header.startswith(auth_scheme):
+        authorization_header = authorization_header.replace(auth_scheme, u'', 1)
+    items = parse_http_list(authorization_header)
+    try:
+        return parse_keqv_list(items).items()
+    except ValueError:
+        raise ValueError('Malformed authorization header')
+
--- a/libs/oauthlib/oauth2/init.py
+++ b/libs/oauthlib/oauth2/init.py
@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+"""
+oauthlib.oauth2
+~~~~~~~~~~~~~~
+
+This module is a wrapper for the most recent implementation of OAuth 2.0 Client
+and Server classes.
+"""
+
+from .draft25 import Client, Server
+
--- a/libs/oauthlib/oauth2/draft25/init.py
+++ b/libs/oauthlib/oauth2/draft25/init.py
@ -0,0 +1,14 @@
+"""
+oauthlib.oauth2.draft_25
+~~~~~~~~~~~~~~
+
+This module is an implementation of various logic needed
+for signing and checking OAuth 2.0 draft 25 requests.
+"""
+
+class Client(object):
+    pass
+
+class Server(object):
+    pass
+
--- a/libs/oauthlib/oauth2/draft25/tokens.py
+++ b/libs/oauthlib/oauth2/draft25/tokens.py
@ -0,0 +1,131 @@
+from __future__ import absolute_import
+"""
+oauthlib.oauth2.draft25.tokens
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This module contains methods for adding two types of access tokens to requests.
+
+- Bearer http://tools.ietf.org/html/draft-ietf-oauth-saml2-bearer-08
+- MAC http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-00
+
+"""
+from binascii import b2a_base64
+import hashlib
+import hmac
+from urlparse import urlparse
+
+from . import utils
+
+
+def prepare_mac_header(token, uri, key, http_method, nonce=None, headers=None,
+        body=None, ext=u'', hash_algorithm=u'hmac-sha-1'):
+    """Add an `MAC Access Authentication`_ signature to headers.
+
+    Unlike OAuth 1, this HMAC signature does not require inclusion of the request
+    payload/body, neither does it use a combination of client_secret and
+    token_secret but rather a mac_key provided together with the access token.
+
+    Currently two algorithms are supported, "hmac-sha-1" and "hmac-sha-256",
+    `extension algorithms`_ are not supported.
+
+    Example MAC Authorization header, linebreaks added for clarity
+
+    Authorization: MAC id="h480djs93hd8",
+                       nonce="1336363200:dj83hs9s",
+                       mac="bhCQXTVyfj5cmA9uKkPFx1zeOXM="
+
+    .. _`MAC Access Authentication`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01
+    .. _`extension algorithms`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-7.1
+
+    :param uri: Request URI.
+    :param headers: Request headers as a dictionary.
+    :param http_method: HTTP Request method.
+    :param key: MAC given provided by token endpoint.
+    :param algorithm: HMAC algorithm provided by token endpoint.
+    :return: headers dictionary with the authorization field added.
+    """
+    http_method = http_method.upper()
+    host, port = utils.host_from_uri(uri)
+
+    if hash_algorithm.lower() == u'hmac-sha-1':
+        h = hashlib.sha1
+    else:
+        h = hashlib.sha256
+
+    nonce = nonce or u'{0}:{1}'.format(utils.generate_nonce(), utils.generate_timestamp())
+    sch, net, path, par, query, fra = urlparse(uri)
+
+    if query:
+        request_uri = path + u'?' + query
+    else:
+        request_uri = path
+
+    # Hash the body/payload
+    if body is not None:
+        bodyhash = b2a_base64(h(body).digest())[:-1].decode('utf-8')
+    else:
+        bodyhash = u''
+
+    # Create the normalized base string
+    base = []
+    base.append(nonce)
+    base.append(http_method.upper())
+    base.append(request_uri)
+    base.append(host)
+    base.append(port)
+    base.append(bodyhash)
+    base.append(ext)
+    base_string = '\n'.join(base) + u'\n'
+
+    # hmac struggles with unicode strings - http://bugs.python.org/issue5285
+    if isinstance(key, unicode):
+        key = key.encode('utf-8')
+    sign = hmac.new(key, base_string, h)
+    sign = b2a_base64(sign.digest())[:-1].decode('utf-8')
+
+    header = []
+    header.append(u'MAC id="%s"' % token)
+    header.append(u'nonce="%s"' % nonce)
+    if bodyhash:
+        header.append(u'bodyhash="%s"' % bodyhash)
+    if ext:
+        header.append(u'ext="%s"' % ext)
+    header.append(u'mac="%s"' % sign)
+
+    headers = headers or {}
+    headers[u'Authorization'] = u', '.join(header)
+    return headers
+
+
+def prepare_bearer_uri(token, uri):
+    """Add a `Bearer Token`_ to the request URI.
+    Not recommended, use only if client can't use authorization header or body.
+
+    http://www.example.com/path?access_token=h480djs93hd8
+
+    .. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18
+    """
+    return utils.add_params_to_uri(uri, [((u'access_token', token))])
+
+
+def prepare_bearer_headers(token, headers=None):
+    """Add a `Bearer Token`_ to the request URI.
+    Recommended method of passing bearer tokens.
+
+    Authorization: Bearer h480djs93hd8
+
+    .. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18
+    """
+    headers = headers or {}
+    headers[u'Authorization'] = u'Bearer %s' % token
+    return headers
+
+
+def prepare_bearer_body(token, body=u''):
+    """Add a `Bearer Token`_ to the request body.
+
+    access_token=h480djs93hd8
+
+    .. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18
+    """
+    return utils.add_params_to_qs(body, [((u'access_token', token))])
--- a/libs/oauthlib/oauth2/draft25/utils.py
+++ b/libs/oauthlib/oauth2/draft25/utils.py
@ -0,0 +1,128 @@
+"""
+oauthlib.utils
+~~~~~~~~~~~~~~
+
+This module contains utility methods used by various parts of the OAuth 2 spec.
+"""
+
+import random
+import string
+import time
+import urllib
+from urlparse import urlparse, urlunparse, parse_qsl
+
+UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') +
+    string.digits.decode('ascii'))
+
+def add_params_to_qs(query, params):
+    """Extend a query with a list of two-tuples.
+
+    :param query: Query string.
+    :param params: List of two-tuples.
+    :return: extended query
+    """
+    queryparams = parse_qsl(query, keep_blank_values=True)
+    queryparams.extend(params)
+    return urlencode(queryparams)
+
+
+def add_params_to_uri(uri, params):
+    """Add a list of two-tuples to the uri query components.
+
+    :param uri: Full URI.
+    :param params: List of two-tuples.
+    :return: uri with extended query
+    """
+    sch, net, path, par, query, fra = urlparse(uri)
+    query = add_params_to_qs(query, params)
+    return urlunparse((sch, net, path, par, query, fra))
+
+
+def escape(u):
+    """Escape a string in an OAuth-compatible fashion.
+
+    Per `section 3.6`_ of the spec.
+
+    .. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6
+
+    """
+    if not isinstance(u, unicode):
+        raise ValueError('Only unicode objects are escapable.')
+    return urllib.quote(u.encode('utf-8'), safe='~')
+
+
+def generate_nonce():
+    """Generate pseudorandom nonce that is unlikely to repeat.
+
+    Per `section 3.2.1`_ of the MAC Access Authentication spec.
+
+    A random 64-bit number is appended to the epoch timestamp for both
+    randomness and to decrease the likelihood of collisions.
+
+    .. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1
+    """
+    return unicode(unicode(random.getrandbits(64)) + generate_timestamp())
+
+
+def generate_timestamp():
+    """Get seconds since epoch (UTC).
+
+    Per `section 3.2.1`_ of the MAC Access Authentication spec.
+
+    .. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1
+    """
+    return unicode(int(time.time()))
+
+
+def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET):
+    """Generates a generic OAuth 2 token
+
+    According to `section 1.4`_ and `section 1.5` of the spec, the method of token
+    construction is undefined. This implementation is simply a random selection
+    of `length` choices from `chars`. SystemRandom is used since it provides
+    higher entropy than random.choice. 
+
+    .. _`section 1.4`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.4
+    .. _`section 1.5`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.5
+    """
+    rand = random.SystemRandom()
+    return u''.join(rand.choice(chars) for x in range(length))
+
+
+def host_from_uri(uri):
+    """Extract hostname and port from URI.
+
+    Will use default port for HTTP and HTTPS if none is present in the URI. 
+
+    >>> host_from_uri(u'https://www.example.com/path?query')
+    u'www.example.com', u'443'
+    >>> host_from_uri(u'http://www.example.com:8080/path?query')
+    u'www.example.com', u'8080'
+
+    :param uri: Full URI.
+    :param http_method: HTTP request method.
+    :return: hostname, port
+    """
+    default_ports = {
+        u'HTTP' : u'80',
+        u'HTTPS' : u'443',
+    }
+
+    sch, netloc, path, par, query, fra = urlparse(uri)
+    if u':' in netloc:
+        netloc, port = netloc.split(u':', 1)
+    else:
+        port = default_ports.get(sch.upper())
+
+    return netloc, port
+
+
+def urlencode(query):
+    """Encode a sequence of two-element tuples or dictionary into a URL query string.
+
+    Operates using an OAuth-safe escape() method, in contrast to urllib.urlenocde.
+    """
+    # Convert dictionaries to list of tuples
+    if isinstance(query, dict):
+        query = query.items()
+    return "&".join(['='.join([escape(k), escape(v)]) for k, v in query])
--- a/libs/subliminal/api.py
+++ b/libs/subliminal/api.py
@ -18,7 +18,8 @@
 from .core import (SERVICES, LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE,
    MATCHING_CONFIDENCE, create_list_tasks, consume_task, create_download_tasks,
    group_by_video, key_subtitles)
-from .languages import list_languages
+import guessit
+from guessit.language import ALL_LANGUAGES
 import logging


@ -26,7 +27,7 @@ __all__ = ['list_subtitles', 'download_subtitles']
 logger = logging.getLogger(__name__)


-def list_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3):
+def list_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None):
    """List subtitles in given paths according to the criteria

    :param paths: path(s) to video file or folder
@ -37,19 +38,20 @@ def list_subtitles(paths, languages=None, services=None, force=True, multi=False
    :param bool multi: search multiple languages for the same video
    :param string cache_dir: path to the cache directory to use
    :param int max_depth: maximum depth for scanning entries
+    :param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``)
    :return: found subtitles
    :rtype: dict of :class:`~subliminal.videos.Video` => [:class:`~subliminal.subtitles.ResultSubtitle`]

    """
    services = services or SERVICES
-    languages = set(languages or list_languages(1))
+    languages = set(map(guessit.Language, languages or []) or ALL_LANGUAGES)
    if isinstance(paths, basestring):
        paths = [paths]
    if any([not isinstance(p, unicode) for p in paths]):
        logger.warning(u'Not all entries are unicode')
    results = []
    service_instances = {}
-    tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth)
+    tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter)
    for task in tasks:
        try:
            result = consume_task(task, service_instances)
@ -61,7 +63,7 @@ def list_subtitles(paths, languages=None, services=None, force=True, multi=False
    return group_by_video(results)


-def download_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, order=None):
+def download_subtitles(paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None, order=None):
    """Download subtitles in given paths according to the criteria

    :param paths: path(s) to video file or folder
@ -72,6 +74,7 @@ def download_subtitles(paths, languages=None, services=None, force=True, multi=F
    :param bool multi: search multiple languages for the same video
    :param string cache_dir: path to the cache directory to use
    :param int max_depth: maximum depth for scanning entries
+    :param function scan_filter: filter function that takes a path as argument and returns a boolean indicating whether it has to be filtered out (``True``) or not (``False``)
    :param order: preferred order for subtitles sorting
    :type list: list of :data:`~subliminal.core.LANGUAGE_INDEX`, :data:`~subliminal.core.SERVICE_INDEX`, :data:`~subliminal.core.SERVICE_CONFIDENCE`, :data:`~subliminal.core.MATCHING_CONFIDENCE`
    :return: found subtitles
@ -79,11 +82,11 @@ def download_subtitles(paths, languages=None, services=None, force=True, multi=F

    """
    services = services or SERVICES
-    languages = languages or list_languages(1)
+    languages = map(guessit.Language, languages or []) or list(ALL_LANGUAGES)
    if isinstance(paths, basestring):
        paths = [paths]
    order = order or [LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE]
-    subtitles_by_video = list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth)
+    subtitles_by_video = list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth, scan_filter)
    for video, subtitles in subtitles_by_video.iteritems():
        subtitles.sort(key=lambda s: key_subtitles(s, video, languages, services, order), reverse=True)
    results = []
--- a/libs/subliminal/async.py
+++ b/libs/subliminal/async.py
@ -18,7 +18,7 @@
 from .core import (consume_task, LANGUAGE_INDEX, SERVICE_INDEX,
    SERVICE_CONFIDENCE, MATCHING_CONFIDENCE, SERVICES, create_list_tasks,
    create_download_tasks, group_by_video, key_subtitles)
-from .languages import list_languages
+from guessit.language import ALL_LANGUAGES
 from .tasks import StopTask
 import Queue
 import logging
@ -108,29 +108,29 @@ class Pool(object):
                break
        return results

-    def list_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3):
+    def list_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None):
        """See :meth:`subliminal.list_subtitles`"""
        services = services or SERVICES
-        languages = set(languages or list_languages(1))
+        languages = set(languages or ALL_LANGUAGES)
        if isinstance(paths, basestring):
            paths = [paths]
        if any([not isinstance(p, unicode) for p in paths]):
            logger.warning(u'Not all entries are unicode')
-        tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth)
+        tasks = create_list_tasks(paths, languages, services, force, multi, cache_dir, max_depth, scan_filter)
        for task in tasks:
            self.tasks.put(task)
        self.join()
        results = self.collect()
        return group_by_video(results)

-    def download_subtitles(self, paths, languages=None, services=None, cache_dir=None, max_depth=3, force=True, multi=False, order=None):
+    def download_subtitles(self, paths, languages=None, services=None, force=True, multi=False, cache_dir=None, max_depth=3, scan_filter=None, order=None):
        """See :meth:`subliminal.download_subtitles`"""
        services = services or SERVICES
-        languages = languages or list_languages(1)
+        languages = languages or list(ALL_LANGUAGES)
        if isinstance(paths, basestring):
            paths = [paths]
        order = order or [LANGUAGE_INDEX, SERVICE_INDEX, SERVICE_CONFIDENCE, MATCHING_CONFIDENCE]
-        subtitles_by_video = self.list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth)
+        subtitles_by_video = self.list_subtitles(paths, set(languages), services, force, multi, cache_dir, max_depth, scan_filter)
        for video, subtitles in subtitles_by_video.iteritems():
            subtitles.sort(key=lambda s: key_subtitles(s, video, languages, services, order), reverse=True)
        tasks = create_download_tasks(subtitles_by_video, multi)