Browse Source

BeautifulSoup4 python 3

old/py3
Ruud 11 years ago
parent
commit
6b4e6857de
  1. 22
      libs/bs4/__init__.py
  2. 4
      libs/bs4/builder/__init__.py
  3. 12
      libs/bs4/builder/_html5lib.py
  4. 12
      libs/bs4/builder/_htmlparser.py
  5. 22
      libs/bs4/builder/_lxml.py
  6. 10
      libs/bs4/dammit.py
  7. 52
      libs/bs4/diagnose.py
  8. 110
      libs/bs4/element.py
  9. 32
      libs/bs4/testing.py

22
libs/bs4/__init__.py

@ -45,7 +45,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is # The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it. # running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag): class BeautifulSoup(Tag):
""" """
@ -69,7 +69,7 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then like HTML's <br> tag), call handle_starttag and then
handle_endtag. handle_endtag.
""" """
ROOT_TAG_NAME = u'[document]' ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they # If the end-user gives no indication which tree builder they
# want, look for one with these features. # want, look for one with these features.
@ -135,12 +135,12 @@ class BeautifulSoup(Tag):
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if len(kwargs) > 0: if len(kwargs) > 0:
arg = kwargs.keys().pop() arg = list(kwargs.keys()).pop()
raise TypeError( raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg) "__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None: if builder is None:
if isinstance(features, basestring): if isinstance(features, str):
features = [features] features = [features]
if features is None or len(features) == 0: if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES features = self.DEFAULT_BUILDER_FEATURES
@ -164,7 +164,7 @@ class BeautifulSoup(Tag):
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants. # just in case that's what the user really wants.
if (isinstance(markup, unicode) if (isinstance(markup, str)
and not os.path.supports_unicode_filenames): and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8") possible_filename = markup.encode("utf8")
else: else:
@ -172,7 +172,7 @@ class BeautifulSoup(Tag):
is_file = False is_file = False
try: try:
is_file = os.path.exists(possible_filename) is_file = os.path.exists(possible_filename)
except Exception, e: except Exception as e:
# This is almost certainly a problem involving # This is almost certainly a problem involving
# characters not valid in filenames on this # characters not valid in filenames on this
# system. Just let it go. # system. Just let it go.
@ -184,7 +184,7 @@ class BeautifulSoup(Tag):
# TODO: This is ugly but I couldn't get it to work in # TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise. # Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup) if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)): or (isinstance(markup, str) and not ' ' in markup)):
warnings.warn( warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
@ -259,7 +259,7 @@ class BeautifulSoup(Tag):
def endData(self, containerClass=NavigableString): def endData(self, containerClass=NavigableString):
if self.current_data: if self.current_data:
current_data = u''.join(self.current_data) current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains # If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space # nothing but ASCII spaces, replace it with a single space
# or newline. # or newline.
@ -367,9 +367,9 @@ class BeautifulSoup(Tag):
encoding_part = '' encoding_part = ''
if eventual_encoding != None: if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else: else:
prefix = u'' prefix = ''
if not pretty_print: if not pretty_print:
indent_level = None indent_level = None
else: else:
@ -403,4 +403,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
soup = BeautifulSoup(sys.stdin) soup = BeautifulSoup(sys.stdin)
print soup.prettify() print(soup.prettify())

4
libs/bs4/builder/__init__.py

@ -153,13 +153,13 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', []) universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get( tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None) tag_name.lower(), None)
for attr in attrs.keys(): for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific): if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string # We have a "class"-type attribute whose string
# value is a whitespace-separated list of # value is a whitespace-separated list of
# values. Split it into a list. # values. Split it into a list.
value = attrs[attr] value = attrs[attr]
if isinstance(value, basestring): if isinstance(value, str):
values = whitespace_re.split(value) values = whitespace_re.split(value)
else: else:
# html5lib sometimes calls setAttributes twice # html5lib sometimes calls setAttributes twice

12
libs/bs4/builder/_html5lib.py

@ -37,7 +37,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, encoding=self.user_specified_encoding) doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, str):
# We need to special-case this because html5lib sets # We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
@ -51,7 +51,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@ -124,7 +124,7 @@ class Element(html5lib.treebuilders._base.Node):
def appendChild(self, node): def appendChild(self, node):
string_child = child = None string_child = child = None
if isinstance(node, basestring): if isinstance(node, str):
# Some other piece of code decided to pass in a string # Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the # instead of creating a TextElement object to contain the
# string. # string.
@ -139,7 +139,7 @@ class Element(html5lib.treebuilders._base.Node):
else: else:
child = node.element child = node.element
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, str) and child.parent is not None:
node.element.extract() node.element.extract()
if (string_child and self.element.contents if (string_child and self.element.contents
@ -152,7 +152,7 @@ class Element(html5lib.treebuilders._base.Node):
old_element.replace_with(new_element) old_element.replace_with(new_element)
self.soup._most_recent_element = new_element self.soup._most_recent_element = new_element
else: else:
if isinstance(node, basestring): if isinstance(node, str):
# Create a brand new NavigableString from this string. # Create a brand new NavigableString from this string.
child = self.soup.new_string(node) child = self.soup.new_string(node)
@ -183,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node):
self.soup.builder._replace_cdata_list_attribute_values( self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes) self.name, attributes)
for name, value in attributes.items(): for name, value in list(attributes.items()):
self.element[name] = value self.element[name] = value
# The attributes may contain variables that need substitution. # The attributes may contain variables that need substitution.

12
libs/bs4/builder/_htmlparser.py

@ -4,7 +4,7 @@ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
from HTMLParser import ( from html.parser import (
HTMLParser, HTMLParser,
HTMLParseError, HTMLParseError,
) )
@ -72,9 +72,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
real_name = int(name) real_name = int(name)
try: try:
data = unichr(real_name) data = chr(real_name)
except (ValueError, OverflowError), e: except (ValueError, OverflowError) as e:
data = u"\N{REPLACEMENT CHARACTER}" data = "\N{REPLACEMENT CHARACTER}"
self.handle_data(data) self.handle_data(data)
@ -142,7 +142,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER). replaced with REPLACEMENT CHARACTER).
""" """
if isinstance(markup, unicode): if isinstance(markup, str):
yield (markup, None, None, False) yield (markup, None, None, False)
return return
@ -158,7 +158,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
except HTMLParseError, e: except HTMLParseError as e:
warnings.warn(RuntimeWarning( warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e raise e

22
libs/bs4/builder/_lxml.py

@ -4,7 +4,7 @@ __all__ = [
] ]
from io import BytesIO from io import BytesIO
from StringIO import StringIO from io import StringIO
import collections import collections
from lxml import etree from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute from bs4.element import Comment, Doctype, NamespacedAttribute
@ -78,12 +78,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document. Each 4-tuple represents a strategy for parsing the document.
""" """
if isinstance(markup, unicode): if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on # We were given Unicode. Maybe lxml can parse Unicode on
# this system? # this system?
yield markup, None, document_declared_encoding, False yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode): if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and # No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8. # tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
@ -102,7 +102,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup): def feed(self, markup):
if isinstance(markup, bytes): if isinstance(markup, bytes):
markup = BytesIO(markup) markup = BytesIO(markup)
elif isinstance(markup, unicode): elif isinstance(markup, str):
markup = StringIO(markup) markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty, # Call feed() at least once, even if the markup is empty,
@ -117,7 +117,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0: if len(data) != 0:
self.parser.feed(data) self.parser.feed(data)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def close(self): def close(self):
@ -135,12 +135,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items()) inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
self.nsmaps.append(inverted_nsmap) self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the # Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later. # tag, so we can recreate it later.
attrs = attrs.copy() attrs = attrs.copy()
for prefix, namespace in nsmap.items(): for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute( attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/") "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace attrs[attribute] = namespace
@ -149,7 +149,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and # from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects. # turn then into NamespacedAttribute objects.
new_attrs = {} new_attrs = {}
for attr, value in attrs.items(): for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr) namespace, attr = self._getNsTag(attr)
if namespace is None: if namespace is None:
new_attrs[attr] = value new_attrs[attr] = value
@ -207,7 +207,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@ -224,10 +224,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding) self.parser = self.parser_for(encoding)
self.parser.feed(markup) self.parser.feed(markup)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`.""" """See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment return '<html><body>%s</body></html>' % fragment

10
libs/bs4/dammit.py

@ -8,7 +8,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
import codecs import codecs
from htmlentitydefs import codepoint2name from html.entities import codepoint2name
import re import re
import logging import logging
import string import string
@ -56,7 +56,7 @@ class EntitySubstitution(object):
reverse_lookup = {} reverse_lookup = {}
characters_for_re = [] characters_for_re = []
for codepoint, name in list(codepoint2name.items()): for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint) character = chr(codepoint)
if codepoint != 34: if codepoint != 34:
# There's no point in turning the quotation mark into # There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which # &quot;, unless it happens within an attribute value, which
@ -340,9 +340,9 @@ class UnicodeDammit:
self.detector = EncodingDetector(markup, override_encodings, is_html) self.detector = EncodingDetector(markup, override_encodings, is_html)
# Short-circuit if the data is in Unicode to begin with. # Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '': if isinstance(markup, str) or markup == '':
self.markup = markup self.markup = markup
self.unicode_markup = unicode(markup) self.unicode_markup = str(markup)
self.original_encoding = None self.original_encoding = None
return return
@ -425,7 +425,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"): def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode. '''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases''' %encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors) return str(data, encoding, errors)
@property @property
def declared_html_encoding(self): def declared_html_encoding(self):

52
libs/bs4/diagnose.py

@ -1,7 +1,7 @@
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
import cProfile import cProfile
from StringIO import StringIO from io import StringIO
from HTMLParser import HTMLParser from html.parser import HTMLParser
import bs4 import bs4
from bs4 import BeautifulSoup, __version__ from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry from bs4.builder import builder_registry
@ -17,8 +17,8 @@ import cProfile
def diagnose(data): def diagnose(data):
"""Diagnostic suite for isolating common problems.""" """Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__ print("Diagnostic running on Beautiful Soup %s" % __version__)
print "Python version %s" % sys.version print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"] basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers: for name in basic_parsers:
@ -27,44 +27,44 @@ def diagnose(data):
break break
else: else:
basic_parsers.remove(name) basic_parsers.remove(name)
print ( print((
"I noticed that %s is not installed. Installing it may help." % "I noticed that %s is not installed. Installing it may help." %
name) name))
if 'lxml' in basic_parsers: if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"]) basic_parsers.append(["lxml", "xml"])
from lxml import etree from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
if 'html5lib' in basic_parsers: if 'html5lib' in basic_parsers:
import html5lib import html5lib
print "Found html5lib version %s" % html5lib.__version__ print("Found html5lib version %s" % html5lib.__version__)
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif os.path.exists(data): elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data print('"%s" looks like a filename. Reading data from the file.' % data)
data = open(data).read() data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return return
print print()
for parser in basic_parsers: for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser print("Trying to parse your markup with %s" % parser)
success = False success = False
try: try:
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "Here's what %s did with the markup:" % parser print("Here's what %s did with the markup:" % parser)
print soup.prettify() print(soup.prettify())
print "-" * 80 print("-" * 80)
def lxml_trace(data, html=True, **kwargs): def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing. """Print out the lxml events that occur during parsing.
@ -74,7 +74,7 @@ def lxml_trace(data, html=True, **kwargs):
""" """
from lxml import etree from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text)) print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser): class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else.""" """Announces HTMLParser parse events, without doing anything else."""
@ -156,9 +156,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000): def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark.""" """Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements) data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data) print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False success = False
@ -167,24 +167,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
b = time.time() b = time.time()
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree from lxml import etree
a = time.time() a = time.time()
etree.HTML(data) etree.HTML(data)
b = time.time() b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a) print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib import html5lib
parser = html5lib.HTMLParser() parser = html5lib.HTMLParser()
a = time.time() a = time.time()
parser.parse(data) parser.parse(data)
b = time.time() b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a) print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"): def profile(num_elements=100000, parser="lxml"):

110
libs/bs4/element.py

@ -21,22 +21,22 @@ def _alias(attr):
return alias return alias
class NamespacedAttribute(unicode): class NamespacedAttribute(str):
def __new__(cls, prefix, name, namespace=None): def __new__(cls, prefix, name, namespace=None):
if name is None: if name is None:
obj = unicode.__new__(cls, prefix) obj = str.__new__(cls, prefix)
elif prefix is None: elif prefix is None:
# Not really namespaced. # Not really namespaced.
obj = unicode.__new__(cls, name) obj = str.__new__(cls, name)
else: else:
obj = unicode.__new__(cls, prefix + ":" + name) obj = str.__new__(cls, prefix + ":" + name)
obj.prefix = prefix obj.prefix = prefix
obj.name = name obj.name = name
obj.namespace = namespace obj.namespace = namespace
return obj return obj
class AttributeValueWithCharsetSubstitution(unicode): class AttributeValueWithCharsetSubstitution(str):
"""A stand-in object for a character encoding specified in HTML.""" """A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@ -47,7 +47,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
""" """
def __new__(cls, original_value): def __new__(cls, original_value):
obj = unicode.__new__(cls, original_value) obj = str.__new__(cls, original_value)
obj.original_value = original_value obj.original_value = original_value
return obj return obj
@ -70,9 +70,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
match = cls.CHARSET_RE.search(original_value) match = cls.CHARSET_RE.search(original_value)
if match is None: if match is None:
# No substitution necessary. # No substitution necessary.
return unicode.__new__(unicode, original_value) return str.__new__(str, original_value)
obj = unicode.__new__(cls, original_value) obj = str.__new__(cls, original_value)
obj.original_value = original_value obj.original_value = original_value
return obj return obj
@ -152,7 +152,7 @@ class PageElement(object):
def format_string(self, s, formatter='minimal'): def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter.""" """Format the given string using the given formatter."""
if not callable(formatter): if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
if formatter is None: if formatter is None:
output = s output = s
@ -272,7 +272,7 @@ class PageElement(object):
def insert(self, position, new_child): def insert(self, position, new_child):
if new_child is self: if new_child is self:
raise ValueError("Cannot insert a tag into itself.") raise ValueError("Cannot insert a tag into itself.")
if (isinstance(new_child, basestring) if (isinstance(new_child, str)
and not isinstance(new_child, NavigableString)): and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child) new_child = NavigableString(new_child)
@ -489,7 +489,7 @@ class PageElement(object):
result = (element for element in generator result = (element for element in generator
if isinstance(element, Tag)) if isinstance(element, Tag))
return ResultSet(strainer, result) return ResultSet(strainer, result)
elif isinstance(name, basestring): elif isinstance(name, str):
# Optimization to find all tags with a given name. # Optimization to find all tags with a given name.
result = (element for element in generator result = (element for element in generator
if isinstance(element, Tag) if isinstance(element, Tag)
@ -640,7 +640,7 @@ class PageElement(object):
return self.parents return self.parents
class NavigableString(unicode, PageElement): class NavigableString(str, PageElement):
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
@ -653,15 +653,15 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters. how to handle non-ASCII characters.
""" """
if isinstance(value, unicode): if isinstance(value, str):
return unicode.__new__(cls, value) return str.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __copy__(self): def __copy__(self):
return self return self
def __getnewargs__(self): def __getnewargs__(self):
return (unicode(self),) return (str(self),)
def __getattr__(self, attr): def __getattr__(self, attr):
"""text.string gives you text. This is for backwards """text.string gives you text. This is for backwards
@ -701,23 +701,23 @@ class PreformattedString(NavigableString):
class CData(PreformattedString): class CData(PreformattedString):
PREFIX = u'<![CDATA[' PREFIX = '<![CDATA['
SUFFIX = u']]>' SUFFIX = ']]>'
class ProcessingInstruction(PreformattedString): class ProcessingInstruction(PreformattedString):
PREFIX = u'<?' PREFIX = '<?'
SUFFIX = u'?>' SUFFIX = '?>'
class Comment(PreformattedString): class Comment(PreformattedString):
PREFIX = u'<!--' PREFIX = '<!--'
SUFFIX = u'-->' SUFFIX = '-->'
class Declaration(PreformattedString): class Declaration(PreformattedString):
PREFIX = u'<!' PREFIX = '<!'
SUFFIX = u'!>' SUFFIX = '!>'
class Doctype(PreformattedString): class Doctype(PreformattedString):
@ -734,8 +734,8 @@ class Doctype(PreformattedString):
return Doctype(value) return Doctype(value)
PREFIX = u'<!DOCTYPE ' PREFIX = '<!DOCTYPE '
SUFFIX = u'>\n' SUFFIX = '>\n'
class Tag(PageElement): class Tag(PageElement):
@ -843,7 +843,7 @@ class Tag(PageElement):
for string in self._all_strings(True): for string in self._all_strings(True):
yield string yield string
def get_text(self, separator=u"", strip=False, def get_text(self, separator="", strip=False,
types=(NavigableString, CData)): types=(NavigableString, CData)):
""" """
Get all child strings, concatenated using the given separator. Get all child strings, concatenated using the given separator.
@ -915,7 +915,7 @@ class Tag(PageElement):
def __contains__(self, x): def __contains__(self, x):
return x in self.contents return x in self.contents
def __nonzero__(self): def __bool__(self):
"A tag is non-None even if it has no contents." "A tag is non-None even if it has no contents."
return True return True
@ -1014,7 +1014,7 @@ class Tag(PageElement):
# First off, turn a string formatter into a function. This # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not callable(formatter): if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
attrs = [] attrs = []
@ -1025,8 +1025,8 @@ class Tag(PageElement):
else: else:
if isinstance(val, list) or isinstance(val, tuple): if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val) val = ' '.join(val)
elif not isinstance(val, basestring): elif not isinstance(val, str):
val = unicode(val) val = str(val)
elif ( elif (
isinstance(val, AttributeValueWithCharsetSubstitution) isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None): and eventual_encoding is not None):
@ -1034,7 +1034,7 @@ class Tag(PageElement):
text = self.format_string(val, formatter) text = self.format_string(val, formatter)
decoded = ( decoded = (
unicode(key) + '=' str(key) + '='
+ EntitySubstitution.quoted_attribute_value(text)) + EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded) attrs.append(decoded)
close = '' close = ''
@ -1112,7 +1112,7 @@ class Tag(PageElement):
""" """
# First off, turn a string formatter into a function. This # First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not callable(formatter): if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None) pretty_print = (indent_level is not None)
@ -1210,16 +1210,16 @@ class Tag(PageElement):
raise ValueError( raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1]) 'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug: if self._select_debug:
print 'Running CSS selector "%s"' % selector print('Running CSS selector "%s"' % selector)
for index, token in enumerate(tokens): for index, token in enumerate(tokens):
if self._select_debug: if self._select_debug:
print ' Considering token "%s"' % token print(' Considering token "%s"' % token)
recursive_candidate_generator = None recursive_candidate_generator = None
tag_name = None tag_name = None
if tokens[index-1] in self._selector_combinators: if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it. # This token was consumed by the previous combinator. Skip it.
if self._select_debug: if self._select_debug:
print ' Token was consumed by the previous combinator.' print(' Token was consumed by the previous combinator.')
continue continue
# Each operation corresponds to a checker function, a rule # Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the # for determining whether a candidate matches the
@ -1325,14 +1325,14 @@ class Tag(PageElement):
next_token = tokens[index+1] next_token = tokens[index+1]
def recursive_select(tag): def recursive_select(tag):
if self._select_debug: if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
print '-' * 40 print('-' * 40)
for i in tag.select(next_token, recursive_candidate_generator): for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug: if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
yield i yield i
if self._select_debug: if self._select_debug:
print '-' * 40 print('-' * 40)
_use_candidate_generator = recursive_select _use_candidate_generator = recursive_select
elif _candidate_generator is None: elif _candidate_generator is None:
# By default, a tag's candidates are all of its # By default, a tag's candidates are all of its
@ -1343,7 +1343,7 @@ class Tag(PageElement):
check = "[any]" check = "[any]"
else: else:
check = tag_name check = tag_name
print ' Default candidate generator, tag name="%s"' % check print(' Default candidate generator, tag name="%s"' % check)
if self._select_debug: if self._select_debug:
# This is redundant with later code, but it stops # This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the # a bunch of bogus tags from cluttering up the
@ -1365,8 +1365,8 @@ class Tag(PageElement):
new_context_ids = set([]) new_context_ids = set([])
for tag in current_context: for tag in current_context:
if self._select_debug: if self._select_debug:
print " Running candidate generator on %s %s" % ( print(" Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs)) tag.name, repr(tag.attrs)))
for candidate in _use_candidate_generator(tag): for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag): if not isinstance(candidate, Tag):
continue continue
@ -1381,21 +1381,21 @@ class Tag(PageElement):
break break
if checker is None or result: if checker is None or result:
if self._select_debug: if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
if id(candidate) not in new_context_ids: if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once, # If a tag matches a selector more than once,
# don't include it in the context more than once. # don't include it in the context more than once.
new_context.append(candidate) new_context.append(candidate)
new_context_ids.add(id(candidate)) new_context_ids.add(id(candidate))
elif self._select_debug: elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
current_context = new_context current_context = new_context
if self._select_debug: if self._select_debug:
print "Final verdict:" print("Final verdict:")
for i in current_context: for i in current_context:
print " %s %s" % (i.name, i.attrs) print(" %s %s" % (i.name, i.attrs))
return current_context return current_context
# Old names for backwards compatibility # Old names for backwards compatibility
@ -1439,7 +1439,7 @@ class SoupStrainer(object):
else: else:
attrs = kwargs attrs = kwargs
normalized_attrs = {} normalized_attrs = {}
for key, value in attrs.items(): for key, value in list(attrs.items()):
normalized_attrs[key] = self._normalize_search_value(value) normalized_attrs[key] = self._normalize_search_value(value)
self.attrs = normalized_attrs self.attrs = normalized_attrs
@ -1448,7 +1448,7 @@ class SoupStrainer(object):
def _normalize_search_value(self, value): def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a # Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None. # regular expression, a boolean, or None.
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None): or isinstance(value, bool) or value is None):
return value return value
@ -1461,7 +1461,7 @@ class SoupStrainer(object):
new_value = [] new_value = []
for v in value: for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes) if (hasattr(v, '__iter__') and not isinstance(v, bytes)
and not isinstance(v, unicode)): and not isinstance(v, str)):
# This is almost certainly the user's mistake. In the # This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let # interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call. # it through as-is rather than doing a recursive call.
@ -1473,7 +1473,7 @@ class SoupStrainer(object):
# Otherwise, convert it into a Unicode string. # Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2 # The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3. # and Python 3.
return unicode(str(value)) return str(str(value))
def __str__(self): def __str__(self):
if self.text: if self.text:
@ -1527,7 +1527,7 @@ class SoupStrainer(object):
found = None found = None
# If given a list of items, scan it for a text element that # If given a list of items, scan it for a text element that
# matches. # matches.
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
for element in markup: for element in markup:
if isinstance(element, NavigableString) \ if isinstance(element, NavigableString) \
and self.search(element): and self.search(element):
@ -1540,7 +1540,7 @@ class SoupStrainer(object):
found = self.search_tag(markup) found = self.search_tag(markup)
# If it's text, make sure the text matches. # If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \ elif isinstance(markup, NavigableString) or \
isinstance(markup, basestring): isinstance(markup, str):
if not self.name and not self.attrs and self._matches(markup, self.text): if not self.name and not self.attrs and self._matches(markup, self.text):
found = markup found = markup
else: else:
@ -1554,7 +1554,7 @@ class SoupStrainer(object):
if isinstance(markup, list) or isinstance(markup, tuple): if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute # This should only happen when searching a multi-valued attribute
# like 'class'. # like 'class'.
if (isinstance(match_against, unicode) if (isinstance(match_against, str)
and ' ' in match_against): and ' ' in match_against):
# A bit of a special case. If they try to match "foo # A bit of a special case. If they try to match "foo
# bar" on a multivalue attribute's value, only accept # bar" on a multivalue attribute's value, only accept
@ -1589,7 +1589,7 @@ class SoupStrainer(object):
# None matches None, False, an empty string, an empty list, and so on. # None matches None, False, an empty string, an empty list, and so on.
return not match_against return not match_against
if isinstance(match_against, unicode): if isinstance(match_against, str):
# Exact string match # Exact string match
return markup == match_against return markup == match_against

32
libs/bs4/testing.py

@ -225,14 +225,14 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>') self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self): def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect) self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect) self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self): def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect) self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect) self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -243,7 +243,7 @@ class HTMLTreeBuilderSmokeTest(object):
'<p>I said "good day!"</p>') '<p>I said "good day!"</p>')
def test_out_of_range_entity(self): def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}" expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect) self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect) self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect) self.assertSoupEquals("&#1000000000;", expect)
@ -285,9 +285,9 @@ class HTMLTreeBuilderSmokeTest(object):
# A seemingly innocuous document... but it's in Unicode! And # A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the # it contains characters that can't be represented in the
# encoding found in the declaration! The horror! # encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>' markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self): def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers.""" """Parsers should be able to work with SoupStrainers."""
@ -327,7 +327,7 @@ class HTMLTreeBuilderSmokeTest(object):
# Both XML and HTML entities are converted to Unicode characters # Both XML and HTML entities are converted to Unicode characters
# during parsing. # during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>" expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected) self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self): def test_smart_quotes_converted_on_the_way_in(self):
@ -337,15 +337,15 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(quote) soup = self.soup(quote)
self.assertEqual( self.assertEqual(
soup.p.string, soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self): def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>") soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self): def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>" text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8") expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text) soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected) self.assertEqual(soup.p.encode("utf-8"), expected)
@ -354,7 +354,7 @@ class HTMLTreeBuilderSmokeTest(object):
# easy-to-understand document. # easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use # That's because we're going to encode it into ISO-Latin-1, and use
# that to test. # that to test.
@ -493,15 +493,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded) self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self): def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>' markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self): def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual( self.assertEqual(
unicode(soup.rss), markup) str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self): def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
@ -532,17 +532,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self): def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup) self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self): def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self): def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>' markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(str(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5.""" """Smoke test for a tree builder that supports HTML5."""

Loading…
Cancel
Save