Browse Source

BeautifulSoup4 python 3

old/py3
Ruud 11 years ago
parent
commit
6b4e6857de
  1. 22
      libs/bs4/__init__.py
  2. 4
      libs/bs4/builder/__init__.py
  3. 12
      libs/bs4/builder/_html5lib.py
  4. 12
      libs/bs4/builder/_htmlparser.py
  5. 22
      libs/bs4/builder/_lxml.py
  6. 10
      libs/bs4/dammit.py
  7. 52
      libs/bs4/diagnose.py
  8. 110
      libs/bs4/element.py
  9. 32
      libs/bs4/testing.py

22
libs/bs4/__init__.py

@ -45,7 +45,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
@ -69,7 +69,7 @@ class BeautifulSoup(Tag):
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
ROOT_TAG_NAME = '[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
@ -135,12 +135,12 @@ class BeautifulSoup(Tag):
"fromEncoding", "from_encoding")
if len(kwargs) > 0:
arg = kwargs.keys().pop()
arg = list(kwargs.keys()).pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
if isinstance(features, basestring):
if isinstance(features, str):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
@ -164,7 +164,7 @@ class BeautifulSoup(Tag):
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
if (isinstance(markup, unicode)
if (isinstance(markup, str)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
@ -172,7 +172,7 @@ class BeautifulSoup(Tag):
is_file = False
try:
is_file = os.path.exists(possible_filename)
except Exception, e:
except Exception as e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
@ -184,7 +184,7 @@ class BeautifulSoup(Tag):
# TODO: This is ugly but I couldn't get it to work in
# Python 3 otherwise.
if ((isinstance(markup, bytes) and not b' ' in markup)
or (isinstance(markup, unicode) and not u' ' in markup)):
or (isinstance(markup, str) and not ' ' in markup)):
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
@ -259,7 +259,7 @@ class BeautifulSoup(Tag):
def endData(self, containerClass=NavigableString):
if self.current_data:
current_data = u''.join(self.current_data)
current_data = ''.join(self.current_data)
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
@ -367,9 +367,9 @@ class BeautifulSoup(Tag):
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
else:
prefix = u''
prefix = ''
if not pretty_print:
indent_level = None
else:
@ -403,4 +403,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print soup.prettify()
print(soup.prettify())

4
libs/bs4/builder/__init__.py

@ -153,13 +153,13 @@ class TreeBuilder(object):
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
for attr in attrs.keys():
for attr in list(attrs.keys()):
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
if isinstance(value, basestring):
if isinstance(value, str):
values = whitespace_re.split(value)
else:
# html5lib sometimes calls setAttributes twice

12
libs/bs4/builder/_html5lib.py

@ -37,7 +37,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
if isinstance(markup, str):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
@ -51,7 +51,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
return '<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
@ -124,7 +124,7 @@ class Element(html5lib.treebuilders._base.Node):
def appendChild(self, node):
string_child = child = None
if isinstance(node, basestring):
if isinstance(node, str):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
@ -139,7 +139,7 @@ class Element(html5lib.treebuilders._base.Node):
else:
child = node.element
if not isinstance(child, basestring) and child.parent is not None:
if not isinstance(child, str) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
@ -152,7 +152,7 @@ class Element(html5lib.treebuilders._base.Node):
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
if isinstance(node, str):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
@ -183,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node):
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
for name, value in list(attributes.items()):
self.element[name] = value
# The attributes may contain variables that need substitution.

12
libs/bs4/builder/_htmlparser.py

@ -4,7 +4,7 @@ __all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import (
from html.parser import (
HTMLParser,
HTMLParseError,
)
@ -72,9 +72,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
data = chr(real_name)
except (ValueError, OverflowError) as e:
data = "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
@ -142,7 +142,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
if isinstance(markup, str):
yield (markup, None, None, False)
return
@ -158,7 +158,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e

22
libs/bs4/builder/_lxml.py

@ -4,7 +4,7 @@ __all__ = [
]
from io import BytesIO
from StringIO import StringIO
from io import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
@ -78,12 +78,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
Each 4-tuple represents a strategy for parsing the document.
"""
if isinstance(markup, unicode):
if isinstance(markup, str):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode):
if isinstance(markup, str):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
@ -102,7 +102,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
elif isinstance(markup, str):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
@ -117,7 +117,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def close(self):
@ -135,12 +135,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
for prefix, namespace in nsmap.items():
for prefix, namespace in list(nsmap.items()):
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
@ -149,7 +149,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
for attr, value in list(attrs.items()):
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
@ -207,7 +207,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
@ -224,10 +224,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment
return '<html><body>%s</body></html>' % fragment

10
libs/bs4/dammit.py

@ -8,7 +8,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
import codecs
from htmlentitydefs import codepoint2name
from html.entities import codepoint2name
import re
import logging
import string
@ -56,7 +56,7 @@ class EntitySubstitution(object):
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint)
character = chr(codepoint)
if codepoint != 34:
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
@ -340,9 +340,9 @@ class UnicodeDammit:
self.detector = EncodingDetector(markup, override_encodings, is_html)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':
if isinstance(markup, str) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
self.unicode_markup = str(markup)
self.original_encoding = None
return
@ -425,7 +425,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors)
return str(data, encoding, errors)
@property
def declared_html_encoding(self):

52
libs/bs4/diagnose.py

@ -1,7 +1,7 @@
"""Diagnostic functions, mainly for use when doing tech support."""
import cProfile
from StringIO import StringIO
from HTMLParser import HTMLParser
from io import StringIO
from html.parser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
@ -17,8 +17,8 @@ import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__
print "Python version %s" % sys.version
print("Diagnostic running on Beautiful Soup %s" % __version__)
print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
@ -27,44 +27,44 @@ def diagnose(data):
break
else:
basic_parsers.remove(name)
print (
print((
"I noticed that %s is not installed. Installing it may help." %
name)
name))
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
if 'html5lib' in basic_parsers:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
print("Found html5lib version %s" % html5lib.__version__)
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
print('"%s" looks like a filename. Reading data from the file.' % data)
data = open(data).read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
print
print()
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
print("Trying to parse your markup with %s" % parser)
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print "Here's what %s did with the markup:" % parser
print soup.prettify()
print("Here's what %s did with the markup:" % parser)
print(soup.prettify())
print "-" * 80
print("-" * 80)
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
@ -74,7 +74,7 @@ def lxml_trace(data, html=True, **kwargs):
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text))
print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
@ -156,9 +156,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
@ -167,24 +167,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
except Exception as e:
print("%s could not parse the markup." % parser)
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"):

110
libs/bs4/element.py

@ -21,22 +21,22 @@ def _alias(attr):
return alias
class NamespacedAttribute(unicode):
class NamespacedAttribute(str):
def __new__(cls, prefix, name, namespace=None):
if name is None:
obj = unicode.__new__(cls, prefix)
obj = str.__new__(cls, prefix)
elif prefix is None:
# Not really namespaced.
obj = unicode.__new__(cls, name)
obj = str.__new__(cls, name)
else:
obj = unicode.__new__(cls, prefix + ":" + name)
obj = str.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
obj.name = name
obj.namespace = namespace
return obj
class AttributeValueWithCharsetSubstitution(unicode):
class AttributeValueWithCharsetSubstitution(str):
"""A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@ -47,7 +47,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
"""
def __new__(cls, original_value):
obj = unicode.__new__(cls, original_value)
obj = str.__new__(cls, original_value)
obj.original_value = original_value
return obj
@ -70,9 +70,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
match = cls.CHARSET_RE.search(original_value)
if match is None:
# No substitution necessary.
return unicode.__new__(unicode, original_value)
return str.__new__(str, original_value)
obj = unicode.__new__(cls, original_value)
obj = str.__new__(cls, original_value)
obj.original_value = original_value
return obj
@ -152,7 +152,7 @@ class PageElement(object):
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter."""
if not callable(formatter):
if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
@ -272,7 +272,7 @@ class PageElement(object):
def insert(self, position, new_child):
if new_child is self:
raise ValueError("Cannot insert a tag into itself.")
if (isinstance(new_child, basestring)
if (isinstance(new_child, str)
and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child)
@ -489,7 +489,7 @@ class PageElement(object):
result = (element for element in generator
if isinstance(element, Tag))
return ResultSet(strainer, result)
elif isinstance(name, basestring):
elif isinstance(name, str):
# Optimization to find all tags with a given name.
result = (element for element in generator
if isinstance(element, Tag)
@ -640,7 +640,7 @@ class PageElement(object):
return self.parents
class NavigableString(unicode, PageElement):
class NavigableString(str, PageElement):
PREFIX = ''
SUFFIX = ''
@ -653,15 +653,15 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters.
"""
if isinstance(value, unicode):
return unicode.__new__(cls, value)
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
if isinstance(value, str):
return str.__new__(cls, value)
return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
def __copy__(self):
return self
def __getnewargs__(self):
return (unicode(self),)
return (str(self),)
def __getattr__(self, attr):
"""text.string gives you text. This is for backwards
@ -701,23 +701,23 @@ class PreformattedString(NavigableString):
class CData(PreformattedString):
PREFIX = u'<![CDATA['
SUFFIX = u']]>'
PREFIX = '<![CDATA['
SUFFIX = ']]>'
class ProcessingInstruction(PreformattedString):
PREFIX = u'<?'
SUFFIX = u'?>'
PREFIX = '<?'
SUFFIX = '?>'
class Comment(PreformattedString):
PREFIX = u'<!--'
SUFFIX = u'-->'
PREFIX = '<!--'
SUFFIX = '-->'
class Declaration(PreformattedString):
PREFIX = u'<!'
SUFFIX = u'!>'
PREFIX = '<!'
SUFFIX = '!>'
class Doctype(PreformattedString):
@ -734,8 +734,8 @@ class Doctype(PreformattedString):
return Doctype(value)
PREFIX = u'<!DOCTYPE '
SUFFIX = u'>\n'
PREFIX = '<!DOCTYPE '
SUFFIX = '>\n'
class Tag(PageElement):
@ -843,7 +843,7 @@ class Tag(PageElement):
for string in self._all_strings(True):
yield string
def get_text(self, separator=u"", strip=False,
def get_text(self, separator="", strip=False,
types=(NavigableString, CData)):
"""
Get all child strings, concatenated using the given separator.
@ -915,7 +915,7 @@ class Tag(PageElement):
def __contains__(self, x):
return x in self.contents
def __nonzero__(self):
def __bool__(self):
"A tag is non-None even if it has no contents."
return True
@ -1014,7 +1014,7 @@ class Tag(PageElement):
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter)
attrs = []
@ -1025,8 +1025,8 @@ class Tag(PageElement):
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, basestring):
val = unicode(val)
elif not isinstance(val, str):
val = str(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None):
@ -1034,7 +1034,7 @@ class Tag(PageElement):
text = self.format_string(val, formatter)
decoded = (
unicode(key) + '='
str(key) + '='
+ EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
@ -1112,7 +1112,7 @@ class Tag(PageElement):
"""
# First off, turn a string formatter into a function. This
# will stop the lookup from happening over and over again.
if not callable(formatter):
if not isinstance(formatter, collections.Callable):
formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None)
@ -1210,16 +1210,16 @@ class Tag(PageElement):
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug:
print 'Running CSS selector "%s"' % selector
print('Running CSS selector "%s"' % selector)
for index, token in enumerate(tokens):
if self._select_debug:
print ' Considering token "%s"' % token
print(' Considering token "%s"' % token)
recursive_candidate_generator = None
tag_name = None
if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
print(' Token was consumed by the previous combinator.')
continue
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the
@ -1325,14 +1325,14 @@ class Tag(PageElement):
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
print '-' * 40
print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
print('-' * 40)
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
yield i
if self._select_debug:
print '-' * 40
print('-' * 40)
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
@ -1343,7 +1343,7 @@ class Tag(PageElement):
check = "[any]"
else:
check = tag_name
print ' Default candidate generator, tag name="%s"' % check
print(' Default candidate generator, tag name="%s"' % check)
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
@ -1365,8 +1365,8 @@ class Tag(PageElement):
new_context_ids = set([])
for tag in current_context:
if self._select_debug:
print " Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs))
print(" Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs)))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
@ -1381,21 +1381,21 @@ class Tag(PageElement):
break
if checker is None or result:
if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
current_context = new_context
if self._select_debug:
print "Final verdict:"
print("Final verdict:")
for i in current_context:
print " %s %s" % (i.name, i.attrs)
print(" %s %s" % (i.name, i.attrs))
return current_context
# Old names for backwards compatibility
@ -1439,7 +1439,7 @@ class SoupStrainer(object):
else:
attrs = kwargs
normalized_attrs = {}
for key, value in attrs.items():
for key, value in list(attrs.items()):
normalized_attrs[key] = self._normalize_search_value(value)
self.attrs = normalized_attrs
@ -1448,7 +1448,7 @@ class SoupStrainer(object):
def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None.
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None):
return value
@ -1461,7 +1461,7 @@ class SoupStrainer(object):
new_value = []
for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
and not isinstance(v, unicode)):
and not isinstance(v, str)):
# This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call.
@ -1473,7 +1473,7 @@ class SoupStrainer(object):
# Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3.
return unicode(str(value))
return str(str(value))
def __str__(self):
if self.text:
@ -1527,7 +1527,7 @@ class SoupStrainer(object):
found = None
# If given a list of items, scan it for a text element that
# matches.
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
for element in markup:
if isinstance(element, NavigableString) \
and self.search(element):
@ -1540,7 +1540,7 @@ class SoupStrainer(object):
found = self.search_tag(markup)
# If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \
isinstance(markup, basestring):
isinstance(markup, str):
if not self.name and not self.attrs and self._matches(markup, self.text):
found = markup
else:
@ -1554,7 +1554,7 @@ class SoupStrainer(object):
if isinstance(markup, list) or isinstance(markup, tuple):
# This should only happen when searching a multi-valued attribute
# like 'class'.
if (isinstance(match_against, unicode)
if (isinstance(match_against, str)
and ' ' in match_against):
# A bit of a special case. If they try to match "foo
# bar" on a multivalue attribute's value, only accept
@ -1589,7 +1589,7 @@ class SoupStrainer(object):
# None matches None, False, an empty string, an empty list, and so on.
return not match_against
if isinstance(match_against, unicode):
if isinstance(match_against, str):
# Exact string match
return markup == match_against

32
libs/bs4/testing.py

@ -225,14 +225,14 @@ class HTMLTreeBuilderSmokeTest(object):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
@ -243,7 +243,7 @@ class HTMLTreeBuilderSmokeTest(object):
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
expect = "\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
@ -285,9 +285,9 @@ class HTMLTreeBuilderSmokeTest(object):
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
@ -327,7 +327,7 @@ class HTMLTreeBuilderSmokeTest(object):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
@ -337,15 +337,15 @@ class HTMLTreeBuilderSmokeTest(object):
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
expected = "<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
@ -354,7 +354,7 @@ class HTMLTreeBuilderSmokeTest(object):
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
@ -493,15 +493,15 @@ class XMLTreeBuilderSmokeTest(object):
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
unicode(soup.rss), markup)
str(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
@ -532,17 +532,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup)
self.assertEqual(str(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
self.assertEqual(str(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
self.assertEqual(str(soup.foo), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""

Loading…
Cancel
Save