From 6b4e6857de07659e73f776e366c3ee40cc06f565 Mon Sep 17 00:00:00 2001 From: Ruud Date: Mon, 6 Oct 2014 11:12:35 +0200 Subject: [PATCH] BeautifulSoup4 python 3 --- libs/bs4/__init__.py | 22 ++++---- libs/bs4/builder/__init__.py | 4 +- libs/bs4/builder/_html5lib.py | 12 ++--- libs/bs4/builder/_htmlparser.py | 12 ++--- libs/bs4/builder/_lxml.py | 22 ++++---- libs/bs4/dammit.py | 10 ++-- libs/bs4/diagnose.py | 52 +++++++++---------- libs/bs4/element.py | 110 ++++++++++++++++++++-------------------- libs/bs4/testing.py | 32 ++++++------ 9 files changed, 138 insertions(+), 138 deletions(-) diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py index 7ba3426..df28de9 100644 --- a/libs/bs4/__init__.py +++ b/libs/bs4/__init__.py @@ -45,7 +45,7 @@ from .element import ( # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +syntax_error = 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -69,7 +69,7 @@ class BeautifulSoup(Tag): like HTML's
tag), call handle_starttag and then handle_endtag. """ - ROOT_TAG_NAME = u'[document]' + ROOT_TAG_NAME = '[document]' # If the end-user gives no indication which tree builder they # want, look for one with these features. @@ -135,12 +135,12 @@ class BeautifulSoup(Tag): "fromEncoding", "from_encoding") if len(kwargs) > 0: - arg = kwargs.keys().pop() + arg = list(kwargs.keys()).pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: - if isinstance(features, basestring): + if isinstance(features, str): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES @@ -164,7 +164,7 @@ class BeautifulSoup(Tag): # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if (isinstance(markup, unicode) + if (isinstance(markup, str) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: @@ -172,7 +172,7 @@ class BeautifulSoup(Tag): is_file = False try: is_file = os.path.exists(possible_filename) - except Exception, e: + except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. @@ -184,7 +184,7 @@ class BeautifulSoup(Tag): # TODO: This is ugly but I couldn't get it to work in # Python 3 otherwise. if ((isinstance(markup, bytes) and not b' ' in markup) - or (isinstance(markup, unicode) and not u' ' in markup)): + or (isinstance(markup, str) and not ' ' in markup)): warnings.warn( '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup) @@ -259,7 +259,7 @@ class BeautifulSoup(Tag): def endData(self, containerClass=NavigableString): if self.current_data: - current_data = u''.join(self.current_data) + current_data = ''.join(self.current_data) # If whitespace is not preserved, and this string contains # nothing but ASCII spaces, replace it with a single space # or newline. @@ -367,9 +367,9 @@ class BeautifulSoup(Tag): encoding_part = '' if eventual_encoding != None: encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'\n' % encoding_part + prefix = '\n' % encoding_part else: - prefix = u'' + prefix = '' if not pretty_print: indent_level = None else: @@ -403,4 +403,4 @@ class FeatureNotFound(ValueError): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print(soup.prettify()) diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index 740f5f2..f281ac2 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -153,13 +153,13 @@ class TreeBuilder(object): universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( tag_name.lower(), None) - for attr in attrs.keys(): + for attr in list(attrs.keys()): if attr in universal or (tag_specific and attr in tag_specific): # We have a "class"-type attribute whose string # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): + if isinstance(value, str): values = whitespace_re.split(value) else: # html5lib sometimes calls setAttributes twice diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 7de36ae..ddcf0b1 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -37,7 +37,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, encoding=self.user_specified_encoding) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, str): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None @@ -51,7 +51,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'%s' % fragment + return '%s' % fragment class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): @@ -124,7 +124,7 @@ class Element(html5lib.treebuilders._base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, str): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -139,7 +139,7 @@ class Element(html5lib.treebuilders._base.Node): else: child = node.element - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, str) and child.parent is not None: node.element.extract() if (string_child and self.element.contents @@ -152,7 +152,7 @@ class Element(html5lib.treebuilders._base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, str): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) @@ -183,7 +183,7 @@ class Element(html5lib.treebuilders._base.Node): self.soup.builder._replace_cdata_list_attribute_values( self.name, attributes) - for name, value in attributes.items(): + for name, value in list(attributes.items()): self.element[name] = value # The attributes may contain variables that need substitution. diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index ca8d8b8..fa69e97 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -4,7 +4,7 @@ __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( +from html.parser import ( HTMLParser, HTMLParseError, ) @@ -72,9 +72,9 @@ class BeautifulSoupHTMLParser(HTMLParser): real_name = int(name) try: - data = unichr(real_name) - except (ValueError, OverflowError), e: - data = u"\N{REPLACEMENT CHARACTER}" + data = chr(real_name) + except (ValueError, OverflowError) as e: + data = "\N{REPLACEMENT CHARACTER}" self.handle_data(data) @@ -142,7 +142,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, str): yield (markup, None, None, False) return @@ -158,7 +158,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index fa5d498..d923c30 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -4,7 +4,7 @@ __all__ = [ ] from io import BytesIO -from StringIO import StringIO +from io import StringIO import collections from lxml import etree from bs4.element import Comment, Doctype, NamespacedAttribute @@ -78,12 +78,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): Each 4-tuple represents a strategy for parsing the document. """ - if isinstance(markup, unicode): + if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -102,7 +102,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, str): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -117,7 +117,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): @@ -135,12 +135,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + inverted_nsmap = dict((value, key) for key, value in list(nsmap.items())) self.nsmaps.append(inverted_nsmap) # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() - for prefix, namespace in nsmap.items(): + for prefix, namespace in list(nsmap.items()): attribute = NamespacedAttribute( "xmlns", prefix, "http://www.w3.org/2000/xmlns/") attrs[attribute] = namespace @@ -149,7 +149,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): # from lxml with namespaces attached to their names, and # turn then into NamespacedAttribute objects. new_attrs = {} - for attr, value in attrs.items(): + for attr, value in list(attrs.items()): namespace, attr = self._getNsTag(attr) if namespace is None: new_attrs[attr] = value @@ -207,7 +207,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'\n%s' % fragment + return '\n%s' % fragment class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): @@ -224,10 +224,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def test_fragment_to_document(self, fragment): """See `TreeBuilder`.""" - return u'%s' % fragment + return '%s' % fragment diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py index 59640b7..f176eca 100644 --- a/libs/bs4/dammit.py +++ b/libs/bs4/dammit.py @@ -8,7 +8,7 @@ XML or HTML to reflect a new encoding; that's the tree builder's job. """ import codecs -from htmlentitydefs import codepoint2name +from html.entities import codepoint2name import re import logging import string @@ -56,7 +56,7 @@ class EntitySubstitution(object): reverse_lookup = {} characters_for_re = [] for codepoint, name in list(codepoint2name.items()): - character = unichr(codepoint) + character = chr(codepoint) if codepoint != 34: # There's no point in turning the quotation mark into # ", unless it happens within an attribute value, which @@ -340,9 +340,9 @@ class UnicodeDammit: self.detector = EncodingDetector(markup, override_encodings, is_html) # Short-circuit if the data is in Unicode to begin with. - if isinstance(markup, unicode) or markup == '': + if isinstance(markup, str) or markup == '': self.markup = markup - self.unicode_markup = unicode(markup) + self.unicode_markup = str(markup) self.original_encoding = None return @@ -425,7 +425,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - return unicode(data, encoding, errors) + return str(data, encoding, errors) @property def declared_html_encoding(self): diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py index 4d0b00a..8748ff0 100644 --- a/libs/bs4/diagnose.py +++ b/libs/bs4/diagnose.py @@ -1,7 +1,7 @@ """Diagnostic functions, mainly for use when doing tech support.""" import cProfile -from StringIO import StringIO -from HTMLParser import HTMLParser +from io import StringIO +from html.parser import HTMLParser import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry @@ -17,8 +17,8 @@ import cProfile def diagnose(data): """Diagnostic suite for isolating common problems.""" - print "Diagnostic running on Beautiful Soup %s" % __version__ - print "Python version %s" % sys.version + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: @@ -27,44 +27,44 @@ def diagnose(data): break else: basic_parsers.remove(name) - print ( + print(( "I noticed that %s is not installed. Installing it may help." % - name) + name)) if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) if 'html5lib' in basic_parsers: import html5lib - print "Found html5lib version %s" % html5lib.__version__ + print("Found html5lib version %s" % html5lib.__version__) if hasattr(data, 'read'): data = data.read() elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data + print('"%s" looks like a filename. Reading data from the file.' % data) data = open(data).read() elif data.startswith("http:") or data.startswith("https:"): - print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data - print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") return - print + print() for parser in basic_parsers: - print "Trying to parse your markup with %s" % parser + print("Trying to parse your markup with %s" % parser) success = False try: soup = BeautifulSoup(data, parser) success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "Here's what %s did with the markup:" % parser - print soup.prettify() + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) - print "-" * 80 + print("-" * 80) def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. @@ -74,7 +74,7 @@ def lxml_trace(data, html=True, **kwargs): """ from lxml import etree for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): - print("%s, %4s, %s" % (event, element.tag, element.text)) + print(("%s, %4s, %s" % (event, element.tag, element.text))) class AnnouncingParser(HTMLParser): """Announces HTMLParser parse events, without doing anything else.""" @@ -156,9 +156,9 @@ def rdoc(num_elements=1000): def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" - print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) - print "Generated a large invalid HTML document (%d bytes)." % len(data) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False @@ -167,24 +167,24 @@ def benchmark_parsers(num_elements=100000): soup = BeautifulSoup(data, parser) b = time.time() success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() - print "Raw lxml parsed the markup in %.2fs." % (b-a) + print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() - print "Raw html5lib parsed the markup in %.2fs." % (b-a) + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) def profile(num_elements=100000, parser="lxml"): diff --git a/libs/bs4/element.py b/libs/bs4/element.py index da9afdf..2934470 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -21,22 +21,22 @@ def _alias(attr): return alias -class NamespacedAttribute(unicode): +class NamespacedAttribute(str): def __new__(cls, prefix, name, namespace=None): if name is None: - obj = unicode.__new__(cls, prefix) + obj = str.__new__(cls, prefix) elif prefix is None: # Not really namespaced. - obj = unicode.__new__(cls, name) + obj = str.__new__(cls, name) else: - obj = unicode.__new__(cls, prefix + ":" + name) + obj = str.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj -class AttributeValueWithCharsetSubstitution(unicode): +class AttributeValueWithCharsetSubstitution(str): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): @@ -47,7 +47,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """ def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -70,9 +70,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. - return unicode.__new__(unicode, original_value) + return str.__new__(str, original_value) - obj = unicode.__new__(cls, original_value) + obj = str.__new__(cls, original_value) obj.original_value = original_value return obj @@ -152,7 +152,7 @@ class PageElement(object): def format_string(self, s, formatter='minimal'): """Format the given string using the given formatter.""" - if not callable(formatter): + if not isinstance(formatter, collections.Callable): formatter = self._formatter_for_name(formatter) if formatter is None: output = s @@ -272,7 +272,7 @@ class PageElement(object): def insert(self, position, new_child): if new_child is self: raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, basestring) + if (isinstance(new_child, str) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) @@ -489,7 +489,7 @@ class PageElement(object): result = (element for element in generator if isinstance(element, Tag)) return ResultSet(strainer, result) - elif isinstance(name, basestring): + elif isinstance(name, str): # Optimization to find all tags with a given name. result = (element for element in generator if isinstance(element, Tag) @@ -640,7 +640,7 @@ class PageElement(object): return self.parents -class NavigableString(unicode, PageElement): +class NavigableString(str, PageElement): PREFIX = '' SUFFIX = '' @@ -653,15 +653,15 @@ class NavigableString(unicode, PageElement): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - return unicode.__new__(cls, value) - return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + if isinstance(value, str): + return str.__new__(cls, value) + return str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) def __copy__(self): return self def __getnewargs__(self): - return (unicode(self),) + return (str(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -701,23 +701,23 @@ class PreformattedString(NavigableString): class CData(PreformattedString): - PREFIX = u'' + PREFIX = '' class ProcessingInstruction(PreformattedString): - PREFIX = u'' + PREFIX = '' class Comment(PreformattedString): - PREFIX = u'' + PREFIX = '' class Declaration(PreformattedString): - PREFIX = u'' + PREFIX = '' class Doctype(PreformattedString): @@ -734,8 +734,8 @@ class Doctype(PreformattedString): return Doctype(value) - PREFIX = u'\n' + PREFIX = '\n' class Tag(PageElement): @@ -843,7 +843,7 @@ class Tag(PageElement): for string in self._all_strings(True): yield string - def get_text(self, separator=u"", strip=False, + def get_text(self, separator="", strip=False, types=(NavigableString, CData)): """ Get all child strings, concatenated using the given separator. @@ -915,7 +915,7 @@ class Tag(PageElement): def __contains__(self, x): return x in self.contents - def __nonzero__(self): + def __bool__(self): "A tag is non-None even if it has no contents." return True @@ -1014,7 +1014,7 @@ class Tag(PageElement): # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, collections.Callable): formatter = self._formatter_for_name(formatter) attrs = [] @@ -1025,8 +1025,8 @@ class Tag(PageElement): else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) + elif not isinstance(val, str): + val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): @@ -1034,7 +1034,7 @@ class Tag(PageElement): text = self.format_string(val, formatter) decoded = ( - unicode(key) + '=' + str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' @@ -1112,7 +1112,7 @@ class Tag(PageElement): """ # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. - if not callable(formatter): + if not isinstance(formatter, collections.Callable): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) @@ -1210,16 +1210,16 @@ class Tag(PageElement): raise ValueError( 'Final combinator "%s" is missing an argument.' % tokens[-1]) if self._select_debug: - print 'Running CSS selector "%s"' % selector + print('Running CSS selector "%s"' % selector) for index, token in enumerate(tokens): if self._select_debug: - print ' Considering token "%s"' % token + print(' Considering token "%s"' % token) recursive_candidate_generator = None tag_name = None if tokens[index-1] in self._selector_combinators: # This token was consumed by the previous combinator. Skip it. if self._select_debug: - print ' Token was consumed by the previous combinator.' + print(' Token was consumed by the previous combinator.') continue # Each operation corresponds to a checker function, a rule # for determining whether a candidate matches the @@ -1325,14 +1325,14 @@ class Tag(PageElement): next_token = tokens[index+1] def recursive_select(tag): if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 + print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) + print('-' * 40) for i in tag.select(next_token, recursive_candidate_generator): if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) yield i if self._select_debug: - print '-' * 40 + print('-' * 40) _use_candidate_generator = recursive_select elif _candidate_generator is None: # By default, a tag's candidates are all of its @@ -1343,7 +1343,7 @@ class Tag(PageElement): check = "[any]" else: check = tag_name - print ' Default candidate generator, tag name="%s"' % check + print(' Default candidate generator, tag name="%s"' % check) if self._select_debug: # This is redundant with later code, but it stops # a bunch of bogus tags from cluttering up the @@ -1365,8 +1365,8 @@ class Tag(PageElement): new_context_ids = set([]) for tag in current_context: if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) + print(" Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs))) for candidate in _use_candidate_generator(tag): if not isinstance(candidate, Tag): continue @@ -1381,21 +1381,21 @@ class Tag(PageElement): break if checker is None or result: if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) if id(candidate) not in new_context_ids: # If a tag matches a selector more than once, # don't include it in the context more than once. new_context.append(candidate) new_context_ids.add(id(candidate)) elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) current_context = new_context if self._select_debug: - print "Final verdict:" + print("Final verdict:") for i in current_context: - print " %s %s" % (i.name, i.attrs) + print(" %s %s" % (i.name, i.attrs)) return current_context # Old names for backwards compatibility @@ -1439,7 +1439,7 @@ class SoupStrainer(object): else: attrs = kwargs normalized_attrs = {} - for key, value in attrs.items(): + for key, value in list(attrs.items()): normalized_attrs[key] = self._normalize_search_value(value) self.attrs = normalized_attrs @@ -1448,7 +1448,7 @@ class SoupStrainer(object): def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + if (isinstance(value, str) or isinstance(value, collections.Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -1461,7 +1461,7 @@ class SoupStrainer(object): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, unicode)): + and not isinstance(v, str)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. @@ -1473,7 +1473,7 @@ class SoupStrainer(object): # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. - return unicode(str(value)) + return str(str(value)) def __str__(self): if self.text: @@ -1527,7 +1527,7 @@ class SoupStrainer(object): found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): @@ -1540,7 +1540,7 @@ class SoupStrainer(object): found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, str): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: @@ -1554,7 +1554,7 @@ class SoupStrainer(object): if isinstance(markup, list) or isinstance(markup, tuple): # This should only happen when searching a multi-valued attribute # like 'class'. - if (isinstance(match_against, unicode) + if (isinstance(match_against, str) and ' ' in match_against): # A bit of a special case. If they try to match "foo # bar" on a multivalue attribute's value, only accept @@ -1589,7 +1589,7 @@ class SoupStrainer(object): # None matches None, False, an empty string, an empty list, and so on. return not match_against - if isinstance(match_against, unicode): + if isinstance(match_against, str): # Exact string match return markup == match_against diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py index fd4495a..c754f24 100644 --- a/libs/bs4/testing.py +++ b/libs/bs4/testing.py @@ -225,14 +225,14 @@ class HTMLTreeBuilderSmokeTest(object): self.assertSoupEquals('', '') def test_entities_in_attributes_converted_to_unicode(self): - expect = u'

' + expect = '

' self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) def test_entities_in_text_converted_to_unicode(self): - expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' + expect = '

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) @@ -243,7 +243,7 @@ class HTMLTreeBuilderSmokeTest(object): '

I said "good day!"

') def test_out_of_range_entity(self): - expect = u"\N{REPLACEMENT CHARACTER}" + expect = "\N{REPLACEMENT CHARACTER}" self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) @@ -285,9 +285,9 @@ class HTMLTreeBuilderSmokeTest(object): # A seemingly innocuous document... but it's in Unicode! And # it contains characters that can't be represented in the # encoding found in the declaration! The horror! - markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + self.assertEqual('Sacr\xe9 bleu!', soup.body.string) def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" @@ -327,7 +327,7 @@ class HTMLTreeBuilderSmokeTest(object): # Both XML and HTML entities are converted to Unicode characters # during parsing. text = "

<<sacré bleu!>>

" - expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" + expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" self.assertSoupEquals(text, expected) def test_smart_quotes_converted_on_the_way_in(self): @@ -337,15 +337,15 @@ class HTMLTreeBuilderSmokeTest(object): soup = self.soup(quote) self.assertEqual( soup.p.string, - u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") + "\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") def test_non_breaking_spaces_converted_on_the_way_in(self): soup = self.soup("  ") - self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) + self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2) def test_entities_converted_on_the_way_out(self): text = "

<<sacré bleu!>>

" - expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") + expected = "

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") soup = self.soup(text) self.assertEqual(soup.p.encode("utf-8"), expected) @@ -354,7 +354,7 @@ class HTMLTreeBuilderSmokeTest(object): # easy-to-understand document. # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' + unicode_html = '

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' # That's because we're going to encode it into ISO-Latin-1, and use # that to test. @@ -493,15 +493,15 @@ class XMLTreeBuilderSmokeTest(object): self.assertTrue(b"< < hey > >" in encoded) def test_can_parse_unicode_document(self): - markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + markup = 'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + self.assertEqual('Sacr\xe9 bleu!', soup.root.string) def test_popping_namespaced_tag(self): markup = 'b2012-07-02T20:33:42Zcd' soup = self.soup(markup) self.assertEqual( - unicode(soup.rss), markup) + str(soup.rss), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") @@ -532,17 +532,17 @@ class XMLTreeBuilderSmokeTest(object): def test_closing_namespaced_tag(self): markup = '

20010504

' soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) + self.assertEqual(str(soup.p), markup) def test_namespaced_attributes(self): markup = '' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(str(soup.foo), markup) def test_namespaced_attributes_xml_namespace(self): markup = 'bar' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(str(soup.foo), markup) class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5."""