From 0ec577e69cde9ec34ce739190f11f75252a2ae6d Mon Sep 17 00:00:00 2001 From: JackDandy Date: Wed, 8 May 2019 13:51:20 +0100 Subject: [PATCH] =?UTF-8?q?Update=20Beautiful=20Soup=204.6.3=20(r475)=20?= =?UTF-8?q?=E2=86=92=204.7.1=20(r497).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGES.md | 1 + lib/bs4/__init__.py | 110 ++++++----- lib/bs4/builder/__init__.py | 14 +- lib/bs4/builder/_html5lib.py | 18 +- lib/bs4/builder/_htmlparser.py | 4 +- lib/bs4/builder/_lxml.py | 49 ++++- lib/bs4/dammit.py | 3 +- lib/bs4/diagnose.py | 3 +- lib/bs4/element.py | 432 ++++++++--------------------------------- 9 files changed, 209 insertions(+), 425 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index aa7fafd..c742387 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ ### 0.20.0 (2019-xx-xx xx:xx:xx UTC) * Update attr 18.3.0.dev0 (55642b3) to 19.2.0.dev0 (de84609) +* Update Beautiful Soup 4.6.3 (r475) to 4.7.1 (r497) [develop changelog] diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index 470177f..d3a1086 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -17,12 +17,10 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ """ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.6.3" -__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson" +__version__ = "4.7.1" +__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" +# Use of this source code is governed by the MIT license. __license__ = "MIT" __all__ = ['BeautifulSoup'] @@ -237,10 +235,11 @@ class BeautifulSoup(Tag): self.builder = builder self.is_xml = builder.is_xml self.known_xml = self.is_xml - self.builder.soup = self - + self._namespaces = dict() self.parse_only = parse_only + self.builder.initialize_soup(self) + if hasattr(markup, 'read'): # It's a file-type object. markup = markup.read() elif len(markup) <= 256 and ( @@ -382,7 +381,7 @@ class BeautifulSoup(Tag): def pushTag(self, tag): #print "Push", tag.name - if self.currentTag: + if self.currentTag is not None: self.currentTag.contents.append(tag) self.tagStack.append(tag) self.currentTag = self.tagStack[-1] @@ -421,60 +420,71 @@ class BeautifulSoup(Tag): def object_was_parsed(self, o, parent=None, most_recent_element=None): """Add an object to the parse tree.""" - parent = parent or self.currentTag - previous_element = most_recent_element or self._most_recent_element + if parent is None: + parent = self.currentTag + if most_recent_element is not None: + previous_element = most_recent_element + else: + previous_element = self._most_recent_element next_element = previous_sibling = next_sibling = None if isinstance(o, Tag): next_element = o.next_element next_sibling = o.next_sibling previous_sibling = o.previous_sibling - if not previous_element: + if previous_element is None: previous_element = o.previous_element + fix = parent.next_element is not None + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) self._most_recent_element = o parent.contents.append(o) - if parent.next_sibling: - # This node is being inserted into an element that has - # already been parsed. Deal with any dangling references. - index = len(parent.contents)-1 - while index >= 0: - if parent.contents[index] is o: - break - index -= 1 - else: - raise ValueError( - "Error building tree: supposedly %r was inserted " - "into %r after the fact, but I don't see it!" % ( - o, parent - ) - ) - if index == 0: - previous_element = parent - previous_sibling = None - else: - previous_element = previous_sibling = parent.contents[index-1] - if index == len(parent.contents)-1: - next_element = parent.next_sibling - next_sibling = None - else: - next_element = next_sibling = parent.contents[index+1] - - o.previous_element = previous_element - if previous_element: - previous_element.next_element = o - o.next_element = next_element - if next_element: - next_element.previous_element = o - o.next_sibling = next_sibling - if next_sibling: - next_sibling.previous_sibling = o - o.previous_sibling = previous_sibling - if previous_sibling: - previous_sibling.next_sibling = o + # Check if we are inserting into an already parsed node. + if fix: + self._linkage_fixer(parent) + + def _linkage_fixer(self, el): + """Make sure linkage of this fragment is sound.""" + + first = el.contents[0] + child = el.contents[-1] + descendant = child + + if child is first and el.parent is not None: + # Parent should be linked to first child + el.next_element = child + # We are no longer linked to whatever this element is + prev_el = child.previous_element + if prev_el is not None and prev_el is not el: + prev_el.next_element = None + # First child should be linked to the parent, and no previous siblings. + child.previous_element = el + child.previous_sibling = None + + # We have no sibling as we've been appended as the last. + child.next_sibling = None + + # This index is a tag, dig deeper for a "last descendant" + if isinstance(child, Tag) and child.contents: + descendant = child._last_descendant(False) + + # As the final step, link last descendant. It should be linked + # to the parent's next sibling (if found), else walk up the chain + # and find a parent with a sibling. It should have no next sibling. + descendant.next_element = None + descendant.next_sibling = None + target = el + while True: + if target is None: + break + elif target.next_sibling is not None: + descendant.next_element = target.next_sibling + target.next_sibling.previous_element = child + break + target = target.parent def _popToTag(self, name, nsprefix=None, inclusivePop=True): """Pops the tag stack up to and including the most recent @@ -520,7 +530,7 @@ class BeautifulSoup(Tag): self.currentTag, self._most_recent_element) if tag is None: return tag - if self._most_recent_element: + if self._most_recent_element is not None: self._most_recent_element.next_element = tag self._most_recent_element = tag self.pushTag(tag) diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index c9e3f3d..4207750 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" from collections import defaultdict import itertools @@ -8,7 +8,7 @@ from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, HTMLAwareEntitySubstitution, - whitespace_re + nonwhitespace_re ) __all__ = [ @@ -102,6 +102,12 @@ class TreeBuilder(object): def __init__(self): self.soup = None + def initialize_soup(self, soup): + """The BeautifulSoup object has been initialized and is now + being associated with the TreeBuilder. + """ + self.soup = soup + def reset(self): pass @@ -167,7 +173,7 @@ class TreeBuilder(object): # values. Split it into a list. value = attrs[attr] if isinstance(value, basestring): - values = whitespace_re.split(value) + values = nonwhitespace_re.findall(value) else: # html5lib sometimes calls setAttributes twice # for the same tag when rearranging the parse diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index 5f54893..6fa8593 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -1,5 +1,5 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTML5TreeBuilder', @@ -15,7 +15,7 @@ from bs4.builder import ( ) from bs4.element import ( NamespacedAttribute, - whitespace_re, + nonwhitespace_re, ) import html5lib from html5lib.constants import ( @@ -206,7 +206,7 @@ class AttrList(object): # A node that is being cloned may have already undergone # this procedure. if not isinstance(value, list): - value = whitespace_re.split(value) + value = nonwhitespace_re.findall(value) self.element[name] = value def items(self): return list(self.attrs.items()) @@ -249,7 +249,7 @@ class Element(treebuilder_base.Node): if not isinstance(child, basestring) and child.parent is not None: node.element.extract() - if (string_child and self.element.contents + if (string_child is not None and self.element.contents and self.element.contents[-1].__class__ == NavigableString): # We are appending a string onto another string. # TODO This has O(n^2) performance, for input like @@ -360,16 +360,16 @@ class Element(treebuilder_base.Node): # Set the first child's previous_element and previous_sibling # to elements within the new parent first_child = to_append[0] - if new_parents_last_descendant: + if new_parents_last_descendant is not None: first_child.previous_element = new_parents_last_descendant else: first_child.previous_element = new_parent_element first_child.previous_sibling = new_parents_last_child - if new_parents_last_descendant: + if new_parents_last_descendant is not None: new_parents_last_descendant.next_element = first_child else: new_parent_element.next_element = first_child - if new_parents_last_child: + if new_parents_last_child is not None: new_parents_last_child.next_sibling = first_child # Find the very last element being moved. It is now the @@ -379,7 +379,7 @@ class Element(treebuilder_base.Node): last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant.next_element = new_parents_last_descendant_next_element - if new_parents_last_descendant_next_element: + if new_parents_last_descendant_next_element is not None: # TODO: This code has no test coverage and I'm not sure # how to get html5lib to go through this path, but it's # just the other side of the previous line. diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index ee6c685..ff09ca3 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -1,8 +1,8 @@ # encoding: utf-8 """Use the HTMLParser library to parse HTML files that aren't too bad.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" __all__ = [ 'HTMLParserTreeBuilder', diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py index 4a0f7de..b7e172c 100644 --- a/lib/bs4/builder/_lxml.py +++ b/lib/bs4/builder/_lxml.py @@ -1,5 +1,6 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. +__license__ = "MIT" + __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -32,6 +33,10 @@ from bs4.dammit import EncodingDetector LXML = 'lxml' +def _invert(d): + "Invert a dictionary." + return dict((v,k) for k, v in d.items()) + class LXMLTreeBuilderForXML(TreeBuilder): DEFAULT_PARSER_CLASS = etree.XMLParser @@ -48,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder): # This namespace mapping is specified in the XML Namespace # standard. - DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace') + + DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) + + def initialize_soup(self, soup): + """Let the BeautifulSoup object know about the standard namespace + mapping. + """ + super(LXMLTreeBuilderForXML, self).initialize_soup(soup) + self._register_namespaces(self.DEFAULT_NSMAPS) + + def _register_namespaces(self, mapping): + """Let the BeautifulSoup object know about namespaces encountered + while parsing the document. + + This might be useful later on when creating CSS selectors. + """ + for key, value in mapping.items(): + if key and key not in self.soup._namespaces: + # Let the BeautifulSoup object know about a new namespace. + # If there are multiple namespaces defined with the same + # prefix, the first one in the document takes precedence. + self.soup._namespaces[key] = value def default_parser(self, encoding): # This can either return a parser object or a class, which @@ -75,8 +102,8 @@ class LXMLTreeBuilderForXML(TreeBuilder): if empty_element_tags is not None: self.empty_element_tags = set(empty_element_tags) self.soup = None - self.nsmaps = [self.DEFAULT_NSMAPS] - + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] + def _getNsTag(self, tag): # Split the namespace URL out of a fully-qualified lxml tag # name. Copied from lxml's src/lxml/sax.py. @@ -144,7 +171,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): raise ParserRejectedMarkup(str(e)) def close(self): - self.nsmaps = [self.DEFAULT_NSMAPS] + self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] def start(self, name, attrs, nsmap={}): # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. @@ -158,8 +185,14 @@ class LXMLTreeBuilderForXML(TreeBuilder): self.nsmaps.append(None) elif len(nsmap) > 0: # A new namespace mapping has come into play. - inverted_nsmap = dict((value, key) for key, value in nsmap.items()) - self.nsmaps.append(inverted_nsmap) + + # First, Let the BeautifulSoup object know about it. + self._register_namespaces(nsmap) + + # Then, add it to our running list of inverted namespace + # mappings. + self.nsmaps.append(_invert(nsmap)) + # Also treat the namespace mapping as a set of attributes on the # tag, so we can recreate it later. attrs = attrs.copy() diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py index be46b39..fb2f8b8 100644 --- a/lib/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -6,8 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal Feed Parser. It works best on XML and HTML, but it does not rewrite the XML or HTML to reflect a new encoding; that's the tree builder's job. """ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import codecs diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py index 7a28c09..f9835c3 100644 --- a/lib/bs4/diagnose.py +++ b/lib/bs4/diagnose.py @@ -1,7 +1,6 @@ """Diagnostic functions, mainly for use when doing tech support.""" -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" import cProfile diff --git a/lib/bs4/element.py b/lib/bs4/element.py index 886eb91..547b8ba 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -1,5 +1,4 @@ -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. +# Use of this source code is governed by the MIT license. __license__ = "MIT" try: @@ -7,14 +6,25 @@ try: except ImportError , e: from collections import Callable import re -import shlex import sys import warnings +try: + import soupsieve +except ImportError, e: + soupsieve = None + warnings.warn( + 'The soupsieve package is not installed. CSS selectors cannot be used.' + ) + from bs4.dammit import EntitySubstitution DEFAULT_OUTPUT_ENCODING = "utf-8" PY3K = (sys.version_info[0] > 2) +nonwhitespace_re = re.compile(r"\S+") + +# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on +# the off chance someone imported it for their own use. whitespace_re = re.compile(r"\s+") def _alias(attr): @@ -207,7 +217,7 @@ class PageElement(object): if formatter is None: output = s else: - if callable(formatter): + if isinstance(formatter, Callable): # Backwards compatibility -- you used to pass in a formatting method. output = formatter(s) else: @@ -256,26 +266,26 @@ class PageElement(object): self.previous_element.next_element = self self.next_element = next_element - if self.next_element: + if self.next_element is not None: self.next_element.previous_element = self self.next_sibling = next_sibling - if self.next_sibling: + if self.next_sibling is not None: self.next_sibling.previous_sibling = self - if (not previous_sibling + if (previous_sibling is None and self.parent is not None and self.parent.contents): previous_sibling = self.parent.contents[-1] self.previous_sibling = previous_sibling - if previous_sibling: + if previous_sibling is not None: self.previous_sibling.next_sibling = self nextSibling = _alias("next_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3 def replace_with(self, replace_with): - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace one element with another when the" "element to be replaced is not part of a tree.") @@ -292,7 +302,7 @@ class PageElement(object): def unwrap(self): my_parent = self.parent - if not self.parent: + if self.parent is None: raise ValueError( "Cannot replace an element with its contents when that" "element is not part of a tree.") @@ -340,7 +350,7 @@ class PageElement(object): def _last_descendant(self, is_initialized=True, accept_self=True): "Finds the last element beneath this object to be parsed." - if is_initialized and self.next_sibling: + if is_initialized and self.next_sibling is not None: last_child = self.next_sibling.previous_element else: last_child = self @@ -430,43 +440,54 @@ class PageElement(object): """Appends the given tag to the contents of this tag.""" self.insert(len(self.contents), tag) - def insert_before(self, predecessor): - """Makes the given element the immediate predecessor of this one. + def extend(self, tags): + """Appends the given tags to the contents of this tag.""" + for tag in tags: + self.append(tag) + + def insert_before(self, *args): + """Makes the given element(s) the immediate predecessor of this one. - The two elements will have the same parent, and the given element + The elements will have the same parent, and the given elements will be immediately before this one. """ - if self is predecessor: - raise ValueError("Can't insert an element before itself.") parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'before' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(predecessor, PageElement): - predecessor.extract() - index = parent.index(self) - parent.insert(index, predecessor) - - def insert_after(self, successor): - """Makes the given element the immediate successor of this one. - - The two elements will have the same parent, and the given element + if any(x is self for x in args): + raise ValueError("Can't insert an element before itself.") + for predecessor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(predecessor, PageElement): + predecessor.extract() + index = parent.index(self) + parent.insert(index, predecessor) + + def insert_after(self, *args): + """Makes the given element(s) the immediate successor of this one. + + The elements will have the same parent, and the given elements will be immediately after this one. """ - if self is successor: - raise ValueError("Can't insert an element after itself.") + # Do all error checking before modifying the tree. parent = self.parent if parent is None: raise ValueError( "Element has no parent, so 'after' has no meaning.") - # Extract first so that the index won't be screwed up if they - # are siblings. - if isinstance(successor, PageElement): - successor.extract() - index = parent.index(self) - parent.insert(index+1, successor) + if any(x is self for x in args): + raise ValueError("Can't insert an element after itself.") + + offset = 0 + for successor in args: + # Extract first so that the index won't be screwed up if they + # are siblings. + if isinstance(successor, PageElement): + successor.extract() + index = parent.index(self) + parent.insert(index+1+offset, successor) + offset += 1 def find_next(self, name=None, attrs={}, text=None, **kwargs): """Returns the first item that matches the given criteria and @@ -657,82 +678,6 @@ class PageElement(object): yield i i = i.parent - # Methods for supporting CSS selectors. - - tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$') - - # /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/ - # \---------------------------/ \---/\-------------/ \-------/ - # | | | | - # | | | The value - # | | ~,|,^,$,* or = - # | Attribute - # Tag - attribselect_re = re.compile( - r'^(?P[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P[\w-]+)(?P[=~\|\^\$\*]?)' + - r'=?"?(?P[^\]"]*)"?\]$' - ) - - def _attr_value_as_string(self, value, default=None): - """Force an attribute value into a string representation. - - A multi-valued attribute will be converted into a - space-separated stirng. - """ - value = self.get(value, default) - if isinstance(value, list) or isinstance(value, tuple): - value =" ".join(value) - return value - - def _tag_name_matches_and(self, function, tag_name): - if not tag_name: - return function - else: - def _match(tag): - return tag.name == tag_name and function(tag) - return _match - - def _attribute_checker(self, operator, attribute, value=''): - """Create a function that performs a CSS selector operation. - - Takes an operator, attribute and optional value. Returns a - function that will return True for elements that match that - combination. - """ - if operator == '=': - # string representation of `attribute` is equal to `value` - return lambda el: el._attr_value_as_string(attribute) == value - elif operator == '~': - # space-separated list representation of `attribute` - # contains `value` - def _includes_value(element): - attribute_value = element.get(attribute, []) - if not isinstance(attribute_value, list): - attribute_value = attribute_value.split() - return value in attribute_value - return _includes_value - elif operator == '^': - # string representation of `attribute` starts with `value` - return lambda el: el._attr_value_as_string( - attribute, '').startswith(value) - elif operator == '$': - # string representation of `attribute` ends with `value` - return lambda el: el._attr_value_as_string( - attribute, '').endswith(value) - elif operator == '*': - # string representation of `attribute` contains `value` - return lambda el: value in el._attr_value_as_string(attribute, '') - elif operator == '|': - # string representation of `attribute` is either exactly - # `value` or starts with `value` and then a dash. - def _is_or_starts_with_dash(element): - attribute_value = element._attr_value_as_string(attribute, '') - return (attribute_value == value or attribute_value.startswith( - value + '-')) - return _is_or_starts_with_dash - else: - return lambda el: el.has_attr(attribute) - # Old non-property versions of the generators, for backwards # compatibility with BS3. def nextGenerator(self): @@ -1193,7 +1138,7 @@ class Tag(PageElement): # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not callable(formatter): + if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable): formatter = self._formatter_for_name(formatter) attrs = [] if self.attrs: @@ -1298,7 +1243,7 @@ class Tag(PageElement): """ # First off, turn a string formatter into a Formatter object. This # will stop the lookup from happening over and over again. - if not isinstance(formatter, Formatter) and not callable(formatter): + if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable): formatter = self._formatter_for_name(formatter) pretty_print = (indent_level is not None) @@ -1394,250 +1339,41 @@ class Tag(PageElement): current = current.next_element # CSS selector code - - _selector_combinators = ['>', '+', '~'] - _select_debug = False - quoted_colon = re.compile('"[^"]*:[^"]*"') - def select_one(self, selector): + def select_one(self, selector, namespaces=None, **kwargs): """Perform a CSS selection operation on the current element.""" - value = self.select(selector, limit=1) + value = self.select(selector, namespaces, 1, **kwargs) if value: return value[0] return None - def select(self, selector, _candidate_generator=None, limit=None): - """Perform a CSS selection operation on the current element.""" + def select(self, selector, namespaces=None, limit=None, **kwargs): + """Perform a CSS selection operation on the current element. - # Handle grouping selectors if ',' exists, ie: p,a - if ',' in selector: - context = [] - selectors = [x.strip() for x in selector.split(",")] - - # If a selector is mentioned multiple times we don't want - # to use it more than once. - used_selectors = set() - - # We also don't want to select the same element more than once, - # if it's matched by multiple selectors. - selected_object_ids = set() - for partial_selector in selectors: - if partial_selector == '': - raise ValueError('Invalid group selection syntax: %s' % selector) - if partial_selector in used_selectors: - continue - used_selectors.add(partial_selector) - candidates = self.select(partial_selector, limit=limit) - for candidate in candidates: - # This lets us distinguish between distinct tags that - # represent the same markup. - object_id = id(candidate) - if object_id not in selected_object_ids: - context.append(candidate) - selected_object_ids.add(object_id) - if limit and len(context) >= limit: - break - return context - tokens = shlex.split(selector) - current_context = [self] - - if tokens[-1] in self._selector_combinators: - raise ValueError( - 'Final combinator "%s" is missing an argument.' % tokens[-1]) + This uses the SoupSieve library. - if self._select_debug: - print 'Running CSS selector "%s"' % selector + :param selector: A string containing a CSS selector. - for index, token in enumerate(tokens): - new_context = [] - new_context_ids = set([]) + :param namespaces: A dictionary mapping namespace prefixes + used in the CSS selector to namespace URIs. By default, + Beautiful Soup will use the prefixes it encountered while + parsing the document. - if tokens[index-1] in self._selector_combinators: - # This token was consumed by the previous combinator. Skip it. - if self._select_debug: - print ' Token was consumed by the previous combinator.' - continue + :param limit: After finding this number of results, stop looking. - if self._select_debug: - print ' Considering token "%s"' % token - recursive_candidate_generator = None - tag_name = None - - # Each operation corresponds to a checker function, a rule - # for determining whether a candidate matches the - # selector. Candidates are generated by the active - # iterator. - checker = None - - m = self.attribselect_re.match(token) - if m is not None: - # Attribute selector - tag_name, attribute, operator, value = m.groups() - checker = self._attribute_checker(operator, attribute, value) - - elif '#' in token: - # ID selector - tag_name, tag_id = token.split('#', 1) - def id_matches(tag): - return tag.get('id', None) == tag_id - checker = id_matches - - elif '.' in token: - # Class selector - tag_name, klass = token.split('.', 1) - classes = set(klass.split('.')) - def classes_match(candidate): - return classes.issubset(candidate.get('class', [])) - checker = classes_match - - elif ':' in token and not self.quoted_colon.search(token): - # Pseudo-class - tag_name, pseudo = token.split(':', 1) - if tag_name == '': - raise ValueError( - "A pseudo-class must be prefixed with a tag name.") - pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo) - found = [] - if pseudo_attributes is None: - pseudo_type = pseudo - pseudo_value = None - else: - pseudo_type, pseudo_value = pseudo_attributes.groups() - if pseudo_type == 'nth-of-type': - try: - pseudo_value = int(pseudo_value) - except: - raise NotImplementedError( - 'Only numeric values are currently supported for the nth-of-type pseudo-class.') - if pseudo_value < 1: - raise ValueError( - 'nth-of-type pseudo-class value must be at least 1.') - class Counter(object): - def __init__(self, destination): - self.count = 0 - self.destination = destination - - def nth_child_of_type(self, tag): - self.count += 1 - if self.count == self.destination: - return True - else: - return False - checker = Counter(pseudo_value).nth_child_of_type - else: - raise NotImplementedError( - 'Only the following pseudo-classes are implemented: nth-of-type.') - - elif token == '*': - # Star selector -- matches everything - pass - elif token == '>': - # Run the next token as a CSS selector against the - # direct children of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.children - elif token == '~': - # Run the next token as a CSS selector against the - # siblings of each tag in the current context. - recursive_candidate_generator = lambda tag: tag.next_siblings - elif token == '+': - # For each tag in the current context, run the next - # token as a CSS selector against the tag's next - # sibling that's a tag. - def next_tag_sibling(tag): - yield tag.find_next_sibling(True) - recursive_candidate_generator = next_tag_sibling - - elif self.tag_name_re.match(token): - # Just a tag name. - tag_name = token - else: - raise ValueError( - 'Unsupported or invalid CSS selector: "%s"' % token) - if recursive_candidate_generator: - # This happens when the selector looks like "> foo". - # - # The generator calls select() recursively on every - # member of the current context, passing in a different - # candidate generator and a different selector. - # - # In the case of "> foo", the candidate generator is - # one that yields a tag's direct children (">"), and - # the selector is "foo". - next_token = tokens[index+1] - def recursive_select(tag): - if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 - for i in tag.select(next_token, recursive_candidate_generator): - if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) - yield i - if self._select_debug: - print '-' * 40 - _use_candidate_generator = recursive_select - elif _candidate_generator is None: - # By default, a tag's candidates are all of its - # children. If tag_name is defined, only yield tags - # with that name. - if self._select_debug: - if tag_name: - check = "[any]" - else: - check = tag_name - print ' Default candidate generator, tag name="%s"' % check - if self._select_debug: - # This is redundant with later code, but it stops - # a bunch of bogus tags from cluttering up the - # debug log. - def default_candidate_generator(tag): - for child in tag.descendants: - if not isinstance(child, Tag): - continue - if tag_name and not child.name == tag_name: - continue - yield child - _use_candidate_generator = default_candidate_generator - else: - _use_candidate_generator = lambda tag: tag.descendants - else: - _use_candidate_generator = _candidate_generator - - count = 0 - for tag in current_context: - if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) - for candidate in _use_candidate_generator(tag): - if not isinstance(candidate, Tag): - continue - if tag_name and candidate.name != tag_name: - continue - if checker is not None: - try: - result = checker(candidate) - except StopIteration: - # The checker has decided we should no longer - # run the generator. - break - if checker is None or result: - if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) - if id(candidate) not in new_context_ids: - # If a tag matches a selector more than once, - # don't include it in the context more than once. - new_context.append(candidate) - new_context_ids.add(id(candidate)) - elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) - - current_context = new_context - if limit and len(current_context) >= limit: - current_context = current_context[:limit] - - if self._select_debug: - print "Final verdict:" - for i in current_context: - print " %s %s" % (i.name, i.attrs) - return current_context + :param kwargs: Any extra arguments you'd like to pass in to + soupsieve.select(). + """ + if namespaces is None: + namespaces = self._namespaces + + if limit is None: + limit = 0 + if soupsieve is None: + raise NotImplementedError( + "Cannot execute CSS selectors because the soupsieve package is not installed." + ) + + return soupsieve.select(selector, self, namespaces, limit, **kwargs) # Old names for backwards compatibility def childGenerator(self): @@ -1689,7 +1425,7 @@ class SoupStrainer(object): def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value