Browse Source

Update Beautiful Soup 4.6.3 (r475) → 4.7.1 (r497).

pull/1200/head
JackDandy 6 years ago
parent
commit
0ec577e69c
  1. 1
      CHANGES.md
  2. 110
      lib/bs4/__init__.py
  3. 14
      lib/bs4/builder/__init__.py
  4. 18
      lib/bs4/builder/_html5lib.py
  5. 4
      lib/bs4/builder/_htmlparser.py
  6. 49
      lib/bs4/builder/_lxml.py
  7. 3
      lib/bs4/dammit.py
  8. 3
      lib/bs4/diagnose.py
  9. 432
      lib/bs4/element.py

1
CHANGES.md

@ -1,6 +1,7 @@
### 0.20.0 (2019-xx-xx xx:xx:xx UTC) ### 0.20.0 (2019-xx-xx xx:xx:xx UTC)
* Update attr 18.3.0.dev0 (55642b3) to 19.2.0.dev0 (de84609) * Update attr 18.3.0.dev0 (55642b3) to 19.2.0.dev0 (de84609)
* Update Beautiful Soup 4.6.3 (r475) to 4.7.1 (r497)
[develop changelog] [develop changelog]

110
lib/bs4/__init__.py

@ -17,12 +17,10 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.6.3" __version__ = "4.7.1"
__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
__all__ = ['BeautifulSoup'] __all__ = ['BeautifulSoup']
@ -237,10 +235,11 @@ class BeautifulSoup(Tag):
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.known_xml = self.is_xml self.known_xml = self.is_xml
self.builder.soup = self self._namespaces = dict()
self.parse_only = parse_only self.parse_only = parse_only
self.builder.initialize_soup(self)
if hasattr(markup, 'read'): # It's a file-type object. if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read() markup = markup.read()
elif len(markup) <= 256 and ( elif len(markup) <= 256 and (
@ -382,7 +381,7 @@ class BeautifulSoup(Tag):
def pushTag(self, tag): def pushTag(self, tag):
#print "Push", tag.name #print "Push", tag.name
if self.currentTag: if self.currentTag is not None:
self.currentTag.contents.append(tag) self.currentTag.contents.append(tag)
self.tagStack.append(tag) self.tagStack.append(tag)
self.currentTag = self.tagStack[-1] self.currentTag = self.tagStack[-1]
@ -421,60 +420,71 @@ class BeautifulSoup(Tag):
def object_was_parsed(self, o, parent=None, most_recent_element=None): def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree.""" """Add an object to the parse tree."""
parent = parent or self.currentTag if parent is None:
previous_element = most_recent_element or self._most_recent_element parent = self.currentTag
if most_recent_element is not None:
previous_element = most_recent_element
else:
previous_element = self._most_recent_element
next_element = previous_sibling = next_sibling = None next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag): if isinstance(o, Tag):
next_element = o.next_element next_element = o.next_element
next_sibling = o.next_sibling next_sibling = o.next_sibling
previous_sibling = o.previous_sibling previous_sibling = o.previous_sibling
if not previous_element: if previous_element is None:
previous_element = o.previous_element previous_element = o.previous_element
fix = parent.next_element is not None
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o self._most_recent_element = o
parent.contents.append(o) parent.contents.append(o)
if parent.next_sibling: # Check if we are inserting into an already parsed node.
# This node is being inserted into an element that has if fix:
# already been parsed. Deal with any dangling references. self._linkage_fixer(parent)
index = len(parent.contents)-1
while index >= 0: def _linkage_fixer(self, el):
if parent.contents[index] is o: """Make sure linkage of this fragment is sound."""
break
index -= 1 first = el.contents[0]
else: child = el.contents[-1]
raise ValueError( descendant = child
"Error building tree: supposedly %r was inserted "
"into %r after the fact, but I don't see it!" % ( if child is first and el.parent is not None:
o, parent # Parent should be linked to first child
) el.next_element = child
) # We are no longer linked to whatever this element is
if index == 0: prev_el = child.previous_element
previous_element = parent if prev_el is not None and prev_el is not el:
previous_sibling = None prev_el.next_element = None
else: # First child should be linked to the parent, and no previous siblings.
previous_element = previous_sibling = parent.contents[index-1] child.previous_element = el
if index == len(parent.contents)-1: child.previous_sibling = None
next_element = parent.next_sibling
next_sibling = None # We have no sibling as we've been appended as the last.
else: child.next_sibling = None
next_element = next_sibling = parent.contents[index+1]
# This index is a tag, dig deeper for a "last descendant"
o.previous_element = previous_element if isinstance(child, Tag) and child.contents:
if previous_element: descendant = child._last_descendant(False)
previous_element.next_element = o
o.next_element = next_element # As the final step, link last descendant. It should be linked
if next_element: # to the parent's next sibling (if found), else walk up the chain
next_element.previous_element = o # and find a parent with a sibling. It should have no next sibling.
o.next_sibling = next_sibling descendant.next_element = None
if next_sibling: descendant.next_sibling = None
next_sibling.previous_sibling = o target = el
o.previous_sibling = previous_sibling while True:
if previous_sibling: if target is None:
previous_sibling.next_sibling = o break
elif target.next_sibling is not None:
descendant.next_element = target.next_sibling
target.next_sibling.previous_element = child
break
target = target.parent
def _popToTag(self, name, nsprefix=None, inclusivePop=True): def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent """Pops the tag stack up to and including the most recent
@ -520,7 +530,7 @@ class BeautifulSoup(Tag):
self.currentTag, self._most_recent_element) self.currentTag, self._most_recent_element)
if tag is None: if tag is None:
return tag return tag
if self._most_recent_element: if self._most_recent_element is not None:
self._most_recent_element.next_element = tag self._most_recent_element.next_element = tag
self._most_recent_element = tag self._most_recent_element = tag
self.pushTag(tag) self.pushTag(tag)

14
lib/bs4/builder/__init__.py

@ -1,5 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
from collections import defaultdict from collections import defaultdict
import itertools import itertools
@ -8,7 +8,7 @@ from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
HTMLAwareEntitySubstitution, HTMLAwareEntitySubstitution,
whitespace_re nonwhitespace_re
) )
__all__ = [ __all__ = [
@ -102,6 +102,12 @@ class TreeBuilder(object):
def __init__(self): def __init__(self):
self.soup = None self.soup = None
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
"""
self.soup = soup
def reset(self): def reset(self):
pass pass
@ -167,7 +173,7 @@ class TreeBuilder(object):
# values. Split it into a list. # values. Split it into a list.
value = attrs[attr] value = attrs[attr]
if isinstance(value, basestring): if isinstance(value, basestring):
values = whitespace_re.split(value) values = nonwhitespace_re.findall(value)
else: else:
# html5lib sometimes calls setAttributes twice # html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse # for the same tag when rearranging the parse

18
lib/bs4/builder/_html5lib.py

@ -1,5 +1,5 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
__all__ = [ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
@ -15,7 +15,7 @@ from bs4.builder import (
) )
from bs4.element import ( from bs4.element import (
NamespacedAttribute, NamespacedAttribute,
whitespace_re, nonwhitespace_re,
) )
import html5lib import html5lib
from html5lib.constants import ( from html5lib.constants import (
@ -206,7 +206,7 @@ class AttrList(object):
# A node that is being cloned may have already undergone # A node that is being cloned may have already undergone
# this procedure. # this procedure.
if not isinstance(value, list): if not isinstance(value, list):
value = whitespace_re.split(value) value = nonwhitespace_re.findall(value)
self.element[name] = value self.element[name] = value
def items(self): def items(self):
return list(self.attrs.items()) return list(self.attrs.items())
@ -249,7 +249,7 @@ class Element(treebuilder_base.Node):
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, basestring) and child.parent is not None:
node.element.extract() node.element.extract()
if (string_child and self.element.contents if (string_child is not None and self.element.contents
and self.element.contents[-1].__class__ == NavigableString): and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string. # We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like # TODO This has O(n^2) performance, for input like
@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
# Set the first child's previous_element and previous_sibling # Set the first child's previous_element and previous_sibling
# to elements within the new parent # to elements within the new parent
first_child = to_append[0] first_child = to_append[0]
if new_parents_last_descendant: if new_parents_last_descendant is not None:
first_child.previous_element = new_parents_last_descendant first_child.previous_element = new_parents_last_descendant
else: else:
first_child.previous_element = new_parent_element first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant: if new_parents_last_descendant is not None:
new_parents_last_descendant.next_element = first_child new_parents_last_descendant.next_element = first_child
else: else:
new_parent_element.next_element = first_child new_parent_element.next_element = first_child
if new_parents_last_child: if new_parents_last_child is not None:
new_parents_last_child.next_sibling = first_child new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the # Find the very last element being moved. It is now the
@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
last_childs_last_descendant = to_append[-1]._last_descendant(False, True) last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element: if new_parents_last_descendant_next_element is not None:
# TODO: This code has no test coverage and I'm not sure # TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's # how to get html5lib to go through this path, but it's
# just the other side of the previous line. # just the other side of the previous line.

4
lib/bs4/builder/_htmlparser.py

@ -1,8 +1,8 @@
# encoding: utf-8 # encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" """Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
__all__ = [ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',

49
lib/bs4/builder/_lxml.py

@ -1,5 +1,6 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file. __license__ = "MIT"
__all__ = [ __all__ = [
'LXMLTreeBuilderForXML', 'LXMLTreeBuilderForXML',
'LXMLTreeBuilder', 'LXMLTreeBuilder',
@ -32,6 +33,10 @@ from bs4.dammit import EncodingDetector
LXML = 'lxml' LXML = 'lxml'
def _invert(d):
"Invert a dictionary."
return dict((v,k) for k, v in d.items())
class LXMLTreeBuilderForXML(TreeBuilder): class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser DEFAULT_PARSER_CLASS = etree.XMLParser
@ -48,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
# This namespace mapping is specified in the XML Namespace # This namespace mapping is specified in the XML Namespace
# standard. # standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
"""
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
self._register_namespaces(self.DEFAULT_NSMAPS)
def _register_namespaces(self, mapping):
"""Let the BeautifulSoup object know about namespaces encountered
while parsing the document.
This might be useful later on when creating CSS selectors.
"""
for key, value in mapping.items():
if key and key not in self.soup._namespaces:
# Let the BeautifulSoup object know about a new namespace.
# If there are multiple namespaces defined with the same
# prefix, the first one in the document takes precedence.
self.soup._namespaces[key] = value
def default_parser(self, encoding): def default_parser(self, encoding):
# This can either return a parser object or a class, which # This can either return a parser object or a class, which
@ -75,8 +102,8 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if empty_element_tags is not None: if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags) self.empty_element_tags = set(empty_element_tags)
self.soup = None self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def _getNsTag(self, tag): def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag # Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py. # name. Copied from lxml's src/lxml/sax.py.
@ -144,7 +171,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def close(self): def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
def start(self, name, attrs, nsmap={}): def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
@ -158,8 +185,14 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.nsmaps.append(None) self.nsmaps.append(None)
elif len(nsmap) > 0: elif len(nsmap) > 0:
# A new namespace mapping has come into play. # A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap) # First, Let the BeautifulSoup object know about it.
self._register_namespaces(nsmap)
# Then, add it to our running list of inverted namespace
# mappings.
self.nsmaps.append(_invert(nsmap))
# Also treat the namespace mapping as a set of attributes on the # Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later. # tag, so we can recreate it later.
attrs = attrs.copy() attrs = attrs.copy()

3
lib/bs4/dammit.py

@ -6,8 +6,7 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job. XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import codecs import codecs

3
lib/bs4/diagnose.py

@ -1,7 +1,6 @@
"""Diagnostic functions, mainly for use when doing tech support.""" """Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
import cProfile import cProfile

432
lib/bs4/element.py

@ -1,5 +1,4 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by the MIT license.
# found in the LICENSE file.
__license__ = "MIT" __license__ = "MIT"
try: try:
@ -7,14 +6,25 @@ try:
except ImportError , e: except ImportError , e:
from collections import Callable from collections import Callable
import re import re
import shlex
import sys import sys
import warnings import warnings
try:
import soupsieve
except ImportError, e:
soupsieve = None
warnings.warn(
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
from bs4.dammit import EntitySubstitution from bs4.dammit import EntitySubstitution
DEFAULT_OUTPUT_ENCODING = "utf-8" DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2) PY3K = (sys.version_info[0] > 2)
nonwhitespace_re = re.compile(r"\S+")
# NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
# the off chance someone imported it for their own use.
whitespace_re = re.compile(r"\s+") whitespace_re = re.compile(r"\s+")
def _alias(attr): def _alias(attr):
@ -207,7 +217,7 @@ class PageElement(object):
if formatter is None: if formatter is None:
output = s output = s
else: else:
if callable(formatter): if isinstance(formatter, Callable):
# Backwards compatibility -- you used to pass in a formatting method. # Backwards compatibility -- you used to pass in a formatting method.
output = formatter(s) output = formatter(s)
else: else:
@ -256,26 +266,26 @@ class PageElement(object):
self.previous_element.next_element = self self.previous_element.next_element = self
self.next_element = next_element self.next_element = next_element
if self.next_element: if self.next_element is not None:
self.next_element.previous_element = self self.next_element.previous_element = self
self.next_sibling = next_sibling self.next_sibling = next_sibling
if self.next_sibling: if self.next_sibling is not None:
self.next_sibling.previous_sibling = self self.next_sibling.previous_sibling = self
if (not previous_sibling if (previous_sibling is None
and self.parent is not None and self.parent.contents): and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1] previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling self.previous_sibling = previous_sibling
if previous_sibling: if previous_sibling is not None:
self.previous_sibling.next_sibling = self self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3 nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with): def replace_with(self, replace_with):
if not self.parent: if self.parent is None:
raise ValueError( raise ValueError(
"Cannot replace one element with another when the" "Cannot replace one element with another when the"
"element to be replaced is not part of a tree.") "element to be replaced is not part of a tree.")
@ -292,7 +302,7 @@ class PageElement(object):
def unwrap(self): def unwrap(self):
my_parent = self.parent my_parent = self.parent
if not self.parent: if self.parent is None:
raise ValueError( raise ValueError(
"Cannot replace an element with its contents when that" "Cannot replace an element with its contents when that"
"element is not part of a tree.") "element is not part of a tree.")
@ -340,7 +350,7 @@ class PageElement(object):
def _last_descendant(self, is_initialized=True, accept_self=True): def _last_descendant(self, is_initialized=True, accept_self=True):
"Finds the last element beneath this object to be parsed." "Finds the last element beneath this object to be parsed."
if is_initialized and self.next_sibling: if is_initialized and self.next_sibling is not None:
last_child = self.next_sibling.previous_element last_child = self.next_sibling.previous_element
else: else:
last_child = self last_child = self
@ -430,43 +440,54 @@ class PageElement(object):
"""Appends the given tag to the contents of this tag.""" """Appends the given tag to the contents of this tag."""
self.insert(len(self.contents), tag) self.insert(len(self.contents), tag)
def insert_before(self, predecessor): def extend(self, tags):
"""Makes the given element the immediate predecessor of this one. """Appends the given tags to the contents of this tag."""
for tag in tags:
self.append(tag)
def insert_before(self, *args):
"""Makes the given element(s) the immediate predecessor of this one.
The two elements will have the same parent, and the given element The elements will have the same parent, and the given elements
will be immediately before this one. will be immediately before this one.
""" """
if self is predecessor:
raise ValueError("Can't insert an element before itself.")
parent = self.parent parent = self.parent
if parent is None: if parent is None:
raise ValueError( raise ValueError(
"Element has no parent, so 'before' has no meaning.") "Element has no parent, so 'before' has no meaning.")
# Extract first so that the index won't be screwed up if they if any(x is self for x in args):
# are siblings. raise ValueError("Can't insert an element before itself.")
if isinstance(predecessor, PageElement): for predecessor in args:
predecessor.extract() # Extract first so that the index won't be screwed up if they
index = parent.index(self) # are siblings.
parent.insert(index, predecessor) if isinstance(predecessor, PageElement):
predecessor.extract()
def insert_after(self, successor): index = parent.index(self)
"""Makes the given element the immediate successor of this one. parent.insert(index, predecessor)
The two elements will have the same parent, and the given element def insert_after(self, *args):
"""Makes the given element(s) the immediate successor of this one.
The elements will have the same parent, and the given elements
will be immediately after this one. will be immediately after this one.
""" """
if self is successor: # Do all error checking before modifying the tree.
raise ValueError("Can't insert an element after itself.")
parent = self.parent parent = self.parent
if parent is None: if parent is None:
raise ValueError( raise ValueError(
"Element has no parent, so 'after' has no meaning.") "Element has no parent, so 'after' has no meaning.")
# Extract first so that the index won't be screwed up if they if any(x is self for x in args):
# are siblings. raise ValueError("Can't insert an element after itself.")
if isinstance(successor, PageElement):
successor.extract() offset = 0
index = parent.index(self) for successor in args:
parent.insert(index+1, successor) # Extract first so that the index won't be screwed up if they
# are siblings.
if isinstance(successor, PageElement):
successor.extract()
index = parent.index(self)
parent.insert(index+1+offset, successor)
offset += 1
def find_next(self, name=None, attrs={}, text=None, **kwargs): def find_next(self, name=None, attrs={}, text=None, **kwargs):
"""Returns the first item that matches the given criteria and """Returns the first item that matches the given criteria and
@ -657,82 +678,6 @@ class PageElement(object):
yield i yield i
i = i.parent i = i.parent
# Methods for supporting CSS selectors.
tag_name_re = re.compile('^[a-zA-Z0-9][-.a-zA-Z0-9:_]*$')
# /^([a-zA-Z0-9][-.a-zA-Z0-9:_]*)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
# \---------------------------/ \---/\-------------/ \-------/
# | | | |
# | | | The value
# | | ~,|,^,$,* or =
# | Attribute
# Tag
attribselect_re = re.compile(
r'^(?P<tag>[a-zA-Z0-9][-.a-zA-Z0-9:_]*)?\[(?P<attribute>[\w-]+)(?P<operator>[=~\|\^\$\*]?)' +
r'=?"?(?P<value>[^\]"]*)"?\]$'
)
def _attr_value_as_string(self, value, default=None):
"""Force an attribute value into a string representation.
A multi-valued attribute will be converted into a
space-separated stirng.
"""
value = self.get(value, default)
if isinstance(value, list) or isinstance(value, tuple):
value =" ".join(value)
return value
def _tag_name_matches_and(self, function, tag_name):
if not tag_name:
return function
else:
def _match(tag):
return tag.name == tag_name and function(tag)
return _match
def _attribute_checker(self, operator, attribute, value=''):
"""Create a function that performs a CSS selector operation.
Takes an operator, attribute and optional value. Returns a
function that will return True for elements that match that
combination.
"""
if operator == '=':
# string representation of `attribute` is equal to `value`
return lambda el: el._attr_value_as_string(attribute) == value
elif operator == '~':
# space-separated list representation of `attribute`
# contains `value`
def _includes_value(element):
attribute_value = element.get(attribute, [])
if not isinstance(attribute_value, list):
attribute_value = attribute_value.split()
return value in attribute_value
return _includes_value
elif operator == '^':
# string representation of `attribute` starts with `value`
return lambda el: el._attr_value_as_string(
attribute, '').startswith(value)
elif operator == '$':
# string representation of `attribute` ends with `value`
return lambda el: el._attr_value_as_string(
attribute, '').endswith(value)
elif operator == '*':
# string representation of `attribute` contains `value`
return lambda el: value in el._attr_value_as_string(attribute, '')
elif operator == '|':
# string representation of `attribute` is either exactly
# `value` or starts with `value` and then a dash.
def _is_or_starts_with_dash(element):
attribute_value = element._attr_value_as_string(attribute, '')
return (attribute_value == value or attribute_value.startswith(
value + '-'))
return _is_or_starts_with_dash
else:
return lambda el: el.has_attr(attribute)
# Old non-property versions of the generators, for backwards # Old non-property versions of the generators, for backwards
# compatibility with BS3. # compatibility with BS3.
def nextGenerator(self): def nextGenerator(self):
@ -1193,7 +1138,7 @@ class Tag(PageElement):
# First off, turn a string formatter into a Formatter object. This # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not callable(formatter): if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
attrs = [] attrs = []
if self.attrs: if self.attrs:
@ -1298,7 +1243,7 @@ class Tag(PageElement):
""" """
# First off, turn a string formatter into a Formatter object. This # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not callable(formatter): if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
formatter = self._formatter_for_name(formatter) formatter = self._formatter_for_name(formatter)
pretty_print = (indent_level is not None) pretty_print = (indent_level is not None)
@ -1394,250 +1339,41 @@ class Tag(PageElement):
current = current.next_element current = current.next_element
# CSS selector code # CSS selector code
def select_one(self, selector, namespaces=None, **kwargs):
_selector_combinators = ['>', '+', '~']
_select_debug = False
quoted_colon = re.compile('"[^"]*:[^"]*"')
def select_one(self, selector):
"""Perform a CSS selection operation on the current element.""" """Perform a CSS selection operation on the current element."""
value = self.select(selector, limit=1) value = self.select(selector, namespaces, 1, **kwargs)
if value: if value:
return value[0] return value[0]
return None return None
def select(self, selector, _candidate_generator=None, limit=None): def select(self, selector, namespaces=None, limit=None, **kwargs):
"""Perform a CSS selection operation on the current element.""" """Perform a CSS selection operation on the current element.
# Handle grouping selectors if ',' exists, ie: p,a This uses the SoupSieve library.
if ',' in selector:
context = []
selectors = [x.strip() for x in selector.split(",")]
# If a selector is mentioned multiple times we don't want
# to use it more than once.
used_selectors = set()
# We also don't want to select the same element more than once,
# if it's matched by multiple selectors.
selected_object_ids = set()
for partial_selector in selectors:
if partial_selector == '':
raise ValueError('Invalid group selection syntax: %s' % selector)
if partial_selector in used_selectors:
continue
used_selectors.add(partial_selector)
candidates = self.select(partial_selector, limit=limit)
for candidate in candidates:
# This lets us distinguish between distinct tags that
# represent the same markup.
object_id = id(candidate)
if object_id not in selected_object_ids:
context.append(candidate)
selected_object_ids.add(object_id)
if limit and len(context) >= limit:
break
return context
tokens = shlex.split(selector)
current_context = [self]
if tokens[-1] in self._selector_combinators:
raise ValueError(
'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug: :param selector: A string containing a CSS selector.
print 'Running CSS selector "%s"' % selector
for index, token in enumerate(tokens): :param namespaces: A dictionary mapping namespace prefixes
new_context = [] used in the CSS selector to namespace URIs. By default,
new_context_ids = set([]) Beautiful Soup will use the prefixes it encountered while
parsing the document.
if tokens[index-1] in self._selector_combinators: :param limit: After finding this number of results, stop looking.
# This token was consumed by the previous combinator. Skip it.
if self._select_debug:
print ' Token was consumed by the previous combinator.'
continue
if self._select_debug: :param kwargs: Any extra arguments you'd like to pass in to
print ' Considering token "%s"' % token soupsieve.select().
recursive_candidate_generator = None """
tag_name = None if namespaces is None:
namespaces = self._namespaces
# Each operation corresponds to a checker function, a rule
# for determining whether a candidate matches the if limit is None:
# selector. Candidates are generated by the active limit = 0
# iterator. if soupsieve is None:
checker = None raise NotImplementedError(
"Cannot execute CSS selectors because the soupsieve package is not installed."
m = self.attribselect_re.match(token) )
if m is not None:
# Attribute selector return soupsieve.select(selector, self, namespaces, limit, **kwargs)
tag_name, attribute, operator, value = m.groups()
checker = self._attribute_checker(operator, attribute, value)
elif '#' in token:
# ID selector
tag_name, tag_id = token.split('#', 1)
def id_matches(tag):
return tag.get('id', None) == tag_id
checker = id_matches
elif '.' in token:
# Class selector
tag_name, klass = token.split('.', 1)
classes = set(klass.split('.'))
def classes_match(candidate):
return classes.issubset(candidate.get('class', []))
checker = classes_match
elif ':' in token and not self.quoted_colon.search(token):
# Pseudo-class
tag_name, pseudo = token.split(':', 1)
if tag_name == '':
raise ValueError(
"A pseudo-class must be prefixed with a tag name.")
pseudo_attributes = re.match(r'([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
found = []
if pseudo_attributes is None:
pseudo_type = pseudo
pseudo_value = None
else:
pseudo_type, pseudo_value = pseudo_attributes.groups()
if pseudo_type == 'nth-of-type':
try:
pseudo_value = int(pseudo_value)
except:
raise NotImplementedError(
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
if pseudo_value < 1:
raise ValueError(
'nth-of-type pseudo-class value must be at least 1.')
class Counter(object):
def __init__(self, destination):
self.count = 0
self.destination = destination
def nth_child_of_type(self, tag):
self.count += 1
if self.count == self.destination:
return True
else:
return False
checker = Counter(pseudo_value).nth_child_of_type
else:
raise NotImplementedError(
'Only the following pseudo-classes are implemented: nth-of-type.')
elif token == '*':
# Star selector -- matches everything
pass
elif token == '>':
# Run the next token as a CSS selector against the
# direct children of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.children
elif token == '~':
# Run the next token as a CSS selector against the
# siblings of each tag in the current context.
recursive_candidate_generator = lambda tag: tag.next_siblings
elif token == '+':
# For each tag in the current context, run the next
# token as a CSS selector against the tag's next
# sibling that's a tag.
def next_tag_sibling(tag):
yield tag.find_next_sibling(True)
recursive_candidate_generator = next_tag_sibling
elif self.tag_name_re.match(token):
# Just a tag name.
tag_name = token
else:
raise ValueError(
'Unsupported or invalid CSS selector: "%s"' % token)
if recursive_candidate_generator:
# This happens when the selector looks like "> foo".
#
# The generator calls select() recursively on every
# member of the current context, passing in a different
# candidate generator and a different selector.
#
# In the case of "> foo", the candidate generator is
# one that yields a tag's direct children (">"), and
# the selector is "foo".
next_token = tokens[index+1]
def recursive_select(tag):
if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
print '-' * 40
for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
yield i
if self._select_debug:
print '-' * 40
_use_candidate_generator = recursive_select
elif _candidate_generator is None:
# By default, a tag's candidates are all of its
# children. If tag_name is defined, only yield tags
# with that name.
if self._select_debug:
if tag_name:
check = "[any]"
else:
check = tag_name
print ' Default candidate generator, tag name="%s"' % check
if self._select_debug:
# This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the
# debug log.
def default_candidate_generator(tag):
for child in tag.descendants:
if not isinstance(child, Tag):
continue
if tag_name and not child.name == tag_name:
continue
yield child
_use_candidate_generator = default_candidate_generator
else:
_use_candidate_generator = lambda tag: tag.descendants
else:
_use_candidate_generator = _candidate_generator
count = 0
for tag in current_context:
if self._select_debug:
print " Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs))
for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag):
continue
if tag_name and candidate.name != tag_name:
continue
if checker is not None:
try:
result = checker(candidate)
except StopIteration:
# The checker has decided we should no longer
# run the generator.
break
if checker is None or result:
if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once,
# don't include it in the context more than once.
new_context.append(candidate)
new_context_ids.add(id(candidate))
elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
current_context = new_context
if limit and len(current_context) >= limit:
current_context = current_context[:limit]
if self._select_debug:
print "Final verdict:"
for i in current_context:
print " %s %s" % (i.name, i.attrs)
return current_context
# Old names for backwards compatibility # Old names for backwards compatibility
def childGenerator(self): def childGenerator(self):
@ -1689,7 +1425,7 @@ class SoupStrainer(object):
def _normalize_search_value(self, value): def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a # Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None. # regular expression, a boolean, or None.
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') if (isinstance(value, unicode) or isinstance(value, Callable) or hasattr(value, 'match')
or isinstance(value, bool) or value is None): or isinstance(value, bool) or value is None):
return value return value

Loading…
Cancel
Save