Browse Source

Merge branch 'feature/UpdateBS4' into develop

pull/1200/head
JackDandy 6 years ago
parent
commit
f7136b1cdd
  1. 1
      CHANGES.md
  2. 115
      lib/bs4/__init__.py
  3. 71
      lib/bs4/builder/__init__.py
  4. 47
      lib/bs4/builder/_html5lib.py
  5. 21
      lib/bs4/builder/_htmlparser.py
  6. 13
      lib/bs4/builder/_lxml.py
  7. 56
      lib/bs4/dammit.py
  8. 411
      lib/bs4/element.py
  9. 99
      lib/bs4/formatter.py

1
CHANGES.md

@ -1,6 +1,7 @@
### 0.21.0 (2019-xx-xx xx:xx:xx UTC)
* Update attr 19.2.0.dev0 (de84609) to 19.2.0.dev0 (154b4e5)
* Update Beautiful Soup 4.7.1 (r497) to 4.8.0 (r526)
* Update Certifi 2019.03.09 (401100f) to 2019.06.16 (84dc766)
* Update DiskCache library 3.1.1 (2649ac9) to 4.0.0 (2c79bb9)
* Update feedparser 5.2.1 (2b11c80) to 5.2.1 (cbe18d0)

115
lib/bs4/__init__.py

@ -18,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.7.1"
__version__ = "4.8.0"
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
# Use of this source code is governed by the MIT license.
__license__ = "MIT"
@ -63,7 +63,7 @@ class BeautifulSoup(Tag):
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node
endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
@ -78,14 +78,14 @@ class BeautifulSoup(Tag):
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
element_classes=None, **kwargs):
"""Constructor.
:param markup: A string or a file-like object representing
@ -98,8 +98,10 @@ class BeautifulSoup(Tag):
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A specific TreeBuilder to use instead of looking one
up based on `features`. You shouldn't need to use this.
:param builder: A TreeBuilder subclass to instantiate (or
instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
@ -115,14 +117,26 @@ class BeautifulSoup(Tag):
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param element_classes: A dictionary mapping BeautifulSoup
classes like Tag and NavigableString to other classes you'd
like to be instantiated instead as the parse tree is
built. This is useful for using subclasses to modify the
default behavior of Tag or NavigableString.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4 and there's no need to actually pass keyword
arguments into the constructor.
Beautiful Soup 4; they will result in a warning and then be ignored.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This
makes it possible to configure a TreeBuilder beyond saying
which one to use.
"""
if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
@ -177,13 +191,19 @@ class BeautifulSoup(Tag):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
original_features = features
self.element_classes = element_classes or dict()
# We need this information to track whether or not the builder
# was specified well enough that we can omit the 'you need to
# specify a parser' warning.
original_builder = builder
original_features = features
if isinstance(builder, type):
# A builder class was passed in; it needs to be instantiated.
builder_class = builder
builder = None
elif builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
@ -194,9 +214,16 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
if not (original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES):
# At this point either we have a TreeBuilder instance in
# builder, or we have a builder_class that we can instantiate
# with the remaining **kwargs.
if builder is None:
builder = builder_class(**kwargs)
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
):
if builder.is_xml:
markup_type = "XML"
else:
@ -231,7 +258,10 @@ class BeautifulSoup(Tag):
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
@ -272,6 +302,8 @@ class BeautifulSoup(Tag):
' Beautiful Soup.' % markup)
self._check_markup_is_url(markup)
rejections = []
success = False
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(
@ -279,10 +311,18 @@ class BeautifulSoup(Tag):
self.reset()
try:
self._feed()
success = True
break
except ParserRejectedMarkup:
except ParserRejectedMarkup as e:
rejections.append(e)
pass
if not success:
other_exceptions = [unicode(e) for e in rejections]
raise ParserRejectedMarkup(
u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
)
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
@ -355,13 +395,20 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup."""
kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos
)
def new_string(self, s, subclass=NavigableString):
def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup."""
subclass = subclass or self.element_classes.get(
NavigableString, NavigableString
)
return subclass(s)
def insert_before(self, successor):
@ -388,7 +435,17 @@ class BeautifulSoup(Tag):
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=NavigableString):
def endData(self, containerClass=None):
# Default container is NavigableString.
containerClass = containerClass or NavigableString
# The user may want us to instantiate some alias for the
# container class.
containerClass = self.element_classes.get(
containerClass, containerClass
)
if self.current_data:
current_data = u''.join(self.current_data)
# If whitespace is not preserved, and this string contains
@ -509,7 +566,8 @@ class BeautifulSoup(Tag):
return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs):
def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
@ -526,8 +584,11 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element)
tag = self.element_classes.get(Tag, Tag)(
self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element,
sourceline=sourceline, sourcepos=sourcepos
)
if tag is None:
return tag
if self._most_recent_element is not None:

71
lib/bs4/builder/__init__.py

@ -7,7 +7,6 @@ import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
nonwhitespace_re
)
@ -90,18 +89,58 @@ class TreeBuilder(object):
is_xml = False
picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
USE_DEFAULT = object()
def __init__(self):
# Most parsers don't keep track of line numbers.
TRACKS_LINE_NUMBERS = False
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
store_line_numbers=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
the way <pre> tags are treated in HTML. Tags in this list
will have
:param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
`Tag` object. You can turn this off by passing
store_line_numbers=False. If the parser you're using doesn't
keep track of this information, then setting store_line_numbers=True
will do nothing.
"""
self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
self.store_line_numbers = store_line_numbers
def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder.
@ -131,13 +170,13 @@ class TreeBuilder(object):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
return markup, None, None, False
document_declared_encoding=None, exclude_encodings=None):
yield markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
@ -237,7 +276,6 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
@ -259,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
cdata_list_attributes = {
DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
@ -276,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"],
}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
@ -323,8 +363,15 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
pass
def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
"""
if isinstance(message_or_exception, Exception):
e = message_or_exception
message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
super(ParserRejectedMarkup, self).__init__(message_or_exception)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only

47
lib/bs4/builder/_html5lib.py

@ -45,6 +45,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
features = [NAME, PERMISSIVE, HTML_5, HTML]
# html5lib can tell us which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
def prepare_markup(self, markup, user_specified_encoding,
document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on.
@ -62,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict()
if not isinstance(markup, unicode):
if new_html5lib:
@ -70,7 +74,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
@ -84,10 +88,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
self.underlying_builder.parser = None
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
namespaceHTMLElements, self.soup)
namespaceHTMLElements, self.soup,
store_line_numbers=self.store_line_numbers
)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
@ -96,15 +103,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, soup=None):
def __init__(self, namespaceHTMLElements, soup=None,
store_line_numbers=True, **kwargs):
if soup:
self.soup = soup
else:
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
# TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup(
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
# object, which we can use to track the current line number.
self.parser = None
self.store_line_numbers = store_line_numbers
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
@ -118,7 +136,16 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace)
kwargs = {}
if self.parser and self.store_line_numbers:
# This represents the point immediately after the end of the
# tag. We don't know when the tag started, but we do know
# where it ended -- the character just before this one.
sourceline, sourcepos = self.parser.tokenizer.stream.position()
kwargs['sourceline'] = sourceline
kwargs['sourcepos'] = sourcepos-1
tag = self.soup.new_tag(name, namespace, **kwargs)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
@ -126,6 +153,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def fragmentClass(self):
from bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
@ -199,7 +228,7 @@ class AttrList(object):
def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes
list_attr = self.element.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):

21
lib/bs4/builder/_htmlparser.py

@ -99,7 +99,11 @@ class BeautifulSoupHTMLParser(HTMLParser):
attr_dict[key] = value
attrvalue = '""'
#print "START", name
tag = self.soup.handle_starttag(name, None, None, attr_dict)
sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline,
sourcepos=sourcepos
)
if tag and tag.is_empty_element and handle_empty_element:
# Unlike other parsers, html.parser doesn't send separate end tag
# events for empty-element tags. (It's handled in
@ -214,12 +218,19 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
# The html.parser knows which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False
parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs)
parser_kwargs['convert_charrefs'] = False
self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):

13
lib/bs4/builder/_lxml.py

@ -57,6 +57,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
# NOTE: If we parsed Element objects and looked at .sourceline,
# we'd be able to see the line numbers from the original document.
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace
mapping.
@ -94,7 +100,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
def __init__(self, parser=None, empty_element_tags=None):
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
@ -103,6 +109,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
@ -168,7 +175,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
raise ParserRejectedMarkup(e)
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
@ -287,7 +294,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser.feed(markup)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
raise ParserRejectedMarkup(e)
def test_fragment_to_document(self, fragment):

56
lib/bs4/dammit.py

@ -22,6 +22,8 @@ try:
# PyPI package: cchardet
import cchardet
def chardet_dammit(s):
if isinstance(s, unicode):
return None
return cchardet.detect(s)['encoding']
except ImportError:
try:
@ -30,6 +32,8 @@ except ImportError:
# PyPI package: chardet
import chardet
def chardet_dammit(s):
if isinstance(s, unicode):
return None
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
@ -44,10 +48,19 @@ try:
except ImportError:
pass
xml_encoding_re = re.compile(
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
# Build bytestring and Unicode versions of regular expressions for finding
# a declared encoding inside an XML or HTML document.
xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
encoding_res = dict()
encoding_res[bytes] = {
'html' : re.compile(html_meta.encode("ascii"), re.I),
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
}
encoding_res[unicode] = {
'html' : re.compile(html_meta, re.I),
'xml' : re.compile(xml_encoding, re.I)
}
class EntitySubstitution(object):
@ -57,15 +70,24 @@ class EntitySubstitution(object):
lookup = {}
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
# entity. We don't want to use it, but we want to recognize it on the way in.
#
# TODO: Ideally we would be able to recognize all HTML 5 named
# entities, but that's a little tricky.
extra = [(39, 'apos')]
for codepoint, name in list(codepoint2name.items()) + extra:
character = unichr(codepoint)
if codepoint != 34:
if codepoint not in (34, 39):
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
# &quot; or the single quote into &apos;, unless it
# happens within an attribute value, which is handled
# elsewhere.
characters_for_re.append(character)
lookup[character] = name
# But we do want to turn &quot; into the quotation mark.
# But we do want to recognize those entities on the way in and
# convert them to Unicode characters.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
@ -310,14 +332,22 @@ class EncodingDetector:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
if isinstance(markup, bytes):
res = encoding_res[bytes]
else:
res = encoding_res[unicode]
xml_re = res['xml']
html_re = res['html']
declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii', 'replace')
declared_encoding = declared_encoding_match.groups()[0]
if declared_encoding:
if isinstance(declared_encoding, bytes):
declared_encoding = declared_encoding.decode('ascii', 'replace')
return declared_encoding.lower()
return None

411
lib/bs4/element.py

@ -16,7 +16,11 @@ except ImportError, e:
'The soupsieve package is not installed. CSS selectors cannot be used.'
)
from bs4.dammit import EntitySubstitution
from bs4.formatter import (
Formatter,
HTMLFormatter,
XMLFormatter,
)
DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2)
@ -42,6 +46,11 @@ def _alias(attr):
class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
# per https://www.w3.org/TR/xml-names/#defaulting
name = None
if name is None:
obj = unicode.__new__(cls, prefix)
elif prefix is None:
@ -99,138 +108,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
class HTMLAwareEntitySubstitution(EntitySubstitution):
"""Entity substitution rules that are aware of some HTML quirks.
Specifically, the contents of <script> and <style> tags should not
undergo entity substitution.
Incoming NavigableString objects are checked to see if they're the
direct children of a <script> or <style> tag.
"""
cdata_containing_tags = set(["script", "style"])
preformatted_tags = set(["pre"])
preserve_whitespace_tags = set(['pre', 'textarea'])
@classmethod
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in cls.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return f(ns)
@classmethod
def substitute_html(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_html)
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
other elements."""
self.parent = parent
@classmethod
def substitute_xml(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_xml)
self.previous_element = previous_element
if previous_element is not None:
self.previous_element.next_element = self
class Formatter(object):
"""Contains information about how to format a parse tree."""
# By default, represent void elements as <tag/> rather than <tag>
void_element_close_prefix = '/'
def substitute_entities(self, *args, **kwargs):
"""Transform certain characters into named entities."""
raise NotImplementedError()
class HTMLFormatter(Formatter):
"""The default HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class MinimalHTMLFormatter(Formatter):
"""A minimal HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
class HTML5Formatter(HTMLFormatter):
"""An HTML formatter that omits the slash in a void tag."""
void_element_close_prefix = None
self.next_element = next_element
if self.next_element is not None:
self.next_element.previous_element = self
class XMLFormatter(Formatter):
"""Substitute only the essential XML entities."""
def substitute(self, *args, **kwargs):
return EntitySubstitution.substitute_xml(*args, **kwargs)
self.next_sibling = next_sibling
if self.next_sibling is not None:
self.next_sibling.previous_sibling = self
class HTMLXMLFormatter(Formatter):
"""Format XML using HTML rules."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
if (previous_sibling is None
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
class PageElement(object):
"""Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
self.previous_sibling = previous_sibling
if previous_sibling is not None:
self.previous_sibling.next_sibling = self
# There are five possible values for the "formatter" argument passed in
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
# are converted to those entities on output.
# "html5" - The same as "html", but empty void tags are represented as
# <tag> rather than <tag/>
# "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
# A callable function - it will be called on every string that needs to undergo entity substitution.
# A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution.
#
# In an HTML document, the default "html", "html5", and "minimal"
# functions will leave the contents of <script> and <style> tags
# alone. For an XML document, all tags will be given the same
# treatment.
HTML_FORMATTERS = {
"html" : HTMLFormatter(),
"html5" : HTML5Formatter(),
"minimal" : MinimalHTMLFormatter(),
None : None
}
XML_FORMATTERS = {
"html" : HTMLXMLFormatter(),
"minimal" : XMLFormatter(),
None : None
}
def format_string(self, s, formatter='minimal'):
def format_string(self, s, formatter):
"""Format the given string using the given formatter."""
if isinstance(formatter, basestring):
formatter = self._formatter_for_name(formatter)
if formatter is None:
output = s
else:
if isinstance(formatter, Callable):
# Backwards compatibility -- you used to pass in a formatting method.
output = formatter(s)
else:
output = formatter.substitute(s)
return s
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
output = formatter.substitute(s)
return output
def formatter_for_name(self, formatter):
"""Look up or create a Formatter for the given identifier,
if necessary.
:param formatter: Can be a Formatter object (used as-is), a
function (used as the entity substitution hook for an
XMLFormatter or HTMLFormatter), or a string (used to look up
an XMLFormatter or HTMLFormatter in the appropriate registry.
"""
if isinstance(formatter, Formatter):
return formatter
if self._is_xml:
c = XMLFormatter
else:
c = HTMLFormatter
if callable(formatter):
return c(entity_substitution=formatter)
return c.REGISTRY[formatter]
@property
def _is_xml(self):
"""Is this element part of an XML tree or an HTML tree?
This is used when mapping a formatter name ("minimal") to an
appropriate function (one that performs entity-substitution on
the contents of <script> and <style> tags, or not). It can be
This is used in formatter_for_name, when deciding whether an
XMLFormatter or HTMLFormatter is more appropriate. It can be
inefficient, but it should be called very rarely.
"""
if self.known_xml is not None:
@ -248,46 +190,13 @@ class PageElement(object):
return getattr(self, 'is_xml', False)
return self.parent._is_xml
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
return self.XML_FORMATTERS.get(name, XMLFormatter())
else:
return self.HTML_FORMATTERS.get(name, HTMLFormatter())
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
other elements."""
self.parent = parent
self.previous_element = previous_element
if previous_element is not None:
self.previous_element.next_element = self
self.next_element = next_element
if self.next_element is not None:
self.next_element.previous_element = self
self.next_sibling = next_sibling
if self.next_sibling is not None:
self.next_sibling.previous_sibling = self
if (previous_sibling is None
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
if previous_sibling is not None:
self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with):
if self.parent is None:
raise ValueError(
"Cannot replace one element with another when the"
"Cannot replace one element with another when the "
"element to be replaced is not part of a tree.")
if replace_with is self:
return
@ -742,6 +651,7 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"):
"""Run the string through the provided formatter."""
output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX
@ -760,10 +670,12 @@ class PreformattedString(NavigableString):
but the return value will be ignored.
"""
def output_ready(self, formatter="minimal"):
"""CData strings are passed into the formatter.
But the return value is ignored."""
self.format_string(self, formatter)
def output_ready(self, formatter=None):
"""CData strings are passed into the formatter, purely
for any side effects. The return value is ignored.
"""
if formatter is not None:
ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString):
@ -817,7 +729,10 @@ class Tag(PageElement):
def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None,
is_xml=None):
is_xml=None, sourceline=None, sourcepos=None,
can_be_empty_element=None, cdata_list_attributes=None,
preserve_whitespace_tags=None
):
"Basic constructor."
if parser is None:
@ -831,14 +746,10 @@ class Tag(PageElement):
self.name = name
self.namespace = namespace
self.prefix = prefix
if builder is not None:
preserve_whitespace_tags = builder.preserve_whitespace_tags
else:
if is_xml:
preserve_whitespace_tags = []
else:
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
self.preserve_whitespace_tags = preserve_whitespace_tags
if ((not builder or builder.store_line_numbers)
and (sourceline is not None or sourcepos is not None)):
self.sourceline = sourceline
self.sourcepos = sourcepos
if attrs is None:
attrs = {}
elif attrs:
@ -861,12 +772,33 @@ class Tag(PageElement):
self.setup(parent, previous)
self.hidden = False
# Set up any substitutions, such as the charset in a META tag.
if builder is not None:
if builder is None:
# In the absence of a TreeBuilder, use whatever values were
# passed in here. They're probably None, unless this is a copy of some
# other tag.
self.can_be_empty_element = can_be_empty_element
self.cdata_list_attributes = cdata_list_attributes
self.preserve_whitespace_tags = preserve_whitespace_tags
else:
# Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self)
# Ask the TreeBuilder whether this tag might be an empty-element tag.
self.can_be_empty_element = builder.can_be_empty_element(name)
else:
self.can_be_empty_element = False
# Keep track of the list of attributes of this tag that
# might need to be treated as a list.
#
# For performance reasons, we store the whole data structure
# rather than asking the question of every tag. Asking would
# require building a new data structure every time, and
# (unlike can_be_empty_element), we almost never need
# to check this.
self.cdata_list_attributes = builder.cdata_list_attributes
# Keep track of the names that might cause this tag to be treated as a
# whitespace-preserved tag.
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
parserClass = _alias("parser_class") # BS3
@ -874,8 +806,14 @@ class Tag(PageElement):
"""A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents.
"""
clone = type(self)(None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml)
clone = type(self)(
None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags
)
for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr))
for child in self.contents:
@ -981,6 +919,43 @@ class Tag(PageElement):
for element in self.contents[:]:
element.extract()
def smooth(self):
"""Smooth out this element's children by consolidating consecutive strings.
This makes pretty-printed output look more natural following a
lot of operations that modified the tree.
"""
# Mark the first position of every pair of children that need
# to be consolidated. Do this rather than making a copy of
# self.contents, since in most cases very few strings will be
# affected.
marked = []
for i, a in enumerate(self.contents):
if isinstance(a, Tag):
# Recursively smooth children.
a.smooth()
if i == len(self.contents)-1:
# This is the last item in .contents, and it's not a
# tag. There's no chance it needs any work.
continue
b = self.contents[i+1]
if (isinstance(a, NavigableString)
and isinstance(b, NavigableString)
and not isinstance(a, PreformattedString)
and not isinstance(b, PreformattedString)
):
marked.append(i)
# Go over the marked positions in reverse order, so that
# removing items from .contents won't affect the remaining
# positions.
for i in reversed(marked):
a = self.contents[i]
b = self.contents[i+1]
b.extract()
n = NavigableString(a+b)
a.replace_with(n)
def index(self, element):
"""
Find the index of a child by identity, not value. Avoids issues with
@ -1115,14 +1090,6 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors)
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
return (
indent_level is not None
and self.name not in self.preserve_whitespace_tags
)
def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
@ -1136,30 +1103,32 @@ class Tag(PageElement):
encoding.
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
formatter = self._formatter_for_name(formatter)
# First off, turn a non-Formatter `formatter` into a Formatter
# object. This will stop the lookup from happening over and
# over again.
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
attrs = []
if self.attrs:
for key, val in sorted(self.attrs.items()):
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, basestring):
val = unicode(val)
elif (
for key, val in attributes:
if val is None:
decoded = key
else:
if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val)
elif not isinstance(val, basestring):
val = unicode(val)
elif (
isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None):
val = val.encode(eventual_encoding)
text = self.format_string(val, formatter)
decoded = (
unicode(key) + '='
+ EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded)
and eventual_encoding is not None
):
val = val.encode(eventual_encoding)
text = formatter.attribute_value(val)
decoded = (
unicode(key) + '='
+ formatter.quoted_attribute_value(text))
attrs.append(decoded)
close = ''
closeTag = ''
@ -1168,9 +1137,7 @@ class Tag(PageElement):
prefix = self.prefix + ":"
if self.is_empty_element:
close = ''
if isinstance(formatter, Formatter):
close = formatter.void_element_close_prefix or close
close = formatter.void_element_close_prefix or ''
else:
closeTag = '</%s%s>' % (prefix, self.name)
@ -1185,7 +1152,8 @@ class Tag(PageElement):
else:
indent_contents = None
contents = self.decode_contents(
indent_contents, eventual_encoding, formatter)
indent_contents, eventual_encoding, formatter
)
if self.hidden:
# This is the 'document root' object.
@ -1217,6 +1185,16 @@ class Tag(PageElement):
s = ''.join(s)
return s
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
return (
indent_level is not None
and (
not self.preserve_whitespace_tags
or self.name not in self.preserve_whitespace_tags
)
)
def prettify(self, encoding=None, formatter="minimal"):
if encoding is None:
return self.decode(True, formatter=formatter)
@ -1232,19 +1210,19 @@ class Tag(PageElement):
indented this many spaces.
:param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_
encoded into this encoding. decode_contents() is _not_
responsible for performing that encoding. This information
is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's
encoding.
:param formatter: The output formatter responsible for converting
entities to Unicode characters.
:param formatter: A Formatter object, or a string naming one of
the standard Formatters.
"""
# First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable):
formatter = self._formatter_for_name(formatter)
if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
pretty_print = (indent_level is not None)
s = []
@ -1255,16 +1233,19 @@ class Tag(PageElement):
elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding,
formatter))
if text and indent_level and not self.name == 'pre':
preserve_whitespace = (
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace:
text = text.strip()
if text:
if pretty_print and not self.name == 'pre':
if pretty_print and not preserve_whitespace:
s.append(" " * (indent_level - 1))
s.append(text)
if pretty_print and not self.name == 'pre':
if pretty_print and not preserve_whitespace:
s.append("\n")
return ''.join(s)
def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):

99
lib/bs4/formatter.py

@ -0,0 +1,99 @@
from bs4.dammit import EntitySubstitution
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
HTML_FORMATTERS = {}
HTML = 'html'
XML = 'xml'
HTML_DEFAULTS = dict(
cdata_containing_tags=set(["script", "style"]),
)
def _default(self, language, value, kwarg):
if value is not None:
return value
if language == self.XML:
return set()
return self.HTML_DEFAULTS[kwarg]
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
):
"""
:param void_element_close_prefix: By default, represent void
elements as <tag/> rather than <tag>
"""
self.language = language
self.entity_substitution = entity_substitution
self.void_element_close_prefix = void_element_close_prefix
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution."""
if not self.entity_substitution:
return ns
from element import NavigableString
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in self.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return self.entity_substitution(ns)
def attribute_value(self, value):
"""Process the value of an attribute."""
return self.substitute(value)
def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
return sorted(tag.attrs.items())
class HTMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
entity_substitution=None
)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = Formatter(
Formatter(Formatter.XML, entity_substitution=None)
)
Loading…
Cancel
Save