Browse Source

Merge branch 'feature/UpdateBS4' into develop

pull/1200/head
JackDandy 6 years ago
parent
commit
f7136b1cdd
  1. 1
      CHANGES.md
  2. 115
      lib/bs4/__init__.py
  3. 71
      lib/bs4/builder/__init__.py
  4. 47
      lib/bs4/builder/_html5lib.py
  5. 21
      lib/bs4/builder/_htmlparser.py
  6. 13
      lib/bs4/builder/_lxml.py
  7. 56
      lib/bs4/dammit.py
  8. 411
      lib/bs4/element.py
  9. 99
      lib/bs4/formatter.py

1
CHANGES.md

@ -1,6 +1,7 @@
### 0.21.0 (2019-xx-xx xx:xx:xx UTC) ### 0.21.0 (2019-xx-xx xx:xx:xx UTC)
* Update attr 19.2.0.dev0 (de84609) to 19.2.0.dev0 (154b4e5) * Update attr 19.2.0.dev0 (de84609) to 19.2.0.dev0 (154b4e5)
* Update Beautiful Soup 4.7.1 (r497) to 4.8.0 (r526)
* Update Certifi 2019.03.09 (401100f) to 2019.06.16 (84dc766) * Update Certifi 2019.03.09 (401100f) to 2019.06.16 (84dc766)
* Update DiskCache library 3.1.1 (2649ac9) to 4.0.0 (2c79bb9) * Update DiskCache library 3.1.1 (2649ac9) to 4.0.0 (2c79bb9)
* Update feedparser 5.2.1 (2b11c80) to 5.2.1 (cbe18d0) * Update feedparser 5.2.1 (2b11c80) to 5.2.1 (cbe18d0)

115
lib/bs4/__init__.py

@ -18,7 +18,7 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
""" """
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.7.1" __version__ = "4.8.0"
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
# Use of this source code is governed by the MIT license. # Use of this source code is governed by the MIT license.
__license__ = "MIT" __license__ = "MIT"
@ -63,7 +63,7 @@ class BeautifulSoup(Tag):
handle_starttag(name, attrs) # See note about return value handle_starttag(name, attrs) # See note about return value
handle_endtag(name) handle_endtag(name)
handle_data(data) # Appends to the current data node handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node endData(containerClass) # Ends the current data node
No matter how complicated the underlying parser is, you should be No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events, able to build a tree using 'start tag' events, 'end tag' events,
@ -78,14 +78,14 @@ class BeautifulSoup(Tag):
# If the end-user gives no indication which tree builder they # If the end-user gives no indication which tree builder they
# want, look for one with these features. # want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast'] DEFAULT_BUILDER_FEATURES = ['html', 'fast']
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n" NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
def __init__(self, markup="", features=None, builder=None, def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None, parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs): element_classes=None, **kwargs):
"""Constructor. """Constructor.
:param markup: A string or a file-like object representing :param markup: A string or a file-like object representing
@ -98,8 +98,10 @@ class BeautifulSoup(Tag):
name a specific parser, so that Beautiful Soup gives you the name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments. same results across platforms and virtual environments.
:param builder: A specific TreeBuilder to use instead of looking one :param builder: A TreeBuilder subclass to instantiate (or
up based on `features`. You shouldn't need to use this. instance to use) instead of looking one up based on
`features`. You only need to use this if you've implemented a
custom TreeBuilder.
:param parse_only: A SoupStrainer. Only parts of the document :param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful matching the SoupStrainer will be considered. This is useful
@ -115,14 +117,26 @@ class BeautifulSoup(Tag):
the document's encoding but you know Beautiful Soup's guess is the document's encoding but you know Beautiful Soup's guess is
wrong. wrong.
:param element_classes: A dictionary mapping BeautifulSoup
classes like Tag and NavigableString to other classes you'd
like to be instantiated instead as the parse tree is
built. This is useful for using subclasses to modify the
default behavior of Tag or NavigableString.
:param kwargs: For backwards compatibility purposes, the :param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4 and there's no need to actually pass keyword Beautiful Soup 4; they will result in a warning and then be ignored.
arguments into the constructor.
Apart from this, any keyword arguments passed into the BeautifulSoup
constructor are propagated to the TreeBuilder constructor. This
makes it possible to configure a TreeBuilder beyond saying
which one to use.
""" """
if 'convertEntities' in kwargs: if 'convertEntities' in kwargs:
del kwargs['convertEntities']
warnings.warn( warnings.warn(
"BS4 does not respect the convertEntities argument to the " "BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted " "BeautifulSoup constructor. Entities are always converted "
@ -177,13 +191,19 @@ class BeautifulSoup(Tag):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None from_encoding = None
if len(kwargs) > 0: self.element_classes = element_classes or dict()
arg = kwargs.keys().pop()
raise TypeError( # We need this information to track whether or not the builder
"__init__() got an unexpected keyword argument '%s'" % arg) # was specified well enough that we can omit the 'you need to
# specify a parser' warning.
if builder is None: original_builder = builder
original_features = features original_features = features
if isinstance(builder, type):
# A builder class was passed in; it needs to be instantiated.
builder_class = builder
builder = None
elif builder is None:
if isinstance(features, basestring): if isinstance(features, basestring):
features = [features] features = [features]
if features is None or len(features) == 0: if features is None or len(features) == 0:
@ -194,9 +214,16 @@ class BeautifulSoup(Tag):
"Couldn't find a tree builder with the features you " "Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?" "requested: %s. Do you need to install a parser library?"
% ",".join(features)) % ",".join(features))
builder = builder_class()
if not (original_features == builder.NAME or # At this point either we have a TreeBuilder instance in
original_features in builder.ALTERNATE_NAMES): # builder, or we have a builder_class that we can instantiate
# with the remaining **kwargs.
if builder is None:
builder = builder_class(**kwargs)
if not original_builder and not (
original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES
):
if builder.is_xml: if builder.is_xml:
markup_type = "XML" markup_type = "XML"
else: else:
@ -231,7 +258,10 @@ class BeautifulSoup(Tag):
markup_type=markup_type markup_type=markup_type
) )
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2) warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
else:
if kwargs:
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
self.builder = builder self.builder = builder
self.is_xml = builder.is_xml self.is_xml = builder.is_xml
self.known_xml = self.is_xml self.known_xml = self.is_xml
@ -272,6 +302,8 @@ class BeautifulSoup(Tag):
' Beautiful Soup.' % markup) ' Beautiful Soup.' % markup)
self._check_markup_is_url(markup) self._check_markup_is_url(markup)
rejections = []
success = False
for (self.markup, self.original_encoding, self.declared_html_encoding, for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in ( self.contains_replacement_characters) in (
self.builder.prepare_markup( self.builder.prepare_markup(
@ -279,10 +311,18 @@ class BeautifulSoup(Tag):
self.reset() self.reset()
try: try:
self._feed() self._feed()
success = True
break break
except ParserRejectedMarkup: except ParserRejectedMarkup as e:
rejections.append(e)
pass pass
if not success:
other_exceptions = [unicode(e) for e in rejections]
raise ParserRejectedMarkup(
u"The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)
)
# Clear out the markup and remove the builder's circular # Clear out the markup and remove the builder's circular
# reference to this object. # reference to this object.
self.markup = None self.markup = None
@ -355,13 +395,20 @@ class BeautifulSoup(Tag):
self.preserve_whitespace_tag_stack = [] self.preserve_whitespace_tag_stack = []
self.pushTag(self) self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs): def new_tag(self, name, namespace=None, nsprefix=None, attrs={},
sourceline=None, sourcepos=None, **kwattrs):
"""Create a new tag associated with this soup.""" """Create a new tag associated with this soup."""
kwattrs.update(attrs) kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs) return self.element_classes.get(Tag, Tag)(
None, self.builder, name, namespace, nsprefix, kwattrs,
sourceline=sourceline, sourcepos=sourcepos
)
def new_string(self, s, subclass=NavigableString): def new_string(self, s, subclass=None):
"""Create a new NavigableString associated with this soup.""" """Create a new NavigableString associated with this soup."""
subclass = subclass or self.element_classes.get(
NavigableString, NavigableString
)
return subclass(s) return subclass(s)
def insert_before(self, successor): def insert_before(self, successor):
@ -388,7 +435,17 @@ class BeautifulSoup(Tag):
if tag.name in self.builder.preserve_whitespace_tags: if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag) self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=NavigableString): def endData(self, containerClass=None):
# Default container is NavigableString.
containerClass = containerClass or NavigableString
# The user may want us to instantiate some alias for the
# container class.
containerClass = self.element_classes.get(
containerClass, containerClass
)
if self.current_data: if self.current_data:
current_data = u''.join(self.current_data) current_data = u''.join(self.current_data)
# If whitespace is not preserved, and this string contains # If whitespace is not preserved, and this string contains
@ -509,7 +566,8 @@ class BeautifulSoup(Tag):
return most_recently_popped return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs): def handle_starttag(self, name, namespace, nsprefix, attrs, sourceline=None,
sourcepos=None):
"""Push a start tag on to the stack. """Push a start tag on to the stack.
If this method returns None, the tag was rejected by the If this method returns None, the tag was rejected by the
@ -526,8 +584,11 @@ class BeautifulSoup(Tag):
or not self.parse_only.search_tag(name, attrs))): or not self.parse_only.search_tag(name, attrs))):
return None return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, tag = self.element_classes.get(Tag, Tag)(
self.currentTag, self._most_recent_element) self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element,
sourceline=sourceline, sourcepos=sourcepos
)
if tag is None: if tag is None:
return tag return tag
if self._most_recent_element is not None: if self._most_recent_element is not None:

71
lib/bs4/builder/__init__.py

@ -7,7 +7,6 @@ import sys
from bs4.element import ( from bs4.element import (
CharsetMetaAttributeValue, CharsetMetaAttributeValue,
ContentMetaAttributeValue, ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
nonwhitespace_re nonwhitespace_re
) )
@ -90,18 +89,58 @@ class TreeBuilder(object):
is_xml = False is_xml = False
picklable = False picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents. # tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or # A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA. # comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {} DEFAULT_CDATA_LIST_ATTRIBUTES = {}
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
USE_DEFAULT = object()
def __init__(self): # Most parsers don't keep track of line numbers.
TRACKS_LINE_NUMBERS = False
def __init__(self, multi_valued_attributes=USE_DEFAULT,
preserve_whitespace_tags=USE_DEFAULT,
store_line_numbers=USE_DEFAULT):
"""Constructor.
:param multi_valued_attributes: If this is set to None, the
TreeBuilder will not turn any values for attributes like
'class' into lists. Setting this do a dictionary will
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
for an example.
Internally, these are called "CDATA list attributes", but that
probably doesn't make sense to an end-user, so the argument name
is `multi_valued_attributes`.
:param preserve_whitespace_tags: A list of tags to treat
the way <pre> tags are treated in HTML. Tags in this list
will have
:param store_line_numbers: If the parser keeps track of the
line numbers and positions of the original markup, that
information will, by default, be stored in each corresponding
`Tag` object. You can turn this off by passing
store_line_numbers=False. If the parser you're using doesn't
keep track of this information, then setting store_line_numbers=True
will do nothing.
"""
self.soup = None self.soup = None
if multi_valued_attributes is self.USE_DEFAULT:
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
self.cdata_list_attributes = multi_valued_attributes
if preserve_whitespace_tags is self.USE_DEFAULT:
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
self.preserve_whitespace_tags = preserve_whitespace_tags
if store_line_numbers == self.USE_DEFAULT:
store_line_numbers = self.TRACKS_LINE_NUMBERS
self.store_line_numbers = store_line_numbers
def initialize_soup(self, soup): def initialize_soup(self, soup):
"""The BeautifulSoup object has been initialized and is now """The BeautifulSoup object has been initialized and is now
being associated with the TreeBuilder. being associated with the TreeBuilder.
@ -131,13 +170,13 @@ class TreeBuilder(object):
if self.empty_element_tags is None: if self.empty_element_tags is None:
return True return True
return tag_name in self.empty_element_tags return tag_name in self.empty_element_tags
def feed(self, markup): def feed(self, markup):
raise NotImplementedError() raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None): document_declared_encoding=None, exclude_encodings=None):
return markup, None, None, False yield markup, None, None, False
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document. """Wrap an HTML fragment to make it look like a document.
@ -237,7 +276,6 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags. Such as which tags are empty-element tags.
""" """
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([ empty_element_tags = set([
# These are from HTML5. # These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
@ -259,7 +297,7 @@ class HTMLTreeBuilder(TreeBuilder):
# encounter one of these attributes, we will parse its value into # encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be # a list of values if possible. Upon output, the list will be
# converted back into a string. # converted back into a string.
cdata_list_attributes = { DEFAULT_CDATA_LIST_ATTRIBUTES = {
"*" : ['class', 'accesskey', 'dropzone'], "*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'], "a" : ['rel', 'rev'],
"link" : ['rel', 'rev'], "link" : ['rel', 'rev'],
@ -276,6 +314,8 @@ class HTMLTreeBuilder(TreeBuilder):
"output" : ["for"], "output" : ["for"],
} }
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
def set_up_substitutions(self, tag): def set_up_substitutions(self, tag):
# We are only interested in <meta> tags # We are only interested in <meta> tags
if tag.name != 'meta': if tag.name != 'meta':
@ -323,8 +363,15 @@ def register_treebuilders_from(module):
this_module.builder_registry.register(obj) this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception): class ParserRejectedMarkup(Exception):
pass def __init__(self, message_or_exception):
"""Explain why the parser rejected the given markup, either
with a textual explanation or another exception.
"""
if isinstance(message_or_exception, Exception):
e = message_or_exception
message_or_exception = "%s: %s" % (e.__class__.__name__, unicode(e))
super(ParserRejectedMarkup, self).__init__(message_or_exception)
# Builders are registered in reverse order of priority, so that custom # Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml # builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only # to take precedence over html5lib, because it's faster. And we only

47
lib/bs4/builder/_html5lib.py

@ -45,6 +45,10 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
features = [NAME, PERMISSIVE, HTML_5, HTML] features = [NAME, PERMISSIVE, HTML_5, HTML]
# html5lib can tell us which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
def prepare_markup(self, markup, user_specified_encoding, def prepare_markup(self, markup, user_specified_encoding,
document_declared_encoding=None, exclude_encodings=None): document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on. # Store the user-specified encoding for use later on.
@ -62,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
if self.soup.parse_only is not None: if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
self.underlying_builder.parser = parser
extra_kwargs = dict() extra_kwargs = dict()
if not isinstance(markup, unicode): if not isinstance(markup, unicode):
if new_html5lib: if new_html5lib:
@ -70,7 +74,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
else: else:
extra_kwargs['encoding'] = self.user_specified_encoding extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs) doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, unicode):
# We need to special-case this because html5lib sets # We need to special-case this because html5lib sets
@ -84,10 +88,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
# with other tree builders. # with other tree builders.
original_encoding = original_encoding.name original_encoding = original_encoding.name
doc.original_encoding = original_encoding doc.original_encoding = original_encoding
self.underlying_builder.parser = None
def create_treebuilder(self, namespaceHTMLElements): def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib( self.underlying_builder = TreeBuilderForHtml5lib(
namespaceHTMLElements, self.soup) namespaceHTMLElements, self.soup,
store_line_numbers=self.store_line_numbers
)
return self.underlying_builder return self.underlying_builder
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):
@ -96,15 +103,26 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, soup=None): def __init__(self, namespaceHTMLElements, soup=None,
store_line_numbers=True, **kwargs):
if soup: if soup:
self.soup = soup self.soup = soup
else: else:
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser") # TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup(
"", "html.parser", store_line_numbers=store_line_numbers,
**kwargs
)
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
# This will be set later to an html5lib.html5parser.HTMLParser
# object, which we can use to track the current line number.
self.parser = None
self.store_line_numbers = store_line_numbers
def documentClass(self): def documentClass(self):
self.soup.reset() self.soup.reset()
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
@ -118,7 +136,16 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
self.soup.object_was_parsed(doctype) self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace): def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace) kwargs = {}
if self.parser and self.store_line_numbers:
# This represents the point immediately after the end of the
# tag. We don't know when the tag started, but we do know
# where it ended -- the character just before this one.
sourceline, sourcepos = self.parser.tokenizer.stream.position()
kwargs['sourceline'] = sourceline
kwargs['sourcepos'] = sourcepos-1
tag = self.soup.new_tag(name, namespace, **kwargs)
return Element(tag, self.soup, namespace) return Element(tag, self.soup, namespace)
def commentClass(self, data): def commentClass(self, data):
@ -126,6 +153,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def fragmentClass(self): def fragmentClass(self):
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
# TODO: Why is the parser 'html.parser' here? To avoid an
# infinite loop?
self.soup = BeautifulSoup("", "html.parser") self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]" self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
@ -199,7 +228,7 @@ class AttrList(object):
def __setitem__(self, name, value): def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element, # If this attribute is a multi-valued attribute for this element,
# turn its value into a list. # turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes list_attr = self.element.cdata_list_attributes
if (name in list_attr['*'] if (name in list_attr['*']
or (self.element.name in list_attr or (self.element.name in list_attr
and name in list_attr[self.element.name])): and name in list_attr[self.element.name])):

21
lib/bs4/builder/_htmlparser.py

@ -99,7 +99,11 @@ class BeautifulSoupHTMLParser(HTMLParser):
attr_dict[key] = value attr_dict[key] = value
attrvalue = '""' attrvalue = '""'
#print "START", name #print "START", name
tag = self.soup.handle_starttag(name, None, None, attr_dict) sourceline, sourcepos = self.getpos()
tag = self.soup.handle_starttag(
name, None, None, attr_dict, sourceline=sourceline,
sourcepos=sourcepos
)
if tag and tag.is_empty_element and handle_empty_element: if tag and tag.is_empty_element and handle_empty_element:
# Unlike other parsers, html.parser doesn't send separate end tag # Unlike other parsers, html.parser doesn't send separate end tag
# events for empty-element tags. (It's handled in # events for empty-element tags. (It's handled in
@ -214,12 +218,19 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
NAME = HTMLPARSER NAME = HTMLPARSER
features = [NAME, HTML, STRICT] features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs): # The html.parser knows which line number and position in the
# original file is the source of an element.
TRACKS_LINE_NUMBERS = True
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
parser_args = parser_args or []
parser_kwargs = parser_kwargs or {}
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False parser_kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False parser_kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs) self.parser_args = (parser_args, parser_kwargs)
def prepare_markup(self, markup, user_specified_encoding=None, def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None): document_declared_encoding=None, exclude_encodings=None):

13
lib/bs4/builder/_lxml.py

@ -57,6 +57,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS) DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
# NOTE: If we parsed Element objects and looked at .sourceline,
# we'd be able to see the line numbers from the original document.
# But instead we build an XMLParser or HTMLParser object to serve
# as the target of parse messages, and those messages don't include
# line numbers.
def initialize_soup(self, soup): def initialize_soup(self, soup):
"""Let the BeautifulSoup object know about the standard namespace """Let the BeautifulSoup object know about the standard namespace
mapping. mapping.
@ -94,7 +100,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
parser = parser(target=self, strip_cdata=False, encoding=encoding) parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser return parser
def __init__(self, parser=None, empty_element_tags=None): def __init__(self, parser=None, empty_element_tags=None, **kwargs):
# TODO: Issue a warning if parser is present but not a # TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new # callable, since that means there's no way to create new
# parsers for different encodings. # parsers for different encodings.
@ -103,6 +109,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.empty_element_tags = set(empty_element_tags) self.empty_element_tags = set(empty_element_tags)
self.soup = None self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
def _getNsTag(self, tag): def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag # Split the namespace URL out of a fully-qualified lxml tag
@ -168,7 +175,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
self.parser.feed(data) self.parser.feed(data)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(e)
def close(self): def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED] self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
@ -287,7 +294,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser.feed(markup) self.parser.feed(markup)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(e)
def test_fragment_to_document(self, fragment): def test_fragment_to_document(self, fragment):

56
lib/bs4/dammit.py

@ -22,6 +22,8 @@ try:
# PyPI package: cchardet # PyPI package: cchardet
import cchardet import cchardet
def chardet_dammit(s): def chardet_dammit(s):
if isinstance(s, unicode):
return None
return cchardet.detect(s)['encoding'] return cchardet.detect(s)['encoding']
except ImportError: except ImportError:
try: try:
@ -30,6 +32,8 @@ except ImportError:
# PyPI package: chardet # PyPI package: chardet
import chardet import chardet
def chardet_dammit(s): def chardet_dammit(s):
if isinstance(s, unicode):
return None
return chardet.detect(s)['encoding'] return chardet.detect(s)['encoding']
#import chardet.constants #import chardet.constants
#chardet.constants._debug = 1 #chardet.constants._debug = 1
@ -44,10 +48,19 @@ try:
except ImportError: except ImportError:
pass pass
xml_encoding_re = re.compile( # Build bytestring and Unicode versions of regular expressions for finding
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I) # a declared encoding inside an XML or HTML document.
html_meta_re = re.compile( xml_encoding = u'^\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) html_meta = u'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
encoding_res = dict()
encoding_res[bytes] = {
'html' : re.compile(html_meta.encode("ascii"), re.I),
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
}
encoding_res[unicode] = {
'html' : re.compile(html_meta, re.I),
'xml' : re.compile(xml_encoding, re.I)
}
class EntitySubstitution(object): class EntitySubstitution(object):
@ -57,15 +70,24 @@ class EntitySubstitution(object):
lookup = {} lookup = {}
reverse_lookup = {} reverse_lookup = {}
characters_for_re = [] characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
# entity. We don't want to use it, but we want to recognize it on the way in.
#
# TODO: Ideally we would be able to recognize all HTML 5 named
# entities, but that's a little tricky.
extra = [(39, 'apos')]
for codepoint, name in list(codepoint2name.items()) + extra:
character = unichr(codepoint) character = unichr(codepoint)
if codepoint != 34: if codepoint not in (34, 39):
# There's no point in turning the quotation mark into # There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which # &quot; or the single quote into &apos;, unless it
# is handled elsewhere. # happens within an attribute value, which is handled
# elsewhere.
characters_for_re.append(character) characters_for_re.append(character)
lookup[character] = name lookup[character] = name
# But we do want to turn &quot; into the quotation mark. # But we do want to recognize those entities on the way in and
# convert them to Unicode characters.
reverse_lookup[name] = character reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re) re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition) return lookup, reverse_lookup, re.compile(re_definition)
@ -310,14 +332,22 @@ class EncodingDetector:
xml_endpos = 1024 xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05)) html_endpos = max(2048, int(len(markup) * 0.05))
if isinstance(markup, bytes):
res = encoding_res[bytes]
else:
res = encoding_res[unicode]
xml_re = res['xml']
html_re = res['html']
declared_encoding = None declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html: if not declared_encoding_match and is_html:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) declared_encoding_match = html_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None: if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode( declared_encoding = declared_encoding_match.groups()[0]
'ascii', 'replace')
if declared_encoding: if declared_encoding:
if isinstance(declared_encoding, bytes):
declared_encoding = declared_encoding.decode('ascii', 'replace')
return declared_encoding.lower() return declared_encoding.lower()
return None return None

411
lib/bs4/element.py

@ -16,7 +16,11 @@ except ImportError, e:
'The soupsieve package is not installed. CSS selectors cannot be used.' 'The soupsieve package is not installed. CSS selectors cannot be used.'
) )
from bs4.dammit import EntitySubstitution from bs4.formatter import (
Formatter,
HTMLFormatter,
XMLFormatter,
)
DEFAULT_OUTPUT_ENCODING = "utf-8" DEFAULT_OUTPUT_ENCODING = "utf-8"
PY3K = (sys.version_info[0] > 2) PY3K = (sys.version_info[0] > 2)
@ -42,6 +46,11 @@ def _alias(attr):
class NamespacedAttribute(unicode): class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None): def __new__(cls, prefix, name, namespace=None):
if not name:
# This is the default namespace. Its name "has no value"
# per https://www.w3.org/TR/xml-names/#defaulting
name = None
if name is None: if name is None:
obj = unicode.__new__(cls, prefix) obj = unicode.__new__(cls, prefix)
elif prefix is None: elif prefix is None:
@ -99,138 +108,71 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value) return self.CHARSET_RE.sub(rewrite, self.original_value)
class HTMLAwareEntitySubstitution(EntitySubstitution):
class PageElement(object):
"""Entity substitution rules that are aware of some HTML quirks. """Contains the navigational information for some part of the page
(either a tag or a piece of text)"""
Specifically, the contents of <script> and <style> tags should not
undergo entity substitution. def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
Incoming NavigableString objects are checked to see if they're the """Sets up the initial relations between this element and
direct children of a <script> or <style> tag. other elements."""
""" self.parent = parent
cdata_containing_tags = set(["script", "style"])
preformatted_tags = set(["pre"])
preserve_whitespace_tags = set(['pre', 'textarea'])
@classmethod
def _substitute_if_appropriate(cls, ns, f):
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in cls.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return f(ns)
@classmethod
def substitute_html(cls, ns):
return cls._substitute_if_appropriate(
ns, EntitySubstitution.substitute_html)
@classmethod self.previous_element = previous_element
def substitute_xml(cls, ns): if previous_element is not None:
return cls._substitute_if_appropriate( self.previous_element.next_element = self
ns, EntitySubstitution.substitute_xml)
class Formatter(object): self.next_element = next_element
"""Contains information about how to format a parse tree.""" if self.next_element is not None:
self.next_element.previous_element = self
# By default, represent void elements as <tag/> rather than <tag>
void_element_close_prefix = '/'
def substitute_entities(self, *args, **kwargs):
"""Transform certain characters into named entities."""
raise NotImplementedError()
class HTMLFormatter(Formatter):
"""The default HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
class MinimalHTMLFormatter(Formatter):
"""A minimal HTML formatter."""
def substitute(self, *args, **kwargs):
return HTMLAwareEntitySubstitution.substitute_xml(*args, **kwargs)
class HTML5Formatter(HTMLFormatter):
"""An HTML formatter that omits the slash in a void tag."""
void_element_close_prefix = None
class XMLFormatter(Formatter): self.next_sibling = next_sibling
"""Substitute only the essential XML entities.""" if self.next_sibling is not None:
def substitute(self, *args, **kwargs): self.next_sibling.previous_sibling = self
return EntitySubstitution.substitute_xml(*args, **kwargs)
class HTMLXMLFormatter(Formatter): if (previous_sibling is None
"""Format XML using HTML rules.""" and self.parent is not None and self.parent.contents):
def substitute(self, *args, **kwargs): previous_sibling = self.parent.contents[-1]
return HTMLAwareEntitySubstitution.substitute_html(*args, **kwargs)
self.previous_sibling = previous_sibling
class PageElement(object): if previous_sibling is not None:
"""Contains the navigational information for some part of the page self.previous_sibling.next_sibling = self
(either a tag or a piece of text)"""
# There are five possible values for the "formatter" argument passed in def format_string(self, s, formatter):
# to methods like encode() and prettify():
#
# "html" - All Unicode characters with corresponding HTML entities
# are converted to those entities on output.
# "html5" - The same as "html", but empty void tags are represented as
# <tag> rather than <tag/>
# "minimal" - Bare ampersands and angle brackets are converted to
# XML entities: &amp; &lt; &gt;
# None - The null formatter. Unicode characters are never
# converted to entities. This is not recommended, but it's
# faster than "minimal".
# A callable function - it will be called on every string that needs to undergo entity substitution.
# A Formatter instance - Formatter.substitute(string) will be called on every string that
# needs to undergo entity substitution.
#
# In an HTML document, the default "html", "html5", and "minimal"
# functions will leave the contents of <script> and <style> tags
# alone. For an XML document, all tags will be given the same
# treatment.
HTML_FORMATTERS = {
"html" : HTMLFormatter(),
"html5" : HTML5Formatter(),
"minimal" : MinimalHTMLFormatter(),
None : None
}
XML_FORMATTERS = {
"html" : HTMLXMLFormatter(),
"minimal" : XMLFormatter(),
None : None
}
def format_string(self, s, formatter='minimal'):
"""Format the given string using the given formatter.""" """Format the given string using the given formatter."""
if isinstance(formatter, basestring):
formatter = self._formatter_for_name(formatter)
if formatter is None: if formatter is None:
output = s return s
else: if not isinstance(formatter, Formatter):
if isinstance(formatter, Callable): formatter = self.formatter_for_name(formatter)
# Backwards compatibility -- you used to pass in a formatting method. output = formatter.substitute(s)
output = formatter(s)
else:
output = formatter.substitute(s)
return output return output
def formatter_for_name(self, formatter):
"""Look up or create a Formatter for the given identifier,
if necessary.
:param formatter: Can be a Formatter object (used as-is), a
function (used as the entity substitution hook for an
XMLFormatter or HTMLFormatter), or a string (used to look up
an XMLFormatter or HTMLFormatter in the appropriate registry.
"""
if isinstance(formatter, Formatter):
return formatter
if self._is_xml:
c = XMLFormatter
else:
c = HTMLFormatter
if callable(formatter):
return c(entity_substitution=formatter)
return c.REGISTRY[formatter]
@property @property
def _is_xml(self): def _is_xml(self):
"""Is this element part of an XML tree or an HTML tree? """Is this element part of an XML tree or an HTML tree?
This is used when mapping a formatter name ("minimal") to an This is used in formatter_for_name, when deciding whether an
appropriate function (one that performs entity-substitution on XMLFormatter or HTMLFormatter is more appropriate. It can be
the contents of <script> and <style> tags, or not). It can be
inefficient, but it should be called very rarely. inefficient, but it should be called very rarely.
""" """
if self.known_xml is not None: if self.known_xml is not None:
@ -248,46 +190,13 @@ class PageElement(object):
return getattr(self, 'is_xml', False) return getattr(self, 'is_xml', False)
return self.parent._is_xml return self.parent._is_xml
def _formatter_for_name(self, name):
"Look up a formatter function based on its name and the tree."
if self._is_xml:
return self.XML_FORMATTERS.get(name, XMLFormatter())
else:
return self.HTML_FORMATTERS.get(name, HTMLFormatter())
def setup(self, parent=None, previous_element=None, next_element=None,
previous_sibling=None, next_sibling=None):
"""Sets up the initial relations between this element and
other elements."""
self.parent = parent
self.previous_element = previous_element
if previous_element is not None:
self.previous_element.next_element = self
self.next_element = next_element
if self.next_element is not None:
self.next_element.previous_element = self
self.next_sibling = next_sibling
if self.next_sibling is not None:
self.next_sibling.previous_sibling = self
if (previous_sibling is None
and self.parent is not None and self.parent.contents):
previous_sibling = self.parent.contents[-1]
self.previous_sibling = previous_sibling
if previous_sibling is not None:
self.previous_sibling.next_sibling = self
nextSibling = _alias("next_sibling") # BS3 nextSibling = _alias("next_sibling") # BS3
previousSibling = _alias("previous_sibling") # BS3 previousSibling = _alias("previous_sibling") # BS3
def replace_with(self, replace_with): def replace_with(self, replace_with):
if self.parent is None: if self.parent is None:
raise ValueError( raise ValueError(
"Cannot replace one element with another when the" "Cannot replace one element with another when the "
"element to be replaced is not part of a tree.") "element to be replaced is not part of a tree.")
if replace_with is self: if replace_with is self:
return return
@ -742,6 +651,7 @@ class NavigableString(unicode, PageElement):
self.__class__.__name__, attr)) self.__class__.__name__, attr))
def output_ready(self, formatter="minimal"): def output_ready(self, formatter="minimal"):
"""Run the string through the provided formatter."""
output = self.format_string(self, formatter) output = self.format_string(self, formatter)
return self.PREFIX + output + self.SUFFIX return self.PREFIX + output + self.SUFFIX
@ -760,10 +670,12 @@ class PreformattedString(NavigableString):
but the return value will be ignored. but the return value will be ignored.
""" """
def output_ready(self, formatter="minimal"): def output_ready(self, formatter=None):
"""CData strings are passed into the formatter. """CData strings are passed into the formatter, purely
But the return value is ignored.""" for any side effects. The return value is ignored.
self.format_string(self, formatter) """
if formatter is not None:
ignore = self.format_string(self, formatter)
return self.PREFIX + self + self.SUFFIX return self.PREFIX + self + self.SUFFIX
class CData(PreformattedString): class CData(PreformattedString):
@ -817,7 +729,10 @@ class Tag(PageElement):
def __init__(self, parser=None, builder=None, name=None, namespace=None, def __init__(self, parser=None, builder=None, name=None, namespace=None,
prefix=None, attrs=None, parent=None, previous=None, prefix=None, attrs=None, parent=None, previous=None,
is_xml=None): is_xml=None, sourceline=None, sourcepos=None,
can_be_empty_element=None, cdata_list_attributes=None,
preserve_whitespace_tags=None
):
"Basic constructor." "Basic constructor."
if parser is None: if parser is None:
@ -831,14 +746,10 @@ class Tag(PageElement):
self.name = name self.name = name
self.namespace = namespace self.namespace = namespace
self.prefix = prefix self.prefix = prefix
if builder is not None: if ((not builder or builder.store_line_numbers)
preserve_whitespace_tags = builder.preserve_whitespace_tags and (sourceline is not None or sourcepos is not None)):
else: self.sourceline = sourceline
if is_xml: self.sourcepos = sourcepos
preserve_whitespace_tags = []
else:
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
self.preserve_whitespace_tags = preserve_whitespace_tags
if attrs is None: if attrs is None:
attrs = {} attrs = {}
elif attrs: elif attrs:
@ -861,12 +772,33 @@ class Tag(PageElement):
self.setup(parent, previous) self.setup(parent, previous)
self.hidden = False self.hidden = False
# Set up any substitutions, such as the charset in a META tag. if builder is None:
if builder is not None: # In the absence of a TreeBuilder, use whatever values were
# passed in here. They're probably None, unless this is a copy of some
# other tag.
self.can_be_empty_element = can_be_empty_element
self.cdata_list_attributes = cdata_list_attributes
self.preserve_whitespace_tags = preserve_whitespace_tags
else:
# Set up any substitutions for this tag, such as the charset in a META tag.
builder.set_up_substitutions(self) builder.set_up_substitutions(self)
# Ask the TreeBuilder whether this tag might be an empty-element tag.
self.can_be_empty_element = builder.can_be_empty_element(name) self.can_be_empty_element = builder.can_be_empty_element(name)
else:
self.can_be_empty_element = False # Keep track of the list of attributes of this tag that
# might need to be treated as a list.
#
# For performance reasons, we store the whole data structure
# rather than asking the question of every tag. Asking would
# require building a new data structure every time, and
# (unlike can_be_empty_element), we almost never need
# to check this.
self.cdata_list_attributes = builder.cdata_list_attributes
# Keep track of the names that might cause this tag to be treated as a
# whitespace-preserved tag.
self.preserve_whitespace_tags = builder.preserve_whitespace_tags
parserClass = _alias("parser_class") # BS3 parserClass = _alias("parser_class") # BS3
@ -874,8 +806,14 @@ class Tag(PageElement):
"""A copy of a Tag is a new Tag, unconnected to the parse tree. """A copy of a Tag is a new Tag, unconnected to the parse tree.
Its contents are a copy of the old Tag's contents. Its contents are a copy of the old Tag's contents.
""" """
clone = type(self)(None, self.builder, self.name, self.namespace, clone = type(self)(
self.prefix, self.attrs, is_xml=self._is_xml) None, self.builder, self.name, self.namespace,
self.prefix, self.attrs, is_xml=self._is_xml,
sourceline=self.sourceline, sourcepos=self.sourcepos,
can_be_empty_element=self.can_be_empty_element,
cdata_list_attributes=self.cdata_list_attributes,
preserve_whitespace_tags=self.preserve_whitespace_tags
)
for attr in ('can_be_empty_element', 'hidden'): for attr in ('can_be_empty_element', 'hidden'):
setattr(clone, attr, getattr(self, attr)) setattr(clone, attr, getattr(self, attr))
for child in self.contents: for child in self.contents:
@ -981,6 +919,43 @@ class Tag(PageElement):
for element in self.contents[:]: for element in self.contents[:]:
element.extract() element.extract()
def smooth(self):
"""Smooth out this element's children by consolidating consecutive strings.
This makes pretty-printed output look more natural following a
lot of operations that modified the tree.
"""
# Mark the first position of every pair of children that need
# to be consolidated. Do this rather than making a copy of
# self.contents, since in most cases very few strings will be
# affected.
marked = []
for i, a in enumerate(self.contents):
if isinstance(a, Tag):
# Recursively smooth children.
a.smooth()
if i == len(self.contents)-1:
# This is the last item in .contents, and it's not a
# tag. There's no chance it needs any work.
continue
b = self.contents[i+1]
if (isinstance(a, NavigableString)
and isinstance(b, NavigableString)
and not isinstance(a, PreformattedString)
and not isinstance(b, PreformattedString)
):
marked.append(i)
# Go over the marked positions in reverse order, so that
# removing items from .contents won't affect the remaining
# positions.
for i in reversed(marked):
a = self.contents[i]
b = self.contents[i+1]
b.extract()
n = NavigableString(a+b)
a.replace_with(n)
def index(self, element): def index(self, element):
""" """
Find the index of a child by identity, not value. Avoids issues with Find the index of a child by identity, not value. Avoids issues with
@ -1115,14 +1090,6 @@ class Tag(PageElement):
u = self.decode(indent_level, encoding, formatter) u = self.decode(indent_level, encoding, formatter)
return u.encode(encoding, errors) return u.encode(encoding, errors)
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
return (
indent_level is not None
and self.name not in self.preserve_whitespace_tags
)
def decode(self, indent_level=None, def decode(self, indent_level=None,
eventual_encoding=DEFAULT_OUTPUT_ENCODING, eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal"):
@ -1136,30 +1103,32 @@ class Tag(PageElement):
encoding. encoding.
""" """
# First off, turn a string formatter into a Formatter object. This # First off, turn a non-Formatter `formatter` into a Formatter
# will stop the lookup from happening over and over again. # object. This will stop the lookup from happening over and
if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable): # over again.
formatter = self._formatter_for_name(formatter) if not isinstance(formatter, Formatter):
formatter = self.formatter_for_name(formatter)
attributes = formatter.attributes(self)
attrs = [] attrs = []
if self.attrs: for key, val in attributes:
for key, val in sorted(self.attrs.items()): if val is None:
if val is None: decoded = key
decoded = key else:
else: if isinstance(val, list) or isinstance(val, tuple):
if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val)
val = ' '.join(val) elif not isinstance(val, basestring):
elif not isinstance(val, basestring): val = unicode(val)
val = unicode(val) elif (
elif (
isinstance(val, AttributeValueWithCharsetSubstitution) isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None): and eventual_encoding is not None
val = val.encode(eventual_encoding) ):
val = val.encode(eventual_encoding)
text = self.format_string(val, formatter)
decoded = ( text = formatter.attribute_value(val)
unicode(key) + '=' decoded = (
+ EntitySubstitution.quoted_attribute_value(text)) unicode(key) + '='
attrs.append(decoded) + formatter.quoted_attribute_value(text))
attrs.append(decoded)
close = '' close = ''
closeTag = '' closeTag = ''
@ -1168,9 +1137,7 @@ class Tag(PageElement):
prefix = self.prefix + ":" prefix = self.prefix + ":"
if self.is_empty_element: if self.is_empty_element:
close = '' close = formatter.void_element_close_prefix or ''
if isinstance(formatter, Formatter):
close = formatter.void_element_close_prefix or close
else: else:
closeTag = '</%s%s>' % (prefix, self.name) closeTag = '</%s%s>' % (prefix, self.name)
@ -1185,7 +1152,8 @@ class Tag(PageElement):
else: else:
indent_contents = None indent_contents = None
contents = self.decode_contents( contents = self.decode_contents(
indent_contents, eventual_encoding, formatter) indent_contents, eventual_encoding, formatter
)
if self.hidden: if self.hidden:
# This is the 'document root' object. # This is the 'document root' object.
@ -1217,6 +1185,16 @@ class Tag(PageElement):
s = ''.join(s) s = ''.join(s)
return s return s
def _should_pretty_print(self, indent_level):
"""Should this tag be pretty-printed?"""
return (
indent_level is not None
and (
not self.preserve_whitespace_tags
or self.name not in self.preserve_whitespace_tags
)
)
def prettify(self, encoding=None, formatter="minimal"): def prettify(self, encoding=None, formatter="minimal"):
if encoding is None: if encoding is None:
return self.decode(True, formatter=formatter) return self.decode(True, formatter=formatter)
@ -1232,19 +1210,19 @@ class Tag(PageElement):
indented this many spaces. indented this many spaces.
:param eventual_encoding: The tag is destined to be :param eventual_encoding: The tag is destined to be
encoded into this encoding. This method is _not_ encoded into this encoding. decode_contents() is _not_
responsible for performing that encoding. This information responsible for performing that encoding. This information
is passed in so that it can be substituted in if the is passed in so that it can be substituted in if the
document contains a <META> tag that mentions the document's document contains a <META> tag that mentions the document's
encoding. encoding.
:param formatter: The output formatter responsible for converting :param formatter: A Formatter object, or a string naming one of
entities to Unicode characters. the standard Formatters.
""" """
# First off, turn a string formatter into a Formatter object. This # First off, turn a string formatter into a Formatter object. This
# will stop the lookup from happening over and over again. # will stop the lookup from happening over and over again.
if not isinstance(formatter, Formatter) and not isinstance(formatter, Callable): if not isinstance(formatter, Formatter):
formatter = self._formatter_for_name(formatter) formatter = self.formatter_for_name(formatter)
pretty_print = (indent_level is not None) pretty_print = (indent_level is not None)
s = [] s = []
@ -1255,16 +1233,19 @@ class Tag(PageElement):
elif isinstance(c, Tag): elif isinstance(c, Tag):
s.append(c.decode(indent_level, eventual_encoding, s.append(c.decode(indent_level, eventual_encoding,
formatter)) formatter))
if text and indent_level and not self.name == 'pre': preserve_whitespace = (
self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
)
if text and indent_level and not preserve_whitespace:
text = text.strip() text = text.strip()
if text: if text:
if pretty_print and not self.name == 'pre': if pretty_print and not preserve_whitespace:
s.append(" " * (indent_level - 1)) s.append(" " * (indent_level - 1))
s.append(text) s.append(text)
if pretty_print and not self.name == 'pre': if pretty_print and not preserve_whitespace:
s.append("\n") s.append("\n")
return ''.join(s) return ''.join(s)
def encode_contents( def encode_contents(
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING, self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"): formatter="minimal"):

99
lib/bs4/formatter.py

@ -0,0 +1,99 @@
from bs4.dammit import EntitySubstitution
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
HTML_FORMATTERS = {}
HTML = 'html'
XML = 'xml'
HTML_DEFAULTS = dict(
cdata_containing_tags=set(["script", "style"]),
)
def _default(self, language, value, kwarg):
if value is not None:
return value
if language == self.XML:
return set()
return self.HTML_DEFAULTS[kwarg]
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
):
"""
:param void_element_close_prefix: By default, represent void
elements as <tag/> rather than <tag>
"""
self.language = language
self.entity_substitution = entity_substitution
self.void_element_close_prefix = void_element_close_prefix
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution."""
if not self.entity_substitution:
return ns
from element import NavigableString
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in self.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return self.entity_substitution(ns)
def attribute_value(self, value):
"""Process the value of an attribute."""
return self.substitute(value)
def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
return sorted(tag.attrs.items())
class HTMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
entity_substitution=None
)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = Formatter(
Formatter(Formatter.XML, entity_substitution=None)
)
Loading…
Cancel
Save