115 changed files with 22575 additions and 2880 deletions
File diff suppressed because it is too large
@ -0,0 +1,355 @@ |
|||
"""Beautiful Soup |
|||
Elixir and Tonic |
|||
"The Screen-Scraper's Friend" |
|||
http://www.crummy.com/software/BeautifulSoup/ |
|||
|
|||
Beautiful Soup uses a pluggable XML or HTML parser to parse a |
|||
(possibly invalid) document into a tree representation. Beautiful Soup |
|||
provides provides methods and Pythonic idioms that make it easy to |
|||
navigate, search, and modify the parse tree. |
|||
|
|||
Beautiful Soup works with Python 2.6 and up. It works better if lxml |
|||
and/or html5lib is installed. |
|||
|
|||
For more than you ever wanted to know about Beautiful Soup, see the |
|||
documentation: |
|||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/ |
|||
""" |
|||
|
|||
__author__ = "Leonard Richardson (leonardr@segfault.org)" |
|||
__version__ = "4.1.0" |
|||
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" |
|||
__license__ = "MIT" |
|||
|
|||
__all__ = ['BeautifulSoup'] |
|||
|
|||
import re |
|||
import warnings |
|||
|
|||
from .builder import builder_registry |
|||
from .dammit import UnicodeDammit |
|||
from .element import ( |
|||
CData, |
|||
Comment, |
|||
DEFAULT_OUTPUT_ENCODING, |
|||
Declaration, |
|||
Doctype, |
|||
NavigableString, |
|||
PageElement, |
|||
ProcessingInstruction, |
|||
ResultSet, |
|||
SoupStrainer, |
|||
Tag, |
|||
) |
|||
|
|||
# The very first thing we do is give a useful error if someone is |
|||
# running this code under Python 3 without converting it. |
|||
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' |
|||
|
|||
class BeautifulSoup(Tag): |
|||
""" |
|||
This class defines the basic interface called by the tree builders. |
|||
|
|||
These methods will be called by the parser: |
|||
reset() |
|||
feed(markup) |
|||
|
|||
The tree builder may call these methods from its feed() implementation: |
|||
handle_starttag(name, attrs) # See note about return value |
|||
handle_endtag(name) |
|||
handle_data(data) # Appends to the current data node |
|||
endData(containerClass=NavigableString) # Ends the current data node |
|||
|
|||
No matter how complicated the underlying parser is, you should be |
|||
able to build a tree using 'start tag' events, 'end tag' events, |
|||
'data' events, and "done with data" events. |
|||
|
|||
If you encounter an empty-element tag (aka a self-closing tag, |
|||
like HTML's <br> tag), call handle_starttag and then |
|||
handle_endtag. |
|||
""" |
|||
ROOT_TAG_NAME = u'[document]' |
|||
|
|||
# If the end-user gives no indication which tree builder they |
|||
# want, look for one with these features. |
|||
DEFAULT_BUILDER_FEATURES = ['html', 'fast'] |
|||
|
|||
# Used when determining whether a text node is all whitespace and |
|||
# can be replaced with a single space. A text node that contains |
|||
# fancy Unicode spaces (usually non-breaking) should be left |
|||
# alone. |
|||
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } |
|||
|
|||
def __init__(self, markup="", features=None, builder=None, |
|||
parse_only=None, from_encoding=None, **kwargs): |
|||
"""The Soup object is initialized as the 'root tag', and the |
|||
provided markup (which can be a string or a file-like object) |
|||
is fed into the underlying parser.""" |
|||
|
|||
if 'convertEntities' in kwargs: |
|||
warnings.warn( |
|||
"BS4 does not respect the convertEntities argument to the " |
|||
"BeautifulSoup constructor. Entities are always converted " |
|||
"to Unicode characters.") |
|||
|
|||
if 'markupMassage' in kwargs: |
|||
del kwargs['markupMassage'] |
|||
warnings.warn( |
|||
"BS4 does not respect the markupMassage argument to the " |
|||
"BeautifulSoup constructor. The tree builder is responsible " |
|||
"for any necessary markup massage.") |
|||
|
|||
if 'smartQuotesTo' in kwargs: |
|||
del kwargs['smartQuotesTo'] |
|||
warnings.warn( |
|||
"BS4 does not respect the smartQuotesTo argument to the " |
|||
"BeautifulSoup constructor. Smart quotes are always converted " |
|||
"to Unicode characters.") |
|||
|
|||
if 'selfClosingTags' in kwargs: |
|||
del kwargs['selfClosingTags'] |
|||
warnings.warn( |
|||
"BS4 does not respect the selfClosingTags argument to the " |
|||
"BeautifulSoup constructor. The tree builder is responsible " |
|||
"for understanding self-closing tags.") |
|||
|
|||
if 'isHTML' in kwargs: |
|||
del kwargs['isHTML'] |
|||
warnings.warn( |
|||
"BS4 does not respect the isHTML argument to the " |
|||
"BeautifulSoup constructor. You can pass in features='html' " |
|||
"or features='xml' to get a builder capable of handling " |
|||
"one or the other.") |
|||
|
|||
def deprecated_argument(old_name, new_name): |
|||
if old_name in kwargs: |
|||
warnings.warn( |
|||
'The "%s" argument to the BeautifulSoup constructor ' |
|||
'has been renamed to "%s."' % (old_name, new_name)) |
|||
value = kwargs[old_name] |
|||
del kwargs[old_name] |
|||
return value |
|||
return None |
|||
|
|||
parse_only = parse_only or deprecated_argument( |
|||
"parseOnlyThese", "parse_only") |
|||
|
|||
from_encoding = from_encoding or deprecated_argument( |
|||
"fromEncoding", "from_encoding") |
|||
|
|||
if len(kwargs) > 0: |
|||
arg = kwargs.keys().pop() |
|||
raise TypeError( |
|||
"__init__() got an unexpected keyword argument '%s'" % arg) |
|||
|
|||
if builder is None: |
|||
if isinstance(features, basestring): |
|||
features = [features] |
|||
if features is None or len(features) == 0: |
|||
features = self.DEFAULT_BUILDER_FEATURES |
|||
builder_class = builder_registry.lookup(*features) |
|||
if builder_class is None: |
|||
raise ValueError( |
|||
"Couldn't find a tree builder with the features you " |
|||
"requested: %s. Do you need to install a parser library?" |
|||
% ",".join(features)) |
|||
builder = builder_class() |
|||
self.builder = builder |
|||
self.is_xml = builder.is_xml |
|||
self.builder.soup = self |
|||
|
|||
self.parse_only = parse_only |
|||
|
|||
self.reset() |
|||
|
|||
if hasattr(markup, 'read'): # It's a file-type object. |
|||
markup = markup.read() |
|||
(self.markup, self.original_encoding, self.declared_html_encoding, |
|||
self.contains_replacement_characters) = ( |
|||
self.builder.prepare_markup(markup, from_encoding)) |
|||
|
|||
try: |
|||
self._feed() |
|||
except StopParsing: |
|||
pass |
|||
|
|||
# Clear out the markup and remove the builder's circular |
|||
# reference to this object. |
|||
self.markup = None |
|||
self.builder.soup = None |
|||
|
|||
def _feed(self): |
|||
# Convert the document to Unicode. |
|||
self.builder.reset() |
|||
|
|||
self.builder.feed(self.markup) |
|||
# Close out any unfinished strings and close all the open tags. |
|||
self.endData() |
|||
while self.currentTag.name != self.ROOT_TAG_NAME: |
|||
self.popTag() |
|||
|
|||
def reset(self): |
|||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) |
|||
self.hidden = 1 |
|||
self.builder.reset() |
|||
self.currentData = [] |
|||
self.currentTag = None |
|||
self.tagStack = [] |
|||
self.pushTag(self) |
|||
|
|||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs): |
|||
"""Create a new tag associated with this soup.""" |
|||
return Tag(None, self.builder, name, namespace, nsprefix, attrs) |
|||
|
|||
def new_string(self, s): |
|||
"""Create a new NavigableString associated with this soup.""" |
|||
navigable = NavigableString(s) |
|||
navigable.setup() |
|||
return navigable |
|||
|
|||
def insert_before(self, successor): |
|||
raise ValueError("BeautifulSoup objects don't support insert_before().") |
|||
|
|||
def insert_after(self, successor): |
|||
raise ValueError("BeautifulSoup objects don't support insert_after().") |
|||
|
|||
def popTag(self): |
|||
tag = self.tagStack.pop() |
|||
#print "Pop", tag.name |
|||
if self.tagStack: |
|||
self.currentTag = self.tagStack[-1] |
|||
return self.currentTag |
|||
|
|||
def pushTag(self, tag): |
|||
#print "Push", tag.name |
|||
if self.currentTag: |
|||
self.currentTag.contents.append(tag) |
|||
self.tagStack.append(tag) |
|||
self.currentTag = self.tagStack[-1] |
|||
|
|||
def endData(self, containerClass=NavigableString): |
|||
if self.currentData: |
|||
currentData = u''.join(self.currentData) |
|||
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and |
|||
not set([tag.name for tag in self.tagStack]).intersection( |
|||
self.builder.preserve_whitespace_tags)): |
|||
if '\n' in currentData: |
|||
currentData = '\n' |
|||
else: |
|||
currentData = ' ' |
|||
self.currentData = [] |
|||
if self.parse_only and len(self.tagStack) <= 1 and \ |
|||
(not self.parse_only.text or \ |
|||
not self.parse_only.search(currentData)): |
|||
return |
|||
o = containerClass(currentData) |
|||
self.object_was_parsed(o) |
|||
|
|||
def object_was_parsed(self, o): |
|||
"""Add an object to the parse tree.""" |
|||
o.setup(self.currentTag, self.previous_element) |
|||
if self.previous_element: |
|||
self.previous_element.next_element = o |
|||
self.previous_element = o |
|||
self.currentTag.contents.append(o) |
|||
|
|||
def _popToTag(self, name, nsprefix=None, inclusivePop=True): |
|||
"""Pops the tag stack up to and including the most recent |
|||
instance of the given tag. If inclusivePop is false, pops the tag |
|||
stack up to but *not* including the most recent instqance of |
|||
the given tag.""" |
|||
#print "Popping to %s" % name |
|||
if name == self.ROOT_TAG_NAME: |
|||
return |
|||
|
|||
numPops = 0 |
|||
mostRecentTag = None |
|||
|
|||
for i in range(len(self.tagStack) - 1, 0, -1): |
|||
if (name == self.tagStack[i].name |
|||
and nsprefix == self.tagStack[i].nsprefix == nsprefix): |
|||
numPops = len(self.tagStack) - i |
|||
break |
|||
if not inclusivePop: |
|||
numPops = numPops - 1 |
|||
|
|||
for i in range(0, numPops): |
|||
mostRecentTag = self.popTag() |
|||
return mostRecentTag |
|||
|
|||
def handle_starttag(self, name, namespace, nsprefix, attrs): |
|||
"""Push a start tag on to the stack. |
|||
|
|||
If this method returns None, the tag was rejected by the |
|||
SoupStrainer. You should proceed as if the tag had not occured |
|||
in the document. For instance, if this was a self-closing tag, |
|||
don't call handle_endtag. |
|||
""" |
|||
|
|||
# print "Start tag %s: %s" % (name, attrs) |
|||
self.endData() |
|||
|
|||
if (self.parse_only and len(self.tagStack) <= 1 |
|||
and (self.parse_only.text |
|||
or not self.parse_only.search_tag(name, attrs))): |
|||
return None |
|||
|
|||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, |
|||
self.currentTag, self.previous_element) |
|||
if tag is None: |
|||
return tag |
|||
if self.previous_element: |
|||
self.previous_element.next_element = tag |
|||
self.previous_element = tag |
|||
self.pushTag(tag) |
|||
return tag |
|||
|
|||
def handle_endtag(self, name, nsprefix=None): |
|||
#print "End tag: " + name |
|||
self.endData() |
|||
self._popToTag(name, nsprefix) |
|||
|
|||
def handle_data(self, data): |
|||
self.currentData.append(data) |
|||
|
|||
def decode(self, pretty_print=False, |
|||
eventual_encoding=DEFAULT_OUTPUT_ENCODING, |
|||
formatter="minimal"): |
|||
"""Returns a string or Unicode representation of this document. |
|||
To get Unicode, pass None for encoding.""" |
|||
|
|||
if self.is_xml: |
|||
# Print the XML declaration |
|||
encoding_part = '' |
|||
if eventual_encoding != None: |
|||
encoding_part = ' encoding="%s"' % eventual_encoding |
|||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part |
|||
else: |
|||
prefix = u'' |
|||
if not pretty_print: |
|||
indent_level = None |
|||
else: |
|||
indent_level = 0 |
|||
return prefix + super(BeautifulSoup, self).decode( |
|||
indent_level, eventual_encoding, formatter) |
|||
|
|||
class BeautifulStoneSoup(BeautifulSoup): |
|||
"""Deprecated interface to an XML parser.""" |
|||
|
|||
def __init__(self, *args, **kwargs): |
|||
kwargs['features'] = 'xml' |
|||
warnings.warn( |
|||
'The BeautifulStoneSoup class is deprecated. Instead of using ' |
|||
'it, pass features="xml" into the BeautifulSoup constructor.') |
|||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs) |
|||
|
|||
|
|||
class StopParsing(Exception): |
|||
pass |
|||
|
|||
|
|||
#By default, act as an HTML pretty-printer. |
|||
if __name__ == '__main__': |
|||
import sys |
|||
soup = BeautifulSoup(sys.stdin) |
|||
print soup.prettify() |
@ -0,0 +1,307 @@ |
|||
from collections import defaultdict |
|||
import itertools |
|||
import sys |
|||
from bs4.element import ( |
|||
CharsetMetaAttributeValue, |
|||
ContentMetaAttributeValue, |
|||
whitespace_re |
|||
) |
|||
|
|||
__all__ = [ |
|||
'HTMLTreeBuilder', |
|||
'SAXTreeBuilder', |
|||
'TreeBuilder', |
|||
'TreeBuilderRegistry', |
|||
] |
|||
|
|||
# Some useful features for a TreeBuilder to have. |
|||
FAST = 'fast' |
|||
PERMISSIVE = 'permissive' |
|||
STRICT = 'strict' |
|||
XML = 'xml' |
|||
HTML = 'html' |
|||
HTML_5 = 'html5' |
|||
|
|||
|
|||
class TreeBuilderRegistry(object): |
|||
|
|||
def __init__(self): |
|||
self.builders_for_feature = defaultdict(list) |
|||
self.builders = [] |
|||
|
|||
def register(self, treebuilder_class): |
|||
"""Register a treebuilder based on its advertised features.""" |
|||
for feature in treebuilder_class.features: |
|||
self.builders_for_feature[feature].insert(0, treebuilder_class) |
|||
self.builders.insert(0, treebuilder_class) |
|||
|
|||
def lookup(self, *features): |
|||
if len(self.builders) == 0: |
|||
# There are no builders at all. |
|||
return None |
|||
|
|||
if len(features) == 0: |
|||
# They didn't ask for any features. Give them the most |
|||
# recently registered builder. |
|||
return self.builders[0] |
|||
|
|||
# Go down the list of features in order, and eliminate any builders |
|||
# that don't match every feature. |
|||
features = list(features) |
|||
features.reverse() |
|||
candidates = None |
|||
candidate_set = None |
|||
while len(features) > 0: |
|||
feature = features.pop() |
|||
we_have_the_feature = self.builders_for_feature.get(feature, []) |
|||
if len(we_have_the_feature) > 0: |
|||
if candidates is None: |
|||
candidates = we_have_the_feature |
|||
candidate_set = set(candidates) |
|||
else: |
|||
# Eliminate any candidates that don't have this feature. |
|||
candidate_set = candidate_set.intersection( |
|||
set(we_have_the_feature)) |
|||
|
|||
# The only valid candidates are the ones in candidate_set. |
|||
# Go through the original list of candidates and pick the first one |
|||
# that's in candidate_set. |
|||
if candidate_set is None: |
|||
return None |
|||
for candidate in candidates: |
|||
if candidate in candidate_set: |
|||
return candidate |
|||
return None |
|||
|
|||
# The BeautifulSoup class will take feature lists from developers and use them |
|||
# to look up builders in this registry. |
|||
builder_registry = TreeBuilderRegistry() |
|||
|
|||
class TreeBuilder(object): |
|||
"""Turn a document into a Beautiful Soup object tree.""" |
|||
|
|||
features = [] |
|||
|
|||
is_xml = False |
|||
preserve_whitespace_tags = set() |
|||
empty_element_tags = None # A tag will be considered an empty-element |
|||
# tag when and only when it has no contents. |
|||
|
|||
# A value for these tag/attribute combinations is a space- or |
|||
# comma-separated list of CDATA, rather than a single CDATA. |
|||
cdata_list_attributes = {} |
|||
|
|||
|
|||
def __init__(self): |
|||
self.soup = None |
|||
|
|||
def reset(self): |
|||
pass |
|||
|
|||
def can_be_empty_element(self, tag_name): |
|||
"""Might a tag with this name be an empty-element tag? |
|||
|
|||
The final markup may or may not actually present this tag as |
|||
self-closing. |
|||
|
|||
For instance: an HTMLBuilder does not consider a <p> tag to be |
|||
an empty-element tag (it's not in |
|||
HTMLBuilder.empty_element_tags). This means an empty <p> tag |
|||
will be presented as "<p></p>", not "<p />". |
|||
|
|||
The default implementation has no opinion about which tags are |
|||
empty-element tags, so a tag will be presented as an |
|||
empty-element tag if and only if it has no contents. |
|||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will |
|||
be left alone. |
|||
""" |
|||
if self.empty_element_tags is None: |
|||
return True |
|||
return tag_name in self.empty_element_tags |
|||
|
|||
def feed(self, markup): |
|||
raise NotImplementedError() |
|||
|
|||
def prepare_markup(self, markup, user_specified_encoding=None, |
|||
document_declared_encoding=None): |
|||
return markup, None, None, False |
|||
|
|||
def test_fragment_to_document(self, fragment): |
|||
"""Wrap an HTML fragment to make it look like a document. |
|||
|
|||
Different parsers do this differently. For instance, lxml |
|||
introduces an empty <head> tag, and html5lib |
|||
doesn't. Abstracting this away lets us write simple tests |
|||
which run HTML fragments through the parser and compare the |
|||
results against other HTML fragments. |
|||
|
|||
This method should not be used outside of tests. |
|||
""" |
|||
return fragment |
|||
|
|||
def set_up_substitutions(self, tag): |
|||
return False |
|||
|
|||
def _replace_cdata_list_attribute_values(self, tag_name, attrs): |
|||
"""Replaces class="foo bar" with class=["foo", "bar"] |
|||
|
|||
Modifies its input in place. |
|||
""" |
|||
if self.cdata_list_attributes: |
|||
universal = self.cdata_list_attributes.get('*', []) |
|||
tag_specific = self.cdata_list_attributes.get( |
|||
tag_name.lower(), []) |
|||
for cdata_list_attr in itertools.chain(universal, tag_specific): |
|||
if cdata_list_attr in dict(attrs): |
|||
# Basically, we have a "class" attribute whose |
|||
# value is a whitespace-separated list of CSS |
|||
# classes. Split it into a list. |
|||
value = attrs[cdata_list_attr] |
|||
values = whitespace_re.split(value) |
|||
attrs[cdata_list_attr] = values |
|||
return attrs |
|||
|
|||
class SAXTreeBuilder(TreeBuilder): |
|||
"""A Beautiful Soup treebuilder that listens for SAX events.""" |
|||
|
|||
def feed(self, markup): |
|||
raise NotImplementedError() |
|||
|
|||
def close(self): |
|||
pass |
|||
|
|||
def startElement(self, name, attrs): |
|||
attrs = dict((key[1], value) for key, value in list(attrs.items())) |
|||
#print "Start %s, %r" % (name, attrs) |
|||
self.soup.handle_starttag(name, attrs) |
|||
|
|||
def endElement(self, name): |
|||
#print "End %s" % name |
|||
self.soup.handle_endtag(name) |
|||
|
|||
def startElementNS(self, nsTuple, nodeName, attrs): |
|||
# Throw away (ns, nodeName) for now. |
|||
self.startElement(nodeName, attrs) |
|||
|
|||
def endElementNS(self, nsTuple, nodeName): |
|||
# Throw away (ns, nodeName) for now. |
|||
self.endElement(nodeName) |
|||
#handler.endElementNS((ns, node.nodeName), node.nodeName) |
|||
|
|||
def startPrefixMapping(self, prefix, nodeValue): |
|||
# Ignore the prefix for now. |
|||
pass |
|||
|
|||
def endPrefixMapping(self, prefix): |
|||
# Ignore the prefix for now. |
|||
# handler.endPrefixMapping(prefix) |
|||
pass |
|||
|
|||
def characters(self, content): |
|||
self.soup.handle_data(content) |
|||
|
|||
def startDocument(self): |
|||
pass |
|||
|
|||
def endDocument(self): |
|||
pass |
|||
|
|||
|
|||
class HTMLTreeBuilder(TreeBuilder): |
|||
"""This TreeBuilder knows facts about HTML. |
|||
|
|||
Such as which tags are empty-element tags. |
|||
""" |
|||
|
|||
preserve_whitespace_tags = set(['pre', 'textarea']) |
|||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', |
|||
'spacer', 'link', 'frame', 'base']) |
|||
|
|||
# The HTML standard defines these attributes as containing a |
|||
# space-separated list of values, not a single value. That is, |
|||
# class="foo bar" means that the 'class' attribute has two values, |
|||
# 'foo' and 'bar', not the single value 'foo bar'. When we |
|||
# encounter one of these attributes, we will parse its value into |
|||
# a list of values if possible. Upon output, the list will be |
|||
# converted back into a string. |
|||
cdata_list_attributes = { |
|||
"*" : ['class', 'accesskey', 'dropzone'], |
|||
"a" : ['rel', 'rev'], |
|||
"link" : ['rel', 'rev'], |
|||
"td" : ["headers"], |
|||
"th" : ["headers"], |
|||
"td" : ["headers"], |
|||
"form" : ["accept-charset"], |
|||
"object" : ["archive"], |
|||
|
|||
# These are HTML5 specific, as are *.accesskey and *.dropzone above. |
|||
"area" : ["rel"], |
|||
"icon" : ["sizes"], |
|||
"iframe" : ["sandbox"], |
|||
"output" : ["for"], |
|||
} |
|||
|
|||
def set_up_substitutions(self, tag): |
|||
# We are only interested in <meta> tags |
|||
if tag.name != 'meta': |
|||
return False |
|||
|
|||
http_equiv = tag.get('http-equiv') |
|||
content = tag.get('content') |
|||
charset = tag.get('charset') |
|||
|
|||
# We are interested in <meta> tags that say what encoding the |
|||
# document was originally in. This means HTML 5-style <meta> |
|||
# tags that provide the "charset" attribute. It also means |
|||
# HTML 4-style <meta> tags that provide the "content" |
|||
# attribute and have "http-equiv" set to "content-type". |
|||
# |
|||
# In both cases we will replace the value of the appropriate |
|||
# attribute with a standin object that can take on any |
|||
# encoding. |
|||
meta_encoding = None |
|||
if charset is not None: |
|||
# HTML 5 style: |
|||
# <meta charset="utf8"> |
|||
meta_encoding = charset |
|||
tag['charset'] = CharsetMetaAttributeValue(charset) |
|||
|
|||
elif (content is not None and http_equiv is not None |
|||
and http_equiv.lower() == 'content-type'): |
|||
# HTML 4 style: |
|||
# <meta http-equiv="content-type" content="text/html; charset=utf8"> |
|||
tag['content'] = ContentMetaAttributeValue(content) |
|||
|
|||
return (meta_encoding is not None) |
|||
|
|||
def register_treebuilders_from(module): |
|||
"""Copy TreeBuilders from the given module into this module.""" |
|||
# I'm fairly sure this is not the best way to do this. |
|||
this_module = sys.modules['bs4.builder'] |
|||
for name in module.__all__: |
|||
obj = getattr(module, name) |
|||
|
|||
if issubclass(obj, TreeBuilder): |
|||
setattr(this_module, name, obj) |
|||
this_module.__all__.append(name) |
|||
# Register the builder while we're at it. |
|||
this_module.builder_registry.register(obj) |
|||
|
|||
# Builders are registered in reverse order of priority, so that custom |
|||
# builder registrations will take precedence. In general, we want lxml |
|||
# to take precedence over html5lib, because it's faster. And we only |
|||
# want to use HTMLParser as a last result. |
|||
from . import _htmlparser |
|||
register_treebuilders_from(_htmlparser) |
|||
try: |
|||
from . import _html5lib |
|||
register_treebuilders_from(_html5lib) |
|||
except ImportError: |
|||
# They don't have html5lib installed. |
|||
pass |
|||
try: |
|||
from . import _lxml |
|||
register_treebuilders_from(_lxml) |
|||
except ImportError: |
|||
# They don't have lxml installed. |
|||
pass |
@ -0,0 +1,222 @@ |
|||
__all__ = [ |
|||
'HTML5TreeBuilder', |
|||
] |
|||
|
|||
import warnings |
|||
from bs4.builder import ( |
|||
PERMISSIVE, |
|||
HTML, |
|||
HTML_5, |
|||
HTMLTreeBuilder, |
|||
) |
|||
from bs4.element import NamespacedAttribute |
|||
import html5lib |
|||
from html5lib.constants import namespaces |
|||
from bs4.element import ( |
|||
Comment, |
|||
Doctype, |
|||
NavigableString, |
|||
Tag, |
|||
) |
|||
|
|||
class HTML5TreeBuilder(HTMLTreeBuilder): |
|||
"""Use html5lib to build a tree.""" |
|||
|
|||
features = ['html5lib', PERMISSIVE, HTML_5, HTML] |
|||
|
|||
def prepare_markup(self, markup, user_specified_encoding): |
|||
# Store the user-specified encoding for use later on. |
|||
self.user_specified_encoding = user_specified_encoding |
|||
return markup, None, None, False |
|||
|
|||
# These methods are defined by Beautiful Soup. |
|||
def feed(self, markup): |
|||
if self.soup.parse_only is not None: |
|||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") |
|||
parser = html5lib.HTMLParser(tree=self.create_treebuilder) |
|||
doc = parser.parse(markup, encoding=self.user_specified_encoding) |
|||
|
|||
# Set the character encoding detected by the tokenizer. |
|||
if isinstance(markup, unicode): |
|||
# We need to special-case this because html5lib sets |
|||
# charEncoding to UTF-8 if it gets Unicode input. |
|||
doc.original_encoding = None |
|||
else: |
|||
doc.original_encoding = parser.tokenizer.stream.charEncoding[0] |
|||
|
|||
def create_treebuilder(self, namespaceHTMLElements): |
|||
self.underlying_builder = TreeBuilderForHtml5lib( |
|||
self.soup, namespaceHTMLElements) |
|||
return self.underlying_builder |
|||
|
|||
def test_fragment_to_document(self, fragment): |
|||
"""See `TreeBuilder`.""" |
|||
return u'<html><head></head><body>%s</body></html>' % fragment |
|||
|
|||
|
|||
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): |
|||
|
|||
def __init__(self, soup, namespaceHTMLElements): |
|||
self.soup = soup |
|||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) |
|||
|
|||
def documentClass(self): |
|||
self.soup.reset() |
|||
return Element(self.soup, self.soup, None) |
|||
|
|||
def insertDoctype(self, token): |
|||
name = token["name"] |
|||
publicId = token["publicId"] |
|||
systemId = token["systemId"] |
|||
|
|||
doctype = Doctype.for_name_and_ids(name, publicId, systemId) |
|||
self.soup.object_was_parsed(doctype) |
|||
|
|||
def elementClass(self, name, namespace): |
|||
tag = self.soup.new_tag(name, namespace) |
|||
return Element(tag, self.soup, namespace) |
|||
|
|||
def commentClass(self, data): |
|||
return TextNode(Comment(data), self.soup) |
|||
|
|||
def fragmentClass(self): |
|||
self.soup = BeautifulSoup("") |
|||
self.soup.name = "[document_fragment]" |
|||
return Element(self.soup, self.soup, None) |
|||
|
|||
def appendChild(self, node): |
|||
# XXX This code is not covered by the BS4 tests. |
|||
self.soup.append(node.element) |
|||
|
|||
def getDocument(self): |
|||
return self.soup |
|||
|
|||
def getFragment(self): |
|||
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element |
|||
|
|||
class AttrList(object): |
|||
def __init__(self, element): |
|||
self.element = element |
|||
self.attrs = dict(self.element.attrs) |
|||
def __iter__(self): |
|||
return list(self.attrs.items()).__iter__() |
|||
def __setitem__(self, name, value): |
|||
"set attr", name, value |
|||
self.element[name] = value |
|||
def items(self): |
|||
return list(self.attrs.items()) |
|||
def keys(self): |
|||
return list(self.attrs.keys()) |
|||
def __len__(self): |
|||
return len(self.attrs) |
|||
def __getitem__(self, name): |
|||
return self.attrs[name] |
|||
def __contains__(self, name): |
|||
return name in list(self.attrs.keys()) |
|||
|
|||
|
|||
class Element(html5lib.treebuilders._base.Node): |
|||
def __init__(self, element, soup, namespace): |
|||
html5lib.treebuilders._base.Node.__init__(self, element.name) |
|||
self.element = element |
|||
self.soup = soup |
|||
self.namespace = namespace |
|||
|
|||
def appendChild(self, node): |
|||
if (node.element.__class__ == NavigableString and self.element.contents |
|||
and self.element.contents[-1].__class__ == NavigableString): |
|||
# Concatenate new text onto old text node |
|||
# XXX This has O(n^2) performance, for input like |
|||
# "a</a>a</a>a</a>..." |
|||
old_element = self.element.contents[-1] |
|||
new_element = self.soup.new_string(old_element + node.element) |
|||
old_element.replace_with(new_element) |
|||
else: |
|||
self.element.append(node.element) |
|||
node.parent = self |
|||
|
|||
def getAttributes(self): |
|||
return AttrList(self.element) |
|||
|
|||
def setAttributes(self, attributes): |
|||
if attributes is not None and len(attributes) > 0: |
|||
|
|||
converted_attributes = [] |
|||
for name, value in list(attributes.items()): |
|||
if isinstance(name, tuple): |
|||
new_name = NamespacedAttribute(*name) |
|||
del attributes[name] |
|||
attributes[new_name] = value |
|||
|
|||
self.soup.builder._replace_cdata_list_attribute_values( |
|||
self.name, attributes) |
|||
for name, value in attributes.items(): |
|||
self.element[name] = value |
|||
|
|||
# The attributes may contain variables that need substitution. |
|||
# Call set_up_substitutions manually. |
|||
# |
|||
# The Tag constructor called this method when the Tag was created, |
|||
# but we just set/changed the attributes, so call it again. |
|||
self.soup.builder.set_up_substitutions(self.element) |
|||
attributes = property(getAttributes, setAttributes) |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
text = TextNode(self.soup.new_string(data), self.soup) |
|||
if insertBefore: |
|||
self.insertBefore(text, insertBefore) |
|||
else: |
|||
self.appendChild(text) |
|||
|
|||
def insertBefore(self, node, refNode): |
|||
index = self.element.index(refNode.element) |
|||
if (node.element.__class__ == NavigableString and self.element.contents |
|||
and self.element.contents[index-1].__class__ == NavigableString): |
|||
# (See comments in appendChild) |
|||
old_node = self.element.contents[index-1] |
|||
new_str = self.soup.new_string(old_node + node.element) |
|||
old_node.replace_with(new_str) |
|||
else: |
|||
self.element.insert(index, node.element) |
|||
node.parent = self |
|||
|
|||
def removeChild(self, node): |
|||
node.element.extract() |
|||
|
|||
def reparentChildren(self, newParent): |
|||
while self.element.contents: |
|||
child = self.element.contents[0] |
|||
child.extract() |
|||
if isinstance(child, Tag): |
|||
newParent.appendChild( |
|||
Element(child, self.soup, namespaces["html"])) |
|||
else: |
|||
newParent.appendChild( |
|||
TextNode(child, self.soup)) |
|||
|
|||
def cloneNode(self): |
|||
tag = self.soup.new_tag(self.element.name, self.namespace) |
|||
node = Element(tag, self.soup, self.namespace) |
|||
for key,value in self.attributes: |
|||
node.attributes[key] = value |
|||
return node |
|||
|
|||
def hasContent(self): |
|||
return self.element.contents |
|||
|
|||
def getNameTuple(self): |
|||
if self.namespace == None: |
|||
return namespaces["html"], self.name |
|||
else: |
|||
return self.namespace, self.name |
|||
|
|||
nameTuple = property(getNameTuple) |
|||
|
|||
class TextNode(Element): |
|||
def __init__(self, element, soup): |
|||
html5lib.treebuilders._base.Node.__init__(self, None) |
|||
self.element = element |
|||
self.soup = soup |
|||
|
|||
def cloneNode(self): |
|||
raise NotImplementedError |
@ -0,0 +1,244 @@ |
|||
"""Use the HTMLParser library to parse HTML files that aren't too bad.""" |
|||
|
|||
__all__ = [ |
|||
'HTMLParserTreeBuilder', |
|||
] |
|||
|
|||
from HTMLParser import ( |
|||
HTMLParser, |
|||
HTMLParseError, |
|||
) |
|||
import sys |
|||
import warnings |
|||
|
|||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' |
|||
# argument, which we'd like to set to False. Unfortunately, |
|||
# http://bugs.python.org/issue13273 makes strict=True a better bet |
|||
# before Python 3.2.3. |
|||
# |
|||
# At the end of this file, we monkeypatch HTMLParser so that |
|||
# strict=True works well on Python 3.2.2. |
|||
major, minor, release = sys.version_info[:3] |
|||
CONSTRUCTOR_TAKES_STRICT = ( |
|||
major > 3 |
|||
or (major == 3 and minor > 2) |
|||
or (major == 3 and minor == 2 and release >= 3)) |
|||
|
|||
from bs4.element import ( |
|||
CData, |
|||
Comment, |
|||
Declaration, |
|||
Doctype, |
|||
ProcessingInstruction, |
|||
) |
|||
from bs4.dammit import EntitySubstitution, UnicodeDammit |
|||
|
|||
from bs4.builder import ( |
|||
HTML, |
|||
HTMLTreeBuilder, |
|||
STRICT, |
|||
) |
|||
|
|||
|
|||
HTMLPARSER = 'html.parser' |
|||
|
|||
class BeautifulSoupHTMLParser(HTMLParser): |
|||
def handle_starttag(self, name, attrs): |
|||
# XXX namespace |
|||
self.soup.handle_starttag(name, None, None, dict(attrs)) |
|||
|
|||
def handle_endtag(self, name): |
|||
self.soup.handle_endtag(name) |
|||
|
|||
def handle_data(self, data): |
|||
self.soup.handle_data(data) |
|||
|
|||
def handle_charref(self, name): |
|||
# XXX workaround for a bug in HTMLParser. Remove this once |
|||
# it's fixed. |
|||
if name.startswith('x'): |
|||
real_name = int(name.lstrip('x'), 16) |
|||
else: |
|||
real_name = int(name) |
|||
|
|||
try: |
|||
data = unichr(real_name) |
|||
except (ValueError, OverflowError), e: |
|||
data = u"\N{REPLACEMENT CHARACTER}" |
|||
|
|||
self.handle_data(data) |
|||
|
|||
def handle_entityref(self, name): |
|||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) |
|||
if character is not None: |
|||
data = character |
|||
else: |
|||
data = "&%s;" % name |
|||
self.handle_data(data) |
|||
|
|||
def handle_comment(self, data): |
|||
self.soup.endData() |
|||
self.soup.handle_data(data) |
|||
self.soup.endData(Comment) |
|||
|
|||
def handle_decl(self, data): |
|||
self.soup.endData() |
|||
if data.startswith("DOCTYPE "): |
|||
data = data[len("DOCTYPE "):] |
|||
self.soup.handle_data(data) |
|||
self.soup.endData(Doctype) |
|||
|
|||
def unknown_decl(self, data): |
|||
if data.upper().startswith('CDATA['): |
|||
cls = CData |
|||
data = data[len('CDATA['):] |
|||
else: |
|||
cls = Declaration |
|||
self.soup.endData() |
|||
self.soup.handle_data(data) |
|||
self.soup.endData(cls) |
|||
|
|||
def handle_pi(self, data): |
|||
self.soup.endData() |
|||
if data.endswith("?") and data.lower().startswith("xml"): |
|||
# "An XHTML processing instruction using the trailing '?' |
|||
# will cause the '?' to be included in data." - HTMLParser |
|||
# docs. |
|||
# |
|||
# Strip the question mark so we don't end up with two |
|||
# question marks. |
|||
data = data[:-1] |
|||
self.soup.handle_data(data) |
|||
self.soup.endData(ProcessingInstruction) |
|||
|
|||
|
|||
class HTMLParserTreeBuilder(HTMLTreeBuilder): |
|||
|
|||
is_xml = False |
|||
features = [HTML, STRICT, HTMLPARSER] |
|||
|
|||
def __init__(self, *args, **kwargs): |
|||
if CONSTRUCTOR_TAKES_STRICT: |
|||
kwargs['strict'] = False |
|||
self.parser_args = (args, kwargs) |
|||
|
|||
def prepare_markup(self, markup, user_specified_encoding=None, |
|||
document_declared_encoding=None): |
|||
""" |
|||
:return: A 4-tuple (markup, original encoding, encoding |
|||
declared within markup, whether any characters had to be |
|||
replaced with REPLACEMENT CHARACTER). |
|||
""" |
|||
if isinstance(markup, unicode): |
|||
return markup, None, None, False |
|||
|
|||
try_encodings = [user_specified_encoding, document_declared_encoding] |
|||
dammit = UnicodeDammit(markup, try_encodings, is_html=True) |
|||
return (dammit.markup, dammit.original_encoding, |
|||
dammit.declared_html_encoding, |
|||
dammit.contains_replacement_characters) |
|||
|
|||
def feed(self, markup): |
|||
args, kwargs = self.parser_args |
|||
parser = BeautifulSoupHTMLParser(*args, **kwargs) |
|||
parser.soup = self.soup |
|||
try: |
|||
parser.feed(markup) |
|||
except HTMLParseError, e: |
|||
warnings.warn(RuntimeWarning( |
|||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) |
|||
raise e |
|||
|
|||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some |
|||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a |
|||
# string. |
|||
# |
|||
# XXX This code can be removed once most Python 3 users are on 3.2.3. |
|||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: |
|||
import re |
|||
attrfind_tolerant = re.compile( |
|||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' |
|||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') |
|||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant |
|||
|
|||
locatestarttagend = re.compile(r""" |
|||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name |
|||
(?:\s+ # whitespace before attribute name |
|||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name |
|||
(?:\s*=\s* # value indicator |
|||
(?:'[^']*' # LITA-enclosed value |
|||
|\"[^\"]*\" # LIT-enclosed value |
|||
|[^'\">\s]+ # bare value |
|||
) |
|||
)? |
|||
) |
|||
)* |
|||
\s* # trailing whitespace |
|||
""", re.VERBOSE) |
|||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend |
|||
|
|||
from html.parser import tagfind, attrfind |
|||
|
|||
def parse_starttag(self, i): |
|||
self.__starttag_text = None |
|||
endpos = self.check_for_whole_start_tag(i) |
|||
if endpos < 0: |
|||
return endpos |
|||
rawdata = self.rawdata |
|||
self.__starttag_text = rawdata[i:endpos] |
|||
|
|||
# Now parse the data between i+1 and j into a tag and attrs |
|||
attrs = [] |
|||
match = tagfind.match(rawdata, i+1) |
|||
assert match, 'unexpected call to parse_starttag()' |
|||
k = match.end() |
|||
self.lasttag = tag = rawdata[i+1:k].lower() |
|||
while k < endpos: |
|||
if self.strict: |
|||
m = attrfind.match(rawdata, k) |
|||
else: |
|||
m = attrfind_tolerant.match(rawdata, k) |
|||
if not m: |
|||
break |
|||
attrname, rest, attrvalue = m.group(1, 2, 3) |
|||
if not rest: |
|||
attrvalue = None |
|||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ |
|||
attrvalue[:1] == '"' == attrvalue[-1:]: |
|||
attrvalue = attrvalue[1:-1] |
|||
if attrvalue: |
|||
attrvalue = self.unescape(attrvalue) |
|||
attrs.append((attrname.lower(), attrvalue)) |
|||
k = m.end() |
|||
|
|||
end = rawdata[k:endpos].strip() |
|||
if end not in (">", "/>"): |
|||
lineno, offset = self.getpos() |
|||
if "\n" in self.__starttag_text: |
|||
lineno = lineno + self.__starttag_text.count("\n") |
|||
offset = len(self.__starttag_text) \ |
|||
- self.__starttag_text.rfind("\n") |
|||
else: |
|||
offset = offset + len(self.__starttag_text) |
|||
if self.strict: |
|||
self.error("junk characters in start tag: %r" |
|||
% (rawdata[k:endpos][:20],)) |
|||
self.handle_data(rawdata[i:endpos]) |
|||
return endpos |
|||
if end.endswith('/>'): |
|||
# XHTML-style empty tag: <span attr="value" /> |
|||
self.handle_startendtag(tag, attrs) |
|||
else: |
|||
self.handle_starttag(tag, attrs) |
|||
if tag in self.CDATA_CONTENT_ELEMENTS: |
|||
self.set_cdata_mode(tag) |
|||
return endpos |
|||
|
|||
def set_cdata_mode(self, elem): |
|||
self.cdata_elem = elem.lower() |
|||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) |
|||
|
|||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag |
|||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode |
|||
|
|||
CONSTRUCTOR_TAKES_STRICT = True |
@ -0,0 +1,179 @@ |
|||
__all__ = [ |
|||
'LXMLTreeBuilderForXML', |
|||
'LXMLTreeBuilder', |
|||
] |
|||
|
|||
from StringIO import StringIO |
|||
import collections |
|||
from lxml import etree |
|||
from bs4.element import Comment, Doctype, NamespacedAttribute |
|||
from bs4.builder import ( |
|||
FAST, |
|||
HTML, |
|||
HTMLTreeBuilder, |
|||
PERMISSIVE, |
|||
TreeBuilder, |
|||
XML) |
|||
from bs4.dammit import UnicodeDammit |
|||
|
|||
LXML = 'lxml' |
|||
|
|||
class LXMLTreeBuilderForXML(TreeBuilder): |
|||
DEFAULT_PARSER_CLASS = etree.XMLParser |
|||
|
|||
is_xml = True |
|||
|
|||
# Well, it's permissive by XML parser standards. |
|||
features = [LXML, XML, FAST, PERMISSIVE] |
|||
|
|||
CHUNK_SIZE = 512 |
|||
|
|||
@property |
|||
def default_parser(self): |
|||
# This can either return a parser object or a class, which |
|||
# will be instantiated with default arguments. |
|||
return etree.XMLParser(target=self, strip_cdata=False, recover=True) |
|||
|
|||
def __init__(self, parser=None, empty_element_tags=None): |
|||
if empty_element_tags is not None: |
|||
self.empty_element_tags = set(empty_element_tags) |
|||
if parser is None: |
|||
# Use the default parser. |
|||
parser = self.default_parser |
|||
if isinstance(parser, collections.Callable): |
|||
# Instantiate the parser with default arguments |
|||
parser = parser(target=self, strip_cdata=False) |
|||
self.parser = parser |
|||
self.soup = None |
|||
self.nsmaps = None |
|||
|
|||
def _getNsTag(self, tag): |
|||
# Split the namespace URL out of a fully-qualified lxml tag |
|||
# name. Copied from lxml's src/lxml/sax.py. |
|||
if tag[0] == '{': |
|||
return tuple(tag[1:].split('}', 1)) |
|||
else: |
|||
return (None, tag) |
|||
|
|||
def prepare_markup(self, markup, user_specified_encoding=None, |
|||
document_declared_encoding=None): |
|||
""" |
|||
:return: A 3-tuple (markup, original encoding, encoding |
|||
declared within markup). |
|||
""" |
|||
if isinstance(markup, unicode): |
|||
return markup, None, None, False |
|||
|
|||
try_encodings = [user_specified_encoding, document_declared_encoding] |
|||
dammit = UnicodeDammit(markup, try_encodings, is_html=True) |
|||
return (dammit.markup, dammit.original_encoding, |
|||
dammit.declared_html_encoding, |
|||
dammit.contains_replacement_characters) |
|||
|
|||
def feed(self, markup): |
|||
if isinstance(markup, basestring): |
|||
markup = StringIO(markup) |
|||
# Call feed() at least once, even if the markup is empty, |
|||
# or the parser won't be initialized. |
|||
data = markup.read(self.CHUNK_SIZE) |
|||
self.parser.feed(data) |
|||
while data != '': |
|||
# Now call feed() on the rest of the data, chunk by chunk. |
|||
data = markup.read(self.CHUNK_SIZE) |
|||
if data != '': |
|||
self.parser.feed(data) |
|||
self.parser.close() |
|||
|
|||
def close(self): |
|||
self.nsmaps = None |
|||
|
|||
def start(self, name, attrs, nsmap={}): |
|||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. |
|||
attrs = dict(attrs) |
|||
|
|||
nsprefix = None |
|||
# Invert each namespace map as it comes in. |
|||
if len(nsmap) == 0 and self.nsmaps != None: |
|||
# There are no new namespaces for this tag, but namespaces |
|||
# are in play, so we need a separate tag stack to know |
|||
# when they end. |
|||
self.nsmaps.append(None) |
|||
elif len(nsmap) > 0: |
|||
# A new namespace mapping has come into play. |
|||
if self.nsmaps is None: |
|||
self.nsmaps = [] |
|||
inverted_nsmap = dict((value, key) for key, value in nsmap.items()) |
|||
self.nsmaps.append(inverted_nsmap) |
|||
# Also treat the namespace mapping as a set of attributes on the |
|||
# tag, so we can recreate it later. |
|||
attrs = attrs.copy() |
|||
for prefix, namespace in nsmap.items(): |
|||
attribute = NamespacedAttribute( |
|||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/") |
|||
attrs[attribute] = namespace |
|||
namespace, name = self._getNsTag(name) |
|||
if namespace is not None: |
|||
for inverted_nsmap in reversed(self.nsmaps): |
|||
if inverted_nsmap is not None and namespace in inverted_nsmap: |
|||
nsprefix = inverted_nsmap[namespace] |
|||
break |
|||
self.soup.handle_starttag(name, namespace, nsprefix, attrs) |
|||
|
|||
def end(self, name): |
|||
self.soup.endData() |
|||
completed_tag = self.soup.tagStack[-1] |
|||
namespace, name = self._getNsTag(name) |
|||
nsprefix = None |
|||
if namespace is not None: |
|||
for inverted_nsmap in reversed(self.nsmaps): |
|||
if inverted_nsmap is not None and namespace in inverted_nsmap: |
|||
nsprefix = inverted_nsmap[namespace] |
|||
break |
|||
self.soup.handle_endtag(name, nsprefix) |
|||
if self.nsmaps != None: |
|||
# This tag, or one of its parents, introduced a namespace |
|||
# mapping, so pop it off the stack. |
|||
self.nsmaps.pop() |
|||
if len(self.nsmaps) == 0: |
|||
# Namespaces are no longer in play, so don't bother keeping |
|||
# track of the namespace stack. |
|||
self.nsmaps = None |
|||
|
|||
def pi(self, target, data): |
|||
pass |
|||
|
|||
def data(self, content): |
|||
self.soup.handle_data(content) |
|||
|
|||
def doctype(self, name, pubid, system): |
|||
self.soup.endData() |
|||
doctype = Doctype.for_name_and_ids(name, pubid, system) |
|||
self.soup.object_was_parsed(doctype) |
|||
|
|||
def comment(self, content): |
|||
"Handle comments as Comment objects." |
|||
self.soup.endData() |
|||
self.soup.handle_data(content) |
|||
self.soup.endData(Comment) |
|||
|
|||
def test_fragment_to_document(self, fragment): |
|||
"""See `TreeBuilder`.""" |
|||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment |
|||
|
|||
|
|||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): |
|||
|
|||
features = [LXML, HTML, FAST, PERMISSIVE] |
|||
is_xml = False |
|||
|
|||
@property |
|||
def default_parser(self): |
|||
return etree.HTMLParser |
|||
|
|||
def feed(self, markup): |
|||
self.parser.feed(markup) |
|||
self.parser.close() |
|||
|
|||
def test_fragment_to_document(self, fragment): |
|||
"""See `TreeBuilder`.""" |
|||
return u'<html><body>%s</body></html>' % fragment |
@ -0,0 +1,792 @@ |
|||
# -*- coding: utf-8 -*- |
|||
"""Beautiful Soup bonus library: Unicode, Dammit |
|||
|
|||
This class forces XML data into a standard format (usually to UTF-8 or |
|||
Unicode). It is heavily based on code from Mark Pilgrim's Universal |
|||
Feed Parser. It does not rewrite the XML or HTML to reflect a new |
|||
encoding; that's the tree builder's job. |
|||
""" |
|||
|
|||
import codecs |
|||
from htmlentitydefs import codepoint2name |
|||
import re |
|||
import warnings |
|||
|
|||
# Autodetects character encodings. Very useful. |
|||
# Download from http://chardet.feedparser.org/ |
|||
# or 'apt-get install python-chardet' |
|||
# or 'easy_install chardet' |
|||
try: |
|||
import chardet |
|||
#import chardet.constants |
|||
#chardet.constants._debug = 1 |
|||
except ImportError: |
|||
chardet = None |
|||
|
|||
# Available from http://cjkpython.i18n.org/. |
|||
try: |
|||
import iconv_codec |
|||
except ImportError: |
|||
pass |
|||
|
|||
xml_encoding_re = re.compile( |
|||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I) |
|||
html_meta_re = re.compile( |
|||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I) |
|||
|
|||
class EntitySubstitution(object): |
|||
|
|||
"""Substitute XML or HTML entities for the corresponding characters.""" |
|||
|
|||
def _populate_class_variables(): |
|||
lookup = {} |
|||
reverse_lookup = {} |
|||
characters_for_re = [] |
|||
for codepoint, name in list(codepoint2name.items()): |
|||
character = unichr(codepoint) |
|||
if codepoint != 34: |
|||
# There's no point in turning the quotation mark into |
|||
# ", unless it happens within an attribute value, which |
|||
# is handled elsewhere. |
|||
characters_for_re.append(character) |
|||
lookup[character] = name |
|||
# But we do want to turn " into the quotation mark. |
|||
reverse_lookup[name] = character |
|||
re_definition = "[%s]" % "".join(characters_for_re) |
|||
return lookup, reverse_lookup, re.compile(re_definition) |
|||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER, |
|||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables() |
|||
|
|||
CHARACTER_TO_XML_ENTITY = { |
|||
"'": "apos", |
|||
'"': "quot", |
|||
"&": "amp", |
|||
"<": "lt", |
|||
">": "gt", |
|||
} |
|||
|
|||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|" |
|||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" |
|||
")") |
|||
|
|||
@classmethod |
|||
def _substitute_html_entity(cls, matchobj): |
|||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) |
|||
return "&%s;" % entity |
|||
|
|||
@classmethod |
|||
def _substitute_xml_entity(cls, matchobj): |
|||
"""Used with a regular expression to substitute the |
|||
appropriate XML entity for an XML special character.""" |
|||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)] |
|||
return "&%s;" % entity |
|||
|
|||
@classmethod |
|||
def quoted_attribute_value(self, value): |
|||
"""Make a value into a quoted XML attribute, possibly escaping it. |
|||
|
|||
Most strings will be quoted using double quotes. |
|||
|
|||
Bob's Bar -> "Bob's Bar" |
|||
|
|||
If a string contains double quotes, it will be quoted using |
|||
single quotes. |
|||
|
|||
Welcome to "my bar" -> 'Welcome to "my bar"' |
|||
|
|||
If a string contains both single and double quotes, the |
|||
double quotes will be escaped, and the string will be quoted |
|||
using double quotes. |
|||
|
|||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar" |
|||
""" |
|||
quote_with = '"' |
|||
if '"' in value: |
|||
if "'" in value: |
|||
# The string contains both single and double |
|||
# quotes. Turn the double quotes into |
|||
# entities. We quote the double quotes rather than |
|||
# the single quotes because the entity name is |
|||
# """ whether this is HTML or XML. If we |
|||
# quoted the single quotes, we'd have to decide |
|||
# between ' and &squot;. |
|||
replace_with = """ |
|||
value = value.replace('"', replace_with) |
|||
else: |
|||
# There are double quotes but no single quotes. |
|||
# We can use single quotes to quote the attribute. |
|||
quote_with = "'" |
|||
return quote_with + value + quote_with |
|||
|
|||
@classmethod |
|||
def substitute_xml(cls, value, make_quoted_attribute=False): |
|||
"""Substitute XML entities for special XML characters. |
|||
|
|||
:param value: A string to be substituted. The less-than sign will |
|||
become <, the greater-than sign will become >, and any |
|||
ampersands that are not part of an entity defition will |
|||
become &. |
|||
|
|||
:param make_quoted_attribute: If True, then the string will be |
|||
quoted, as befits an attribute value. |
|||
""" |
|||
# Escape angle brackets, and ampersands that aren't part of |
|||
# entities. |
|||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub( |
|||
cls._substitute_xml_entity, value) |
|||
|
|||
if make_quoted_attribute: |
|||
value = cls.quoted_attribute_value(value) |
|||
return value |
|||
|
|||
@classmethod |
|||
def substitute_html(cls, s): |
|||
"""Replace certain Unicode characters with named HTML entities. |
|||
|
|||
This differs from data.encode(encoding, 'xmlcharrefreplace') |
|||
in that the goal is to make the result more readable (to those |
|||
with ASCII displays) rather than to recover from |
|||
errors. There's absolutely nothing wrong with a UTF-8 string |
|||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that |
|||
character with "é" will make it more readable to some |
|||
people. |
|||
""" |
|||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub( |
|||
cls._substitute_html_entity, s) |
|||
|
|||
|
|||
class UnicodeDammit: |
|||
"""A class for detecting the encoding of a *ML document and |
|||
converting it to a Unicode string. If the source encoding is |
|||
windows-1252, can replace MS smart quotes with their HTML or XML |
|||
equivalents.""" |
|||
|
|||
# This dictionary maps commonly seen values for "charset" in HTML |
|||
# meta tags to the corresponding Python codec names. It only covers |
|||
# values that aren't in Python's aliases and can't be determined |
|||
# by the heuristics in find_codec. |
|||
CHARSET_ALIASES = {"macintosh": "mac-roman", |
|||
"x-sjis": "shift-jis"} |
|||
|
|||
ENCODINGS_WITH_SMART_QUOTES = [ |
|||
"windows-1252", |
|||
"iso-8859-1", |
|||
"iso-8859-2", |
|||
] |
|||
|
|||
def __init__(self, markup, override_encodings=[], |
|||
smart_quotes_to=None, is_html=False): |
|||
self.declared_html_encoding = None |
|||
self.smart_quotes_to = smart_quotes_to |
|||
self.tried_encodings = [] |
|||
self.contains_replacement_characters = False |
|||
|
|||
if markup == '' or isinstance(markup, unicode): |
|||
self.markup = markup |
|||
self.unicode_markup = unicode(markup) |
|||
self.original_encoding = None |
|||
return |
|||
|
|||
new_markup, document_encoding, sniffed_encoding = \ |
|||
self._detectEncoding(markup, is_html) |
|||
self.markup = new_markup |
|||
|
|||
u = None |
|||
if new_markup != markup: |
|||
# _detectEncoding modified the markup, then converted it to |
|||
# Unicode and then to UTF-8. So convert it from UTF-8. |
|||
u = self._convert_from("utf8") |
|||
self.original_encoding = sniffed_encoding |
|||
|
|||
if not u: |
|||
for proposed_encoding in ( |
|||
override_encodings + [document_encoding, sniffed_encoding]): |
|||
if proposed_encoding is not None: |
|||
u = self._convert_from(proposed_encoding) |
|||
if u: |
|||
break |
|||
|
|||
# If no luck and we have auto-detection library, try that: |
|||
if not u and chardet and not isinstance(self.markup, unicode): |
|||
u = self._convert_from(chardet.detect(self.markup)['encoding']) |
|||
|
|||
# As a last resort, try utf-8 and windows-1252: |
|||
if not u: |
|||
for proposed_encoding in ("utf-8", "windows-1252"): |
|||
u = self._convert_from(proposed_encoding) |
|||
if u: |
|||
break |
|||
|
|||
# As an absolute last resort, try the encodings again with |
|||
# character replacement. |
|||
if not u: |
|||
for proposed_encoding in ( |
|||
override_encodings + [ |
|||
document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): |
|||
if proposed_encoding != "ascii": |
|||
u = self._convert_from(proposed_encoding, "replace") |
|||
if u is not None: |
|||
warnings.warn( |
|||
UnicodeWarning( |
|||
"Some characters could not be decoded, and were " |
|||
"replaced with REPLACEMENT CHARACTER.")) |
|||
self.contains_replacement_characters = True |
|||
break |
|||
|
|||
# We could at this point force it to ASCII, but that would |
|||
# destroy so much data that I think giving up is better |
|||
self.unicode_markup = u |
|||
if not u: |
|||
self.original_encoding = None |
|||
|
|||
def _sub_ms_char(self, match): |
|||
"""Changes a MS smart quote character to an XML or HTML |
|||
entity, or an ASCII character.""" |
|||
orig = match.group(1) |
|||
if self.smart_quotes_to == 'ascii': |
|||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode() |
|||
else: |
|||
sub = self.MS_CHARS.get(orig) |
|||
if type(sub) == tuple: |
|||
if self.smart_quotes_to == 'xml': |
|||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode() |
|||
else: |
|||
sub = '&'.encode() + sub[0].encode() + ';'.encode() |
|||
else: |
|||
sub = sub.encode() |
|||
return sub |
|||
|
|||
def _convert_from(self, proposed, errors="strict"): |
|||
proposed = self.find_codec(proposed) |
|||
if not proposed or (proposed, errors) in self.tried_encodings: |
|||
return None |
|||
self.tried_encodings.append((proposed, errors)) |
|||
markup = self.markup |
|||
|
|||
# Convert smart quotes to HTML if coming from an encoding |
|||
# that might have them. |
|||
if (self.smart_quotes_to is not None |
|||
and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): |
|||
smart_quotes_re = b"([\x80-\x9f])" |
|||
smart_quotes_compiled = re.compile(smart_quotes_re) |
|||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) |
|||
|
|||
try: |
|||
#print "Trying to convert document to %s (errors=%s)" % ( |
|||
# proposed, errors) |
|||
u = self._to_unicode(markup, proposed, errors) |
|||
self.markup = u |
|||
self.original_encoding = proposed |
|||
except Exception as e: |
|||
#print "That didn't work!" |
|||
#print e |
|||
return None |
|||
#print "Correct encoding: %s" % proposed |
|||
return self.markup |
|||
|
|||
def _to_unicode(self, data, encoding, errors="strict"): |
|||
'''Given a string and its encoding, decodes the string into Unicode. |
|||
%encoding is a string recognized by encodings.aliases''' |
|||
|
|||
# strip Byte Order Mark (if present) |
|||
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ |
|||
and (data[2:4] != '\x00\x00'): |
|||
encoding = 'utf-16be' |
|||
data = data[2:] |
|||
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ |
|||
and (data[2:4] != '\x00\x00'): |
|||
encoding = 'utf-16le' |
|||
data = data[2:] |
|||
elif data[:3] == '\xef\xbb\xbf': |
|||
encoding = 'utf-8' |
|||
data = data[3:] |
|||
elif data[:4] == '\x00\x00\xfe\xff': |
|||
encoding = 'utf-32be' |
|||
data = data[4:] |
|||
elif data[:4] == '\xff\xfe\x00\x00': |
|||
encoding = 'utf-32le' |
|||
data = data[4:] |
|||
newdata = unicode(data, encoding, errors) |
|||
return newdata |
|||
|
|||
def _detectEncoding(self, xml_data, is_html=False): |
|||
"""Given a document, tries to detect its XML encoding.""" |
|||
xml_encoding = sniffed_xml_encoding = None |
|||
try: |
|||
if xml_data[:4] == b'\x4c\x6f\xa7\x94': |
|||
# EBCDIC |
|||
xml_data = self._ebcdic_to_ascii(xml_data) |
|||
elif xml_data[:4] == b'\x00\x3c\x00\x3f': |
|||
# UTF-16BE |
|||
sniffed_xml_encoding = 'utf-16be' |
|||
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') |
|||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ |
|||
and (xml_data[2:4] != b'\x00\x00'): |
|||
# UTF-16BE with BOM |
|||
sniffed_xml_encoding = 'utf-16be' |
|||
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') |
|||
elif xml_data[:4] == b'\x3c\x00\x3f\x00': |
|||
# UTF-16LE |
|||
sniffed_xml_encoding = 'utf-16le' |
|||
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') |
|||
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ |
|||
(xml_data[2:4] != b'\x00\x00'): |
|||
# UTF-16LE with BOM |
|||
sniffed_xml_encoding = 'utf-16le' |
|||
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') |
|||
elif xml_data[:4] == b'\x00\x00\x00\x3c': |
|||
# UTF-32BE |
|||
sniffed_xml_encoding = 'utf-32be' |
|||
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') |
|||
elif xml_data[:4] == b'\x3c\x00\x00\x00': |
|||
# UTF-32LE |
|||
sniffed_xml_encoding = 'utf-32le' |
|||
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') |
|||
elif xml_data[:4] == b'\x00\x00\xfe\xff': |
|||
# UTF-32BE with BOM |
|||
sniffed_xml_encoding = 'utf-32be' |
|||
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') |
|||
elif xml_data[:4] == b'\xff\xfe\x00\x00': |
|||
# UTF-32LE with BOM |
|||
sniffed_xml_encoding = 'utf-32le' |
|||
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') |
|||
elif xml_data[:3] == b'\xef\xbb\xbf': |
|||
# UTF-8 with BOM |
|||
sniffed_xml_encoding = 'utf-8' |
|||
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') |
|||
else: |
|||
sniffed_xml_encoding = 'ascii' |
|||
pass |
|||
except: |
|||
xml_encoding_match = None |
|||
xml_encoding_match = xml_encoding_re.match(xml_data) |
|||
if not xml_encoding_match and is_html: |
|||
xml_encoding_match = html_meta_re.search(xml_data) |
|||
if xml_encoding_match is not None: |
|||
xml_encoding = xml_encoding_match.groups()[0].decode( |
|||
'ascii').lower() |
|||
if is_html: |
|||
self.declared_html_encoding = xml_encoding |
|||
if sniffed_xml_encoding and \ |
|||
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', |
|||
'iso-10646-ucs-4', 'ucs-4', 'csucs4', |
|||
'utf-16', 'utf-32', 'utf_16', 'utf_32', |
|||
'utf16', 'u16')): |
|||
xml_encoding = sniffed_xml_encoding |
|||
return xml_data, xml_encoding, sniffed_xml_encoding |
|||
|
|||
def find_codec(self, charset): |
|||
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ |
|||
or (charset and self._codec(charset.replace("-", ""))) \ |
|||
or (charset and self._codec(charset.replace("-", "_"))) \ |
|||
or charset |
|||
|
|||
def _codec(self, charset): |
|||
if not charset: |
|||
return charset |
|||
codec = None |
|||
try: |
|||
codecs.lookup(charset) |
|||
codec = charset |
|||
except (LookupError, ValueError): |
|||
pass |
|||
return codec |
|||
|
|||
EBCDIC_TO_ASCII_MAP = None |
|||
|
|||
def _ebcdic_to_ascii(self, s): |
|||
c = self.__class__ |
|||
if not c.EBCDIC_TO_ASCII_MAP: |
|||
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|||
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|||
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|||
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|||
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|||
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|||
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|||
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|||
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, |
|||
201,202,106,107,108,109,110,111,112,113,114,203,204,205, |
|||
206,207,208,209,126,115,116,117,118,119,120,121,122,210, |
|||
211,212,213,214,215,216,217,218,219,220,221,222,223,224, |
|||
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, |
|||
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, |
|||
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, |
|||
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, |
|||
250,251,252,253,254,255) |
|||
import string |
|||
c.EBCDIC_TO_ASCII_MAP = string.maketrans( |
|||
''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) |
|||
return s.translate(c.EBCDIC_TO_ASCII_MAP) |
|||
|
|||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. |
|||
MS_CHARS = {b'\x80': ('euro', '20AC'), |
|||
b'\x81': ' ', |
|||
b'\x82': ('sbquo', '201A'), |
|||
b'\x83': ('fnof', '192'), |
|||
b'\x84': ('bdquo', '201E'), |
|||
b'\x85': ('hellip', '2026'), |
|||
b'\x86': ('dagger', '2020'), |
|||
b'\x87': ('Dagger', '2021'), |
|||
b'\x88': ('circ', '2C6'), |
|||
b'\x89': ('permil', '2030'), |
|||
b'\x8A': ('Scaron', '160'), |
|||
b'\x8B': ('lsaquo', '2039'), |
|||
b'\x8C': ('OElig', '152'), |
|||
b'\x8D': '?', |
|||
b'\x8E': ('#x17D', '17D'), |
|||
b'\x8F': '?', |
|||
b'\x90': '?', |
|||
b'\x91': ('lsquo', '2018'), |
|||
b'\x92': ('rsquo', '2019'), |
|||
b'\x93': ('ldquo', '201C'), |
|||
b'\x94': ('rdquo', '201D'), |
|||
b'\x95': ('bull', '2022'), |
|||
b'\x96': ('ndash', '2013'), |
|||
b'\x97': ('mdash', '2014'), |
|||
b'\x98': ('tilde', '2DC'), |
|||
b'\x99': ('trade', '2122'), |
|||
b'\x9a': ('scaron', '161'), |
|||
b'\x9b': ('rsaquo', '203A'), |
|||
b'\x9c': ('oelig', '153'), |
|||
b'\x9d': '?', |
|||
b'\x9e': ('#x17E', '17E'), |
|||
b'\x9f': ('Yuml', ''),} |
|||
|
|||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains |
|||
# horrors like stripping diacritical marks to turn á into a, but also |
|||
# contains non-horrors like turning “ into ". |
|||
MS_CHARS_TO_ASCII = { |
|||
b'\x80' : 'EUR', |
|||
b'\x81' : ' ', |
|||
b'\x82' : ',', |
|||
b'\x83' : 'f', |
|||
b'\x84' : ',,', |
|||
b'\x85' : '...', |
|||
b'\x86' : '+', |
|||
b'\x87' : '++', |
|||
b'\x88' : '^', |
|||
b'\x89' : '%', |
|||
b'\x8a' : 'S', |
|||
b'\x8b' : '<', |
|||
b'\x8c' : 'OE', |
|||
b'\x8d' : '?', |
|||
b'\x8e' : 'Z', |
|||
b'\x8f' : '?', |
|||
b'\x90' : '?', |
|||
b'\x91' : "'", |
|||
b'\x92' : "'", |
|||
b'\x93' : '"', |
|||
b'\x94' : '"', |
|||
b'\x95' : '*', |
|||
b'\x96' : '-', |
|||
b'\x97' : '--', |
|||
b'\x98' : '~', |
|||
b'\x99' : '(TM)', |
|||
b'\x9a' : 's', |
|||
b'\x9b' : '>', |
|||
b'\x9c' : 'oe', |
|||
b'\x9d' : '?', |
|||
b'\x9e' : 'z', |
|||
b'\x9f' : 'Y', |
|||
b'\xa0' : ' ', |
|||
b'\xa1' : '!', |
|||
b'\xa2' : 'c', |
|||
b'\xa3' : 'GBP', |
|||
b'\xa4' : '$', #This approximation is especially parochial--this is the |
|||
#generic currency symbol. |
|||
b'\xa5' : 'YEN', |
|||
b'\xa6' : '|', |
|||
b'\xa7' : 'S', |
|||
b'\xa8' : '..', |
|||
b'\xa9' : '', |
|||
b'\xaa' : '(th)', |
|||
b'\xab' : '<<', |
|||
b'\xac' : '!', |
|||
b'\xad' : ' ', |
|||
b'\xae' : '(R)', |
|||
b'\xaf' : '-', |
|||
b'\xb0' : 'o', |
|||
b'\xb1' : '+-', |
|||
b'\xb2' : '2', |
|||
b'\xb3' : '3', |
|||
b'\xb4' : ("'", 'acute'), |
|||
b'\xb5' : 'u', |
|||
b'\xb6' : 'P', |
|||
b'\xb7' : '*', |
|||
b'\xb8' : ',', |
|||
b'\xb9' : '1', |
|||
b'\xba' : '(th)', |
|||
b'\xbb' : '>>', |
|||
b'\xbc' : '1/4', |
|||
b'\xbd' : '1/2', |
|||
b'\xbe' : '3/4', |
|||
b'\xbf' : '?', |
|||
b'\xc0' : 'A', |
|||
b'\xc1' : 'A', |
|||
b'\xc2' : 'A', |
|||
b'\xc3' : 'A', |
|||
b'\xc4' : 'A', |
|||
b'\xc5' : 'A', |
|||
b'\xc6' : 'AE', |
|||
b'\xc7' : 'C', |
|||
b'\xc8' : 'E', |
|||
b'\xc9' : 'E', |
|||
b'\xca' : 'E', |
|||
b'\xcb' : 'E', |
|||
b'\xcc' : 'I', |
|||
b'\xcd' : 'I', |
|||
b'\xce' : 'I', |
|||
b'\xcf' : 'I', |
|||
b'\xd0' : 'D', |
|||
b'\xd1' : 'N', |
|||
b'\xd2' : 'O', |
|||
b'\xd3' : 'O', |
|||
b'\xd4' : 'O', |
|||
b'\xd5' : 'O', |
|||
b'\xd6' : 'O', |
|||
b'\xd7' : '*', |
|||
b'\xd8' : 'O', |
|||
b'\xd9' : 'U', |
|||
b'\xda' : 'U', |
|||
b'\xdb' : 'U', |
|||
b'\xdc' : 'U', |
|||
b'\xdd' : 'Y', |
|||
b'\xde' : 'b', |
|||
b'\xdf' : 'B', |
|||
b'\xe0' : 'a', |
|||
b'\xe1' : 'a', |
|||
b'\xe2' : 'a', |
|||
b'\xe3' : 'a', |
|||
b'\xe4' : 'a', |
|||
b'\xe5' : 'a', |
|||
b'\xe6' : 'ae', |
|||
b'\xe7' : 'c', |
|||
b'\xe8' : 'e', |
|||
b'\xe9' : 'e', |
|||
b'\xea' : 'e', |
|||
b'\xeb' : 'e', |
|||
b'\xec' : 'i', |
|||
b'\xed' : 'i', |
|||
b'\xee' : 'i', |
|||
b'\xef' : 'i', |
|||
b'\xf0' : 'o', |
|||
b'\xf1' : 'n', |
|||
b'\xf2' : 'o', |
|||
b'\xf3' : 'o', |
|||
b'\xf4' : 'o', |
|||
b'\xf5' : 'o', |
|||
b'\xf6' : 'o', |
|||
b'\xf7' : '/', |
|||
b'\xf8' : 'o', |
|||
b'\xf9' : 'u', |
|||
b'\xfa' : 'u', |
|||
b'\xfb' : 'u', |
|||
b'\xfc' : 'u', |
|||
b'\xfd' : 'y', |
|||
b'\xfe' : 'b', |
|||
b'\xff' : 'y', |
|||
} |
|||
|
|||
# A map used when removing rogue Windows-1252/ISO-8859-1 |
|||
# characters in otherwise UTF-8 documents. |
|||
# |
|||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in |
|||
# Windows-1252. |
|||
WINDOWS_1252_TO_UTF8 = { |
|||
0x80 : b'\xe2\x82\xac', # € |
|||
0x82 : b'\xe2\x80\x9a', # ‚ |
|||
0x83 : b'\xc6\x92', # ƒ |
|||
0x84 : b'\xe2\x80\x9e', # „ |
|||
0x85 : b'\xe2\x80\xa6', # … |
|||
0x86 : b'\xe2\x80\xa0', # † |
|||
0x87 : b'\xe2\x80\xa1', # ‡ |
|||
0x88 : b'\xcb\x86', # ˆ |
|||
0x89 : b'\xe2\x80\xb0', # ‰ |
|||
0x8a : b'\xc5\xa0', # Š |
|||
0x8b : b'\xe2\x80\xb9', # ‹ |
|||
0x8c : b'\xc5\x92', # Œ |
|||
0x8e : b'\xc5\xbd', # Ž |
|||
0x91 : b'\xe2\x80\x98', # ‘ |
|||
0x92 : b'\xe2\x80\x99', # ’ |
|||
0x93 : b'\xe2\x80\x9c', # “ |
|||
0x94 : b'\xe2\x80\x9d', # ” |
|||
0x95 : b'\xe2\x80\xa2', # • |
|||
0x96 : b'\xe2\x80\x93', # – |
|||
0x97 : b'\xe2\x80\x94', # — |
|||
0x98 : b'\xcb\x9c', # ˜ |
|||
0x99 : b'\xe2\x84\xa2', # ™ |
|||
0x9a : b'\xc5\xa1', # š |
|||
0x9b : b'\xe2\x80\xba', # › |
|||
0x9c : b'\xc5\x93', # œ |
|||
0x9e : b'\xc5\xbe', # ž |
|||
0x9f : b'\xc5\xb8', # Ÿ |
|||
0xa0 : b'\xc2\xa0', # |
|||
0xa1 : b'\xc2\xa1', # ¡ |
|||
0xa2 : b'\xc2\xa2', # ¢ |
|||
0xa3 : b'\xc2\xa3', # £ |
|||
0xa4 : b'\xc2\xa4', # ¤ |
|||
0xa5 : b'\xc2\xa5', # ¥ |
|||
0xa6 : b'\xc2\xa6', # ¦ |
|||
0xa7 : b'\xc2\xa7', # § |
|||
0xa8 : b'\xc2\xa8', # ¨ |
|||
0xa9 : b'\xc2\xa9', # © |
|||
0xaa : b'\xc2\xaa', # ª |
|||
0xab : b'\xc2\xab', # « |
|||
0xac : b'\xc2\xac', # ¬ |
|||
0xad : b'\xc2\xad', # |
|||
0xae : b'\xc2\xae', # ® |
|||
0xaf : b'\xc2\xaf', # ¯ |
|||
0xb0 : b'\xc2\xb0', # ° |
|||
0xb1 : b'\xc2\xb1', # ± |
|||
0xb2 : b'\xc2\xb2', # ² |
|||
0xb3 : b'\xc2\xb3', # ³ |
|||
0xb4 : b'\xc2\xb4', # ´ |
|||
0xb5 : b'\xc2\xb5', # µ |
|||
0xb6 : b'\xc2\xb6', # ¶ |
|||
0xb7 : b'\xc2\xb7', # · |
|||
0xb8 : b'\xc2\xb8', # ¸ |
|||
0xb9 : b'\xc2\xb9', # ¹ |
|||
0xba : b'\xc2\xba', # º |
|||
0xbb : b'\xc2\xbb', # » |
|||
0xbc : b'\xc2\xbc', # ¼ |
|||
0xbd : b'\xc2\xbd', # ½ |
|||
0xbe : b'\xc2\xbe', # ¾ |
|||
0xbf : b'\xc2\xbf', # ¿ |
|||
0xc0 : b'\xc3\x80', # À |
|||
0xc1 : b'\xc3\x81', # Á |
|||
0xc2 : b'\xc3\x82', # Â |
|||
0xc3 : b'\xc3\x83', # Ã |
|||
0xc4 : b'\xc3\x84', # Ä |
|||
0xc5 : b'\xc3\x85', # Å |
|||
0xc6 : b'\xc3\x86', # Æ |
|||
0xc7 : b'\xc3\x87', # Ç |
|||
0xc8 : b'\xc3\x88', # È |
|||
0xc9 : b'\xc3\x89', # É |
|||
0xca : b'\xc3\x8a', # Ê |
|||
0xcb : b'\xc3\x8b', # Ë |
|||
0xcc : b'\xc3\x8c', # Ì |
|||
0xcd : b'\xc3\x8d', # Í |
|||
0xce : b'\xc3\x8e', # Î |
|||
0xcf : b'\xc3\x8f', # Ï |
|||
0xd0 : b'\xc3\x90', # Ð |
|||
0xd1 : b'\xc3\x91', # Ñ |
|||
0xd2 : b'\xc3\x92', # Ò |
|||
0xd3 : b'\xc3\x93', # Ó |
|||
0xd4 : b'\xc3\x94', # Ô |
|||
0xd5 : b'\xc3\x95', # Õ |
|||
0xd6 : b'\xc3\x96', # Ö |
|||
0xd7 : b'\xc3\x97', # × |
|||
0xd8 : b'\xc3\x98', # Ø |
|||
0xd9 : b'\xc3\x99', # Ù |
|||
0xda : b'\xc3\x9a', # Ú |
|||
0xdb : b'\xc3\x9b', # Û |
|||
0xdc : b'\xc3\x9c', # Ü |
|||
0xdd : b'\xc3\x9d', # Ý |
|||
0xde : b'\xc3\x9e', # Þ |
|||
0xdf : b'\xc3\x9f', # ß |
|||
0xe0 : b'\xc3\xa0', # à |
|||
0xe1 : b'\xa1', # á |
|||
0xe2 : b'\xc3\xa2', # â |
|||
0xe3 : b'\xc3\xa3', # ã |
|||
0xe4 : b'\xc3\xa4', # ä |
|||
0xe5 : b'\xc3\xa5', # å |
|||
0xe6 : b'\xc3\xa6', # æ |
|||
0xe7 : b'\xc3\xa7', # ç |
|||
0xe8 : b'\xc3\xa8', # è |
|||
0xe9 : b'\xc3\xa9', # é |
|||
0xea : b'\xc3\xaa', # ê |
|||
0xeb : b'\xc3\xab', # ë |
|||
0xec : b'\xc3\xac', # ì |
|||
0xed : b'\xc3\xad', # í |
|||
0xee : b'\xc3\xae', # î |
|||
0xef : b'\xc3\xaf', # ï |
|||
0xf0 : b'\xc3\xb0', # ð |
|||
0xf1 : b'\xc3\xb1', # ñ |
|||
0xf2 : b'\xc3\xb2', # ò |
|||
0xf3 : b'\xc3\xb3', # ó |
|||
0xf4 : b'\xc3\xb4', # ô |
|||
0xf5 : b'\xc3\xb5', # õ |
|||
0xf6 : b'\xc3\xb6', # ö |
|||
0xf7 : b'\xc3\xb7', # ÷ |
|||
0xf8 : b'\xc3\xb8', # ø |
|||
0xf9 : b'\xc3\xb9', # ù |
|||
0xfa : b'\xc3\xba', # ú |
|||
0xfb : b'\xc3\xbb', # û |
|||
0xfc : b'\xc3\xbc', # ü |
|||
0xfd : b'\xc3\xbd', # ý |
|||
0xfe : b'\xc3\xbe', # þ |
|||
} |
|||
|
|||
MULTIBYTE_MARKERS_AND_SIZES = [ |
|||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF |
|||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF |
|||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4 |
|||
] |
|||
|
|||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0] |
|||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1] |
|||
|
|||
@classmethod |
|||
def detwingle(cls, in_bytes, main_encoding="utf8", |
|||
embedded_encoding="windows-1252"): |
|||
"""Fix characters from one encoding embedded in some other encoding. |
|||
|
|||
Currently the only situation supported is Windows-1252 (or its |
|||
subset ISO-8859-1), embedded in UTF-8. |
|||
|
|||
The input must be a bytestring. If you've already converted |
|||
the document to Unicode, you're too late. |
|||
|
|||
The output is a bytestring in which `embedded_encoding` |
|||
characters have been converted to their `main_encoding` |
|||
equivalents. |
|||
""" |
|||
if embedded_encoding.replace('_', '-').lower() not in ( |
|||
'windows-1252', 'windows_1252'): |
|||
raise NotImplementedError( |
|||
"Windows-1252 and ISO-8859-1 are the only currently supported " |
|||
"embedded encodings.") |
|||
|
|||
if main_encoding.lower() not in ('utf8', 'utf-8'): |
|||
raise NotImplementedError( |
|||
"UTF-8 is the only currently supported main encoding.") |
|||
|
|||
byte_chunks = [] |
|||
|
|||
chunk_start = 0 |
|||
pos = 0 |
|||
while pos < len(in_bytes): |
|||
byte = in_bytes[pos] |
|||
if not isinstance(byte, int): |
|||
# Python 2.x |
|||
byte = ord(byte) |
|||
if (byte >= cls.FIRST_MULTIBYTE_MARKER |
|||
and byte <= cls.LAST_MULTIBYTE_MARKER): |
|||
# This is the start of a UTF-8 multibyte character. Skip |
|||
# to the end. |
|||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES: |
|||
if byte >= start and byte <= end: |
|||
pos += size |
|||
break |
|||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8: |
|||
# We found a Windows-1252 character! |
|||
# Save the string up to this point as a chunk. |
|||
byte_chunks.append(in_bytes[chunk_start:pos]) |
|||
|
|||
# Now translate the Windows-1252 character into UTF-8 |
|||
# and add it as another, one-byte chunk. |
|||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte]) |
|||
pos += 1 |
|||
chunk_start = pos |
|||
else: |
|||
# Go on to the next character. |
|||
pos += 1 |
|||
if chunk_start == 0: |
|||
# The string is unchanged. |
|||
return in_bytes |
|||
else: |
|||
# Store the final chunk. |
|||
byte_chunks.append(in_bytes[chunk_start:]) |
|||
return b''.join(byte_chunks) |
|||
|
File diff suppressed because it is too large
@ -0,0 +1,515 @@ |
|||
"""Helper classes for tests.""" |
|||
|
|||
import copy |
|||
import functools |
|||
import unittest |
|||
from unittest import TestCase |
|||
from bs4 import BeautifulSoup |
|||
from bs4.element import ( |
|||
CharsetMetaAttributeValue, |
|||
Comment, |
|||
ContentMetaAttributeValue, |
|||
Doctype, |
|||
SoupStrainer, |
|||
) |
|||
|
|||
from bs4.builder import HTMLParserTreeBuilder |
|||
default_builder = HTMLParserTreeBuilder |
|||
|
|||
|
|||
class SoupTest(unittest.TestCase): |
|||
|
|||
@property |
|||
def default_builder(self): |
|||
return default_builder() |
|||
|
|||
def soup(self, markup, **kwargs): |
|||
"""Build a Beautiful Soup object from markup.""" |
|||
builder = kwargs.pop('builder', self.default_builder) |
|||
return BeautifulSoup(markup, builder=builder, **kwargs) |
|||
|
|||
def document_for(self, markup): |
|||
"""Turn an HTML fragment into a document. |
|||
|
|||
The details depend on the builder. |
|||
""" |
|||
return self.default_builder.test_fragment_to_document(markup) |
|||
|
|||
def assertSoupEquals(self, to_parse, compare_parsed_to=None): |
|||
builder = self.default_builder |
|||
obj = BeautifulSoup(to_parse, builder=builder) |
|||
if compare_parsed_to is None: |
|||
compare_parsed_to = to_parse |
|||
|
|||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to)) |
|||
|
|||
|
|||
class HTMLTreeBuilderSmokeTest(object): |
|||
|
|||
"""A basic test of a treebuilder's competence. |
|||
|
|||
Any HTML treebuilder, present or future, should be able to pass |
|||
these tests. With invalid markup, there's room for interpretation, |
|||
and different parsers can handle it differently. But with the |
|||
markup in these tests, there's not much room for interpretation. |
|||
""" |
|||
|
|||
def assertDoctypeHandled(self, doctype_fragment): |
|||
"""Assert that a given doctype string is handled correctly.""" |
|||
doctype_str, soup = self._document_with_doctype(doctype_fragment) |
|||
|
|||
# Make sure a Doctype object was created. |
|||
doctype = soup.contents[0] |
|||
self.assertEqual(doctype.__class__, Doctype) |
|||
self.assertEqual(doctype, doctype_fragment) |
|||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str) |
|||
|
|||
# Make sure that the doctype was correctly associated with the |
|||
# parse tree and that the rest of the document parsed. |
|||
self.assertEqual(soup.p.contents[0], 'foo') |
|||
|
|||
def _document_with_doctype(self, doctype_fragment): |
|||
"""Generate and parse a document with the given doctype.""" |
|||
doctype = '<!DOCTYPE %s>' % doctype_fragment |
|||
markup = doctype + '\n<p>foo</p>' |
|||
soup = self.soup(markup) |
|||
return doctype, soup |
|||
|
|||
def test_normal_doctypes(self): |
|||
"""Make sure normal, everyday HTML doctypes are handled correctly.""" |
|||
self.assertDoctypeHandled("html") |
|||
self.assertDoctypeHandled( |
|||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"') |
|||
|
|||
def test_public_doctype_with_url(self): |
|||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"' |
|||
self.assertDoctypeHandled(doctype) |
|||
|
|||
def test_system_doctype(self): |
|||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"') |
|||
|
|||
def test_namespaced_system_doctype(self): |
|||
# We can handle a namespaced doctype with a system ID. |
|||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"') |
|||
|
|||
def test_namespaced_public_doctype(self): |
|||
# Test a namespaced doctype with a public id. |
|||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"') |
|||
|
|||
def test_real_xhtml_document(self): |
|||
"""A real XHTML document should come out more or less the same as it went in.""" |
|||
markup = b"""<?xml version="1.0" encoding="utf-8"?> |
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> |
|||
<html xmlns="http://www.w3.org/1999/xhtml"> |
|||
<head><title>Hello.</title></head> |
|||
<body>Goodbye.</body> |
|||
</html>""" |
|||
soup = self.soup(markup) |
|||
self.assertEqual( |
|||
soup.encode("utf-8").replace(b"\n", b""), |
|||
markup.replace(b"\n", b"")) |
|||
|
|||
def test_deepcopy(self): |
|||
"""Make sure you can copy the tree builder. |
|||
|
|||
This is important because the builder is part of a |
|||
BeautifulSoup object, and we want to be able to copy that. |
|||
""" |
|||
copy.deepcopy(self.default_builder) |
|||
|
|||
def test_p_tag_is_never_empty_element(self): |
|||
"""A <p> tag is never designated as an empty-element tag. |
|||
|
|||
Even if the markup shows it as an empty-element tag, it |
|||
shouldn't be presented that way. |
|||
""" |
|||
soup = self.soup("<p/>") |
|||
self.assertFalse(soup.p.is_empty_element) |
|||
self.assertEqual(str(soup.p), "<p></p>") |
|||
|
|||
def test_unclosed_tags_get_closed(self): |
|||
"""A tag that's not closed by the end of the document should be closed. |
|||
|
|||
This applies to all tags except empty-element tags. |
|||
""" |
|||
self.assertSoupEquals("<p>", "<p></p>") |
|||
self.assertSoupEquals("<b>", "<b></b>") |
|||
|
|||
self.assertSoupEquals("<br>", "<br/>") |
|||
|
|||
def test_br_is_always_empty_element_tag(self): |
|||
"""A <br> tag is designated as an empty-element tag. |
|||
|
|||
Some parsers treat <br></br> as one <br/> tag, some parsers as |
|||
two tags, but it should always be an empty-element tag. |
|||
""" |
|||
soup = self.soup("<br></br>") |
|||
self.assertTrue(soup.br.is_empty_element) |
|||
self.assertEqual(str(soup.br), "<br/>") |
|||
|
|||
def test_nested_formatting_elements(self): |
|||
self.assertSoupEquals("<em><em></em></em>") |
|||
|
|||
def test_comment(self): |
|||
# Comments are represented as Comment objects. |
|||
markup = "<p>foo<!--foobar-->baz</p>" |
|||
self.assertSoupEquals(markup) |
|||
|
|||
soup = self.soup(markup) |
|||
comment = soup.find(text="foobar") |
|||
self.assertEqual(comment.__class__, Comment) |
|||
|
|||
def test_preserved_whitespace_in_pre_and_textarea(self): |
|||
"""Whitespace must be preserved in <pre> and <textarea> tags.""" |
|||
self.assertSoupEquals("<pre> </pre>") |
|||
self.assertSoupEquals("<textarea> woo </textarea>") |
|||
|
|||
def test_nested_inline_elements(self): |
|||
"""Inline elements can be nested indefinitely.""" |
|||
b_tag = "<b>Inside a B tag</b>" |
|||
self.assertSoupEquals(b_tag) |
|||
|
|||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>" |
|||
self.assertSoupEquals(nested_b_tag) |
|||
|
|||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>" |
|||
self.assertSoupEquals(nested_b_tag) |
|||
|
|||
def test_nested_block_level_elements(self): |
|||
"""Block elements can be nested.""" |
|||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>') |
|||
blockquote = soup.blockquote |
|||
self.assertEqual(blockquote.p.b.string, 'Foo') |
|||
self.assertEqual(blockquote.b.string, 'Foo') |
|||
|
|||
def test_correctly_nested_tables(self): |
|||
"""One table can go inside another one.""" |
|||
markup = ('<table id="1">' |
|||
'<tr>' |
|||
"<td>Here's another table:" |
|||
'<table id="2">' |
|||
'<tr><td>foo</td></tr>' |
|||
'</table></td>') |
|||
|
|||
self.assertSoupEquals( |
|||
markup, |
|||
'<table id="1"><tr><td>Here\'s another table:' |
|||
'<table id="2"><tr><td>foo</td></tr></table>' |
|||
'</td></tr></table>') |
|||
|
|||
self.assertSoupEquals( |
|||
"<table><thead><tr><td>Foo</td></tr></thead>" |
|||
"<tbody><tr><td>Bar</td></tr></tbody>" |
|||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>") |
|||
|
|||
def test_angle_brackets_in_attribute_values_are_escaped(self): |
|||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>') |
|||
|
|||
def test_entities_in_attributes_converted_to_unicode(self): |
|||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>' |
|||
self.assertSoupEquals('<p id="piñata"></p>', expect) |
|||
self.assertSoupEquals('<p id="piñata"></p>', expect) |
|||
self.assertSoupEquals('<p id="piñata"></p>', expect) |
|||
|
|||
def test_entities_in_text_converted_to_unicode(self): |
|||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>' |
|||
self.assertSoupEquals("<p>piñata</p>", expect) |
|||
self.assertSoupEquals("<p>piñata</p>", expect) |
|||
self.assertSoupEquals("<p>piñata</p>", expect) |
|||
|
|||
def test_quot_entity_converted_to_quotation_mark(self): |
|||
self.assertSoupEquals("<p>I said "good day!"</p>", |
|||
'<p>I said "good day!"</p>') |
|||
|
|||
def test_out_of_range_entity(self): |
|||
expect = u"\N{REPLACEMENT CHARACTER}" |
|||
self.assertSoupEquals("�", expect) |
|||
self.assertSoupEquals("�", expect) |
|||
self.assertSoupEquals("�", expect) |
|||
|
|||
def test_basic_namespaces(self): |
|||
"""Parsers don't need to *understand* namespaces, but at the |
|||
very least they should not choke on namespaces or lose |
|||
data.""" |
|||
|
|||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>' |
|||
soup = self.soup(markup) |
|||
self.assertEqual(markup, soup.encode()) |
|||
html = soup.html |
|||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) |
|||
self.assertEqual( |
|||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) |
|||
self.assertEqual( |
|||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) |
|||
|
|||
def test_multivalued_attribute_value_becomes_list(self): |
|||
markup = b'<a class="foo bar">' |
|||
soup = self.soup(markup) |
|||
self.assertEqual(['foo', 'bar'], soup.a['class']) |
|||
|
|||
# |
|||
# Generally speaking, tests below this point are more tests of |
|||
# Beautiful Soup than tests of the tree builders. But parsers are |
|||
# weird, so we run these tests separately for every tree builder |
|||
# to detect any differences between them. |
|||
# |
|||
|
|||
def test_soupstrainer(self): |
|||
"""Parsers should be able to work with SoupStrainers.""" |
|||
strainer = SoupStrainer("b") |
|||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>", |
|||
parse_only=strainer) |
|||
self.assertEqual(soup.decode(), "<b>bold</b>") |
|||
|
|||
def test_single_quote_attribute_values_become_double_quotes(self): |
|||
self.assertSoupEquals("<foo attr='bar'></foo>", |
|||
'<foo attr="bar"></foo>') |
|||
|
|||
def test_attribute_values_with_nested_quotes_are_left_alone(self): |
|||
text = """<foo attr='bar "brawls" happen'>a</foo>""" |
|||
self.assertSoupEquals(text) |
|||
|
|||
def test_attribute_values_with_double_nested_quotes_get_quoted(self): |
|||
text = """<foo attr='bar "brawls" happen'>a</foo>""" |
|||
soup = self.soup(text) |
|||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' |
|||
self.assertSoupEquals( |
|||
soup.foo.decode(), |
|||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""") |
|||
|
|||
def test_ampersand_in_attribute_value_gets_escaped(self): |
|||
self.assertSoupEquals('<this is="really messed up & stuff"></this>', |
|||
'<this is="really messed up & stuff"></this>') |
|||
|
|||
self.assertSoupEquals( |
|||
'<a href="http://example.org?a=1&b=2;3">foo</a>', |
|||
'<a href="http://example.org?a=1&b=2;3">foo</a>') |
|||
|
|||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self): |
|||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>') |
|||
|
|||
def test_entities_in_strings_converted_during_parsing(self): |
|||
# Both XML and HTML entities are converted to Unicode characters |
|||
# during parsing. |
|||
text = "<p><<sacré bleu!>></p>" |
|||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>" |
|||
self.assertSoupEquals(text, expected) |
|||
|
|||
def test_smart_quotes_converted_on_the_way_in(self): |
|||
# Microsoft smart quotes are converted to Unicode characters during |
|||
# parsing. |
|||
quote = b"<p>\x91Foo\x92</p>" |
|||
soup = self.soup(quote) |
|||
self.assertEqual( |
|||
soup.p.string, |
|||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") |
|||
|
|||
def test_non_breaking_spaces_converted_on_the_way_in(self): |
|||
soup = self.soup("<a> </a>") |
|||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) |
|||
|
|||
def test_entities_converted_on_the_way_out(self): |
|||
text = "<p><<sacré bleu!>></p>" |
|||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8") |
|||
soup = self.soup(text) |
|||
self.assertEqual(soup.p.encode("utf-8"), expected) |
|||
|
|||
def test_real_iso_latin_document(self): |
|||
# Smoke test of interrelated functionality, using an |
|||
# easy-to-understand document. |
|||
|
|||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1. |
|||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' |
|||
|
|||
# That's because we're going to encode it into ISO-Latin-1, and use |
|||
# that to test. |
|||
iso_latin_html = unicode_html.encode("iso-8859-1") |
|||
|
|||
# Parse the ISO-Latin-1 HTML. |
|||
soup = self.soup(iso_latin_html) |
|||
# Encode it to UTF-8. |
|||
result = soup.encode("utf-8") |
|||
|
|||
# What do we expect the result to look like? Well, it would |
|||
# look like unicode_html, except that the META tag would say |
|||
# UTF-8 instead of ISO-Latin-1. |
|||
expected = unicode_html.replace("ISO-Latin-1", "utf-8") |
|||
|
|||
# And, of course, it would be in UTF-8, not Unicode. |
|||
expected = expected.encode("utf-8") |
|||
|
|||
# Ta-da! |
|||
self.assertEqual(result, expected) |
|||
|
|||
def test_real_shift_jis_document(self): |
|||
# Smoke test to make sure the parser can handle a document in |
|||
# Shift-JIS encoding, without choking. |
|||
shift_jis_html = ( |
|||
b'<html><head></head><body><pre>' |
|||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f' |
|||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c' |
|||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B' |
|||
b'</pre></body></html>') |
|||
unicode_html = shift_jis_html.decode("shift-jis") |
|||
soup = self.soup(unicode_html) |
|||
|
|||
# Make sure the parse tree is correctly encoded to various |
|||
# encodings. |
|||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) |
|||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) |
|||
|
|||
def test_real_hebrew_document(self): |
|||
# A real-world test to make sure we can convert ISO-8859-9 (a |
|||
# Hebrew encoding) to UTF-8. |
|||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>' |
|||
soup = self.soup( |
|||
hebrew_document, from_encoding="iso8859-8") |
|||
self.assertEqual(soup.original_encoding, 'iso8859-8') |
|||
self.assertEqual( |
|||
soup.encode('utf-8'), |
|||
hebrew_document.decode("iso8859-8").encode("utf-8")) |
|||
|
|||
def test_meta_tag_reflects_current_encoding(self): |
|||
# Here's the <meta> tag saying that a document is |
|||
# encoded in Shift-JIS. |
|||
meta_tag = ('<meta content="text/html; charset=x-sjis" ' |
|||
'http-equiv="Content-type"/>') |
|||
|
|||
# Here's a document incorporating that meta tag. |
|||
shift_jis_html = ( |
|||
'<html><head>\n%s\n' |
|||
'<meta http-equiv="Content-language" content="ja"/>' |
|||
'</head><body>Shift-JIS markup goes here.') % meta_tag |
|||
soup = self.soup(shift_jis_html) |
|||
|
|||
# Parse the document, and the charset is seemingly unaffected. |
|||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) |
|||
content = parsed_meta['content'] |
|||
self.assertEqual('text/html; charset=x-sjis', content) |
|||
|
|||
# But that value is actually a ContentMetaAttributeValue object. |
|||
self.assertTrue(isinstance(content, ContentMetaAttributeValue)) |
|||
|
|||
# And it will take on a value that reflects its current |
|||
# encoding. |
|||
self.assertEqual('text/html; charset=utf8', content.encode("utf8")) |
|||
|
|||
# For the rest of the story, see TestSubstitutions in |
|||
# test_tree.py. |
|||
|
|||
def test_html5_style_meta_tag_reflects_current_encoding(self): |
|||
# Here's the <meta> tag saying that a document is |
|||
# encoded in Shift-JIS. |
|||
meta_tag = ('<meta id="encoding" charset="x-sjis" />') |
|||
|
|||
# Here's a document incorporating that meta tag. |
|||
shift_jis_html = ( |
|||
'<html><head>\n%s\n' |
|||
'<meta http-equiv="Content-language" content="ja"/>' |
|||
'</head><body>Shift-JIS markup goes here.') % meta_tag |
|||
soup = self.soup(shift_jis_html) |
|||
|
|||
# Parse the document, and the charset is seemingly unaffected. |
|||
parsed_meta = soup.find('meta', id="encoding") |
|||
charset = parsed_meta['charset'] |
|||
self.assertEqual('x-sjis', charset) |
|||
|
|||
# But that value is actually a CharsetMetaAttributeValue object. |
|||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) |
|||
|
|||
# And it will take on a value that reflects its current |
|||
# encoding. |
|||
self.assertEqual('utf8', charset.encode("utf8")) |
|||
|
|||
def test_tag_with_no_attributes_can_have_attributes_added(self): |
|||
data = self.soup("<a>text</a>") |
|||
data.a['foo'] = 'bar' |
|||
self.assertEqual('<a foo="bar">text</a>', data.a.decode()) |
|||
|
|||
class XMLTreeBuilderSmokeTest(object): |
|||
|
|||
def test_docstring_generated(self): |
|||
soup = self.soup("<root/>") |
|||
self.assertEqual( |
|||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>') |
|||
|
|||
def test_real_xhtml_document(self): |
|||
"""A real XHTML document should come out *exactly* the same as it went in.""" |
|||
markup = b"""<?xml version="1.0" encoding="utf-8"?> |
|||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"> |
|||
<html xmlns="http://www.w3.org/1999/xhtml"> |
|||
<head><title>Hello.</title></head> |
|||
<body>Goodbye.</body> |
|||
</html>""" |
|||
soup = self.soup(markup) |
|||
self.assertEqual( |
|||
soup.encode("utf-8"), markup) |
|||
|
|||
|
|||
def test_docstring_includes_correct_encoding(self): |
|||
soup = self.soup("<root/>") |
|||
self.assertEqual( |
|||
soup.encode("latin1"), |
|||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>') |
|||
|
|||
def test_large_xml_document(self): |
|||
"""A large XML document should come out the same as it went in.""" |
|||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>' |
|||
+ b'0' * (2**12) |
|||
+ b'</root>') |
|||
soup = self.soup(markup) |
|||
self.assertEqual(soup.encode("utf-8"), markup) |
|||
|
|||
|
|||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): |
|||
self.assertSoupEquals("<p>", "<p/>") |
|||
self.assertSoupEquals("<p>foo</p>") |
|||
|
|||
def test_namespaces_are_preserved(self): |
|||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>' |
|||
soup = self.soup(markup) |
|||
root = soup.root |
|||
self.assertEqual("http://example.com/", root['xmlns:a']) |
|||
self.assertEqual("http://example.net/", root['xmlns:b']) |
|||
|
|||
|
|||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): |
|||
"""Smoke test for a tree builder that supports HTML5.""" |
|||
|
|||
def test_real_xhtml_document(self): |
|||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle |
|||
# XHTML documents in any particular way. |
|||
pass |
|||
|
|||
def test_html_tags_have_namespace(self): |
|||
markup = "<a>" |
|||
soup = self.soup(markup) |
|||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) |
|||
|
|||
def test_svg_tags_have_namespace(self): |
|||
markup = '<svg><circle/></svg>' |
|||
soup = self.soup(markup) |
|||
namespace = "http://www.w3.org/2000/svg" |
|||
self.assertEqual(namespace, soup.svg.namespace) |
|||
self.assertEqual(namespace, soup.circle.namespace) |
|||
|
|||
|
|||
def test_mathml_tags_have_namespace(self): |
|||
markup = '<math><msqrt>5</msqrt></math>' |
|||
soup = self.soup(markup) |
|||
namespace = 'http://www.w3.org/1998/Math/MathML' |
|||
self.assertEqual(namespace, soup.math.namespace) |
|||
self.assertEqual(namespace, soup.msqrt.namespace) |
|||
|
|||
|
|||
def skipIf(condition, reason): |
|||
def nothing(test, *args, **kwargs): |
|||
return None |
|||
|
|||
def decorator(test_item): |
|||
if condition: |
|||
return nothing |
|||
else: |
|||
return test_item |
|||
|
|||
return decorator |
@ -0,0 +1 @@ |
|||
from .core import where |
File diff suppressed because it is too large
@ -0,0 +1,19 @@ |
|||
#!/usr/bin/env python |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
ceritfi.py |
|||
~~~~~~~~~~ |
|||
|
|||
This module returns the installation location of cacert.pem. |
|||
""" |
|||
|
|||
import os |
|||
|
|||
def where(): |
|||
f = os.path.split(__file__)[0] |
|||
|
|||
return os.path.join(f, 'cacert.pem') |
|||
|
|||
if __name__ == '__main__': |
|||
print(where()) |
@ -0,0 +1,17 @@ |
|||
""" |
|||
HTML parsing library based on the WHATWG "HTML5" |
|||
specification. The parser is designed to be compatible with existing |
|||
HTML found in the wild and implements well-defined error recovery that |
|||
is largely compatible with modern desktop web browsers. |
|||
|
|||
Example usage: |
|||
|
|||
import html5lib |
|||
f = open("my_document.html") |
|||
tree = html5lib.parse(f) |
|||
""" |
|||
__version__ = "0.95-dev" |
|||
from html5parser import HTMLParser, parse, parseFragment |
|||
from treebuilders import getTreeBuilder |
|||
from treewalkers import getTreeWalker |
|||
from serializer import serialize |
File diff suppressed because it is too large
@ -0,0 +1,10 @@ |
|||
|
|||
class Filter(object): |
|||
def __init__(self, source): |
|||
self.source = source |
|||
|
|||
def __iter__(self): |
|||
return iter(self.source) |
|||
|
|||
def __getattr__(self, name): |
|||
return getattr(self.source, name) |
@ -0,0 +1,127 @@ |
|||
# |
|||
# The goal is to finally have a form filler where you pass data for |
|||
# each form, using the algorithm for "Seeding a form with initial values" |
|||
# See http://www.whatwg.org/specs/web-forms/current-work/#seeding |
|||
# |
|||
|
|||
import _base |
|||
|
|||
from html5lib.constants import spaceCharacters |
|||
spaceCharacters = u"".join(spaceCharacters) |
|||
|
|||
class SimpleFilter(_base.Filter): |
|||
def __init__(self, source, fieldStorage): |
|||
_base.Filter.__init__(self, source) |
|||
self.fieldStorage = fieldStorage |
|||
|
|||
def __iter__(self): |
|||
field_indices = {} |
|||
state = None |
|||
field_name = None |
|||
for token in _base.Filter.__iter__(self): |
|||
type = token["type"] |
|||
if type in ("StartTag", "EmptyTag"): |
|||
name = token["name"].lower() |
|||
if name == "input": |
|||
field_name = None |
|||
field_type = None |
|||
input_value_index = -1 |
|||
input_checked_index = -1 |
|||
for i,(n,v) in enumerate(token["data"]): |
|||
n = n.lower() |
|||
if n == u"name": |
|||
field_name = v.strip(spaceCharacters) |
|||
elif n == u"type": |
|||
field_type = v.strip(spaceCharacters) |
|||
elif n == u"checked": |
|||
input_checked_index = i |
|||
elif n == u"value": |
|||
input_value_index = i |
|||
|
|||
value_list = self.fieldStorage.getlist(field_name) |
|||
field_index = field_indices.setdefault(field_name, 0) |
|||
if field_index < len(value_list): |
|||
value = value_list[field_index] |
|||
else: |
|||
value = "" |
|||
|
|||
if field_type in (u"checkbox", u"radio"): |
|||
if value_list: |
|||
if token["data"][input_value_index][1] == value: |
|||
if input_checked_index < 0: |
|||
token["data"].append((u"checked", u"")) |
|||
field_indices[field_name] = field_index + 1 |
|||
elif input_checked_index >= 0: |
|||
del token["data"][input_checked_index] |
|||
|
|||
elif field_type not in (u"button", u"submit", u"reset"): |
|||
if input_value_index >= 0: |
|||
token["data"][input_value_index] = (u"value", value) |
|||
else: |
|||
token["data"].append((u"value", value)) |
|||
field_indices[field_name] = field_index + 1 |
|||
|
|||
field_type = None |
|||
field_name = None |
|||
|
|||
elif name == "textarea": |
|||
field_type = "textarea" |
|||
field_name = dict((token["data"])[::-1])["name"] |
|||
|
|||
elif name == "select": |
|||
field_type = "select" |
|||
attributes = dict(token["data"][::-1]) |
|||
field_name = attributes.get("name") |
|||
is_select_multiple = "multiple" in attributes |
|||
is_selected_option_found = False |
|||
|
|||
elif field_type == "select" and field_name and name == "option": |
|||
option_selected_index = -1 |
|||
option_value = None |
|||
for i,(n,v) in enumerate(token["data"]): |
|||
n = n.lower() |
|||
if n == "selected": |
|||
option_selected_index = i |
|||
elif n == "value": |
|||
option_value = v.strip(spaceCharacters) |
|||
if option_value is None: |
|||
raise NotImplementedError("<option>s without a value= attribute") |
|||
else: |
|||
value_list = self.fieldStorage.getlist(field_name) |
|||
if value_list: |
|||
field_index = field_indices.setdefault(field_name, 0) |
|||
if field_index < len(value_list): |
|||
value = value_list[field_index] |
|||
else: |
|||
value = "" |
|||
if (is_select_multiple or not is_selected_option_found) and option_value == value: |
|||
if option_selected_index < 0: |
|||
token["data"].append((u"selected", u"")) |
|||
field_indices[field_name] = field_index + 1 |
|||
is_selected_option_found = True |
|||
elif option_selected_index >= 0: |
|||
del token["data"][option_selected_index] |
|||
|
|||
elif field_type is not None and field_name and type == "EndTag": |
|||
name = token["name"].lower() |
|||
if name == field_type: |
|||
if name == "textarea": |
|||
value_list = self.fieldStorage.getlist(field_name) |
|||
if value_list: |
|||
field_index = field_indices.setdefault(field_name, 0) |
|||
if field_index < len(value_list): |
|||
value = value_list[field_index] |
|||
else: |
|||
value = "" |
|||
yield {"type": "Characters", "data": value} |
|||
field_indices[field_name] = field_index + 1 |
|||
|
|||
field_name = None |
|||
|
|||
elif name == "option" and field_type == "select": |
|||
pass # TODO: part of "option without value= attribute" processing |
|||
|
|||
elif field_type == "textarea": |
|||
continue # ignore token |
|||
|
|||
yield token |
@ -0,0 +1,62 @@ |
|||
import _base |
|||
|
|||
class Filter(_base.Filter): |
|||
def __init__(self, source, encoding): |
|||
_base.Filter.__init__(self, source) |
|||
self.encoding = encoding |
|||
|
|||
def __iter__(self): |
|||
state = "pre_head" |
|||
meta_found = (self.encoding is None) |
|||
pending = [] |
|||
|
|||
for token in _base.Filter.__iter__(self): |
|||
type = token["type"] |
|||
if type == "StartTag": |
|||
if token["name"].lower() == u"head": |
|||
state = "in_head" |
|||
|
|||
elif type == "EmptyTag": |
|||
if token["name"].lower() == u"meta": |
|||
# replace charset with actual encoding |
|||
has_http_equiv_content_type = False |
|||
for (namespace,name),value in token["data"].iteritems(): |
|||
if namespace != None: |
|||
continue |
|||
elif name.lower() == u'charset': |
|||
token["data"][(namespace,name)] = self.encoding |
|||
meta_found = True |
|||
break |
|||
elif name == u'http-equiv' and value.lower() == u'content-type': |
|||
has_http_equiv_content_type = True |
|||
else: |
|||
if has_http_equiv_content_type and (None, u"content") in token["data"]: |
|||
token["data"][(None, u"content")] = u'text/html; charset=%s' % self.encoding |
|||
meta_found = True |
|||
|
|||
elif token["name"].lower() == u"head" and not meta_found: |
|||
# insert meta into empty head |
|||
yield {"type": "StartTag", "name": u"head", |
|||
"data": token["data"]} |
|||
yield {"type": "EmptyTag", "name": u"meta", |
|||
"data": {(None, u"charset"): self.encoding}} |
|||
yield {"type": "EndTag", "name": u"head"} |
|||
meta_found = True |
|||
continue |
|||
|
|||
elif type == "EndTag": |
|||
if token["name"].lower() == u"head" and pending: |
|||
# insert meta into head (if necessary) and flush pending queue |
|||
yield pending.pop(0) |
|||
if not meta_found: |
|||
yield {"type": "EmptyTag", "name": u"meta", |
|||
"data": {(None, u"charset"): self.encoding}} |
|||
while pending: |
|||
yield pending.pop(0) |
|||
meta_found = True |
|||
state = "post_head" |
|||
|
|||
if state == "in_head": |
|||
pending.append(token) |
|||
else: |
|||
yield token |
@ -0,0 +1,88 @@ |
|||
from gettext import gettext |
|||
_ = gettext |
|||
|
|||
import _base |
|||
from html5lib.constants import cdataElements, rcdataElements, voidElements |
|||
|
|||
from html5lib.constants import spaceCharacters |
|||
spaceCharacters = u"".join(spaceCharacters) |
|||
|
|||
class LintError(Exception): pass |
|||
|
|||
class Filter(_base.Filter): |
|||
def __iter__(self): |
|||
open_elements = [] |
|||
contentModelFlag = "PCDATA" |
|||
for token in _base.Filter.__iter__(self): |
|||
type = token["type"] |
|||
if type in ("StartTag", "EmptyTag"): |
|||
name = token["name"] |
|||
if contentModelFlag != "PCDATA": |
|||
raise LintError(_("StartTag not in PCDATA content model flag: %s") % name) |
|||
if not isinstance(name, unicode): |
|||
raise LintError(_(u"Tag name is not a string: %r") % name) |
|||
if not name: |
|||
raise LintError(_(u"Empty tag name")) |
|||
if type == "StartTag" and name in voidElements: |
|||
raise LintError(_(u"Void element reported as StartTag token: %s") % name) |
|||
elif type == "EmptyTag" and name not in voidElements: |
|||
raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"]) |
|||
if type == "StartTag": |
|||
open_elements.append(name) |
|||
for name, value in token["data"]: |
|||
if not isinstance(name, unicode): |
|||
raise LintError(_("Attribute name is not a string: %r") % name) |
|||
if not name: |
|||
raise LintError(_(u"Empty attribute name")) |
|||
if not isinstance(value, unicode): |
|||
raise LintError(_("Attribute value is not a string: %r") % value) |
|||
if name in cdataElements: |
|||
contentModelFlag = "CDATA" |
|||
elif name in rcdataElements: |
|||
contentModelFlag = "RCDATA" |
|||
elif name == "plaintext": |
|||
contentModelFlag = "PLAINTEXT" |
|||
|
|||
elif type == "EndTag": |
|||
name = token["name"] |
|||
if not isinstance(name, unicode): |
|||
raise LintError(_(u"Tag name is not a string: %r") % name) |
|||
if not name: |
|||
raise LintError(_(u"Empty tag name")) |
|||
if name in voidElements: |
|||
raise LintError(_(u"Void element reported as EndTag token: %s") % name) |
|||
start_name = open_elements.pop() |
|||
if start_name != name: |
|||
raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name)) |
|||
contentModelFlag = "PCDATA" |
|||
|
|||
elif type == "Comment": |
|||
if contentModelFlag != "PCDATA": |
|||
raise LintError(_("Comment not in PCDATA content model flag")) |
|||
|
|||
elif type in ("Characters", "SpaceCharacters"): |
|||
data = token["data"] |
|||
if not isinstance(data, unicode): |
|||
raise LintError(_("Attribute name is not a string: %r") % data) |
|||
if not data: |
|||
raise LintError(_(u"%s token with empty data") % type) |
|||
if type == "SpaceCharacters": |
|||
data = data.strip(spaceCharacters) |
|||
if data: |
|||
raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data) |
|||
|
|||
elif type == "Doctype": |
|||
name = token["name"] |
|||
if contentModelFlag != "PCDATA": |
|||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name) |
|||
if not isinstance(name, unicode): |
|||
raise LintError(_(u"Tag name is not a string: %r") % name) |
|||
# XXX: what to do with token["data"] ? |
|||
|
|||
elif type in ("ParseError", "SerializeError"): |
|||
pass |
|||
|
|||
else: |
|||
raise LintError(_(u"Unknown token type: %s") % type) |
|||
|
|||
yield token |
@ -0,0 +1,202 @@ |
|||
import _base |
|||
|
|||
class Filter(_base.Filter): |
|||
def slider(self): |
|||
previous1 = previous2 = None |
|||
for token in self.source: |
|||
if previous1 is not None: |
|||
yield previous2, previous1, token |
|||
previous2 = previous1 |
|||
previous1 = token |
|||
yield previous2, previous1, None |
|||
|
|||
def __iter__(self): |
|||
for previous, token, next in self.slider(): |
|||
type = token["type"] |
|||
if type == "StartTag": |
|||
if (token["data"] or |
|||
not self.is_optional_start(token["name"], previous, next)): |
|||
yield token |
|||
elif type == "EndTag": |
|||
if not self.is_optional_end(token["name"], next): |
|||
yield token |
|||
else: |
|||
yield token |
|||
|
|||
def is_optional_start(self, tagname, previous, next): |
|||
type = next and next["type"] or None |
|||
if tagname in 'html': |
|||
# An html element's start tag may be omitted if the first thing |
|||
# inside the html element is not a space character or a comment. |
|||
return type not in ("Comment", "SpaceCharacters") |
|||
elif tagname == 'head': |
|||
# A head element's start tag may be omitted if the first thing |
|||
# inside the head element is an element. |
|||
# XXX: we also omit the start tag if the head element is empty |
|||
if type in ("StartTag", "EmptyTag"): |
|||
return True |
|||
elif type == "EndTag": |
|||
return next["name"] == "head" |
|||
elif tagname == 'body': |
|||
# A body element's start tag may be omitted if the first thing |
|||
# inside the body element is not a space character or a comment, |
|||
# except if the first thing inside the body element is a script |
|||
# or style element and the node immediately preceding the body |
|||
# element is a head element whose end tag has been omitted. |
|||
if type in ("Comment", "SpaceCharacters"): |
|||
return False |
|||
elif type == "StartTag": |
|||
# XXX: we do not look at the preceding event, so we never omit |
|||
# the body element's start tag if it's followed by a script or |
|||
# a style element. |
|||
return next["name"] not in ('script', 'style') |
|||
else: |
|||
return True |
|||
elif tagname == 'colgroup': |
|||
# A colgroup element's start tag may be omitted if the first thing |
|||
# inside the colgroup element is a col element, and if the element |
|||
# is not immediately preceeded by another colgroup element whose |
|||
# end tag has been omitted. |
|||
if type in ("StartTag", "EmptyTag"): |
|||
# XXX: we do not look at the preceding event, so instead we never |
|||
# omit the colgroup element's end tag when it is immediately |
|||
# followed by another colgroup element. See is_optional_end. |
|||
return next["name"] == "col" |
|||
else: |
|||
return False |
|||
elif tagname == 'tbody': |
|||
# A tbody element's start tag may be omitted if the first thing |
|||
# inside the tbody element is a tr element, and if the element is |
|||
# not immediately preceeded by a tbody, thead, or tfoot element |
|||
# whose end tag has been omitted. |
|||
if type == "StartTag": |
|||
# omit the thead and tfoot elements' end tag when they are |
|||
# immediately followed by a tbody element. See is_optional_end. |
|||
if previous and previous['type'] == 'EndTag' and \ |
|||
previous['name'] in ('tbody','thead','tfoot'): |
|||
return False |
|||
return next["name"] == 'tr' |
|||
else: |
|||
return False |
|||
return False |
|||
|
|||
def is_optional_end(self, tagname, next): |
|||
type = next and next["type"] or None |
|||
if tagname in ('html', 'head', 'body'): |
|||
# An html element's end tag may be omitted if the html element |
|||
# is not immediately followed by a space character or a comment. |
|||
return type not in ("Comment", "SpaceCharacters") |
|||
elif tagname in ('li', 'optgroup', 'tr'): |
|||
# A li element's end tag may be omitted if the li element is |
|||
# immediately followed by another li element or if there is |
|||
# no more content in the parent element. |
|||
# An optgroup element's end tag may be omitted if the optgroup |
|||
# element is immediately followed by another optgroup element, |
|||
# or if there is no more content in the parent element. |
|||
# A tr element's end tag may be omitted if the tr element is |
|||
# immediately followed by another tr element, or if there is |
|||
# no more content in the parent element. |
|||
if type == "StartTag": |
|||
return next["name"] == tagname |
|||
else: |
|||
return type == "EndTag" or type is None |
|||
elif tagname in ('dt', 'dd'): |
|||
# A dt element's end tag may be omitted if the dt element is |
|||
# immediately followed by another dt element or a dd element. |
|||
# A dd element's end tag may be omitted if the dd element is |
|||
# immediately followed by another dd element or a dt element, |
|||
# or if there is no more content in the parent element. |
|||
if type == "StartTag": |
|||
return next["name"] in ('dt', 'dd') |
|||
elif tagname == 'dd': |
|||
return type == "EndTag" or type is None |
|||
else: |
|||
return False |
|||
elif tagname == 'p': |
|||
# A p element's end tag may be omitted if the p element is |
|||
# immediately followed by an address, article, aside, |
|||
# blockquote, datagrid, dialog, dir, div, dl, fieldset, |
|||
# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu, |
|||
# nav, ol, p, pre, section, table, or ul, element, or if |
|||
# there is no more content in the parent element. |
|||
if type in ("StartTag", "EmptyTag"): |
|||
return next["name"] in ('address', 'article', 'aside', |
|||
'blockquote', 'datagrid', 'dialog', |
|||
'dir', 'div', 'dl', 'fieldset', 'footer', |
|||
'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', |
|||
'header', 'hr', 'menu', 'nav', 'ol', |
|||
'p', 'pre', 'section', 'table', 'ul') |
|||
else: |
|||
return type == "EndTag" or type is None |
|||
elif tagname == 'option': |
|||
# An option element's end tag may be omitted if the option |
|||
# element is immediately followed by another option element, |
|||
# or if it is immediately followed by an <code>optgroup</code> |
|||
# element, or if there is no more content in the parent |
|||
# element. |
|||
if type == "StartTag": |
|||
return next["name"] in ('option', 'optgroup') |
|||
else: |
|||
return type == "EndTag" or type is None |
|||
elif tagname in ('rt', 'rp'): |
|||
# An rt element's end tag may be omitted if the rt element is |
|||
# immediately followed by an rt or rp element, or if there is |
|||
# no more content in the parent element. |
|||
# An rp element's end tag may be omitted if the rp element is |
|||
# immediately followed by an rt or rp element, or if there is |
|||
# no more content in the parent element. |
|||
if type == "StartTag": |
|||
return next["name"] in ('rt', 'rp') |
|||
else: |
|||
return type == "EndTag" or type is None |
|||
elif tagname == 'colgroup': |
|||
# A colgroup element's end tag may be omitted if the colgroup |
|||
# element is not immediately followed by a space character or |
|||
# a comment. |
|||
if type in ("Comment", "SpaceCharacters"): |
|||
return False |
|||
elif type == "StartTag": |
|||
# XXX: we also look for an immediately following colgroup |
|||
# element. See is_optional_start. |
|||
return next["name"] != 'colgroup' |
|||
else: |
|||
return True |
|||
elif tagname in ('thead', 'tbody'): |
|||
# A thead element's end tag may be omitted if the thead element |
|||
# is immediately followed by a tbody or tfoot element. |
|||
# A tbody element's end tag may be omitted if the tbody element |
|||
# is immediately followed by a tbody or tfoot element, or if |
|||
# there is no more content in the parent element. |
|||
# A tfoot element's end tag may be omitted if the tfoot element |
|||
# is immediately followed by a tbody element, or if there is no |
|||
# more content in the parent element. |
|||
# XXX: we never omit the end tag when the following element is |
|||
# a tbody. See is_optional_start. |
|||
if type == "StartTag": |
|||
return next["name"] in ['tbody', 'tfoot'] |
|||
elif tagname == 'tbody': |
|||
return type == "EndTag" or type is None |
|||
else: |
|||
return False |
|||
elif tagname == 'tfoot': |
|||
# A tfoot element's end tag may be omitted if the tfoot element |
|||
# is immediately followed by a tbody element, or if there is no |
|||
# more content in the parent element. |
|||
# XXX: we never omit the end tag when the following element is |
|||
# a tbody. See is_optional_start. |
|||
if type == "StartTag": |
|||
return next["name"] == 'tbody' |
|||
else: |
|||
return type == "EndTag" or type is None |
|||
elif tagname in ('td', 'th'): |
|||
# A td element's end tag may be omitted if the td element is |
|||
# immediately followed by a td or th element, or if there is |
|||
# no more content in the parent element. |
|||
# A th element's end tag may be omitted if the th element is |
|||
# immediately followed by a td or th element, or if there is |
|||
# no more content in the parent element. |
|||
if type == "StartTag": |
|||
return next["name"] in ('td', 'th') |
|||
else: |
|||
return type == "EndTag" or type is None |
|||
return False |
@ -0,0 +1,8 @@ |
|||
import _base |
|||
from html5lib.sanitizer import HTMLSanitizerMixin |
|||
|
|||
class Filter(_base.Filter, HTMLSanitizerMixin): |
|||
def __iter__(self): |
|||
for token in _base.Filter.__iter__(self): |
|||
token = self.sanitize_token(token) |
|||
if token: yield token |
@ -0,0 +1,41 @@ |
|||
try: |
|||
frozenset |
|||
except NameError: |
|||
# Import from the sets module for python 2.3 |
|||
from sets import ImmutableSet as frozenset |
|||
|
|||
import re |
|||
|
|||
import _base |
|||
from html5lib.constants import rcdataElements, spaceCharacters |
|||
spaceCharacters = u"".join(spaceCharacters) |
|||
|
|||
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters) |
|||
|
|||
class Filter(_base.Filter): |
|||
|
|||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) |
|||
|
|||
def __iter__(self): |
|||
preserve = 0 |
|||
for token in _base.Filter.__iter__(self): |
|||
type = token["type"] |
|||
if type == "StartTag" \ |
|||
and (preserve or token["name"] in self.spacePreserveElements): |
|||
preserve += 1 |
|||
|
|||
elif type == "EndTag" and preserve: |
|||
preserve -= 1 |
|||
|
|||
elif not preserve and type == "SpaceCharacters" and token["data"]: |
|||
# Test on token["data"] above to not introduce spaces where there were not |
|||
token["data"] = u" " |
|||
|
|||
elif not preserve and type == "Characters": |
|||
token["data"] = collapse_spaces(token["data"]) |
|||
|
|||
yield token |
|||
|
|||
def collapse_spaces(text): |
|||
return SPACES_REGEX.sub(' ', text) |
|||
|
File diff suppressed because it is too large
@ -0,0 +1,177 @@ |
|||
import re |
|||
|
|||
baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]""" |
|||
|
|||
ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]""" |
|||
|
|||
combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A""" |
|||
|
|||
digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]""" |
|||
|
|||
extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]""" |
|||
|
|||
letter = " | ".join([baseChar, ideographic]) |
|||
|
|||
#Without the |
|||
name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, |
|||
extender]) |
|||
nameFirst = " | ".join([letter, "_"]) |
|||
|
|||
reChar = re.compile(r"#x([\d|A-F]{4,4})") |
|||
reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]") |
|||
|
|||
def charStringToList(chars): |
|||
charRanges = [item.strip() for item in chars.split(" | ")] |
|||
rv = [] |
|||
for item in charRanges: |
|||
foundMatch = False |
|||
for regexp in (reChar, reCharRange): |
|||
match = regexp.match(item) |
|||
if match is not None: |
|||
rv.append([hexToInt(item) for item in match.groups()]) |
|||
if len(rv[-1]) == 1: |
|||
rv[-1] = rv[-1]*2 |
|||
foundMatch = True |
|||
break |
|||
if not foundMatch: |
|||
assert len(item) == 1 |
|||
|
|||
rv.append([ord(item)] * 2) |
|||
rv = normaliseCharList(rv) |
|||
return rv |
|||
|
|||
def normaliseCharList(charList): |
|||
charList = sorted(charList) |
|||
for item in charList: |
|||
assert item[1] >= item[0] |
|||
rv = [] |
|||
i = 0 |
|||
while i < len(charList): |
|||
j = 1 |
|||
rv.append(charList[i]) |
|||
while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1: |
|||
rv[-1][1] = charList[i+j][1] |
|||
j += 1 |
|||
i += j |
|||
return rv |
|||
|
|||
#We don't really support characters above the BMP :( |
|||
max_unicode = int("FFFF", 16) |
|||
|
|||
def missingRanges(charList): |
|||
rv = [] |
|||
if charList[0] != 0: |
|||
rv.append([0, charList[0][0] - 1]) |
|||
for i, item in enumerate(charList[:-1]): |
|||
rv.append([item[1]+1, charList[i+1][0] - 1]) |
|||
if charList[-1][1] != max_unicode: |
|||
rv.append([charList[-1][1] + 1, max_unicode]) |
|||
return rv |
|||
|
|||
def listToRegexpStr(charList): |
|||
rv = [] |
|||
for item in charList: |
|||
if item[0] == item[1]: |
|||
rv.append(escapeRegexp(unichr(item[0]))) |
|||
else: |
|||
rv.append(escapeRegexp(unichr(item[0])) + "-" + |
|||
escapeRegexp(unichr(item[1]))) |
|||
return "[%s]"%"".join(rv) |
|||
|
|||
def hexToInt(hex_str): |
|||
return int(hex_str, 16) |
|||
|
|||
def escapeRegexp(string): |
|||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}", |
|||
"[", "]", "|", "(", ")", "-") |
|||
for char in specialCharacters: |
|||
string = string.replace(char, "\\" + char) |
|||
if char in string: |
|||
print string |
|||
|
|||
return string |
|||
|
|||
#output from the above |
|||
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') |
|||
|
|||
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') |
|||
|
|||
class InfosetFilter(object): |
|||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}") |
|||
def __init__(self, replaceChars = None, |
|||
dropXmlnsLocalName = False, |
|||
dropXmlnsAttrNs = False, |
|||
preventDoubleDashComments = False, |
|||
preventDashAtCommentEnd = False, |
|||
replaceFormFeedCharacters = True): |
|||
|
|||
self.dropXmlnsLocalName = dropXmlnsLocalName |
|||
self.dropXmlnsAttrNs = dropXmlnsAttrNs |
|||
|
|||
self.preventDoubleDashComments = preventDoubleDashComments |
|||
self.preventDashAtCommentEnd = preventDashAtCommentEnd |
|||
|
|||
self.replaceFormFeedCharacters = replaceFormFeedCharacters |
|||
|
|||
self.replaceCache = {} |
|||
|
|||
def coerceAttribute(self, name, namespace=None): |
|||
if self.dropXmlnsLocalName and name.startswith("xmlns:"): |
|||
#Need a datalosswarning here |
|||
return None |
|||
elif (self.dropXmlnsAttrNs and |
|||
namespace == "http://www.w3.org/2000/xmlns/"): |
|||
return None |
|||
else: |
|||
return self.toXmlName(name) |
|||
|
|||
def coerceElement(self, name, namespace=None): |
|||
return self.toXmlName(name) |
|||
|
|||
def coerceComment(self, data): |
|||
if self.preventDoubleDashComments: |
|||
while "--" in data: |
|||
data = data.replace("--", "- -") |
|||
return data |
|||
|
|||
def coerceCharacters(self, data): |
|||
if self.replaceFormFeedCharacters: |
|||
data = data.replace("\x0C", " ") |
|||
#Other non-xml characters |
|||
return data |
|||
|
|||
def toXmlName(self, name): |
|||
nameFirst = name[0] |
|||
nameRest = name[1:] |
|||
m = nonXmlNameFirstBMPRegexp.match(nameFirst) |
|||
if m: |
|||
nameFirstOutput = self.getReplacementCharacter(nameFirst) |
|||
else: |
|||
nameFirstOutput = nameFirst |
|||
|
|||
nameRestOutput = nameRest |
|||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest)) |
|||
for char in replaceChars: |
|||
replacement = self.getReplacementCharacter(char) |
|||
nameRestOutput = nameRestOutput.replace(char, replacement) |
|||
return nameFirstOutput + nameRestOutput |
|||
|
|||
def getReplacementCharacter(self, char): |
|||
if char in self.replaceCache: |
|||
replacement = self.replaceCache[char] |
|||
else: |
|||
replacement = self.escapeChar(char) |
|||
return replacement |
|||
|
|||
def fromXmlName(self, name): |
|||
for item in set(self.replacementRegexp.findall(name)): |
|||
name = name.replace(item, self.unescapeChar(item)) |
|||
return name |
|||
|
|||
def escapeChar(self, char): |
|||
replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0") |
|||
self.replaceCache[char] = replacement |
|||
return replacement |
|||
|
|||
def unescapeChar(self, charcode): |
|||
return unichr(int(charcode[1:], 16)) |
@ -0,0 +1,782 @@ |
|||
import codecs |
|||
import re |
|||
import types |
|||
import sys |
|||
|
|||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase |
|||
from constants import encodings, ReparseException |
|||
import utils |
|||
|
|||
#Non-unicode versions of constants for use in the pre-parser |
|||
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters]) |
|||
asciiLettersBytes = frozenset([str(item) for item in asciiLetters]) |
|||
asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase]) |
|||
spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"]) |
|||
|
|||
invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]") |
|||
|
|||
non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, |
|||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, |
|||
0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, |
|||
0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, |
|||
0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, |
|||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, |
|||
0x10FFFE, 0x10FFFF]) |
|||
|
|||
ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") |
|||
|
|||
# Cache for charsUntil() |
|||
charsUntilRegEx = {} |
|||
|
|||
class BufferedStream: |
|||
"""Buffering for streams that do not have buffering of their own |
|||
|
|||
The buffer is implemented as a list of chunks on the assumption that |
|||
joining many strings will be slow since it is O(n**2) |
|||
""" |
|||
|
|||
def __init__(self, stream): |
|||
self.stream = stream |
|||
self.buffer = [] |
|||
self.position = [-1,0] #chunk number, offset |
|||
|
|||
def tell(self): |
|||
pos = 0 |
|||
for chunk in self.buffer[:self.position[0]]: |
|||
pos += len(chunk) |
|||
pos += self.position[1] |
|||
return pos |
|||
|
|||
def seek(self, pos): |
|||
assert pos < self._bufferedBytes() |
|||
offset = pos |
|||
i = 0 |
|||
while len(self.buffer[i]) < offset: |
|||
offset -= pos |
|||
i += 1 |
|||
self.position = [i, offset] |
|||
|
|||
def read(self, bytes): |
|||
if not self.buffer: |
|||
return self._readStream(bytes) |
|||
elif (self.position[0] == len(self.buffer) and |
|||
self.position[1] == len(self.buffer[-1])): |
|||
return self._readStream(bytes) |
|||
else: |
|||
return self._readFromBuffer(bytes) |
|||
|
|||
def _bufferedBytes(self): |
|||
return sum([len(item) for item in self.buffer]) |
|||
|
|||
def _readStream(self, bytes): |
|||
data = self.stream.read(bytes) |
|||
self.buffer.append(data) |
|||
self.position[0] += 1 |
|||
self.position[1] = len(data) |
|||
return data |
|||
|
|||
def _readFromBuffer(self, bytes): |
|||
remainingBytes = bytes |
|||
rv = [] |
|||
bufferIndex = self.position[0] |
|||
bufferOffset = self.position[1] |
|||
while bufferIndex < len(self.buffer) and remainingBytes != 0: |
|||
assert remainingBytes > 0 |
|||
bufferedData = self.buffer[bufferIndex] |
|||
|
|||
if remainingBytes <= len(bufferedData) - bufferOffset: |
|||
bytesToRead = remainingBytes |
|||
self.position = [bufferIndex, bufferOffset + bytesToRead] |
|||
else: |
|||
bytesToRead = len(bufferedData) - bufferOffset |
|||
self.position = [bufferIndex, len(bufferedData)] |
|||
bufferIndex += 1 |
|||
data = rv.append(bufferedData[bufferOffset: |
|||
bufferOffset + bytesToRead]) |
|||
remainingBytes -= bytesToRead |
|||
|
|||
bufferOffset = 0 |
|||
|
|||
if remainingBytes: |
|||
rv.append(self._readStream(remainingBytes)) |
|||
|
|||
return "".join(rv) |
|||
|
|||
|
|||
|
|||
class HTMLInputStream: |
|||
"""Provides a unicode stream of characters to the HTMLTokenizer. |
|||
|
|||
This class takes care of character encoding and removing or replacing |
|||
incorrect byte-sequences and also provides column and line tracking. |
|||
|
|||
""" |
|||
|
|||
_defaultChunkSize = 10240 |
|||
|
|||
def __init__(self, source, encoding=None, parseMeta=True, chardet=True): |
|||
"""Initialises the HTMLInputStream. |
|||
|
|||
HTMLInputStream(source, [encoding]) -> Normalized stream from source |
|||
for use by html5lib. |
|||
|
|||
source can be either a file-object, local filename or a string. |
|||
|
|||
The optional encoding parameter must be a string that indicates |
|||
the encoding. If specified, that encoding will be used, |
|||
regardless of any BOM or later declaration (such as in a meta |
|||
element) |
|||
|
|||
parseMeta - Look for a <meta> element containing encoding information |
|||
|
|||
""" |
|||
|
|||
#Craziness |
|||
if len(u"\U0010FFFF") == 1: |
|||
self.reportCharacterErrors = self.characterErrorsUCS4 |
|||
self.replaceCharactersRegexp = re.compile(u"[\uD800-\uDFFF]") |
|||
else: |
|||
self.reportCharacterErrors = self.characterErrorsUCS2 |
|||
self.replaceCharactersRegexp = re.compile(u"([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])") |
|||
|
|||
# List of where new lines occur |
|||
self.newLines = [0] |
|||
|
|||
self.charEncoding = (codecName(encoding), "certain") |
|||
|
|||
# Raw Stream - for unicode objects this will encode to utf-8 and set |
|||
# self.charEncoding as appropriate |
|||
self.rawStream = self.openStream(source) |
|||
|
|||
# Encoding Information |
|||
#Number of bytes to use when looking for a meta element with |
|||
#encoding information |
|||
self.numBytesMeta = 512 |
|||
#Number of bytes to use when using detecting encoding using chardet |
|||
self.numBytesChardet = 100 |
|||
#Encoding to use if no other information can be found |
|||
self.defaultEncoding = "windows-1252" |
|||
|
|||
#Detect encoding iff no explicit "transport level" encoding is supplied |
|||
if (self.charEncoding[0] is None): |
|||
self.charEncoding = self.detectEncoding(parseMeta, chardet) |
|||
|
|||
|
|||
self.reset() |
|||
|
|||
def reset(self): |
|||
self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream, |
|||
'replace') |
|||
|
|||
self.chunk = u"" |
|||
self.chunkSize = 0 |
|||
self.chunkOffset = 0 |
|||
self.errors = [] |
|||
|
|||
# number of (complete) lines in previous chunks |
|||
self.prevNumLines = 0 |
|||
# number of columns in the last line of the previous chunk |
|||
self.prevNumCols = 0 |
|||
|
|||
#Deal with CR LF and surrogates split over chunk boundaries |
|||
self._bufferedCharacter = None |
|||
|
|||
def openStream(self, source): |
|||
"""Produces a file object from source. |
|||
|
|||
source can be either a file object, local filename or a string. |
|||
|
|||
""" |
|||
# Already a file object |
|||
if hasattr(source, 'read'): |
|||
stream = source |
|||
else: |
|||
# Otherwise treat source as a string and convert to a file object |
|||
if isinstance(source, unicode): |
|||
source = source.encode('utf-8') |
|||
self.charEncoding = ("utf-8", "certain") |
|||
try: |
|||
from io import BytesIO |
|||
except: |
|||
# 2to3 converts this line to: from io import StringIO |
|||
from cStringIO import StringIO as BytesIO |
|||
stream = BytesIO(source) |
|||
|
|||
if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or |
|||
stream is sys.stdin): |
|||
stream = BufferedStream(stream) |
|||
|
|||
return stream |
|||
|
|||
def detectEncoding(self, parseMeta=True, chardet=True): |
|||
#First look for a BOM |
|||
#This will also read past the BOM if present |
|||
encoding = self.detectBOM() |
|||
confidence = "certain" |
|||
#If there is no BOM need to look for meta elements with encoding |
|||
#information |
|||
if encoding is None and parseMeta: |
|||
encoding = self.detectEncodingMeta() |
|||
confidence = "tentative" |
|||
#Guess with chardet, if avaliable |
|||
if encoding is None and chardet: |
|||
confidence = "tentative" |
|||
try: |
|||
from chardet.universaldetector import UniversalDetector |
|||
buffers = [] |
|||
detector = UniversalDetector() |
|||
while not detector.done: |
|||
buffer = self.rawStream.read(self.numBytesChardet) |
|||
if not buffer: |
|||
break |
|||
buffers.append(buffer) |
|||
detector.feed(buffer) |
|||
detector.close() |
|||
encoding = detector.result['encoding'] |
|||
self.rawStream.seek(0) |
|||
except ImportError: |
|||
pass |
|||
# If all else fails use the default encoding |
|||
if encoding is None: |
|||
confidence="tentative" |
|||
encoding = self.defaultEncoding |
|||
|
|||
#Substitute for equivalent encodings: |
|||
encodingSub = {"iso-8859-1":"windows-1252"} |
|||
|
|||
if encoding.lower() in encodingSub: |
|||
encoding = encodingSub[encoding.lower()] |
|||
|
|||
return encoding, confidence |
|||
|
|||
def changeEncoding(self, newEncoding): |
|||
newEncoding = codecName(newEncoding) |
|||
if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"): |
|||
newEncoding = "utf-8" |
|||
if newEncoding is None: |
|||
return |
|||
elif newEncoding == self.charEncoding[0]: |
|||
self.charEncoding = (self.charEncoding[0], "certain") |
|||
else: |
|||
self.rawStream.seek(0) |
|||
self.reset() |
|||
self.charEncoding = (newEncoding, "certain") |
|||
raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding) |
|||
|
|||
def detectBOM(self): |
|||
"""Attempts to detect at BOM at the start of the stream. If |
|||
an encoding can be determined from the BOM return the name of the |
|||
encoding otherwise return None""" |
|||
bomDict = { |
|||
codecs.BOM_UTF8: 'utf-8', |
|||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', |
|||
codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' |
|||
} |
|||
|
|||
# Go to beginning of file and read in 4 bytes |
|||
string = self.rawStream.read(4) |
|||
|
|||
# Try detecting the BOM using bytes from the string |
|||
encoding = bomDict.get(string[:3]) # UTF-8 |
|||
seek = 3 |
|||
if not encoding: |
|||
# Need to detect UTF-32 before UTF-16 |
|||
encoding = bomDict.get(string) # UTF-32 |
|||
seek = 4 |
|||
if not encoding: |
|||
encoding = bomDict.get(string[:2]) # UTF-16 |
|||
seek = 2 |
|||
|
|||
# Set the read position past the BOM if one was found, otherwise |
|||
# set it to the start of the stream |
|||
self.rawStream.seek(encoding and seek or 0) |
|||
|
|||
return encoding |
|||
|
|||
def detectEncodingMeta(self): |
|||
"""Report the encoding declared by the meta element |
|||
""" |
|||
buffer = self.rawStream.read(self.numBytesMeta) |
|||
parser = EncodingParser(buffer) |
|||
self.rawStream.seek(0) |
|||
encoding = parser.getEncoding() |
|||
|
|||
if encoding in ("utf-16", "utf-16-be", "utf-16-le"): |
|||
encoding = "utf-8" |
|||
|
|||
return encoding |
|||
|
|||
def _position(self, offset): |
|||
chunk = self.chunk |
|||
nLines = chunk.count(u'\n', 0, offset) |
|||
positionLine = self.prevNumLines + nLines |
|||
lastLinePos = chunk.rfind(u'\n', 0, offset) |
|||
if lastLinePos == -1: |
|||
positionColumn = self.prevNumCols + offset |
|||
else: |
|||
positionColumn = offset - (lastLinePos + 1) |
|||
return (positionLine, positionColumn) |
|||
|
|||
def position(self): |
|||
"""Returns (line, col) of the current position in the stream.""" |
|||
line, col = self._position(self.chunkOffset) |
|||
return (line+1, col) |
|||
|
|||
def char(self): |
|||
""" Read one character from the stream or queue if available. Return |
|||
EOF when EOF is reached. |
|||
""" |
|||
# Read a new chunk from the input stream if necessary |
|||
if self.chunkOffset >= self.chunkSize: |
|||
if not self.readChunk(): |
|||
return EOF |
|||
|
|||
chunkOffset = self.chunkOffset |
|||
char = self.chunk[chunkOffset] |
|||
self.chunkOffset = chunkOffset + 1 |
|||
|
|||
return char |
|||
|
|||
def readChunk(self, chunkSize=None): |
|||
if chunkSize is None: |
|||
chunkSize = self._defaultChunkSize |
|||
|
|||
self.prevNumLines, self.prevNumCols = self._position(self.chunkSize) |
|||
|
|||
self.chunk = u"" |
|||
self.chunkSize = 0 |
|||
self.chunkOffset = 0 |
|||
|
|||
data = self.dataStream.read(chunkSize) |
|||
|
|||
#Deal with CR LF and surrogates broken across chunks |
|||
if self._bufferedCharacter: |
|||
data = self._bufferedCharacter + data |
|||
self._bufferedCharacter = None |
|||
elif not data: |
|||
# We have no more data, bye-bye stream |
|||
return False |
|||
|
|||
if len(data) > 1: |
|||
lastv = ord(data[-1]) |
|||
if lastv == 0x0D or 0xD800 <= lastv <= 0xDBFF: |
|||
self._bufferedCharacter = data[-1] |
|||
data = data[:-1] |
|||
|
|||
self.reportCharacterErrors(data) |
|||
|
|||
# Replace invalid characters |
|||
# Note U+0000 is dealt with in the tokenizer |
|||
data = self.replaceCharactersRegexp.sub(u"\ufffd", data) |
|||
|
|||
data = data.replace(u"\r\n", u"\n") |
|||
data = data.replace(u"\r", u"\n") |
|||
|
|||
self.chunk = data |
|||
self.chunkSize = len(data) |
|||
|
|||
return True |
|||
|
|||
def characterErrorsUCS4(self, data): |
|||
for i in xrange(len(invalid_unicode_re.findall(data))): |
|||
self.errors.append("invalid-codepoint") |
|||
|
|||
def characterErrorsUCS2(self, data): |
|||
#Someone picked the wrong compile option |
|||
#You lose |
|||
skip = False |
|||
import sys |
|||
for match in invalid_unicode_re.finditer(data): |
|||
if skip: |
|||
continue |
|||
codepoint = ord(match.group()) |
|||
pos = match.start() |
|||
#Pretty sure there should be endianness issues here |
|||
if utils.isSurrogatePair(data[pos:pos+2]): |
|||
#We have a surrogate pair! |
|||
char_val = utils.surrogatePairToCodepoint(data[pos:pos+2]) |
|||
if char_val in non_bmp_invalid_codepoints: |
|||
self.errors.append("invalid-codepoint") |
|||
skip = True |
|||
elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and |
|||
pos == len(data) - 1): |
|||
self.errors.append("invalid-codepoint") |
|||
else: |
|||
skip = False |
|||
self.errors.append("invalid-codepoint") |
|||
|
|||
def charsUntil(self, characters, opposite = False): |
|||
""" Returns a string of characters from the stream up to but not |
|||
including any character in 'characters' or EOF. 'characters' must be |
|||
a container that supports the 'in' method and iteration over its |
|||
characters. |
|||
""" |
|||
|
|||
# Use a cache of regexps to find the required characters |
|||
try: |
|||
chars = charsUntilRegEx[(characters, opposite)] |
|||
except KeyError: |
|||
if __debug__: |
|||
for c in characters: |
|||
assert(ord(c) < 128) |
|||
regex = u"".join([u"\\x%02x" % ord(c) for c in characters]) |
|||
if not opposite: |
|||
regex = u"^%s" % regex |
|||
chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex) |
|||
|
|||
rv = [] |
|||
|
|||
while True: |
|||
# Find the longest matching prefix |
|||
m = chars.match(self.chunk, self.chunkOffset) |
|||
if m is None: |
|||
# If nothing matched, and it wasn't because we ran out of chunk, |
|||
# then stop |
|||
if self.chunkOffset != self.chunkSize: |
|||
break |
|||
else: |
|||
end = m.end() |
|||
# If not the whole chunk matched, return everything |
|||
# up to the part that didn't match |
|||
if end != self.chunkSize: |
|||
rv.append(self.chunk[self.chunkOffset:end]) |
|||
self.chunkOffset = end |
|||
break |
|||
# If the whole remainder of the chunk matched, |
|||
# use it all and read the next chunk |
|||
rv.append(self.chunk[self.chunkOffset:]) |
|||
if not self.readChunk(): |
|||
# Reached EOF |
|||
break |
|||
|
|||
r = u"".join(rv) |
|||
return r |
|||
|
|||
def unget(self, char): |
|||
# Only one character is allowed to be ungotten at once - it must |
|||
# be consumed again before any further call to unget |
|||
if char is not None: |
|||
if self.chunkOffset == 0: |
|||
# unget is called quite rarely, so it's a good idea to do |
|||
# more work here if it saves a bit of work in the frequently |
|||
# called char and charsUntil. |
|||
# So, just prepend the ungotten character onto the current |
|||
# chunk: |
|||
self.chunk = char + self.chunk |
|||
self.chunkSize += 1 |
|||
else: |
|||
self.chunkOffset -= 1 |
|||
assert self.chunk[self.chunkOffset] == char |
|||
|
|||
class EncodingBytes(str): |
|||
"""String-like object with an associated position and various extra methods |
|||
If the position is ever greater than the string length then an exception is |
|||
raised""" |
|||
def __new__(self, value): |
|||
return str.__new__(self, value.lower()) |
|||
|
|||
def __init__(self, value): |
|||
self._position=-1 |
|||
|
|||
def __iter__(self): |
|||
return self |
|||
|
|||
def next(self): |
|||
p = self._position = self._position + 1 |
|||
if p >= len(self): |
|||
raise StopIteration |
|||
elif p < 0: |
|||
raise TypeError |
|||
return self[p] |
|||
|
|||
def previous(self): |
|||
p = self._position |
|||
if p >= len(self): |
|||
raise StopIteration |
|||
elif p < 0: |
|||
raise TypeError |
|||
self._position = p = p - 1 |
|||
return self[p] |
|||
|
|||
def setPosition(self, position): |
|||
if self._position >= len(self): |
|||
raise StopIteration |
|||
self._position = position |
|||
|
|||
def getPosition(self): |
|||
if self._position >= len(self): |
|||
raise StopIteration |
|||
if self._position >= 0: |
|||
return self._position |
|||
else: |
|||
return None |
|||
|
|||
position = property(getPosition, setPosition) |
|||
|
|||
def getCurrentByte(self): |
|||
return self[self.position] |
|||
|
|||
currentByte = property(getCurrentByte) |
|||
|
|||
def skip(self, chars=spaceCharactersBytes): |
|||
"""Skip past a list of characters""" |
|||
p = self.position # use property for the error-checking |
|||
while p < len(self): |
|||
c = self[p] |
|||
if c not in chars: |
|||
self._position = p |
|||
return c |
|||
p += 1 |
|||
self._position = p |
|||
return None |
|||
|
|||
def skipUntil(self, chars): |
|||
p = self.position |
|||
while p < len(self): |
|||
c = self[p] |
|||
if c in chars: |
|||
self._position = p |
|||
return c |
|||
p += 1 |
|||
self._position = p |
|||
return None |
|||
|
|||
def matchBytes(self, bytes): |
|||
"""Look for a sequence of bytes at the start of a string. If the bytes |
|||
are found return True and advance the position to the byte after the |
|||
match. Otherwise return False and leave the position alone""" |
|||
p = self.position |
|||
data = self[p:p+len(bytes)] |
|||
rv = data.startswith(bytes) |
|||
if rv: |
|||
self.position += len(bytes) |
|||
return rv |
|||
|
|||
def jumpTo(self, bytes): |
|||
"""Look for the next sequence of bytes matching a given sequence. If |
|||
a match is found advance the position to the last byte of the match""" |
|||
newPosition = self[self.position:].find(bytes) |
|||
if newPosition > -1: |
|||
# XXX: This is ugly, but I can't see a nicer way to fix this. |
|||
if self._position == -1: |
|||
self._position = 0 |
|||
self._position += (newPosition + len(bytes)-1) |
|||
return True |
|||
else: |
|||
raise StopIteration |
|||
|
|||
class EncodingParser(object): |
|||
"""Mini parser for detecting character encoding from meta elements""" |
|||
|
|||
def __init__(self, data): |
|||
"""string - the data to work on for encoding detection""" |
|||
self.data = EncodingBytes(data) |
|||
self.encoding = None |
|||
|
|||
def getEncoding(self): |
|||
methodDispatch = ( |
|||
("<!--",self.handleComment), |
|||
("<meta",self.handleMeta), |
|||
("</",self.handlePossibleEndTag), |
|||
("<!",self.handleOther), |
|||
("<?",self.handleOther), |
|||
("<",self.handlePossibleStartTag)) |
|||
for byte in self.data: |
|||
keepParsing = True |
|||
for key, method in methodDispatch: |
|||
if self.data.matchBytes(key): |
|||
try: |
|||
keepParsing = method() |
|||
break |
|||
except StopIteration: |
|||
keepParsing=False |
|||
break |
|||
if not keepParsing: |
|||
break |
|||
|
|||
return self.encoding |
|||
|
|||
def handleComment(self): |
|||
"""Skip over comments""" |
|||
return self.data.jumpTo("-->") |
|||
|
|||
def handleMeta(self): |
|||
if self.data.currentByte not in spaceCharactersBytes: |
|||
#if we have <meta not followed by a space so just keep going |
|||
return True |
|||
#We have a valid meta element we want to search for attributes |
|||
while True: |
|||
#Try to find the next attribute after the current position |
|||
attr = self.getAttribute() |
|||
if attr is None: |
|||
return True |
|||
else: |
|||
if attr[0] == "charset": |
|||
tentativeEncoding = attr[1] |
|||
codec = codecName(tentativeEncoding) |
|||
if codec is not None: |
|||
self.encoding = codec |
|||
return False |
|||
elif attr[0] == "content": |
|||
contentParser = ContentAttrParser(EncodingBytes(attr[1])) |
|||
tentativeEncoding = contentParser.parse() |
|||
codec = codecName(tentativeEncoding) |
|||
if codec is not None: |
|||
self.encoding = codec |
|||
return False |
|||
|
|||
def handlePossibleStartTag(self): |
|||
return self.handlePossibleTag(False) |
|||
|
|||
def handlePossibleEndTag(self): |
|||
self.data.next() |
|||
return self.handlePossibleTag(True) |
|||
|
|||
def handlePossibleTag(self, endTag): |
|||
data = self.data |
|||
if data.currentByte not in asciiLettersBytes: |
|||
#If the next byte is not an ascii letter either ignore this |
|||
#fragment (possible start tag case) or treat it according to |
|||
#handleOther |
|||
if endTag: |
|||
data.previous() |
|||
self.handleOther() |
|||
return True |
|||
|
|||
c = data.skipUntil(spacesAngleBrackets) |
|||
if c == "<": |
|||
#return to the first step in the overall "two step" algorithm |
|||
#reprocessing the < byte |
|||
data.previous() |
|||
else: |
|||
#Read all attributes |
|||
attr = self.getAttribute() |
|||
while attr is not None: |
|||
attr = self.getAttribute() |
|||
return True |
|||
|
|||
def handleOther(self): |
|||
return self.data.jumpTo(">") |
|||
|
|||
def getAttribute(self): |
|||
"""Return a name,value pair for the next attribute in the stream, |
|||
if one is found, or None""" |
|||
data = self.data |
|||
# Step 1 (skip chars) |
|||
c = data.skip(spaceCharactersBytes | frozenset("/")) |
|||
# Step 2 |
|||
if c in (">", None): |
|||
return None |
|||
# Step 3 |
|||
attrName = [] |
|||
attrValue = [] |
|||
#Step 4 attribute name |
|||
while True: |
|||
if c == "=" and attrName: |
|||
break |
|||
elif c in spaceCharactersBytes: |
|||
#Step 6! |
|||
c = data.skip() |
|||
c = data.next() |
|||
break |
|||
elif c in ("/", ">"): |
|||
return "".join(attrName), "" |
|||
elif c in asciiUppercaseBytes: |
|||
attrName.append(c.lower()) |
|||
elif c == None: |
|||
return None |
|||
else: |
|||
attrName.append(c) |
|||
#Step 5 |
|||
c = data.next() |
|||
#Step 7 |
|||
if c != "=": |
|||
data.previous() |
|||
return "".join(attrName), "" |
|||
#Step 8 |
|||
data.next() |
|||
#Step 9 |
|||
c = data.skip() |
|||
#Step 10 |
|||
if c in ("'", '"'): |
|||
#10.1 |
|||
quoteChar = c |
|||
while True: |
|||
#10.2 |
|||
c = data.next() |
|||
#10.3 |
|||
if c == quoteChar: |
|||
data.next() |
|||
return "".join(attrName), "".join(attrValue) |
|||
#10.4 |
|||
elif c in asciiUppercaseBytes: |
|||
attrValue.append(c.lower()) |
|||
#10.5 |
|||
else: |
|||
attrValue.append(c) |
|||
elif c == ">": |
|||
return "".join(attrName), "" |
|||
elif c in asciiUppercaseBytes: |
|||
attrValue.append(c.lower()) |
|||
elif c is None: |
|||
return None |
|||
else: |
|||
attrValue.append(c) |
|||
# Step 11 |
|||
while True: |
|||
c = data.next() |
|||
if c in spacesAngleBrackets: |
|||
return "".join(attrName), "".join(attrValue) |
|||
elif c in asciiUppercaseBytes: |
|||
attrValue.append(c.lower()) |
|||
elif c is None: |
|||
return None |
|||
else: |
|||
attrValue.append(c) |
|||
|
|||
|
|||
class ContentAttrParser(object): |
|||
def __init__(self, data): |
|||
self.data = data |
|||
def parse(self): |
|||
try: |
|||
#Check if the attr name is charset |
|||
#otherwise return |
|||
self.data.jumpTo("charset") |
|||
self.data.position += 1 |
|||
self.data.skip() |
|||
if not self.data.currentByte == "=": |
|||
#If there is no = sign keep looking for attrs |
|||
return None |
|||
self.data.position += 1 |
|||
self.data.skip() |
|||
#Look for an encoding between matching quote marks |
|||
if self.data.currentByte in ('"', "'"): |
|||
quoteMark = self.data.currentByte |
|||
self.data.position += 1 |
|||
oldPosition = self.data.position |
|||
if self.data.jumpTo(quoteMark): |
|||
return self.data[oldPosition:self.data.position] |
|||
else: |
|||
return None |
|||
else: |
|||
#Unquoted value |
|||
oldPosition = self.data.position |
|||
try: |
|||
self.data.skipUntil(spaceCharactersBytes) |
|||
return self.data[oldPosition:self.data.position] |
|||
except StopIteration: |
|||
#Return the whole remaining value |
|||
return self.data[oldPosition:] |
|||
except StopIteration: |
|||
return None |
|||
|
|||
|
|||
def codecName(encoding): |
|||
"""Return the python codec name corresponding to an encoding or None if the |
|||
string doesn't correspond to a valid encoding.""" |
|||
if (encoding is not None and type(encoding) in types.StringTypes): |
|||
canonicalName = ascii_punctuation_re.sub("", encoding).lower() |
|||
return encodings.get(canonicalName, None) |
|||
else: |
|||
return None |
@ -0,0 +1,258 @@ |
|||
import re |
|||
from xml.sax.saxutils import escape, unescape |
|||
|
|||
from tokenizer import HTMLTokenizer |
|||
from constants import tokenTypes |
|||
|
|||
class HTMLSanitizerMixin(object): |
|||
""" sanitization of XHTML+MathML+SVG and of inline style attributes.""" |
|||
|
|||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', |
|||
'article', 'aside', 'audio', 'b', 'big', 'blockquote', 'br', 'button', |
|||
'canvas', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', |
|||
'command', 'datagrid', 'datalist', 'dd', 'del', 'details', 'dfn', |
|||
'dialog', 'dir', 'div', 'dl', 'dt', 'em', 'event-source', 'fieldset', |
|||
'figcaption', 'figure', 'footer', 'font', 'form', 'header', 'h1', |
|||
'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', 'ins', |
|||
'keygen', 'kbd', 'label', 'legend', 'li', 'm', 'map', 'menu', 'meter', |
|||
'multicol', 'nav', 'nextid', 'ol', 'output', 'optgroup', 'option', |
|||
'p', 'pre', 'progress', 'q', 's', 'samp', 'section', 'select', |
|||
'small', 'sound', 'source', 'spacer', 'span', 'strike', 'strong', |
|||
'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'time', 'tfoot', |
|||
'th', 'thead', 'tr', 'tt', 'u', 'ul', 'var', 'video'] |
|||
|
|||
mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi', |
|||
'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom', |
|||
'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub', |
|||
'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', |
|||
'munderover', 'none'] |
|||
|
|||
svg_elements = ['a', 'animate', 'animateColor', 'animateMotion', |
|||
'animateTransform', 'clipPath', 'circle', 'defs', 'desc', 'ellipse', |
|||
'font-face', 'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', |
|||
'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph', |
|||
'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect', |
|||
'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use'] |
|||
|
|||
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', |
|||
'action', 'align', 'alt', 'autocomplete', 'autofocus', 'axis', |
|||
'background', 'balance', 'bgcolor', 'bgproperties', 'border', |
|||
'bordercolor', 'bordercolordark', 'bordercolorlight', 'bottompadding', |
|||
'cellpadding', 'cellspacing', 'ch', 'challenge', 'char', 'charoff', |
|||
'choff', 'charset', 'checked', 'cite', 'class', 'clear', 'color', |
|||
'cols', 'colspan', 'compact', 'contenteditable', 'controls', 'coords', |
|||
'data', 'datafld', 'datapagesize', 'datasrc', 'datetime', 'default', |
|||
'delay', 'dir', 'disabled', 'draggable', 'dynsrc', 'enctype', 'end', |
|||
'face', 'for', 'form', 'frame', 'galleryimg', 'gutter', 'headers', |
|||
'height', 'hidefocus', 'hidden', 'high', 'href', 'hreflang', 'hspace', |
|||
'icon', 'id', 'inputmode', 'ismap', 'keytype', 'label', 'leftspacing', |
|||
'lang', 'list', 'longdesc', 'loop', 'loopcount', 'loopend', |
|||
'loopstart', 'low', 'lowsrc', 'max', 'maxlength', 'media', 'method', |
|||
'min', 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'open', |
|||
'optimum', 'pattern', 'ping', 'point-size', 'prompt', 'pqg', |
|||
'radiogroup', 'readonly', 'rel', 'repeat-max', 'repeat-min', |
|||
'replace', 'required', 'rev', 'rightspacing', 'rows', 'rowspan', |
|||
'rules', 'scope', 'selected', 'shape', 'size', 'span', 'src', 'start', |
|||
'step', 'style', 'summary', 'suppress', 'tabindex', 'target', |
|||
'template', 'title', 'toppadding', 'type', 'unselectable', 'usemap', |
|||
'urn', 'valign', 'value', 'variable', 'volume', 'vspace', 'vrml', |
|||
'width', 'wrap', 'xml:lang'] |
|||
|
|||
mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign', |
|||
'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth', |
|||
'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence', |
|||
'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace', |
|||
'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize', |
|||
'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines', |
|||
'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection', |
|||
'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show', |
|||
'xlink:type', 'xmlns', 'xmlns:xlink'] |
|||
|
|||
svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic', |
|||
'arabic-form', 'ascent', 'attributeName', 'attributeType', |
|||
'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height', |
|||
'class', 'clip-path', 'color', 'color-rendering', 'content', 'cx', |
|||
'cy', 'd', 'dx', 'dy', 'descent', 'display', 'dur', 'end', 'fill', |
|||
'fill-opacity', 'fill-rule', 'font-family', 'font-size', |
|||
'font-stretch', 'font-style', 'font-variant', 'font-weight', 'from', |
|||
'fx', 'fy', 'g1', 'g2', 'glyph-name', 'gradientUnits', 'hanging', |
|||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k', |
|||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'marker-end', |
|||
'marker-mid', 'marker-start', 'markerHeight', 'markerUnits', |
|||
'markerWidth', 'mathematical', 'max', 'min', 'name', 'offset', |
|||
'opacity', 'orient', 'origin', 'overline-position', |
|||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points', |
|||
'preserveAspectRatio', 'r', 'refX', 'refY', 'repeatCount', |
|||
'repeatDur', 'requiredExtensions', 'requiredFeatures', 'restart', |
|||
'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv', 'stop-color', |
|||
'stop-opacity', 'strikethrough-position', 'strikethrough-thickness', |
|||
'stroke', 'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap', |
|||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity', |
|||
'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to', |
|||
'transform', 'type', 'u1', 'u2', 'underline-position', |
|||
'underline-thickness', 'unicode', 'unicode-range', 'units-per-em', |
|||
'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x', |
|||
'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole', |
|||
'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', |
|||
'xml:base', 'xml:lang', 'xml:space', 'xmlns', 'xmlns:xlink', 'y', |
|||
'y1', 'y2', 'zoomAndPan'] |
|||
|
|||
attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc', |
|||
'xlink:href', 'xml:base'] |
|||
|
|||
svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill', |
|||
'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', |
|||
'mask', 'stroke'] |
|||
|
|||
svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', |
|||
'animateMotion', 'animateTransform', 'cursor', 'feImage', 'filter', |
|||
'linearGradient', 'pattern', 'radialGradient', 'textpath', 'tref', |
|||
'set', 'use'] |
|||
|
|||
acceptable_css_properties = ['azimuth', 'background-color', |
|||
'border-bottom-color', 'border-collapse', 'border-color', |
|||
'border-left-color', 'border-right-color', 'border-top-color', 'clear', |
|||
'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font', |
|||
'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight', |
|||
'height', 'letter-spacing', 'line-height', 'overflow', 'pause', |
|||
'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness', |
|||
'speak', 'speak-header', 'speak-numeral', 'speak-punctuation', |
|||
'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent', |
|||
'unicode-bidi', 'vertical-align', 'voice-family', 'volume', |
|||
'white-space', 'width'] |
|||
|
|||
acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue', |
|||
'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed', |
|||
'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left', |
|||
'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive', |
|||
'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top', |
|||
'transparent', 'underline', 'white', 'yellow'] |
|||
|
|||
acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule', |
|||
'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin', |
|||
'stroke-opacity'] |
|||
|
|||
acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc', |
|||
'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal', |
|||
'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag', |
|||
'ssh', 'sftp', 'rtsp', 'afs' ] |
|||
|
|||
# subclasses may define their own versions of these constants |
|||
allowed_elements = acceptable_elements + mathml_elements + svg_elements |
|||
allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes |
|||
allowed_css_properties = acceptable_css_properties |
|||
allowed_css_keywords = acceptable_css_keywords |
|||
allowed_svg_properties = acceptable_svg_properties |
|||
allowed_protocols = acceptable_protocols |
|||
|
|||
# Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and |
|||
# stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style |
|||
# attributes are parsed, and a restricted set, # specified by |
|||
# ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. |
|||
# attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified |
|||
# in ALLOWED_PROTOCOLS are allowed. |
|||
# |
|||
# sanitize_html('<script> do_nasty_stuff() </script>') |
|||
# => <script> do_nasty_stuff() </script> |
|||
# sanitize_html('<a href="javascript: sucker();">Click here for $100</a>') |
|||
# => <a>Click here for $100</a> |
|||
def sanitize_token(self, token): |
|||
|
|||
# accommodate filters which use token_type differently |
|||
token_type = token["type"] |
|||
if token_type in tokenTypes.keys(): |
|||
token_type = tokenTypes[token_type] |
|||
|
|||
if token_type in (tokenTypes["StartTag"], tokenTypes["EndTag"], |
|||
tokenTypes["EmptyTag"]): |
|||
if token["name"] in self.allowed_elements: |
|||
if token.has_key("data"): |
|||
attrs = dict([(name,val) for name,val in |
|||
token["data"][::-1] |
|||
if name in self.allowed_attributes]) |
|||
for attr in self.attr_val_is_uri: |
|||
if not attrs.has_key(attr): |
|||
continue |
|||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', |
|||
unescape(attrs[attr])).lower() |
|||
#remove replacement characters from unescaped characters |
|||
val_unescaped = val_unescaped.replace(u"\ufffd", "") |
|||
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and |
|||
(val_unescaped.split(':')[0] not in |
|||
self.allowed_protocols)): |
|||
del attrs[attr] |
|||
for attr in self.svg_attr_val_allows_ref: |
|||
if attr in attrs: |
|||
attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)', |
|||
' ', |
|||
unescape(attrs[attr])) |
|||
if (token["name"] in self.svg_allow_local_href and |
|||
'xlink:href' in attrs and re.search('^\s*[^#\s].*', |
|||
attrs['xlink:href'])): |
|||
del attrs['xlink:href'] |
|||
if attrs.has_key('style'): |
|||
attrs['style'] = self.sanitize_css(attrs['style']) |
|||
token["data"] = [[name,val] for name,val in attrs.items()] |
|||
return token |
|||
else: |
|||
if token_type == tokenTypes["EndTag"]: |
|||
token["data"] = "</%s>" % token["name"] |
|||
elif token["data"]: |
|||
attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) |
|||
token["data"] = "<%s%s>" % (token["name"],attrs) |
|||
else: |
|||
token["data"] = "<%s>" % token["name"] |
|||
if token.get("selfClosing"): |
|||
token["data"]=token["data"][:-1] + "/>" |
|||
|
|||
if token["type"] in tokenTypes.keys(): |
|||
token["type"] = "Characters" |
|||
else: |
|||
token["type"] = tokenTypes["Characters"] |
|||
|
|||
del token["name"] |
|||
return token |
|||
elif token_type == tokenTypes["Comment"]: |
|||
pass |
|||
else: |
|||
return token |
|||
|
|||
def sanitize_css(self, style): |
|||
# disallow urls |
|||
style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style) |
|||
|
|||
# gauntlet |
|||
if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' |
|||
if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' |
|||
|
|||
clean = [] |
|||
for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style): |
|||
if not value: continue |
|||
if prop.lower() in self.allowed_css_properties: |
|||
clean.append(prop + ': ' + value + ';') |
|||
elif prop.split('-')[0].lower() in ['background','border','margin', |
|||
'padding']: |
|||
for keyword in value.split(): |
|||
if not keyword in self.acceptable_css_keywords and \ |
|||
not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword): |
|||
break |
|||
else: |
|||
clean.append(prop + ': ' + value + ';') |
|||
elif prop.lower() in self.allowed_svg_properties: |
|||
clean.append(prop + ': ' + value + ';') |
|||
|
|||
return ' '.join(clean) |
|||
|
|||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): |
|||
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True, |
|||
lowercaseElementName=False, lowercaseAttrName=False, parser=None): |
|||
#Change case matching defaults as we only output lowercase html anyway |
|||
#This solution doesn't seem ideal... |
|||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet, |
|||
lowercaseElementName, lowercaseAttrName, parser=parser) |
|||
|
|||
def __iter__(self): |
|||
for token in HTMLTokenizer.__iter__(self): |
|||
token = self.sanitize_token(token) |
|||
if token: |
|||
yield token |
@ -0,0 +1,17 @@ |
|||
|
|||
from html5lib import treewalkers |
|||
|
|||
from htmlserializer import HTMLSerializer |
|||
from xhtmlserializer import XHTMLSerializer |
|||
|
|||
def serialize(input, tree="simpletree", format="html", encoding=None, |
|||
**serializer_opts): |
|||
# XXX: Should we cache this? |
|||
walker = treewalkers.getTreeWalker(tree) |
|||
if format == "html": |
|||
s = HTMLSerializer(**serializer_opts) |
|||
elif format == "xhtml": |
|||
s = XHTMLSerializer(**serializer_opts) |
|||
else: |
|||
raise ValueError, "type must be either html or xhtml" |
|||
return s.render(walker(input), encoding) |
@ -0,0 +1,312 @@ |
|||
try: |
|||
frozenset |
|||
except NameError: |
|||
# Import from the sets module for python 2.3 |
|||
from sets import ImmutableSet as frozenset |
|||
|
|||
import gettext |
|||
_ = gettext.gettext |
|||
|
|||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters |
|||
from html5lib.constants import rcdataElements, entities, xmlEntities |
|||
from html5lib import utils |
|||
from xml.sax.saxutils import escape |
|||
|
|||
spaceCharacters = u"".join(spaceCharacters) |
|||
|
|||
try: |
|||
from codecs import register_error, xmlcharrefreplace_errors |
|||
except ImportError: |
|||
unicode_encode_errors = "strict" |
|||
else: |
|||
unicode_encode_errors = "htmlentityreplace" |
|||
|
|||
from html5lib.constants import entities |
|||
|
|||
encode_entity_map = {} |
|||
is_ucs4 = len(u"\U0010FFFF") == 1 |
|||
for k, v in entities.items(): |
|||
#skip multi-character entities |
|||
if ((is_ucs4 and len(v) > 1) or |
|||
(not is_ucs4 and len(v) > 2)): |
|||
continue |
|||
if v != "&": |
|||
if len(v) == 2: |
|||
v = utils.surrogatePairToCodepoint(v) |
|||
else: |
|||
try: |
|||
v = ord(v) |
|||
except: |
|||
print v |
|||
raise |
|||
if not v in encode_entity_map or k.islower(): |
|||
# prefer < over < and similarly for &, >, etc. |
|||
encode_entity_map[v] = k |
|||
|
|||
def htmlentityreplace_errors(exc): |
|||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): |
|||
res = [] |
|||
codepoints = [] |
|||
skip = False |
|||
for i, c in enumerate(exc.object[exc.start:exc.end]): |
|||
if skip: |
|||
skip = False |
|||
continue |
|||
index = i + exc.start |
|||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]): |
|||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2]) |
|||
skip = True |
|||
else: |
|||
codepoint = ord(c) |
|||
codepoints.append(codepoint) |
|||
for cp in codepoints: |
|||
e = encode_entity_map.get(cp) |
|||
if e: |
|||
res.append("&") |
|||
res.append(e) |
|||
if not e.endswith(";"): |
|||
res.append(";") |
|||
else: |
|||
res.append("&#x%s;"%(hex(cp)[2:])) |
|||
return (u"".join(res), exc.end) |
|||
else: |
|||
return xmlcharrefreplace_errors(exc) |
|||
|
|||
register_error(unicode_encode_errors, htmlentityreplace_errors) |
|||
|
|||
del register_error |
|||
|
|||
|
|||
class HTMLSerializer(object): |
|||
|
|||
# attribute quoting options |
|||
quote_attr_values = False |
|||
quote_char = u'"' |
|||
use_best_quote_char = True |
|||
|
|||
# tag syntax options |
|||
omit_optional_tags = True |
|||
minimize_boolean_attributes = True |
|||
use_trailing_solidus = False |
|||
space_before_trailing_solidus = True |
|||
|
|||
# escaping options |
|||
escape_lt_in_attrs = False |
|||
escape_rcdata = False |
|||
resolve_entities = True |
|||
|
|||
# miscellaneous options |
|||
inject_meta_charset = True |
|||
strip_whitespace = False |
|||
sanitize = False |
|||
|
|||
options = ("quote_attr_values", "quote_char", "use_best_quote_char", |
|||
"minimize_boolean_attributes", "use_trailing_solidus", |
|||
"space_before_trailing_solidus", "omit_optional_tags", |
|||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", |
|||
"escape_rcdata", "resolve_entities", "sanitize") |
|||
|
|||
def __init__(self, **kwargs): |
|||
"""Initialize HTMLSerializer. |
|||
|
|||
Keyword options (default given first unless specified) include: |
|||
|
|||
inject_meta_charset=True|False |
|||
Whether it insert a meta element to define the character set of the |
|||
document. |
|||
quote_attr_values=True|False |
|||
Whether to quote attribute values that don't require quoting |
|||
per HTML5 parsing rules. |
|||
quote_char=u'"'|u"'" |
|||
Use given quote character for attribute quoting. Default is to |
|||
use double quote unless attribute value contains a double quote, |
|||
in which case single quotes are used instead. |
|||
escape_lt_in_attrs=False|True |
|||
Whether to escape < in attribute values. |
|||
escape_rcdata=False|True |
|||
Whether to escape characters that need to be escaped within normal |
|||
elements within rcdata elements such as style. |
|||
resolve_entities=True|False |
|||
Whether to resolve named character entities that appear in the |
|||
source tree. The XML predefined entities < > & " ' |
|||
are unaffected by this setting. |
|||
strip_whitespace=False|True |
|||
Whether to remove semantically meaningless whitespace. (This |
|||
compresses all whitespace to a single space except within pre.) |
|||
minimize_boolean_attributes=True|False |
|||
Shortens boolean attributes to give just the attribute value, |
|||
for example <input disabled="disabled"> becomes <input disabled>. |
|||
use_trailing_solidus=False|True |
|||
Includes a close-tag slash at the end of the start tag of void |
|||
elements (empty elements whose end tag is forbidden). E.g. <hr/>. |
|||
space_before_trailing_solidus=True|False |
|||
Places a space immediately before the closing slash in a tag |
|||
using a trailing solidus. E.g. <hr />. Requires use_trailing_solidus. |
|||
sanitize=False|True |
|||
Strip all unsafe or unknown constructs from output. |
|||
See `html5lib user documentation`_ |
|||
omit_optional_tags=True|False |
|||
Omit start/end tags that are optional. |
|||
|
|||
.. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation |
|||
""" |
|||
if kwargs.has_key('quote_char'): |
|||
self.use_best_quote_char = False |
|||
for attr in self.options: |
|||
setattr(self, attr, kwargs.get(attr, getattr(self, attr))) |
|||
self.errors = [] |
|||
self.strict = False |
|||
|
|||
def encode(self, string): |
|||
assert(isinstance(string, unicode)) |
|||
if self.encoding: |
|||
return string.encode(self.encoding, unicode_encode_errors) |
|||
else: |
|||
return string |
|||
|
|||
def encodeStrict(self, string): |
|||
assert(isinstance(string, unicode)) |
|||
if self.encoding: |
|||
return string.encode(self.encoding, "strict") |
|||
else: |
|||
return string |
|||
|
|||
def serialize(self, treewalker, encoding=None): |
|||
self.encoding = encoding |
|||
in_cdata = False |
|||
self.errors = [] |
|||
if encoding and self.inject_meta_charset: |
|||
from html5lib.filters.inject_meta_charset import Filter |
|||
treewalker = Filter(treewalker, encoding) |
|||
# XXX: WhitespaceFilter should be used before OptionalTagFilter |
|||
# for maximum efficiently of this latter filter |
|||
if self.strip_whitespace: |
|||
from html5lib.filters.whitespace import Filter |
|||
treewalker = Filter(treewalker) |
|||
if self.sanitize: |
|||
from html5lib.filters.sanitizer import Filter |
|||
treewalker = Filter(treewalker) |
|||
if self.omit_optional_tags: |
|||
from html5lib.filters.optionaltags import Filter |
|||
treewalker = Filter(treewalker) |
|||
for token in treewalker: |
|||
type = token["type"] |
|||
if type == "Doctype": |
|||
doctype = u"<!DOCTYPE %s" % token["name"] |
|||
|
|||
if token["publicId"]: |
|||
doctype += u' PUBLIC "%s"' % token["publicId"] |
|||
elif token["systemId"]: |
|||
doctype += u" SYSTEM" |
|||
if token["systemId"]: |
|||
if token["systemId"].find(u'"') >= 0: |
|||
if token["systemId"].find(u"'") >= 0: |
|||
self.serializeError(_("System identifer contains both single and double quote characters")) |
|||
quote_char = u"'" |
|||
else: |
|||
quote_char = u'"' |
|||
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char) |
|||
|
|||
doctype += u">" |
|||
yield self.encodeStrict(doctype) |
|||
|
|||
elif type in ("Characters", "SpaceCharacters"): |
|||
if type == "SpaceCharacters" or in_cdata: |
|||
if in_cdata and token["data"].find("</") >= 0: |
|||
self.serializeError(_("Unexpected </ in CDATA")) |
|||
yield self.encode(token["data"]) |
|||
else: |
|||
yield self.encode(escape(token["data"])) |
|||
|
|||
elif type in ("StartTag", "EmptyTag"): |
|||
name = token["name"] |
|||
yield self.encodeStrict(u"<%s" % name) |
|||
if name in rcdataElements and not self.escape_rcdata: |
|||
in_cdata = True |
|||
elif in_cdata: |
|||
self.serializeError(_("Unexpected child element of a CDATA element")) |
|||
attributes = [] |
|||
for (attr_namespace,attr_name),attr_value in sorted(token["data"].items()): |
|||
#TODO: Add namespace support here |
|||
k = attr_name |
|||
v = attr_value |
|||
yield self.encodeStrict(u' ') |
|||
|
|||
yield self.encodeStrict(k) |
|||
if not self.minimize_boolean_attributes or \ |
|||
(k not in booleanAttributes.get(name, tuple()) \ |
|||
and k not in booleanAttributes.get("", tuple())): |
|||
yield self.encodeStrict(u"=") |
|||
if self.quote_attr_values or not v: |
|||
quote_attr = True |
|||
else: |
|||
quote_attr = reduce(lambda x,y: x or (y in v), |
|||
spaceCharacters + u">\"'=", False) |
|||
v = v.replace(u"&", u"&") |
|||
if self.escape_lt_in_attrs: v = v.replace(u"<", u"<") |
|||
if quote_attr: |
|||
quote_char = self.quote_char |
|||
if self.use_best_quote_char: |
|||
if u"'" in v and u'"' not in v: |
|||
quote_char = u'"' |
|||
elif u'"' in v and u"'" not in v: |
|||
quote_char = u"'" |
|||
if quote_char == u"'": |
|||
v = v.replace(u"'", u"'") |
|||
else: |
|||
v = v.replace(u'"', u""") |
|||
yield self.encodeStrict(quote_char) |
|||
yield self.encode(v) |
|||
yield self.encodeStrict(quote_char) |
|||
else: |
|||
yield self.encode(v) |
|||
if name in voidElements and self.use_trailing_solidus: |
|||
if self.space_before_trailing_solidus: |
|||
yield self.encodeStrict(u" /") |
|||
else: |
|||
yield self.encodeStrict(u"/") |
|||
yield self.encode(u">") |
|||
|
|||
elif type == "EndTag": |
|||
name = token["name"] |
|||
if name in rcdataElements: |
|||
in_cdata = False |
|||
elif in_cdata: |
|||
self.serializeError(_("Unexpected child element of a CDATA element")) |
|||
yield self.encodeStrict(u"</%s>" % name) |
|||
|
|||
elif type == "Comment": |
|||
data = token["data"] |
|||
if data.find("--") >= 0: |
|||
self.serializeError(_("Comment contains --")) |
|||
yield self.encodeStrict(u"<!--%s-->" % token["data"]) |
|||
|
|||
elif type == "Entity": |
|||
name = token["name"] |
|||
key = name + ";" |
|||
if not key in entities: |
|||
self.serializeError(_("Entity %s not recognized" % name)) |
|||
if self.resolve_entities and key not in xmlEntities: |
|||
data = entities[key] |
|||
else: |
|||
data = u"&%s;" % name |
|||
yield self.encodeStrict(data) |
|||
|
|||
else: |
|||
self.serializeError(token["data"]) |
|||
|
|||
def render(self, treewalker, encoding=None): |
|||
if encoding: |
|||
return "".join(list(self.serialize(treewalker, encoding))) |
|||
else: |
|||
return u"".join(list(self.serialize(treewalker))) |
|||
|
|||
def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): |
|||
# XXX The idea is to make data mandatory. |
|||
self.errors.append(data) |
|||
if self.strict: |
|||
raise SerializeError |
|||
|
|||
def SerializeError(Exception): |
|||
"""Error in serialized tree""" |
|||
pass |
@ -0,0 +1,9 @@ |
|||
from htmlserializer import HTMLSerializer |
|||
|
|||
class XHTMLSerializer(HTMLSerializer): |
|||
quote_attr_values = True |
|||
minimize_boolean_attributes = False |
|||
use_trailing_solidus = True |
|||
escape_lt_in_attrs = True |
|||
omit_optional_tags = False |
|||
escape_rcdata = True |
File diff suppressed because it is too large
@ -0,0 +1,96 @@ |
|||
"""A collection of modules for building different kinds of tree from |
|||
HTML documents. |
|||
|
|||
To create a treebuilder for a new type of tree, you need to do |
|||
implement several things: |
|||
|
|||
1) A set of classes for various types of elements: Document, Doctype, |
|||
Comment, Element. These must implement the interface of |
|||
_base.treebuilders.Node (although comment nodes have a different |
|||
signature for their constructor, see treebuilders.simpletree.Comment) |
|||
Textual content may also be implemented as another node type, or not, as |
|||
your tree implementation requires. |
|||
|
|||
2) A treebuilder object (called TreeBuilder by convention) that |
|||
inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: |
|||
documentClass - the class to use for the bottommost node of a document |
|||
elementClass - the class to use for HTML Elements |
|||
commentClass - the class to use for comments |
|||
doctypeClass - the class to use for doctypes |
|||
It also has one required method: |
|||
getDocument - Returns the root node of the complete document tree |
|||
|
|||
3) If you wish to run the unit tests, you must also create a |
|||
testSerializer method on your treebuilder which accepts a node and |
|||
returns a string containing Node and its children serialized according |
|||
to the format used in the unittests |
|||
|
|||
The supplied simpletree module provides a python-only implementation |
|||
of a full treebuilder and is a useful reference for the semantics of |
|||
the various methods. |
|||
""" |
|||
|
|||
treeBuilderCache = {} |
|||
|
|||
import sys |
|||
|
|||
def getTreeBuilder(treeType, implementation=None, **kwargs): |
|||
"""Get a TreeBuilder class for various types of tree with built-in support |
|||
|
|||
treeType - the name of the tree type required (case-insensitive). Supported |
|||
values are "simpletree", "dom", "etree" and "beautifulsoup" |
|||
|
|||
"simpletree" - a built-in DOM-ish tree type with support for some |
|||
more pythonic idioms. |
|||
"dom" - A generic builder for DOM implementations, defaulting to |
|||
a xml.dom.minidom based implementation for the sake of |
|||
backwards compatibility (as releases up until 0.10 had a |
|||
builder called "dom" that was a minidom implemenation). |
|||
"etree" - A generic builder for tree implementations exposing an |
|||
elementtree-like interface (known to work with |
|||
ElementTree, cElementTree and lxml.etree). |
|||
"beautifulsoup" - Beautiful soup (if installed) |
|||
|
|||
implementation - (Currently applies to the "etree" and "dom" tree types). A |
|||
module implementing the tree type e.g. |
|||
xml.etree.ElementTree or lxml.etree.""" |
|||
|
|||
treeType = treeType.lower() |
|||
if treeType not in treeBuilderCache: |
|||
if treeType == "dom": |
|||
import dom |
|||
# XXX: Keep backwards compatibility by using minidom if no implementation is given |
|||
if implementation == None: |
|||
from xml.dom import minidom |
|||
implementation = minidom |
|||
# XXX: NEVER cache here, caching is done in the dom submodule |
|||
return dom.getDomModule(implementation, **kwargs).TreeBuilder |
|||
elif treeType == "simpletree": |
|||
import simpletree |
|||
treeBuilderCache[treeType] = simpletree.TreeBuilder |
|||
elif treeType == "beautifulsoup": |
|||
import soup |
|||
treeBuilderCache[treeType] = soup.TreeBuilder |
|||
elif treeType == "lxml": |
|||
import etree_lxml |
|||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder |
|||
elif treeType == "etree": |
|||
# Come up with a sane default |
|||
if implementation == None: |
|||
try: |
|||
import xml.etree.cElementTree as ET |
|||
except ImportError: |
|||
try: |
|||
import xml.etree.ElementTree as ET |
|||
except ImportError: |
|||
try: |
|||
import cElementTree as ET |
|||
except ImportError: |
|||
import elementtree.ElementTree as ET |
|||
implementation = ET |
|||
import etree |
|||
# NEVER cache here, caching is done in the etree submodule |
|||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder |
|||
else: |
|||
raise ValueError("""Unrecognised treebuilder "%s" """%treeType) |
|||
return treeBuilderCache.get(treeType) |
@ -0,0 +1,377 @@ |
|||
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces |
|||
try: |
|||
frozenset |
|||
except NameError: |
|||
# Import from the sets module for python 2.3 |
|||
from sets import Set as set |
|||
from sets import ImmutableSet as frozenset |
|||
|
|||
# The scope markers are inserted when entering object elements, |
|||
# marquees, table cells, and table captions, and are used to prevent formatting |
|||
# from "leaking" into tables, object elements, and marquees. |
|||
Marker = None |
|||
|
|||
class Node(object): |
|||
def __init__(self, name): |
|||
"""Node representing an item in the tree. |
|||
name - The tag name associated with the node |
|||
parent - The parent of the current node (or None for the document node) |
|||
value - The value of the current node (applies to text nodes and |
|||
comments |
|||
attributes - a dict holding name, value pairs for attributes of the node |
|||
childNodes - a list of child nodes of the current node. This must |
|||
include all elements but not necessarily other node types |
|||
_flags - A list of miscellaneous flags that can be set on the node |
|||
""" |
|||
self.name = name |
|||
self.parent = None |
|||
self.value = None |
|||
self.attributes = {} |
|||
self.childNodes = [] |
|||
self._flags = [] |
|||
|
|||
def __unicode__(self): |
|||
attributesStr = " ".join(["%s=\"%s\""%(name, value) |
|||
for name, value in |
|||
self.attributes.iteritems()]) |
|||
if attributesStr: |
|||
return "<%s %s>"%(self.name,attributesStr) |
|||
else: |
|||
return "<%s>"%(self.name) |
|||
|
|||
def __repr__(self): |
|||
return "<%s>" % (self.name) |
|||
|
|||
def appendChild(self, node): |
|||
"""Insert node as a child of the current node |
|||
""" |
|||
raise NotImplementedError |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
"""Insert data as text in the current node, positioned before the |
|||
start of node insertBefore or to the end of the node's text. |
|||
""" |
|||
raise NotImplementedError |
|||
|
|||
def insertBefore(self, node, refNode): |
|||
"""Insert node as a child of the current node, before refNode in the |
|||
list of child nodes. Raises ValueError if refNode is not a child of |
|||
the current node""" |
|||
raise NotImplementedError |
|||
|
|||
def removeChild(self, node): |
|||
"""Remove node from the children of the current node |
|||
""" |
|||
raise NotImplementedError |
|||
|
|||
def reparentChildren(self, newParent): |
|||
"""Move all the children of the current node to newParent. |
|||
This is needed so that trees that don't store text as nodes move the |
|||
text in the correct way |
|||
""" |
|||
#XXX - should this method be made more general? |
|||
for child in self.childNodes: |
|||
newParent.appendChild(child) |
|||
self.childNodes = [] |
|||
|
|||
def cloneNode(self): |
|||
"""Return a shallow copy of the current node i.e. a node with the same |
|||
name and attributes but with no parent or child nodes |
|||
""" |
|||
raise NotImplementedError |
|||
|
|||
|
|||
def hasContent(self): |
|||
"""Return true if the node has children or text, false otherwise |
|||
""" |
|||
raise NotImplementedError |
|||
|
|||
class ActiveFormattingElements(list): |
|||
def append(self, node): |
|||
equalCount = 0 |
|||
if node != Marker: |
|||
for element in self[::-1]: |
|||
if element == Marker: |
|||
break |
|||
if self.nodesEqual(element, node): |
|||
equalCount += 1 |
|||
if equalCount == 3: |
|||
self.remove(element) |
|||
break |
|||
list.append(self, node) |
|||
|
|||
def nodesEqual(self, node1, node2): |
|||
if not node1.nameTuple == node2.nameTuple: |
|||
return False |
|||
|
|||
if not node1.attributes == node2.attributes: |
|||
return False |
|||
|
|||
return True |
|||
|
|||
class TreeBuilder(object): |
|||
"""Base treebuilder implementation |
|||
documentClass - the class to use for the bottommost node of a document |
|||
elementClass - the class to use for HTML Elements |
|||
commentClass - the class to use for comments |
|||
doctypeClass - the class to use for doctypes |
|||
""" |
|||
|
|||
#Document class |
|||
documentClass = None |
|||
|
|||
#The class to use for creating a node |
|||
elementClass = None |
|||
|
|||
#The class to use for creating comments |
|||
commentClass = None |
|||
|
|||
#The class to use for creating doctypes |
|||
doctypeClass = None |
|||
|
|||
#Fragment class |
|||
fragmentClass = None |
|||
|
|||
def __init__(self, namespaceHTMLElements): |
|||
if namespaceHTMLElements: |
|||
self.defaultNamespace = "http://www.w3.org/1999/xhtml" |
|||
else: |
|||
self.defaultNamespace = None |
|||
self.reset() |
|||
|
|||
def reset(self): |
|||
self.openElements = [] |
|||
self.activeFormattingElements = ActiveFormattingElements() |
|||
|
|||
#XXX - rename these to headElement, formElement |
|||
self.headPointer = None |
|||
self.formPointer = None |
|||
|
|||
self.insertFromTable = False |
|||
|
|||
self.document = self.documentClass() |
|||
|
|||
def elementInScope(self, target, variant=None): |
|||
|
|||
#If we pass a node in we match that. if we pass a string |
|||
#match any node with that name |
|||
exactNode = hasattr(target, "nameTuple") |
|||
|
|||
listElementsMap = { |
|||
None:(scopingElements, False), |
|||
"button":(scopingElements | set([(namespaces["html"], "button")]), False), |
|||
"list":(scopingElements | set([(namespaces["html"], "ol"), |
|||
(namespaces["html"], "ul")]), False), |
|||
"table":(set([(namespaces["html"], "html"), |
|||
(namespaces["html"], "table")]), False), |
|||
"select":(set([(namespaces["html"], "optgroup"), |
|||
(namespaces["html"], "option")]), True) |
|||
} |
|||
listElements, invert = listElementsMap[variant] |
|||
|
|||
for node in reversed(self.openElements): |
|||
if (node.name == target and not exactNode or |
|||
node == target and exactNode): |
|||
return True |
|||
elif (invert ^ (node.nameTuple in listElements)): |
|||
return False |
|||
|
|||
assert False # We should never reach this point |
|||
|
|||
def reconstructActiveFormattingElements(self): |
|||
# Within this algorithm the order of steps described in the |
|||
# specification is not quite the same as the order of steps in the |
|||
# code. It should still do the same though. |
|||
|
|||
# Step 1: stop the algorithm when there's nothing to do. |
|||
if not self.activeFormattingElements: |
|||
return |
|||
|
|||
# Step 2 and step 3: we start with the last element. So i is -1. |
|||
i = len(self.activeFormattingElements) - 1 |
|||
entry = self.activeFormattingElements[i] |
|||
if entry == Marker or entry in self.openElements: |
|||
return |
|||
|
|||
# Step 6 |
|||
while entry != Marker and entry not in self.openElements: |
|||
if i == 0: |
|||
#This will be reset to 0 below |
|||
i = -1 |
|||
break |
|||
i -= 1 |
|||
# Step 5: let entry be one earlier in the list. |
|||
entry = self.activeFormattingElements[i] |
|||
|
|||
while True: |
|||
# Step 7 |
|||
i += 1 |
|||
|
|||
# Step 8 |
|||
entry = self.activeFormattingElements[i] |
|||
clone = entry.cloneNode() #Mainly to get a new copy of the attributes |
|||
|
|||
# Step 9 |
|||
element = self.insertElement({"type":"StartTag", |
|||
"name":clone.name, |
|||
"namespace":clone.namespace, |
|||
"data":clone.attributes}) |
|||
|
|||
# Step 10 |
|||
self.activeFormattingElements[i] = element |
|||
|
|||
# Step 11 |
|||
if element == self.activeFormattingElements[-1]: |
|||
break |
|||
|
|||
def clearActiveFormattingElements(self): |
|||
entry = self.activeFormattingElements.pop() |
|||
while self.activeFormattingElements and entry != Marker: |
|||
entry = self.activeFormattingElements.pop() |
|||
|
|||
def elementInActiveFormattingElements(self, name): |
|||
"""Check if an element exists between the end of the active |
|||
formatting elements and the last marker. If it does, return it, else |
|||
return false""" |
|||
|
|||
for item in self.activeFormattingElements[::-1]: |
|||
# Check for Marker first because if it's a Marker it doesn't have a |
|||
# name attribute. |
|||
if item == Marker: |
|||
break |
|||
elif item.name == name: |
|||
return item |
|||
return False |
|||
|
|||
def insertRoot(self, token): |
|||
element = self.createElement(token) |
|||
self.openElements.append(element) |
|||
self.document.appendChild(element) |
|||
|
|||
def insertDoctype(self, token): |
|||
name = token["name"] |
|||
publicId = token["publicId"] |
|||
systemId = token["systemId"] |
|||
|
|||
doctype = self.doctypeClass(name, publicId, systemId) |
|||
self.document.appendChild(doctype) |
|||
|
|||
def insertComment(self, token, parent=None): |
|||
if parent is None: |
|||
parent = self.openElements[-1] |
|||
parent.appendChild(self.commentClass(token["data"])) |
|||
|
|||
def createElement(self, token): |
|||
"""Create an element but don't insert it anywhere""" |
|||
name = token["name"] |
|||
namespace = token.get("namespace", self.defaultNamespace) |
|||
element = self.elementClass(name, namespace) |
|||
element.attributes = token["data"] |
|||
return element |
|||
|
|||
def _getInsertFromTable(self): |
|||
return self._insertFromTable |
|||
|
|||
def _setInsertFromTable(self, value): |
|||
"""Switch the function used to insert an element from the |
|||
normal one to the misnested table one and back again""" |
|||
self._insertFromTable = value |
|||
if value: |
|||
self.insertElement = self.insertElementTable |
|||
else: |
|||
self.insertElement = self.insertElementNormal |
|||
|
|||
insertFromTable = property(_getInsertFromTable, _setInsertFromTable) |
|||
|
|||
def insertElementNormal(self, token): |
|||
name = token["name"] |
|||
assert type(name) == unicode, "Element %s not unicode"%name |
|||
namespace = token.get("namespace", self.defaultNamespace) |
|||
element = self.elementClass(name, namespace) |
|||
element.attributes = token["data"] |
|||
self.openElements[-1].appendChild(element) |
|||
self.openElements.append(element) |
|||
return element |
|||
|
|||
def insertElementTable(self, token): |
|||
"""Create an element and insert it into the tree""" |
|||
element = self.createElement(token) |
|||
if self.openElements[-1].name not in tableInsertModeElements: |
|||
return self.insertElementNormal(token) |
|||
else: |
|||
#We should be in the InTable mode. This means we want to do |
|||
#special magic element rearranging |
|||
parent, insertBefore = self.getTableMisnestedNodePosition() |
|||
if insertBefore is None: |
|||
parent.appendChild(element) |
|||
else: |
|||
parent.insertBefore(element, insertBefore) |
|||
self.openElements.append(element) |
|||
return element |
|||
|
|||
def insertText(self, data, parent=None): |
|||
"""Insert text data.""" |
|||
if parent is None: |
|||
parent = self.openElements[-1] |
|||
|
|||
if (not self.insertFromTable or (self.insertFromTable and |
|||
self.openElements[-1].name |
|||
not in tableInsertModeElements)): |
|||
parent.insertText(data) |
|||
else: |
|||
# We should be in the InTable mode. This means we want to do |
|||
# special magic element rearranging |
|||
parent, insertBefore = self.getTableMisnestedNodePosition() |
|||
parent.insertText(data, insertBefore) |
|||
|
|||
def getTableMisnestedNodePosition(self): |
|||
"""Get the foster parent element, and sibling to insert before |
|||
(or None) when inserting a misnested table node""" |
|||
# The foster parent element is the one which comes before the most |
|||
# recently opened table element |
|||
# XXX - this is really inelegant |
|||
lastTable=None |
|||
fosterParent = None |
|||
insertBefore = None |
|||
for elm in self.openElements[::-1]: |
|||
if elm.name == "table": |
|||
lastTable = elm |
|||
break |
|||
if lastTable: |
|||
# XXX - we should really check that this parent is actually a |
|||
# node here |
|||
if lastTable.parent: |
|||
fosterParent = lastTable.parent |
|||
insertBefore = lastTable |
|||
else: |
|||
fosterParent = self.openElements[ |
|||
self.openElements.index(lastTable) - 1] |
|||
else: |
|||
fosterParent = self.openElements[0] |
|||
return fosterParent, insertBefore |
|||
|
|||
def generateImpliedEndTags(self, exclude=None): |
|||
name = self.openElements[-1].name |
|||
# XXX td, th and tr are not actually needed |
|||
if (name in frozenset(("dd", "dt", "li", "option", "optgroup", "p", "rp", "rt")) |
|||
and name != exclude): |
|||
self.openElements.pop() |
|||
# XXX This is not entirely what the specification says. We should |
|||
# investigate it more closely. |
|||
self.generateImpliedEndTags(exclude) |
|||
|
|||
def getDocument(self): |
|||
"Return the final tree" |
|||
return self.document |
|||
|
|||
def getFragment(self): |
|||
"Return the final fragment" |
|||
#assert self.innerHTML |
|||
fragment = self.fragmentClass() |
|||
self.openElements[0].reparentChildren(fragment) |
|||
return fragment |
|||
|
|||
def testSerializer(self, node): |
|||
"""Serialize the subtree of node in the format required by unit tests |
|||
node - the node from which to start serializing""" |
|||
raise NotImplementedError |
@ -0,0 +1,291 @@ |
|||
|
|||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE |
|||
try: |
|||
from types import ModuleType |
|||
except: |
|||
from new import module as ModuleType |
|||
import re |
|||
import weakref |
|||
|
|||
import _base |
|||
from html5lib import constants, ihatexml |
|||
from html5lib.constants import namespaces |
|||
|
|||
moduleCache = {} |
|||
|
|||
def getDomModule(DomImplementation): |
|||
name = "_" + DomImplementation.__name__+"builder" |
|||
if name in moduleCache: |
|||
return moduleCache[name] |
|||
else: |
|||
mod = ModuleType(name) |
|||
objs = getDomBuilder(DomImplementation) |
|||
mod.__dict__.update(objs) |
|||
moduleCache[name] = mod |
|||
return mod |
|||
|
|||
def getDomBuilder(DomImplementation): |
|||
Dom = DomImplementation |
|||
class AttrList(object): |
|||
def __init__(self, element): |
|||
self.element = element |
|||
def __iter__(self): |
|||
return self.element.attributes.items().__iter__() |
|||
def __setitem__(self, name, value): |
|||
self.element.setAttribute(name, value) |
|||
def __len__(self): |
|||
return len(self.element.attributes.items()) |
|||
def items(self): |
|||
return [(item[0], item[1]) for item in |
|||
self.element.attributes.items()] |
|||
def keys(self): |
|||
return self.element.attributes.keys() |
|||
def __getitem__(self, name): |
|||
return self.element.getAttribute(name) |
|||
|
|||
def __contains__(self, name): |
|||
if isinstance(name, tuple): |
|||
raise NotImplementedError |
|||
else: |
|||
return self.element.hasAttribute(name) |
|||
|
|||
class NodeBuilder(_base.Node): |
|||
def __init__(self, element): |
|||
_base.Node.__init__(self, element.nodeName) |
|||
self.element = element |
|||
|
|||
namespace = property(lambda self:hasattr(self.element, "namespaceURI") |
|||
and self.element.namespaceURI or None) |
|||
|
|||
def appendChild(self, node): |
|||
node.parent = self |
|||
self.element.appendChild(node.element) |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
text = self.element.ownerDocument.createTextNode(data) |
|||
if insertBefore: |
|||
self.element.insertBefore(text, insertBefore.element) |
|||
else: |
|||
self.element.appendChild(text) |
|||
|
|||
def insertBefore(self, node, refNode): |
|||
self.element.insertBefore(node.element, refNode.element) |
|||
node.parent = self |
|||
|
|||
def removeChild(self, node): |
|||
if node.element.parentNode == self.element: |
|||
self.element.removeChild(node.element) |
|||
node.parent = None |
|||
|
|||
def reparentChildren(self, newParent): |
|||
while self.element.hasChildNodes(): |
|||
child = self.element.firstChild |
|||
self.element.removeChild(child) |
|||
newParent.element.appendChild(child) |
|||
self.childNodes = [] |
|||
|
|||
def getAttributes(self): |
|||
return AttrList(self.element) |
|||
|
|||
def setAttributes(self, attributes): |
|||
if attributes: |
|||
for name, value in attributes.items(): |
|||
if isinstance(name, tuple): |
|||
if name[0] is not None: |
|||
qualifiedName = (name[0] + ":" + name[1]) |
|||
else: |
|||
qualifiedName = name[1] |
|||
self.element.setAttributeNS(name[2], qualifiedName, |
|||
value) |
|||
else: |
|||
self.element.setAttribute( |
|||
name, value) |
|||
attributes = property(getAttributes, setAttributes) |
|||
|
|||
def cloneNode(self): |
|||
return NodeBuilder(self.element.cloneNode(False)) |
|||
|
|||
def hasContent(self): |
|||
return self.element.hasChildNodes() |
|||
|
|||
def getNameTuple(self): |
|||
if self.namespace == None: |
|||
return namespaces["html"], self.name |
|||
else: |
|||
return self.namespace, self.name |
|||
|
|||
nameTuple = property(getNameTuple) |
|||
|
|||
class TreeBuilder(_base.TreeBuilder): |
|||
def documentClass(self): |
|||
self.dom = Dom.getDOMImplementation().createDocument(None,None,None) |
|||
return weakref.proxy(self) |
|||
|
|||
def insertDoctype(self, token): |
|||
name = token["name"] |
|||
publicId = token["publicId"] |
|||
systemId = token["systemId"] |
|||
|
|||
domimpl = Dom.getDOMImplementation() |
|||
doctype = domimpl.createDocumentType(name, publicId, systemId) |
|||
self.document.appendChild(NodeBuilder(doctype)) |
|||
if Dom == minidom: |
|||
doctype.ownerDocument = self.dom |
|||
|
|||
def elementClass(self, name, namespace=None): |
|||
if namespace is None and self.defaultNamespace is None: |
|||
node = self.dom.createElement(name) |
|||
else: |
|||
node = self.dom.createElementNS(namespace, name) |
|||
|
|||
return NodeBuilder(node) |
|||
|
|||
def commentClass(self, data): |
|||
return NodeBuilder(self.dom.createComment(data)) |
|||
|
|||
def fragmentClass(self): |
|||
return NodeBuilder(self.dom.createDocumentFragment()) |
|||
|
|||
def appendChild(self, node): |
|||
self.dom.appendChild(node.element) |
|||
|
|||
def testSerializer(self, element): |
|||
return testSerializer(element) |
|||
|
|||
def getDocument(self): |
|||
return self.dom |
|||
|
|||
def getFragment(self): |
|||
return _base.TreeBuilder.getFragment(self).element |
|||
|
|||
def insertText(self, data, parent=None): |
|||
data=data |
|||
if parent <> self: |
|||
_base.TreeBuilder.insertText(self, data, parent) |
|||
else: |
|||
# HACK: allow text nodes as children of the document node |
|||
if hasattr(self.dom, '_child_node_types'): |
|||
if not Node.TEXT_NODE in self.dom._child_node_types: |
|||
self.dom._child_node_types=list(self.dom._child_node_types) |
|||
self.dom._child_node_types.append(Node.TEXT_NODE) |
|||
self.dom.appendChild(self.dom.createTextNode(data)) |
|||
|
|||
name = None |
|||
|
|||
def testSerializer(element): |
|||
element.normalize() |
|||
rv = [] |
|||
def serializeElement(element, indent=0): |
|||
if element.nodeType == Node.DOCUMENT_TYPE_NODE: |
|||
if element.name: |
|||
if element.publicId or element.systemId: |
|||
publicId = element.publicId or "" |
|||
systemId = element.systemId or "" |
|||
rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%( |
|||
' '*indent, element.name, publicId, systemId)) |
|||
else: |
|||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name)) |
|||
else: |
|||
rv.append("|%s<!DOCTYPE >"%(' '*indent,)) |
|||
elif element.nodeType == Node.DOCUMENT_NODE: |
|||
rv.append("#document") |
|||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE: |
|||
rv.append("#document-fragment") |
|||
elif element.nodeType == Node.COMMENT_NODE: |
|||
rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue)) |
|||
elif element.nodeType == Node.TEXT_NODE: |
|||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) |
|||
else: |
|||
if (hasattr(element, "namespaceURI") and |
|||
element.namespaceURI != None): |
|||
name = "%s %s"%(constants.prefixes[element.namespaceURI], |
|||
element.nodeName) |
|||
else: |
|||
name = element.nodeName |
|||
rv.append("|%s<%s>"%(' '*indent, name)) |
|||
if element.hasAttributes(): |
|||
attributes = [] |
|||
for i in range(len(element.attributes)): |
|||
attr = element.attributes.item(i) |
|||
name = attr.nodeName |
|||
value = attr.value |
|||
ns = attr.namespaceURI |
|||
if ns: |
|||
name = "%s %s"%(constants.prefixes[ns], attr.localName) |
|||
else: |
|||
name = attr.nodeName |
|||
attributes.append((name, value)) |
|||
|
|||
for name, value in sorted(attributes): |
|||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) |
|||
indent += 2 |
|||
for child in element.childNodes: |
|||
serializeElement(child, indent) |
|||
serializeElement(element, 0) |
|||
|
|||
return "\n".join(rv) |
|||
|
|||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}): |
|||
if node.nodeType == Node.ELEMENT_NODE: |
|||
if not nsmap: |
|||
handler.startElement(node.nodeName, node.attributes) |
|||
for child in node.childNodes: dom2sax(child, handler, nsmap) |
|||
handler.endElement(node.nodeName) |
|||
else: |
|||
attributes = dict(node.attributes.itemsNS()) |
|||
|
|||
# gather namespace declarations |
|||
prefixes = [] |
|||
for attrname in node.attributes.keys(): |
|||
attr = node.getAttributeNode(attrname) |
|||
if (attr.namespaceURI == XMLNS_NAMESPACE or |
|||
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): |
|||
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None) |
|||
handler.startPrefixMapping(prefix, attr.nodeValue) |
|||
prefixes.append(prefix) |
|||
nsmap = nsmap.copy() |
|||
nsmap[prefix] = attr.nodeValue |
|||
del attributes[(attr.namespaceURI, attr.nodeName)] |
|||
|
|||
# apply namespace declarations |
|||
for attrname in node.attributes.keys(): |
|||
attr = node.getAttributeNode(attrname) |
|||
if attr.namespaceURI == None and ':' in attr.nodeName: |
|||
prefix = attr.nodeName.split(':')[0] |
|||
if nsmap.has_key(prefix): |
|||
del attributes[(attr.namespaceURI, attr.nodeName)] |
|||
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue |
|||
|
|||
# SAX events |
|||
ns = node.namespaceURI or nsmap.get(None,None) |
|||
handler.startElementNS((ns,node.nodeName), node.nodeName, attributes) |
|||
for child in node.childNodes: dom2sax(child, handler, nsmap) |
|||
handler.endElementNS((ns, node.nodeName), node.nodeName) |
|||
for prefix in prefixes: handler.endPrefixMapping(prefix) |
|||
|
|||
elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: |
|||
handler.characters(node.nodeValue) |
|||
|
|||
elif node.nodeType == Node.DOCUMENT_NODE: |
|||
handler.startDocument() |
|||
for child in node.childNodes: dom2sax(child, handler, nsmap) |
|||
handler.endDocument() |
|||
|
|||
elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: |
|||
for child in node.childNodes: dom2sax(child, handler, nsmap) |
|||
|
|||
else: |
|||
# ATTRIBUTE_NODE |
|||
# ENTITY_NODE |
|||
# PROCESSING_INSTRUCTION_NODE |
|||
# COMMENT_NODE |
|||
# DOCUMENT_TYPE_NODE |
|||
# NOTATION_NODE |
|||
pass |
|||
|
|||
return locals() |
|||
|
|||
# Keep backwards compatibility with things that directly load |
|||
# classes/functions from this module |
|||
for key, value in getDomModule(minidom).__dict__.items(): |
|||
globals()[key] = value |
@ -0,0 +1,344 @@ |
|||
try: |
|||
from types import ModuleType |
|||
except: |
|||
from new import module as ModuleType |
|||
import re |
|||
import types |
|||
|
|||
import _base |
|||
from html5lib import ihatexml |
|||
from html5lib import constants |
|||
from html5lib.constants import namespaces |
|||
|
|||
tag_regexp = re.compile("{([^}]*)}(.*)") |
|||
|
|||
moduleCache = {} |
|||
|
|||
def getETreeModule(ElementTreeImplementation, fullTree=False): |
|||
name = "_" + ElementTreeImplementation.__name__+"builder" |
|||
if name in moduleCache: |
|||
return moduleCache[name] |
|||
else: |
|||
mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder") |
|||
objs = getETreeBuilder(ElementTreeImplementation, fullTree) |
|||
mod.__dict__.update(objs) |
|||
moduleCache[name] = mod |
|||
return mod |
|||
|
|||
def getETreeBuilder(ElementTreeImplementation, fullTree=False): |
|||
ElementTree = ElementTreeImplementation |
|||
class Element(_base.Node): |
|||
def __init__(self, name, namespace=None): |
|||
self._name = name |
|||
self._namespace = namespace |
|||
self._element = ElementTree.Element(self._getETreeTag(name, |
|||
namespace)) |
|||
if namespace is None: |
|||
self.nameTuple = namespaces["html"], self._name |
|||
else: |
|||
self.nameTuple = self._namespace, self._name |
|||
self.parent = None |
|||
self._childNodes = [] |
|||
self._flags = [] |
|||
|
|||
def _getETreeTag(self, name, namespace): |
|||
if namespace is None: |
|||
etree_tag = name |
|||
else: |
|||
etree_tag = "{%s}%s"%(namespace, name) |
|||
return etree_tag |
|||
|
|||
def _setName(self, name): |
|||
self._name = name |
|||
self._element.tag = self._getETreeTag(self._name, self._namespace) |
|||
|
|||
def _getName(self): |
|||
return self._name |
|||
|
|||
name = property(_getName, _setName) |
|||
|
|||
def _setNamespace(self, namespace): |
|||
self._namespace = namespace |
|||
self._element.tag = self._getETreeTag(self._name, self._namespace) |
|||
|
|||
def _getNamespace(self): |
|||
return self._namespace |
|||
|
|||
namespace = property(_getNamespace, _setNamespace) |
|||
|
|||
def _getAttributes(self): |
|||
return self._element.attrib |
|||
|
|||
def _setAttributes(self, attributes): |
|||
#Delete existing attributes first |
|||
#XXX - there may be a better way to do this... |
|||
for key in self._element.attrib.keys(): |
|||
del self._element.attrib[key] |
|||
for key, value in attributes.iteritems(): |
|||
if isinstance(key, tuple): |
|||
name = "{%s}%s"%(key[2], key[1]) |
|||
else: |
|||
name = key |
|||
self._element.set(name, value) |
|||
|
|||
attributes = property(_getAttributes, _setAttributes) |
|||
|
|||
def _getChildNodes(self): |
|||
return self._childNodes |
|||
def _setChildNodes(self, value): |
|||
del self._element[:] |
|||
self._childNodes = [] |
|||
for element in value: |
|||
self.insertChild(element) |
|||
|
|||
childNodes = property(_getChildNodes, _setChildNodes) |
|||
|
|||
def hasContent(self): |
|||
"""Return true if the node has children or text""" |
|||
return bool(self._element.text or len(self._element)) |
|||
|
|||
def appendChild(self, node): |
|||
self._childNodes.append(node) |
|||
self._element.append(node._element) |
|||
node.parent = self |
|||
|
|||
def insertBefore(self, node, refNode): |
|||
index = list(self._element).index(refNode._element) |
|||
self._element.insert(index, node._element) |
|||
node.parent = self |
|||
|
|||
def removeChild(self, node): |
|||
self._element.remove(node._element) |
|||
node.parent=None |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
if not(len(self._element)): |
|||
if not self._element.text: |
|||
self._element.text = "" |
|||
self._element.text += data |
|||
elif insertBefore is None: |
|||
#Insert the text as the tail of the last child element |
|||
if not self._element[-1].tail: |
|||
self._element[-1].tail = "" |
|||
self._element[-1].tail += data |
|||
else: |
|||
#Insert the text before the specified node |
|||
children = list(self._element) |
|||
index = children.index(insertBefore._element) |
|||
if index > 0: |
|||
if not self._element[index-1].tail: |
|||
self._element[index-1].tail = "" |
|||
self._element[index-1].tail += data |
|||
else: |
|||
if not self._element.text: |
|||
self._element.text = "" |
|||
self._element.text += data |
|||
|
|||
def cloneNode(self): |
|||
element = type(self)(self.name, self.namespace) |
|||
for name, value in self.attributes.iteritems(): |
|||
element.attributes[name] = value |
|||
return element |
|||
|
|||
def reparentChildren(self, newParent): |
|||
if newParent.childNodes: |
|||
newParent.childNodes[-1]._element.tail += self._element.text |
|||
else: |
|||
if not newParent._element.text: |
|||
newParent._element.text = "" |
|||
if self._element.text is not None: |
|||
newParent._element.text += self._element.text |
|||
self._element.text = "" |
|||
_base.Node.reparentChildren(self, newParent) |
|||
|
|||
class Comment(Element): |
|||
def __init__(self, data): |
|||
#Use the superclass constructor to set all properties on the |
|||
#wrapper element |
|||
self._element = ElementTree.Comment(data) |
|||
self.parent = None |
|||
self._childNodes = [] |
|||
self._flags = [] |
|||
|
|||
def _getData(self): |
|||
return self._element.text |
|||
|
|||
def _setData(self, value): |
|||
self._element.text = value |
|||
|
|||
data = property(_getData, _setData) |
|||
|
|||
class DocumentType(Element): |
|||
def __init__(self, name, publicId, systemId): |
|||
Element.__init__(self, "<!DOCTYPE>") |
|||
self._element.text = name |
|||
self.publicId = publicId |
|||
self.systemId = systemId |
|||
|
|||
def _getPublicId(self): |
|||
return self._element.get(u"publicId", "") |
|||
|
|||
def _setPublicId(self, value): |
|||
if value is not None: |
|||
self._element.set(u"publicId", value) |
|||
|
|||
publicId = property(_getPublicId, _setPublicId) |
|||
|
|||
def _getSystemId(self): |
|||
return self._element.get(u"systemId", "") |
|||
|
|||
def _setSystemId(self, value): |
|||
if value is not None: |
|||
self._element.set(u"systemId", value) |
|||
|
|||
systemId = property(_getSystemId, _setSystemId) |
|||
|
|||
class Document(Element): |
|||
def __init__(self): |
|||
Element.__init__(self, "<DOCUMENT_ROOT>") |
|||
|
|||
class DocumentFragment(Element): |
|||
def __init__(self): |
|||
Element.__init__(self, "<DOCUMENT_FRAGMENT>") |
|||
|
|||
def testSerializer(element): |
|||
rv = [] |
|||
finalText = None |
|||
def serializeElement(element, indent=0): |
|||
if not(hasattr(element, "tag")): |
|||
element = element.getroot() |
|||
if element.tag == "<!DOCTYPE>": |
|||
if element.get("publicId") or element.get("systemId"): |
|||
publicId = element.get("publicId") or "" |
|||
systemId = element.get("systemId") or "" |
|||
rv.append( """<!DOCTYPE %s "%s" "%s">"""%( |
|||
element.text, publicId, systemId)) |
|||
else: |
|||
rv.append("<!DOCTYPE %s>"%(element.text,)) |
|||
elif element.tag == "<DOCUMENT_ROOT>": |
|||
rv.append("#document") |
|||
if element.text: |
|||
rv.append("|%s\"%s\""%(' '*(indent+2), element.text)) |
|||
if element.tail: |
|||
finalText = element.tail |
|||
elif element.tag == ElementTree.Comment: |
|||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) |
|||
else: |
|||
assert type(element.tag) in types.StringTypes, "Expected unicode, got %s"%type(element.tag) |
|||
nsmatch = tag_regexp.match(element.tag) |
|||
|
|||
if nsmatch is None: |
|||
name = element.tag |
|||
else: |
|||
ns, name = nsmatch.groups() |
|||
prefix = constants.prefixes[ns] |
|||
name = "%s %s"%(prefix, name) |
|||
rv.append("|%s<%s>"%(' '*indent, name)) |
|||
|
|||
if hasattr(element, "attrib"): |
|||
attributes = [] |
|||
for name, value in element.attrib.iteritems(): |
|||
nsmatch = tag_regexp.match(name) |
|||
if nsmatch is not None: |
|||
ns, name = nsmatch.groups() |
|||
prefix = constants.prefixes[ns] |
|||
attr_string = "%s %s"%(prefix, name) |
|||
else: |
|||
attr_string = name |
|||
attributes.append((attr_string, value)) |
|||
|
|||
for name, value in sorted(attributes): |
|||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) |
|||
if element.text: |
|||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) |
|||
indent += 2 |
|||
for child in element: |
|||
serializeElement(child, indent) |
|||
if element.tail: |
|||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) |
|||
serializeElement(element, 0) |
|||
|
|||
if finalText is not None: |
|||
rv.append("|%s\"%s\""%(' '*2, finalText)) |
|||
|
|||
return "\n".join(rv) |
|||
|
|||
def tostring(element): |
|||
"""Serialize an element and its child nodes to a string""" |
|||
rv = [] |
|||
finalText = None |
|||
filter = ihatexml.InfosetFilter() |
|||
def serializeElement(element): |
|||
if type(element) == type(ElementTree.ElementTree): |
|||
element = element.getroot() |
|||
|
|||
if element.tag == "<!DOCTYPE>": |
|||
if element.get("publicId") or element.get("systemId"): |
|||
publicId = element.get("publicId") or "" |
|||
systemId = element.get("systemId") or "" |
|||
rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%( |
|||
element.text, publicId, systemId)) |
|||
else: |
|||
rv.append("<!DOCTYPE %s>"%(element.text,)) |
|||
elif element.tag == "<DOCUMENT_ROOT>": |
|||
if element.text: |
|||
rv.append(element.text) |
|||
if element.tail: |
|||
finalText = element.tail |
|||
|
|||
for child in element: |
|||
serializeElement(child) |
|||
|
|||
elif type(element.tag) == type(ElementTree.Comment): |
|||
rv.append("<!--%s-->"%(element.text,)) |
|||
else: |
|||
#This is assumed to be an ordinary element |
|||
if not element.attrib: |
|||
rv.append("<%s>"%(filter.fromXmlName(element.tag),)) |
|||
else: |
|||
attr = " ".join(["%s=\"%s\""%( |
|||
filter.fromXmlName(name), value) |
|||
for name, value in element.attrib.iteritems()]) |
|||
rv.append("<%s %s>"%(element.tag, attr)) |
|||
if element.text: |
|||
rv.append(element.text) |
|||
|
|||
for child in element: |
|||
serializeElement(child) |
|||
|
|||
rv.append("</%s>"%(element.tag,)) |
|||
|
|||
if element.tail: |
|||
rv.append(element.tail) |
|||
|
|||
serializeElement(element) |
|||
|
|||
if finalText is not None: |
|||
rv.append("%s\""%(' '*2, finalText)) |
|||
|
|||
return "".join(rv) |
|||
|
|||
class TreeBuilder(_base.TreeBuilder): |
|||
documentClass = Document |
|||
doctypeClass = DocumentType |
|||
elementClass = Element |
|||
commentClass = Comment |
|||
fragmentClass = DocumentFragment |
|||
|
|||
def testSerializer(self, element): |
|||
return testSerializer(element) |
|||
|
|||
def getDocument(self): |
|||
if fullTree: |
|||
return self.document._element |
|||
else: |
|||
if self.defaultNamespace is not None: |
|||
return self.document._element.find( |
|||
"{%s}html"%self.defaultNamespace) |
|||
else: |
|||
return self.document._element.find("html") |
|||
|
|||
def getFragment(self): |
|||
return _base.TreeBuilder.getFragment(self)._element |
|||
|
|||
return locals() |
@ -0,0 +1,336 @@ |
|||
import warnings |
|||
import re |
|||
|
|||
import _base |
|||
from html5lib.constants import DataLossWarning |
|||
import html5lib.constants as constants |
|||
import etree as etree_builders |
|||
from html5lib import ihatexml |
|||
|
|||
try: |
|||
import lxml.etree as etree |
|||
except ImportError: |
|||
pass |
|||
|
|||
fullTree = True |
|||
tag_regexp = re.compile("{([^}]*)}(.*)") |
|||
|
|||
"""Module for supporting the lxml.etree library. The idea here is to use as much |
|||
of the native library as possible, without using fragile hacks like custom element |
|||
names that break between releases. The downside of this is that we cannot represent |
|||
all possible trees; specifically the following are known to cause problems: |
|||
|
|||
Text or comments as siblings of the root element |
|||
Docypes with no name |
|||
|
|||
When any of these things occur, we emit a DataLossWarning |
|||
""" |
|||
|
|||
class DocumentType(object): |
|||
def __init__(self, name, publicId, systemId): |
|||
self.name = name |
|||
self.publicId = publicId |
|||
self.systemId = systemId |
|||
|
|||
class Document(object): |
|||
def __init__(self): |
|||
self._elementTree = None |
|||
self._childNodes = [] |
|||
|
|||
def appendChild(self, element): |
|||
self._elementTree.getroot().addnext(element._element) |
|||
|
|||
def _getChildNodes(self): |
|||
return self._childNodes |
|||
|
|||
childNodes = property(_getChildNodes) |
|||
|
|||
def testSerializer(element): |
|||
rv = [] |
|||
finalText = None |
|||
filter = ihatexml.InfosetFilter() |
|||
def serializeElement(element, indent=0): |
|||
if not hasattr(element, "tag"): |
|||
if hasattr(element, "getroot"): |
|||
#Full tree case |
|||
rv.append("#document") |
|||
if element.docinfo.internalDTD: |
|||
if not (element.docinfo.public_id or |
|||
element.docinfo.system_url): |
|||
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name |
|||
else: |
|||
dtd_str = """<!DOCTYPE %s "%s" "%s">"""%( |
|||
element.docinfo.root_name, |
|||
element.docinfo.public_id, |
|||
element.docinfo.system_url) |
|||
rv.append("|%s%s"%(' '*(indent+2), dtd_str)) |
|||
next_element = element.getroot() |
|||
while next_element.getprevious() is not None: |
|||
next_element = next_element.getprevious() |
|||
while next_element is not None: |
|||
serializeElement(next_element, indent+2) |
|||
next_element = next_element.getnext() |
|||
elif isinstance(element, basestring): |
|||
#Text in a fragment |
|||
rv.append("|%s\"%s\""%(' '*indent, element)) |
|||
else: |
|||
#Fragment case |
|||
rv.append("#document-fragment") |
|||
for next_element in element: |
|||
serializeElement(next_element, indent+2) |
|||
elif type(element.tag) == type(etree.Comment): |
|||
rv.append("|%s<!-- %s -->"%(' '*indent, element.text)) |
|||
else: |
|||
nsmatch = etree_builders.tag_regexp.match(element.tag) |
|||
if nsmatch is not None: |
|||
ns = nsmatch.group(1) |
|||
tag = nsmatch.group(2) |
|||
prefix = constants.prefixes[ns] |
|||
rv.append("|%s<%s %s>"%(' '*indent, prefix, |
|||
filter.fromXmlName(tag))) |
|||
else: |
|||
rv.append("|%s<%s>"%(' '*indent, |
|||
filter.fromXmlName(element.tag))) |
|||
|
|||
if hasattr(element, "attrib"): |
|||
attributes = [] |
|||
for name, value in element.attrib.iteritems(): |
|||
nsmatch = tag_regexp.match(name) |
|||
if nsmatch is not None: |
|||
ns, name = nsmatch.groups() |
|||
name = filter.fromXmlName(name) |
|||
prefix = constants.prefixes[ns] |
|||
attr_string = "%s %s"%(prefix, name) |
|||
else: |
|||
attr_string = filter.fromXmlName(name) |
|||
attributes.append((attr_string, value)) |
|||
|
|||
for name, value in sorted(attributes): |
|||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) |
|||
|
|||
if element.text: |
|||
rv.append("|%s\"%s\"" %(' '*(indent+2), element.text)) |
|||
indent += 2 |
|||
for child in element.getchildren(): |
|||
serializeElement(child, indent) |
|||
if hasattr(element, "tail") and element.tail: |
|||
rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail)) |
|||
serializeElement(element, 0) |
|||
|
|||
if finalText is not None: |
|||
rv.append("|%s\"%s\""%(' '*2, finalText)) |
|||
|
|||
return "\n".join(rv) |
|||
|
|||
def tostring(element): |
|||
"""Serialize an element and its child nodes to a string""" |
|||
rv = [] |
|||
finalText = None |
|||
def serializeElement(element): |
|||
if not hasattr(element, "tag"): |
|||
if element.docinfo.internalDTD: |
|||
if element.docinfo.doctype: |
|||
dtd_str = element.docinfo.doctype |
|||
else: |
|||
dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name |
|||
rv.append(dtd_str) |
|||
serializeElement(element.getroot()) |
|||
|
|||
elif type(element.tag) == type(etree.Comment): |
|||
rv.append("<!--%s-->"%(element.text,)) |
|||
|
|||
else: |
|||
#This is assumed to be an ordinary element |
|||
if not element.attrib: |
|||
rv.append("<%s>"%(element.tag,)) |
|||
else: |
|||
attr = " ".join(["%s=\"%s\""%(name, value) |
|||
for name, value in element.attrib.iteritems()]) |
|||
rv.append("<%s %s>"%(element.tag, attr)) |
|||
if element.text: |
|||
rv.append(element.text) |
|||
|
|||
for child in element.getchildren(): |
|||
serializeElement(child) |
|||
|
|||
rv.append("</%s>"%(element.tag,)) |
|||
|
|||
if hasattr(element, "tail") and element.tail: |
|||
rv.append(element.tail) |
|||
|
|||
serializeElement(element) |
|||
|
|||
if finalText is not None: |
|||
rv.append("%s\""%(' '*2, finalText)) |
|||
|
|||
return "".join(rv) |
|||
|
|||
|
|||
class TreeBuilder(_base.TreeBuilder): |
|||
documentClass = Document |
|||
doctypeClass = DocumentType |
|||
elementClass = None |
|||
commentClass = None |
|||
fragmentClass = Document |
|||
|
|||
def __init__(self, namespaceHTMLElements, fullTree = False): |
|||
builder = etree_builders.getETreeModule(etree, fullTree=fullTree) |
|||
filter = self.filter = ihatexml.InfosetFilter() |
|||
self.namespaceHTMLElements = namespaceHTMLElements |
|||
|
|||
class Attributes(dict): |
|||
def __init__(self, element, value={}): |
|||
self._element = element |
|||
dict.__init__(self, value) |
|||
for key, value in self.iteritems(): |
|||
if isinstance(key, tuple): |
|||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) |
|||
else: |
|||
name = filter.coerceAttribute(key) |
|||
self._element._element.attrib[name] = value |
|||
|
|||
def __setitem__(self, key, value): |
|||
dict.__setitem__(self, key, value) |
|||
if isinstance(key, tuple): |
|||
name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1])) |
|||
else: |
|||
name = filter.coerceAttribute(key) |
|||
self._element._element.attrib[name] = value |
|||
|
|||
class Element(builder.Element): |
|||
def __init__(self, name, namespace): |
|||
name = filter.coerceElement(name) |
|||
builder.Element.__init__(self, name, namespace=namespace) |
|||
self._attributes = Attributes(self) |
|||
|
|||
def _setName(self, name): |
|||
self._name = filter.coerceElement(name) |
|||
self._element.tag = self._getETreeTag( |
|||
self._name, self._namespace) |
|||
|
|||
def _getName(self): |
|||
return filter.fromXmlName(self._name) |
|||
|
|||
name = property(_getName, _setName) |
|||
|
|||
def _getAttributes(self): |
|||
return self._attributes |
|||
|
|||
def _setAttributes(self, attributes): |
|||
self._attributes = Attributes(self, attributes) |
|||
|
|||
attributes = property(_getAttributes, _setAttributes) |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
data = filter.coerceCharacters(data) |
|||
builder.Element.insertText(self, data, insertBefore) |
|||
|
|||
def appendChild(self, child): |
|||
builder.Element.appendChild(self, child) |
|||
|
|||
|
|||
class Comment(builder.Comment): |
|||
def __init__(self, data): |
|||
data = filter.coerceComment(data) |
|||
builder.Comment.__init__(self, data) |
|||
|
|||
def _setData(self, data): |
|||
data = filter.coerceComment(data) |
|||
self._element.text = data |
|||
|
|||
def _getData(self): |
|||
return self._element.text |
|||
|
|||
data = property(_getData, _setData) |
|||
|
|||
self.elementClass = Element |
|||
self.commentClass = builder.Comment |
|||
#self.fragmentClass = builder.DocumentFragment |
|||
_base.TreeBuilder.__init__(self, namespaceHTMLElements) |
|||
|
|||
def reset(self): |
|||
_base.TreeBuilder.reset(self) |
|||
self.insertComment = self.insertCommentInitial |
|||
self.initial_comments = [] |
|||
self.doctype = None |
|||
|
|||
def testSerializer(self, element): |
|||
return testSerializer(element) |
|||
|
|||
def getDocument(self): |
|||
if fullTree: |
|||
return self.document._elementTree |
|||
else: |
|||
return self.document._elementTree.getroot() |
|||
|
|||
def getFragment(self): |
|||
fragment = [] |
|||
element = self.openElements[0]._element |
|||
if element.text: |
|||
fragment.append(element.text) |
|||
fragment.extend(element.getchildren()) |
|||
if element.tail: |
|||
fragment.append(element.tail) |
|||
return fragment |
|||
|
|||
def insertDoctype(self, token): |
|||
name = token["name"] |
|||
publicId = token["publicId"] |
|||
systemId = token["systemId"] |
|||
|
|||
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"': |
|||
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) |
|||
|
|||
doctype = self.doctypeClass(name, publicId, systemId) |
|||
self.doctype = doctype |
|||
|
|||
def insertCommentInitial(self, data, parent=None): |
|||
self.initial_comments.append(data) |
|||
|
|||
def insertRoot(self, token): |
|||
"""Create the document root""" |
|||
#Because of the way libxml2 works, it doesn't seem to be possible to |
|||
#alter information like the doctype after the tree has been parsed. |
|||
#Therefore we need to use the built-in parser to create our iniial |
|||
#tree, after which we can add elements like normal |
|||
docStr = "" |
|||
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'): |
|||
docStr += "<!DOCTYPE %s"%self.doctype.name |
|||
if (self.doctype.publicId is not None or |
|||
self.doctype.systemId is not None): |
|||
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "", |
|||
self.doctype.systemId or "") |
|||
docStr += ">" |
|||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>" |
|||
|
|||
try: |
|||
root = etree.fromstring(docStr) |
|||
except etree.XMLSyntaxError: |
|||
print docStr |
|||
raise |
|||
|
|||
#Append the initial comments: |
|||
for comment_token in self.initial_comments: |
|||
root.addprevious(etree.Comment(comment_token["data"])) |
|||
|
|||
#Create the root document and add the ElementTree to it |
|||
self.document = self.documentClass() |
|||
self.document._elementTree = root.getroottree() |
|||
|
|||
# Give the root element the right name |
|||
name = token["name"] |
|||
namespace = token.get("namespace", self.defaultNamespace) |
|||
if namespace is None: |
|||
etree_tag = name |
|||
else: |
|||
etree_tag = "{%s}%s"%(namespace, name) |
|||
root.tag = etree_tag |
|||
|
|||
#Add the root element to the internal child/open data structures |
|||
root_element = self.elementClass(name, namespace) |
|||
root_element._element = root |
|||
self.document._childNodes.append(root_element) |
|||
self.openElements.append(root_element) |
|||
|
|||
#Reset to the default insert comment function |
|||
self.insertComment = super(TreeBuilder, self).insertComment |
@ -0,0 +1,256 @@ |
|||
import _base |
|||
from html5lib.constants import voidElements, namespaces, prefixes |
|||
from xml.sax.saxutils import escape |
|||
|
|||
# Really crappy basic implementation of a DOM-core like thing |
|||
class Node(_base.Node): |
|||
type = -1 |
|||
def __init__(self, name): |
|||
self.name = name |
|||
self.parent = None |
|||
self.value = None |
|||
self.childNodes = [] |
|||
self._flags = [] |
|||
|
|||
def __iter__(self): |
|||
for node in self.childNodes: |
|||
yield node |
|||
for item in node: |
|||
yield item |
|||
|
|||
def __unicode__(self): |
|||
return self.name |
|||
|
|||
def toxml(self): |
|||
raise NotImplementedError |
|||
|
|||
def printTree(self, indent=0): |
|||
tree = '\n|%s%s' % (' '* indent, unicode(self)) |
|||
for child in self.childNodes: |
|||
tree += child.printTree(indent + 2) |
|||
return tree |
|||
|
|||
def appendChild(self, node): |
|||
assert isinstance(node, Node) |
|||
if (isinstance(node, TextNode) and self.childNodes and |
|||
isinstance(self.childNodes[-1], TextNode)): |
|||
self.childNodes[-1].value += node.value |
|||
else: |
|||
self.childNodes.append(node) |
|||
node.parent = self |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
assert isinstance(data, unicode), "data %s is of type %s expected unicode"%(repr(data), type(data)) |
|||
if insertBefore is None: |
|||
self.appendChild(TextNode(data)) |
|||
else: |
|||
self.insertBefore(TextNode(data), insertBefore) |
|||
|
|||
def insertBefore(self, node, refNode): |
|||
index = self.childNodes.index(refNode) |
|||
if (isinstance(node, TextNode) and index > 0 and |
|||
isinstance(self.childNodes[index - 1], TextNode)): |
|||
self.childNodes[index - 1].value += node.value |
|||
else: |
|||
self.childNodes.insert(index, node) |
|||
node.parent = self |
|||
|
|||
def removeChild(self, node): |
|||
try: |
|||
self.childNodes.remove(node) |
|||
except: |
|||
# XXX |
|||
raise |
|||
node.parent = None |
|||
|
|||
def cloneNode(self): |
|||
raise NotImplementedError |
|||
|
|||
def hasContent(self): |
|||
"""Return true if the node has children or text""" |
|||
return bool(self.childNodes) |
|||
|
|||
def getNameTuple(self): |
|||
if self.namespace == None: |
|||
return namespaces["html"], self.name |
|||
else: |
|||
return self.namespace, self.name |
|||
|
|||
nameTuple = property(getNameTuple) |
|||
|
|||
class Document(Node): |
|||
type = 1 |
|||
def __init__(self): |
|||
Node.__init__(self, None) |
|||
|
|||
def __str__(self): |
|||
return "#document" |
|||
|
|||
def __unicode__(self): |
|||
return str(self) |
|||
|
|||
def appendChild(self, child): |
|||
Node.appendChild(self, child) |
|||
|
|||
def toxml(self, encoding="utf=8"): |
|||
result = "" |
|||
for child in self.childNodes: |
|||
result += child.toxml() |
|||
return result.encode(encoding) |
|||
|
|||
def hilite(self, encoding="utf-8"): |
|||
result = "<pre>" |
|||
for child in self.childNodes: |
|||
result += child.hilite() |
|||
return result.encode(encoding) + "</pre>" |
|||
|
|||
def printTree(self): |
|||
tree = unicode(self) |
|||
for child in self.childNodes: |
|||
tree += child.printTree(2) |
|||
return tree |
|||
|
|||
def cloneNode(self): |
|||
return Document() |
|||
|
|||
class DocumentFragment(Document): |
|||
type = 2 |
|||
def __str__(self): |
|||
return "#document-fragment" |
|||
|
|||
def __unicode__(self): |
|||
return str(self) |
|||
|
|||
def cloneNode(self): |
|||
return DocumentFragment() |
|||
|
|||
class DocumentType(Node): |
|||
type = 3 |
|||
def __init__(self, name, publicId, systemId): |
|||
Node.__init__(self, name) |
|||
self.publicId = publicId |
|||
self.systemId = systemId |
|||
|
|||
def __unicode__(self): |
|||
if self.publicId or self.systemId: |
|||
publicId = self.publicId or "" |
|||
systemId = self.systemId or "" |
|||
return """<!DOCTYPE %s "%s" "%s">"""%( |
|||
self.name, publicId, systemId) |
|||
|
|||
else: |
|||
return u"<!DOCTYPE %s>" % self.name |
|||
|
|||
|
|||
toxml = __unicode__ |
|||
|
|||
def hilite(self): |
|||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name |
|||
|
|||
def cloneNode(self): |
|||
return DocumentType(self.name, self.publicId, self.systemId) |
|||
|
|||
class TextNode(Node): |
|||
type = 4 |
|||
def __init__(self, value): |
|||
Node.__init__(self, None) |
|||
self.value = value |
|||
|
|||
def __unicode__(self): |
|||
return u"\"%s\"" % self.value |
|||
|
|||
def toxml(self): |
|||
return escape(self.value) |
|||
|
|||
hilite = toxml |
|||
|
|||
def cloneNode(self): |
|||
return TextNode(self.value) |
|||
|
|||
class Element(Node): |
|||
type = 5 |
|||
def __init__(self, name, namespace=None): |
|||
Node.__init__(self, name) |
|||
self.namespace = namespace |
|||
self.attributes = {} |
|||
|
|||
def __unicode__(self): |
|||
if self.namespace == None: |
|||
return u"<%s>" % self.name |
|||
else: |
|||
return u"<%s %s>"%(prefixes[self.namespace], self.name) |
|||
|
|||
def toxml(self): |
|||
result = '<' + self.name |
|||
if self.attributes: |
|||
for name,value in self.attributes.iteritems(): |
|||
result += u' %s="%s"' % (name, escape(value,{'"':'"'})) |
|||
if self.childNodes: |
|||
result += '>' |
|||
for child in self.childNodes: |
|||
result += child.toxml() |
|||
result += u'</%s>' % self.name |
|||
else: |
|||
result += u'/>' |
|||
return result |
|||
|
|||
def hilite(self): |
|||
result = '<<code class="markup element-name">%s</code>' % self.name |
|||
if self.attributes: |
|||
for name, value in self.attributes.iteritems(): |
|||
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'"'})) |
|||
if self.childNodes: |
|||
result += ">" |
|||
for child in self.childNodes: |
|||
result += child.hilite() |
|||
elif self.name in voidElements: |
|||
return result + ">" |
|||
return result + '</<code class="markup element-name">%s</code>>' % self.name |
|||
|
|||
def printTree(self, indent): |
|||
tree = '\n|%s%s' % (' '*indent, unicode(self)) |
|||
indent += 2 |
|||
if self.attributes: |
|||
for name, value in sorted(self.attributes.iteritems()): |
|||
if isinstance(name, tuple): |
|||
name = "%s %s"%(name[0], name[1]) |
|||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value) |
|||
for child in self.childNodes: |
|||
tree += child.printTree(indent) |
|||
return tree |
|||
|
|||
def cloneNode(self): |
|||
newNode = Element(self.name) |
|||
if hasattr(self, 'namespace'): |
|||
newNode.namespace = self.namespace |
|||
for attr, value in self.attributes.iteritems(): |
|||
newNode.attributes[attr] = value |
|||
return newNode |
|||
|
|||
class CommentNode(Node): |
|||
type = 6 |
|||
def __init__(self, data): |
|||
Node.__init__(self, None) |
|||
self.data = data |
|||
|
|||
def __unicode__(self): |
|||
return "<!-- %s -->" % self.data |
|||
|
|||
def toxml(self): |
|||
return "<!--%s-->" % self.data |
|||
|
|||
def hilite(self): |
|||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data) |
|||
|
|||
def cloneNode(self): |
|||
return CommentNode(self.data) |
|||
|
|||
class TreeBuilder(_base.TreeBuilder): |
|||
documentClass = Document |
|||
doctypeClass = DocumentType |
|||
elementClass = Element |
|||
commentClass = CommentNode |
|||
fragmentClass = DocumentFragment |
|||
|
|||
def testSerializer(self, node): |
|||
return node.printTree() |
@ -0,0 +1,236 @@ |
|||
import warnings |
|||
|
|||
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning) |
|||
|
|||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration |
|||
|
|||
import _base |
|||
from html5lib.constants import namespaces, DataLossWarning |
|||
|
|||
class AttrList(object): |
|||
def __init__(self, element): |
|||
self.element = element |
|||
self.attrs = dict(self.element.attrs) |
|||
def __iter__(self): |
|||
return self.attrs.items().__iter__() |
|||
def __setitem__(self, name, value): |
|||
"set attr", name, value |
|||
self.element[name] = value |
|||
def items(self): |
|||
return self.attrs.items() |
|||
def keys(self): |
|||
return self.attrs.keys() |
|||
def __getitem__(self, name): |
|||
return self.attrs[name] |
|||
def __contains__(self, name): |
|||
return name in self.attrs.keys() |
|||
def __eq__(self, other): |
|||
if len(self.keys()) != len(other.keys()): |
|||
return False |
|||
for item in self.keys(): |
|||
if item not in other: |
|||
return False |
|||
if self[item] != other[item]: |
|||
return False |
|||
return True |
|||
|
|||
class Element(_base.Node): |
|||
def __init__(self, element, soup, namespace): |
|||
_base.Node.__init__(self, element.name) |
|||
self.element = element |
|||
self.soup = soup |
|||
self.namespace = namespace |
|||
|
|||
def _nodeIndex(self, node, refNode): |
|||
# Finds a node by identity rather than equality |
|||
for index in range(len(self.element.contents)): |
|||
if id(self.element.contents[index]) == id(refNode.element): |
|||
return index |
|||
return None |
|||
|
|||
def appendChild(self, node): |
|||
if (node.element.__class__ == NavigableString and self.element.contents |
|||
and self.element.contents[-1].__class__ == NavigableString): |
|||
# Concatenate new text onto old text node |
|||
# (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...") |
|||
newStr = NavigableString(self.element.contents[-1]+node.element) |
|||
|
|||
# Remove the old text node |
|||
# (Can't simply use .extract() by itself, because it fails if |
|||
# an equal text node exists within the parent node) |
|||
oldElement = self.element.contents[-1] |
|||
del self.element.contents[-1] |
|||
oldElement.parent = None |
|||
oldElement.extract() |
|||
|
|||
self.element.insert(len(self.element.contents), newStr) |
|||
else: |
|||
self.element.insert(len(self.element.contents), node.element) |
|||
node.parent = self |
|||
|
|||
def getAttributes(self): |
|||
return AttrList(self.element) |
|||
|
|||
def setAttributes(self, attributes): |
|||
if attributes: |
|||
for name, value in attributes.items(): |
|||
self.element[name] = value |
|||
|
|||
attributes = property(getAttributes, setAttributes) |
|||
|
|||
def insertText(self, data, insertBefore=None): |
|||
text = TextNode(NavigableString(data), self.soup) |
|||
if insertBefore: |
|||
self.insertBefore(text, insertBefore) |
|||
else: |
|||
self.appendChild(text) |
|||
|
|||
def insertBefore(self, node, refNode): |
|||
index = self._nodeIndex(node, refNode) |
|||
if (node.element.__class__ == NavigableString and self.element.contents |
|||
and self.element.contents[index-1].__class__ == NavigableString): |
|||
# (See comments in appendChild) |
|||
newStr = NavigableString(self.element.contents[index-1]+node.element) |
|||
oldNode = self.element.contents[index-1] |
|||
del self.element.contents[index-1] |
|||
oldNode.parent = None |
|||
oldNode.extract() |
|||
|
|||
self.element.insert(index-1, newStr) |
|||
else: |
|||
self.element.insert(index, node.element) |
|||
node.parent = self |
|||
|
|||
def removeChild(self, node): |
|||
index = self._nodeIndex(node.parent, node) |
|||
del node.parent.element.contents[index] |
|||
node.element.parent = None |
|||
node.element.extract() |
|||
node.parent = None |
|||
|
|||
def reparentChildren(self, newParent): |
|||
while self.element.contents: |
|||
child = self.element.contents[0] |
|||
child.extract() |
|||
if isinstance(child, Tag): |
|||
newParent.appendChild(Element(child, self.soup, namespaces["html"])) |
|||
else: |
|||
newParent.appendChild(TextNode(child, self.soup)) |
|||
|
|||
def cloneNode(self): |
|||
node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace) |
|||
for key,value in self.attributes: |
|||
node.attributes[key] = value |
|||
return node |
|||
|
|||
def hasContent(self): |
|||
return self.element.contents |
|||
|
|||
def getNameTuple(self): |
|||
if self.namespace == None: |
|||
return namespaces["html"], self.name |
|||
else: |
|||
return self.namespace, self.name |
|||
|
|||
nameTuple = property(getNameTuple) |
|||
|
|||
class TextNode(Element): |
|||
def __init__(self, element, soup): |
|||
_base.Node.__init__(self, None) |
|||
self.element = element |
|||
self.soup = soup |
|||
|
|||
def cloneNode(self): |
|||
raise NotImplementedError |
|||
|
|||
class TreeBuilder(_base.TreeBuilder): |
|||
def __init__(self, namespaceHTMLElements): |
|||
if namespaceHTMLElements: |
|||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) |
|||
_base.TreeBuilder.__init__(self, namespaceHTMLElements) |
|||
|
|||
def documentClass(self): |
|||
self.soup = BeautifulSoup("") |
|||
return Element(self.soup, self.soup, None) |
|||
|
|||
def insertDoctype(self, token): |
|||
name = token["name"] |
|||
publicId = token["publicId"] |
|||
systemId = token["systemId"] |
|||
|
|||
if publicId: |
|||
self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) |
|||
elif systemId: |
|||
self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""% |
|||
(name, systemId))) |
|||
else: |
|||
self.soup.insert(0, Declaration("DOCTYPE %s"%name)) |
|||
|
|||
def elementClass(self, name, namespace): |
|||
if namespace is not None: |
|||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning) |
|||
return Element(Tag(self.soup, name), self.soup, namespace) |
|||
|
|||
def commentClass(self, data): |
|||
return TextNode(Comment(data), self.soup) |
|||
|
|||
def fragmentClass(self): |
|||
self.soup = BeautifulSoup("") |
|||
self.soup.name = "[document_fragment]" |
|||
return Element(self.soup, self.soup, None) |
|||
|
|||
def appendChild(self, node): |
|||
self.soup.insert(len(self.soup.contents), node.element) |
|||
|
|||
def testSerializer(self, element): |
|||
return testSerializer(element) |
|||
|
|||
def getDocument(self): |
|||
return self.soup |
|||
|
|||
def getFragment(self): |
|||
return _base.TreeBuilder.getFragment(self).element |
|||
|
|||
def testSerializer(element): |
|||
import re |
|||
rv = [] |
|||
def serializeElement(element, indent=0): |
|||
if isinstance(element, Declaration): |
|||
doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?' |
|||
m = re.compile(doctype_regexp).match(element.string) |
|||
assert m is not None, "DOCTYPE did not match expected format" |
|||
name = m.group('name') |
|||
publicId = m.group('publicId') |
|||
if publicId is not None: |
|||
systemId = m.group('systemId1') or "" |
|||
else: |
|||
systemId = m.group('systemId2') |
|||
|
|||
if publicId is not None or systemId is not None: |
|||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""% |
|||
(' '*indent, name, publicId or "", systemId or "")) |
|||
else: |
|||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name)) |
|||
|
|||
elif isinstance(element, BeautifulSoup): |
|||
if element.name == "[document_fragment]": |
|||
rv.append("#document-fragment") |
|||
else: |
|||
rv.append("#document") |
|||
|
|||
elif isinstance(element, Comment): |
|||
rv.append("|%s<!-- %s -->"%(' '*indent, element.string)) |
|||
elif isinstance(element, unicode): |
|||
rv.append("|%s\"%s\"" %(' '*indent, element)) |
|||
else: |
|||
rv.append("|%s<%s>"%(' '*indent, element.name)) |
|||
if element.attrs: |
|||
for name, value in sorted(element.attrs): |
|||
rv.append('|%s%s="%s"' % (' '*(indent+2), name, value)) |
|||
indent += 2 |
|||
if hasattr(element, "contents"): |
|||
for child in element.contents: |
|||
serializeElement(child, indent) |
|||
serializeElement(element, 0) |
|||
|
|||
return "\n".join(rv) |
@ -0,0 +1,52 @@ |
|||
"""A collection of modules for iterating through different kinds of |
|||
tree, generating tokens identical to those produced by the tokenizer |
|||
module. |
|||
|
|||
To create a tree walker for a new type of tree, you need to do |
|||
implement a tree walker object (called TreeWalker by convention) that |
|||
implements a 'serialize' method taking a tree as sole argument and |
|||
returning an iterator generating tokens. |
|||
""" |
|||
|
|||
treeWalkerCache = {} |
|||
|
|||
def getTreeWalker(treeType, implementation=None, **kwargs): |
|||
"""Get a TreeWalker class for various types of tree with built-in support |
|||
|
|||
treeType - the name of the tree type required (case-insensitive). Supported |
|||
values are "simpletree", "dom", "etree" and "beautifulsoup" |
|||
|
|||
"simpletree" - a built-in DOM-ish tree type with support for some |
|||
more pythonic idioms. |
|||
"dom" - The xml.dom.minidom DOM implementation |
|||
"pulldom" - The xml.dom.pulldom event stream |
|||
"etree" - A generic walker for tree implementations exposing an |
|||
elementtree-like interface (known to work with |
|||
ElementTree, cElementTree and lxml.etree). |
|||
"lxml" - Optimized walker for lxml.etree |
|||
"beautifulsoup" - Beautiful soup (if installed) |
|||
"genshi" - a Genshi stream |
|||
|
|||
implementation - (Currently applies to the "etree" tree type only). A module |
|||
implementing the tree type e.g. xml.etree.ElementTree or |
|||
cElementTree.""" |
|||
|
|||
treeType = treeType.lower() |
|||
if treeType not in treeWalkerCache: |
|||
if treeType in ("dom", "pulldom", "simpletree"): |
|||
mod = __import__(treeType, globals()) |
|||
treeWalkerCache[treeType] = mod.TreeWalker |
|||
elif treeType == "genshi": |
|||
import genshistream |
|||
treeWalkerCache[treeType] = genshistream.TreeWalker |
|||
elif treeType == "beautifulsoup": |
|||
import soup |
|||
treeWalkerCache[treeType] = soup.TreeWalker |
|||
elif treeType == "lxml": |
|||
import lxmletree |
|||
treeWalkerCache[treeType] = lxmletree.TreeWalker |
|||
elif treeType == "etree": |
|||
import etree |
|||
# XXX: NEVER cache here, caching is done in the etree submodule |
|||
return etree.getETreeModule(implementation, **kwargs).TreeWalker |
|||
return treeWalkerCache.get(treeType) |
@ -0,0 +1,176 @@ |
|||
import gettext |
|||
_ = gettext.gettext |
|||
|
|||
from html5lib.constants import voidElements, spaceCharacters |
|||
spaceCharacters = u"".join(spaceCharacters) |
|||
|
|||
class TreeWalker(object): |
|||
def __init__(self, tree): |
|||
self.tree = tree |
|||
|
|||
def __iter__(self): |
|||
raise NotImplementedError |
|||
|
|||
def error(self, msg): |
|||
return {"type": "SerializeError", "data": msg} |
|||
|
|||
def normalizeAttrs(self, attrs): |
|||
newattrs = {} |
|||
if attrs: |
|||
#TODO: treewalkers should always have attrs |
|||
for (namespace,name),value in attrs.iteritems(): |
|||
namespace = unicode(namespace) if namespace else None |
|||
name = unicode(name) |
|||
value = unicode(value) |
|||
newattrs[(namespace,name)] = value |
|||
return newattrs |
|||
|
|||
def emptyTag(self, namespace, name, attrs, hasChildren=False): |
|||
yield {"type": "EmptyTag", "name": unicode(name), |
|||
"namespace":unicode(namespace), |
|||
"data": self.normalizeAttrs(attrs)} |
|||
if hasChildren: |
|||
yield self.error(_("Void element has children")) |
|||
|
|||
def startTag(self, namespace, name, attrs): |
|||
return {"type": "StartTag", |
|||
"name": unicode(name), |
|||
"namespace":unicode(namespace), |
|||
"data": self.normalizeAttrs(attrs)} |
|||
|
|||
def endTag(self, namespace, name): |
|||
return {"type": "EndTag", |
|||
"name": unicode(name), |
|||
"namespace":unicode(namespace), |
|||
"data": {}} |
|||
|
|||
def text(self, data): |
|||
data = unicode(data) |
|||
middle = data.lstrip(spaceCharacters) |
|||
left = data[:len(data)-len(middle)] |
|||
if left: |
|||
yield {"type": "SpaceCharacters", "data": left} |
|||
data = middle |
|||
middle = data.rstrip(spaceCharacters) |
|||
right = data[len(middle):] |
|||
if middle: |
|||
yield {"type": "Characters", "data": middle} |
|||
if right: |
|||
yield {"type": "SpaceCharacters", "data": right} |
|||
|
|||
def comment(self, data): |
|||
return {"type": "Comment", "data": unicode(data)} |
|||
|
|||
def doctype(self, name, publicId=None, systemId=None, correct=True): |
|||
return {"type": "Doctype", |
|||
"name": name is not None and unicode(name) or u"", |
|||
"publicId": publicId, |
|||
"systemId": systemId, |
|||
"correct": correct} |
|||
|
|||
def entity(self, name): |
|||
return {"type": "Entity", "name": unicode(name)} |
|||
|
|||
def unknown(self, nodeType): |
|||
return self.error(_("Unknown node type: ") + nodeType) |
|||
|
|||
class RecursiveTreeWalker(TreeWalker): |
|||
def walkChildren(self, node): |
|||
raise NodeImplementedError |
|||
|
|||
def element(self, node, namespace, name, attrs, hasChildren): |
|||
if name in voidElements: |
|||
for token in self.emptyTag(namespace, name, attrs, hasChildren): |
|||
yield token |
|||
else: |
|||
yield self.startTag(name, attrs) |
|||
if hasChildren: |
|||
for token in self.walkChildren(node): |
|||
yield token |
|||
yield self.endTag(name) |
|||
|
|||
from xml.dom import Node |
|||
|
|||
DOCUMENT = Node.DOCUMENT_NODE |
|||
DOCTYPE = Node.DOCUMENT_TYPE_NODE |
|||
TEXT = Node.TEXT_NODE |
|||
ELEMENT = Node.ELEMENT_NODE |
|||
COMMENT = Node.COMMENT_NODE |
|||
ENTITY = Node.ENTITY_NODE |
|||
UNKNOWN = "<#UNKNOWN#>" |
|||
|
|||
class NonRecursiveTreeWalker(TreeWalker): |
|||
def getNodeDetails(self, node): |
|||
raise NotImplementedError |
|||
|
|||
def getFirstChild(self, node): |
|||
raise NotImplementedError |
|||
|
|||
def getNextSibling(self, node): |
|||
raise NotImplementedError |
|||
|
|||
def getParentNode(self, node): |
|||
raise NotImplementedError |
|||
|
|||
def __iter__(self): |
|||
currentNode = self.tree |
|||
while currentNode is not None: |
|||
details = self.getNodeDetails(currentNode) |
|||
type, details = details[0], details[1:] |
|||
hasChildren = False |
|||
endTag = None |
|||
|
|||
if type == DOCTYPE: |
|||
yield self.doctype(*details) |
|||
|
|||
elif type == TEXT: |
|||
for token in self.text(*details): |
|||
yield token |
|||
|
|||
elif type == ELEMENT: |
|||
namespace, name, attributes, hasChildren = details |
|||
if name in voidElements: |
|||
for token in self.emptyTag(namespace, name, attributes, |
|||
hasChildren): |
|||
yield token |
|||
hasChildren = False |
|||
else: |
|||
endTag = name |
|||
yield self.startTag(namespace, name, attributes) |
|||
|
|||
elif type == COMMENT: |
|||
yield self.comment(details[0]) |
|||
|
|||
elif type == ENTITY: |
|||
yield self.entity(details[0]) |
|||
|
|||
elif type == DOCUMENT: |
|||
hasChildren = True |
|||
|
|||
else: |
|||
yield self.unknown(details[0]) |
|||
|
|||
if hasChildren: |
|||
firstChild = self.getFirstChild(currentNode) |
|||
else: |
|||
firstChild = None |
|||
|
|||
if firstChild is not None: |
|||
currentNode = firstChild |
|||
else: |
|||
while currentNode is not None: |
|||
details = self.getNodeDetails(currentNode) |
|||
type, details = details[0], details[1:] |
|||
if type == ELEMENT: |
|||
namespace, name, attributes, hasChildren = details |
|||
if name not in voidElements: |
|||
yield self.endTag(namespace, name) |
|||
if self.tree is currentNode: |
|||
currentNode = None |
|||
break |
|||
nextSibling = self.getNextSibling(currentNode) |
|||
if nextSibling is not None: |
|||
currentNode = nextSibling |
|||
break |
|||
else: |
|||
currentNode = self.getParentNode(currentNode) |
@ -0,0 +1,41 @@ |
|||
from xml.dom import Node |
|||
|
|||
import gettext |
|||
_ = gettext.gettext |
|||
|
|||
import _base |
|||
from html5lib.constants import voidElements |
|||
|
|||
class TreeWalker(_base.NonRecursiveTreeWalker): |
|||
def getNodeDetails(self, node): |
|||
if node.nodeType == Node.DOCUMENT_TYPE_NODE: |
|||
return _base.DOCTYPE, node.name, node.publicId, node.systemId |
|||
|
|||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): |
|||
return _base.TEXT, node.nodeValue |
|||
|
|||
elif node.nodeType == Node.ELEMENT_NODE: |
|||
attrs = {} |
|||
for attr in node.attributes.keys(): |
|||
attr = node.getAttributeNode(attr) |
|||
attrs[(attr.namespaceURI,attr.localName)] = attr.value |
|||
return (_base.ELEMENT, node.namespaceURI, node.nodeName, |
|||
attrs, node.hasChildNodes()) |
|||
|
|||
elif node.nodeType == Node.COMMENT_NODE: |
|||
return _base.COMMENT, node.nodeValue |
|||
|
|||
elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): |
|||
return (_base.DOCUMENT,) |
|||
|
|||
else: |
|||
return _base.UNKNOWN, node.nodeType |
|||
|
|||
def getFirstChild(self, node): |
|||
return node.firstChild |
|||
|
|||
def getNextSibling(self, node): |
|||
return node.nextSibling |
|||
|
|||
def getParentNode(self, node): |
|||
return node.parentNode |
@ -0,0 +1,141 @@ |
|||
import gettext |
|||
_ = gettext.gettext |
|||
|
|||
try: |
|||
from types import ModuleType |
|||
except: |
|||
from new import module as ModuleType |
|||
import copy |
|||
import re |
|||
|
|||
import _base |
|||
from html5lib.constants import voidElements |
|||
|
|||
tag_regexp = re.compile("{([^}]*)}(.*)") |
|||
|
|||
moduleCache = {} |
|||
|
|||
def getETreeModule(ElementTreeImplementation): |
|||
name = "_" + ElementTreeImplementation.__name__+"builder" |
|||
if name in moduleCache: |
|||
return moduleCache[name] |
|||
else: |
|||
mod = ModuleType("_" + ElementTreeImplementation.__name__+"builder") |
|||
objs = getETreeBuilder(ElementTreeImplementation) |
|||
mod.__dict__.update(objs) |
|||
moduleCache[name] = mod |
|||
return mod |
|||
|
|||
def getETreeBuilder(ElementTreeImplementation): |
|||
ElementTree = ElementTreeImplementation |
|||
|
|||
class TreeWalker(_base.NonRecursiveTreeWalker): |
|||
"""Given the particular ElementTree representation, this implementation, |
|||
to avoid using recursion, returns "nodes" as tuples with the following |
|||
content: |
|||
|
|||
1. The current element |
|||
|
|||
2. The index of the element relative to its parent |
|||
|
|||
3. A stack of ancestor elements |
|||
|
|||
4. A flag "text", "tail" or None to indicate if the current node is a |
|||
text node; either the text or tail of the current element (1) |
|||
""" |
|||
def getNodeDetails(self, node): |
|||
if isinstance(node, tuple): # It might be the root Element |
|||
elt, key, parents, flag = node |
|||
if flag in ("text", "tail"): |
|||
return _base.TEXT, getattr(elt, flag) |
|||
else: |
|||
node = elt |
|||
|
|||
if not(hasattr(node, "tag")): |
|||
node = node.getroot() |
|||
|
|||
if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"): |
|||
return (_base.DOCUMENT,) |
|||
|
|||
elif node.tag == "<!DOCTYPE>": |
|||
return (_base.DOCTYPE, node.text, |
|||
node.get("publicId"), node.get("systemId")) |
|||
|
|||
elif node.tag == ElementTree.Comment: |
|||
return _base.COMMENT, node.text |
|||
|
|||
else: |
|||
assert type(node.tag) in (str, unicode), type(node.tag) |
|||
#This is assumed to be an ordinary element |
|||
match = tag_regexp.match(node.tag) |
|||
if match: |
|||
namespace, tag = match.groups() |
|||
else: |
|||
namespace = None |
|||
tag = node.tag |
|||
attrs = {} |
|||
for name, value in node.attrib.items(): |
|||
match = tag_regexp.match(name) |
|||
if match: |
|||
attrs[(match.group(1),match.group(2))] = value |
|||
else: |
|||
attrs[(None,name)] = value |
|||
return (_base.ELEMENT, namespace, tag, |
|||
attrs, len(node) or node.text) |
|||
|
|||
def getFirstChild(self, node): |
|||
if isinstance(node, tuple): |
|||
element, key, parents, flag = node |
|||
else: |
|||
element, key, parents, flag = node, None, [], None |
|||
|
|||
if flag in ("text", "tail"): |
|||
return None |
|||
else: |
|||
if element.text: |
|||
return element, key, parents, "text" |
|||
elif len(element): |
|||
parents.append(element) |
|||
return element[0], 0, parents, None |
|||
else: |
|||
return None |
|||
|
|||
def getNextSibling(self, node): |
|||
if isinstance(node, tuple): |
|||
element, key, parents, flag = node |
|||
else: |
|||
return None |
|||
|
|||
if flag == "text": |
|||
if len(element): |
|||
parents.append(element) |
|||
return element[0], 0, parents, None |
|||
else: |
|||
return None |
|||
else: |
|||
if element.tail and flag != "tail": |
|||
return element, key, parents, "tail" |
|||
elif key < len(parents[-1]) - 1: |
|||
return parents[-1][key+1], key+1, parents, None |
|||
else: |
|||
return None |
|||
|
|||
def getParentNode(self, node): |
|||
if isinstance(node, tuple): |
|||
element, key, parents, flag = node |
|||
else: |
|||
return None |
|||
|
|||
if flag == "text": |
|||
if not parents: |
|||
return element |
|||
else: |
|||
return element, key, parents, None |
|||
else: |
|||
parent = parents.pop() |
|||
if not parents: |
|||
return parent |
|||
else: |
|||
return parent, list(parents[-1]).index(parent), parents, None |
|||
|
|||
return locals() |
@ -0,0 +1,70 @@ |
|||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT |
|||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT |
|||
from genshi.output import NamespaceFlattener |
|||
|
|||
import _base |
|||
|
|||
from html5lib.constants import voidElements |
|||
|
|||
class TreeWalker(_base.TreeWalker): |
|||
def __iter__(self): |
|||
depth = 0 |
|||
ignore_until = None |
|||
previous = None |
|||
for event in self.tree: |
|||
if previous is not None: |
|||
if previous[0] == START: |
|||
depth += 1 |
|||
if ignore_until <= depth: |
|||
ignore_until = None |
|||
if ignore_until is None: |
|||
for token in self.tokens(previous, event): |
|||
yield token |
|||
if token["type"] == "EmptyTag": |
|||
ignore_until = depth |
|||
if previous[0] == END: |
|||
depth -= 1 |
|||
previous = event |
|||
if previous is not None: |
|||
if ignore_until is None or ignore_until <= depth: |
|||
for token in self.tokens(previous, None): |
|||
yield token |
|||
elif ignore_until is not None: |
|||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") |
|||
|
|||
def tokens(self, event, next): |
|||
kind, data, pos = event |
|||
if kind == START: |
|||
tag, attrib = data |
|||
name = tag.localname |
|||
namespace = tag.namespace |
|||
if tag in voidElements: |
|||
for token in self.emptyTag(namespace, name, list(attrib), |
|||
not next or next[0] != END |
|||
or next[1] != tag): |
|||
yield token |
|||
else: |
|||
yield self.startTag(namespace, name, list(attrib)) |
|||
|
|||
elif kind == END: |
|||
name = data.localname |
|||
namespace = data.namespace |
|||
if name not in voidElements: |
|||
yield self.endTag(namespace, name) |
|||
|
|||
elif kind == COMMENT: |
|||
yield self.comment(data) |
|||
|
|||
elif kind == TEXT: |
|||
for token in self.text(data): |
|||
yield token |
|||
|
|||
elif kind == DOCTYPE: |
|||
yield self.doctype(*data) |
|||
|
|||
elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \ |
|||
START_CDATA, END_CDATA, PI): |
|||
pass |
|||
|
|||
else: |
|||
yield self.unknown(kind) |
@ -0,0 +1,186 @@ |
|||
from lxml import etree |
|||
from html5lib.treebuilders.etree import tag_regexp |
|||
|
|||
from gettext import gettext |
|||
_ = gettext |
|||
|
|||
import _base |
|||
|
|||
from html5lib.constants import voidElements |
|||
from html5lib import ihatexml |
|||
|
|||
class Root(object): |
|||
def __init__(self, et): |
|||
self.elementtree = et |
|||
self.children = [] |
|||
if et.docinfo.internalDTD: |
|||
self.children.append(Doctype(self, et.docinfo.root_name, |
|||
et.docinfo.public_id, |
|||
et.docinfo.system_url)) |
|||
root = et.getroot() |
|||
node = root |
|||
|
|||
while node.getprevious() is not None: |
|||
node = node.getprevious() |
|||
while node is not None: |
|||
self.children.append(node) |
|||
node = node.getnext() |
|||
|
|||
self.text = None |
|||
self.tail = None |
|||
|
|||
def __getitem__(self, key): |
|||
return self.children[key] |
|||
|
|||
def getnext(self): |
|||
return None |
|||
|
|||
def __len__(self): |
|||
return 1 |
|||
|
|||
class Doctype(object): |
|||
def __init__(self, root_node, name, public_id, system_id): |
|||
self.root_node = root_node |
|||
self.name = name |
|||
self.public_id = public_id |
|||
self.system_id = system_id |
|||
|
|||
self.text = None |
|||
self.tail = None |
|||
|
|||
def getnext(self): |
|||
return self.root_node.children[1] |
|||
|
|||
class FragmentRoot(Root): |
|||
def __init__(self, children): |
|||
self.children = [FragmentWrapper(self, child) for child in children] |
|||
self.text = self.tail = None |
|||
|
|||
def getnext(self): |
|||
return None |
|||
|
|||
class FragmentWrapper(object): |
|||
def __init__(self, fragment_root, obj): |
|||
self.root_node = fragment_root |
|||
self.obj = obj |
|||
if hasattr(self.obj, 'text'): |
|||
self.text = self.obj.text |
|||
else: |
|||
self.text = None |
|||
if hasattr(self.obj, 'tail'): |
|||
self.tail = self.obj.tail |
|||
else: |
|||
self.tail = None |
|||
self.isstring = isinstance(obj, basestring) |
|||
|
|||
def __getattr__(self, name): |
|||
return getattr(self.obj, name) |
|||
|
|||
def getnext(self): |
|||
siblings = self.root_node.children |
|||
idx = siblings.index(self) |
|||
if idx < len(siblings) - 1: |
|||
return siblings[idx + 1] |
|||
else: |
|||
return None |
|||
|
|||
def __getitem__(self, key): |
|||
return self.obj[key] |
|||
|
|||
def __nonzero__(self): |
|||
return bool(self.obj) |
|||
|
|||
def getparent(self): |
|||
return None |
|||
|
|||
def __str__(self): |
|||
return str(self.obj) |
|||
|
|||
def __unicode__(self): |
|||
return unicode(self.obj) |
|||
|
|||
def __len__(self): |
|||
return len(self.obj) |
|||
|
|||
|
|||
class TreeWalker(_base.NonRecursiveTreeWalker): |
|||
def __init__(self, tree): |
|||
if hasattr(tree, "getroot"): |
|||
tree = Root(tree) |
|||
elif isinstance(tree, list): |
|||
tree = FragmentRoot(tree) |
|||
_base.NonRecursiveTreeWalker.__init__(self, tree) |
|||
self.filter = ihatexml.InfosetFilter() |
|||
def getNodeDetails(self, node): |
|||
if isinstance(node, tuple): # Text node |
|||
node, key = node |
|||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key |
|||
return _base.TEXT, getattr(node, key) |
|||
|
|||
elif isinstance(node, Root): |
|||
return (_base.DOCUMENT,) |
|||
|
|||
elif isinstance(node, Doctype): |
|||
return _base.DOCTYPE, node.name, node.public_id, node.system_id |
|||
|
|||
elif isinstance(node, FragmentWrapper) and node.isstring: |
|||
return _base.TEXT, node |
|||
|
|||
elif node.tag == etree.Comment: |
|||
return _base.COMMENT, node.text |
|||
|
|||
elif node.tag == etree.Entity: |
|||
return _base.ENTITY, node.text[1:-1] # strip &; |
|||
|
|||
else: |
|||
#This is assumed to be an ordinary element |
|||
match = tag_regexp.match(node.tag) |
|||
if match: |
|||
namespace, tag = match.groups() |
|||
else: |
|||
namespace = None |
|||
tag = node.tag |
|||
attrs = {} |
|||
for name, value in node.attrib.items(): |
|||
match = tag_regexp.match(name) |
|||
if match: |
|||
attrs[(match.group(1),match.group(2))] = value |
|||
else: |
|||
attrs[(None,name)] = value |
|||
return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), |
|||
attrs, len(node) > 0 or node.text) |
|||
|
|||
def getFirstChild(self, node): |
|||
assert not isinstance(node, tuple), _("Text nodes have no children") |
|||
|
|||
assert len(node) or node.text, "Node has no children" |
|||
if node.text: |
|||
return (node, "text") |
|||
else: |
|||
return node[0] |
|||
|
|||
def getNextSibling(self, node): |
|||
if isinstance(node, tuple): # Text node |
|||
node, key = node |
|||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key |
|||
if key == "text": |
|||
# XXX: we cannot use a "bool(node) and node[0] or None" construct here |
|||
# because node[0] might evaluate to False if it has no child element |
|||
if len(node): |
|||
return node[0] |
|||
else: |
|||
return None |
|||
else: # tail |
|||
return node.getnext() |
|||
|
|||
return node.tail and (node, "tail") or node.getnext() |
|||
|
|||
def getParentNode(self, node): |
|||
if isinstance(node, tuple): # Text node |
|||
node, key = node |
|||
assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key |
|||
if key == "text": |
|||
return node |
|||
# else: fallback to "normal" processing |
|||
|
|||
return node.getparent() |
@ -0,0 +1,60 @@ |
|||
from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \ |
|||
COMMENT, IGNORABLE_WHITESPACE, CHARACTERS |
|||
|
|||
import _base |
|||
|
|||
from html5lib.constants import voidElements |
|||
|
|||
class TreeWalker(_base.TreeWalker): |
|||
def __iter__(self): |
|||
ignore_until = None |
|||
previous = None |
|||
for event in self.tree: |
|||
if previous is not None and \ |
|||
(ignore_until is None or previous[1] is ignore_until): |
|||
if previous[1] is ignore_until: |
|||
ignore_until = None |
|||
for token in self.tokens(previous, event): |
|||
yield token |
|||
if token["type"] == "EmptyTag": |
|||
ignore_until = previous[1] |
|||
previous = event |
|||
if ignore_until is None or previous[1] is ignore_until: |
|||
for token in self.tokens(previous, None): |
|||
yield token |
|||
elif ignore_until is not None: |
|||
raise ValueError("Illformed DOM event stream: void element without END_ELEMENT") |
|||
|
|||
def tokens(self, event, next): |
|||
type, node = event |
|||
if type == START_ELEMENT: |
|||
name = node.nodeName |
|||
namespace = node.namespaceURI |
|||
attrs = {} |
|||
for attr in node.attributes.keys(): |
|||
attr = node.getAttributeNode(attr) |
|||
attrs[(attr.namespaceURI,attr.localName)] = attr.value |
|||
if name in voidElements: |
|||
for token in self.emptyTag(namespace, |
|||
name, |
|||
attrs, |
|||
not next or next[1] is not node): |
|||
yield token |
|||
else: |
|||
yield self.startTag(namespace, name, attrs) |
|||
|
|||
elif type == END_ELEMENT: |
|||
name = node.nodeName |
|||
namespace = node.namespaceURI |
|||
if name not in voidElements: |
|||
yield self.endTag(namespace, name) |
|||
|
|||
elif type == COMMENT: |
|||
yield self.comment(node.nodeValue) |
|||
|
|||
elif type in (IGNORABLE_WHITESPACE, CHARACTERS): |
|||
for token in self.text(node.nodeValue): |
|||
yield token |
|||
|
|||
else: |
|||
yield self.unknown(type) |
@ -0,0 +1,78 @@ |
|||
import gettext |
|||
_ = gettext.gettext |
|||
|
|||
import _base |
|||
|
|||
class TreeWalker(_base.NonRecursiveTreeWalker): |
|||
"""Given that simpletree has no performant way of getting a node's |
|||
next sibling, this implementation returns "nodes" as tuples with the |
|||
following content: |
|||
|
|||
1. The parent Node (Element, Document or DocumentFragment) |
|||
|
|||
2. The child index of the current node in its parent's children list |
|||
|
|||
3. A list used as a stack of all ancestors. It is a pair tuple whose |
|||
first item is a parent Node and second item is a child index. |
|||
""" |
|||
|
|||
def getNodeDetails(self, node): |
|||
if isinstance(node, tuple): # It might be the root Node |
|||
parent, idx, parents = node |
|||
node = parent.childNodes[idx] |
|||
|
|||
# testing node.type allows us not to import treebuilders.simpletree |
|||
if node.type in (1, 2): # Document or DocumentFragment |
|||
return (_base.DOCUMENT,) |
|||
|
|||
elif node.type == 3: # DocumentType |
|||
return _base.DOCTYPE, node.name, node.publicId, node.systemId |
|||
|
|||
elif node.type == 4: # TextNode |
|||
return _base.TEXT, node.value |
|||
|
|||
elif node.type == 5: # Element |
|||
attrs = {} |
|||
for name, value in node.attributes.items(): |
|||
if isinstance(name, tuple): |
|||
attrs[(name[2],name[1])] = value |
|||
else: |
|||
attrs[(None,name)] = value |
|||
return (_base.ELEMENT, node.namespace, node.name, |
|||
attrs, node.hasContent()) |
|||
|
|||
elif node.type == 6: # CommentNode |
|||
return _base.COMMENT, node.data |
|||
|
|||
else: |
|||
return _node.UNKNOWN, node.type |
|||
|
|||
def getFirstChild(self, node): |
|||
if isinstance(node, tuple): # It might be the root Node |
|||
parent, idx, parents = node |
|||
parents.append((parent, idx)) |
|||
node = parent.childNodes[idx] |
|||
else: |
|||
parents = [] |
|||
|
|||
assert node.hasContent(), "Node has no children" |
|||
return (node, 0, parents) |
|||
|
|||
def getNextSibling(self, node): |
|||
assert isinstance(node, tuple), "Node is not a tuple: " + str(node) |
|||
parent, idx, parents = node |
|||
idx += 1 |
|||
if len(parent.childNodes) > idx: |
|||
return (parent, idx, parents) |
|||
else: |
|||
return None |
|||
|
|||
def getParentNode(self, node): |
|||
assert isinstance(node, tuple) |
|||
parent, idx, parents = node |
|||
if parents: |
|||
parent, idx = parents.pop() |
|||
return parent, idx, parents |
|||
else: |
|||
# HACK: We could return ``parent`` but None will stop the algorithm the same way |
|||
return None |
@ -0,0 +1,60 @@ |
|||
import re |
|||
import gettext |
|||
_ = gettext.gettext |
|||
|
|||
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag |
|||
from html5lib.constants import namespaces |
|||
import _base |
|||
|
|||
class TreeWalker(_base.NonRecursiveTreeWalker): |
|||
doctype_regexp = re.compile( |
|||
r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?') |
|||
def getNodeDetails(self, node): |
|||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment |
|||
return (_base.DOCUMENT,) |
|||
|
|||
elif isinstance(node, Declaration): # DocumentType |
|||
string = unicode(node.string) |
|||
#Slice needed to remove markup added during unicode conversion, |
|||
#but only in some versions of BeautifulSoup/Python |
|||
if string.startswith('<!') and string.endswith('>'): |
|||
string = string[2:-1] |
|||
m = self.doctype_regexp.match(string) |
|||
#This regexp approach seems wrong and fragile |
|||
#but beautiful soup stores the doctype as a single thing and we want the seperate bits |
|||
#It should work as long as the tree is created by html5lib itself but may be wrong if it's |
|||
#been modified at all |
|||
#We could just feed to it a html5lib tokenizer, I guess... |
|||
assert m is not None, "DOCTYPE did not match expected format" |
|||
|
|||
name = m.group('name') |
|||
publicId = m.group('publicId') |
|||
if publicId is not None: |
|||
systemId = m.group('systemId1') |
|||
else: |
|||
systemId = m.group('systemId2') |
|||
return _base.DOCTYPE, name, publicId or "", systemId or "" |
|||
|
|||
elif isinstance(node, Comment): |
|||
string = unicode(node.string) |
|||
if string.startswith('<!--') and string.endswith('-->'): |
|||
string = string[4:-3] |
|||
return _base.COMMENT, string |
|||
|
|||
elif isinstance(node, unicode): # TextNode |
|||
return _base.TEXT, node |
|||
|
|||
elif isinstance(node, Tag): # Element |
|||
return (_base.ELEMENT, namespaces["html"], node.name, |
|||
dict(node.attrs).items(), node.contents) |
|||
else: |
|||
return _base.UNKNOWN, node.__class__.__name__ |
|||
|
|||
def getFirstChild(self, node): |
|||
return node.contents[0] |
|||
|
|||
def getNextSibling(self, node): |
|||
return node.nextSibling |
|||
|
|||
def getParentNode(self, node): |
|||
return node.parent |
@ -0,0 +1,175 @@ |
|||
try: |
|||
frozenset |
|||
except NameError: |
|||
#Import from the sets module for python 2.3 |
|||
from sets import Set as set |
|||
from sets import ImmutableSet as frozenset |
|||
|
|||
class MethodDispatcher(dict): |
|||
"""Dict with 2 special properties: |
|||
|
|||
On initiation, keys that are lists, sets or tuples are converted to |
|||
multiple keys so accessing any one of the items in the original |
|||
list-like object returns the matching value |
|||
|
|||
md = MethodDispatcher({("foo", "bar"):"baz"}) |
|||
md["foo"] == "baz" |
|||
|
|||
A default value which can be set through the default attribute. |
|||
""" |
|||
|
|||
def __init__(self, items=()): |
|||
# Using _dictEntries instead of directly assigning to self is about |
|||
# twice as fast. Please do careful performance testing before changing |
|||
# anything here. |
|||
_dictEntries = [] |
|||
for name,value in items: |
|||
if type(name) in (list, tuple, frozenset, set): |
|||
for item in name: |
|||
_dictEntries.append((item, value)) |
|||
else: |
|||
_dictEntries.append((name, value)) |
|||
dict.__init__(self, _dictEntries) |
|||
self.default = None |
|||
|
|||
def __getitem__(self, key): |
|||
return dict.get(self, key, self.default) |
|||
|
|||
#Pure python implementation of deque taken from the ASPN Python Cookbook |
|||
#Original code by Raymond Hettinger |
|||
|
|||
class deque(object): |
|||
|
|||
def __init__(self, iterable=(), maxsize=-1): |
|||
if not hasattr(self, 'data'): |
|||
self.left = self.right = 0 |
|||
self.data = {} |
|||
self.maxsize = maxsize |
|||
self.extend(iterable) |
|||
|
|||
def append(self, x): |
|||
self.data[self.right] = x |
|||
self.right += 1 |
|||
if self.maxsize != -1 and len(self) > self.maxsize: |
|||
self.popleft() |
|||
|
|||
def appendleft(self, x): |
|||
self.left -= 1 |
|||
self.data[self.left] = x |
|||
if self.maxsize != -1 and len(self) > self.maxsize: |
|||
self.pop() |
|||
|
|||
def pop(self): |
|||
if self.left == self.right: |
|||
raise IndexError('cannot pop from empty deque') |
|||
self.right -= 1 |
|||
elem = self.data[self.right] |
|||
del self.data[self.right] |
|||
return elem |
|||
|
|||
def popleft(self): |
|||
if self.left == self.right: |
|||
raise IndexError('cannot pop from empty deque') |
|||
elem = self.data[self.left] |
|||
del self.data[self.left] |
|||
self.left += 1 |
|||
return elem |
|||
|
|||
def clear(self): |
|||
self.data.clear() |
|||
self.left = self.right = 0 |
|||
|
|||
def extend(self, iterable): |
|||
for elem in iterable: |
|||
self.append(elem) |
|||
|
|||
def extendleft(self, iterable): |
|||
for elem in iterable: |
|||
self.appendleft(elem) |
|||
|
|||
def rotate(self, n=1): |
|||
if self: |
|||
n %= len(self) |
|||
for i in xrange(n): |
|||
self.appendleft(self.pop()) |
|||
|
|||
def __getitem__(self, i): |
|||
if i < 0: |
|||
i += len(self) |
|||
try: |
|||
return self.data[i + self.left] |
|||
except KeyError: |
|||
raise IndexError |
|||
|
|||
def __setitem__(self, i, value): |
|||
if i < 0: |
|||
i += len(self) |
|||
try: |
|||
self.data[i + self.left] = value |
|||
except KeyError: |
|||
raise IndexError |
|||
|
|||
def __delitem__(self, i): |
|||
size = len(self) |
|||
if not (-size <= i < size): |
|||
raise IndexError |
|||
data = self.data |
|||
if i < 0: |
|||
i += size |
|||
for j in xrange(self.left+i, self.right-1): |
|||
data[j] = data[j+1] |
|||
self.pop() |
|||
|
|||
def __len__(self): |
|||
return self.right - self.left |
|||
|
|||
def __cmp__(self, other): |
|||
if type(self) != type(other): |
|||
return cmp(type(self), type(other)) |
|||
return cmp(list(self), list(other)) |
|||
|
|||
def __repr__(self, _track=[]): |
|||
if id(self) in _track: |
|||
return '...' |
|||
_track.append(id(self)) |
|||
r = 'deque(%r)' % (list(self),) |
|||
_track.remove(id(self)) |
|||
return r |
|||
|
|||
def __getstate__(self): |
|||
return (tuple(self),) |
|||
|
|||
def __setstate__(self, s): |
|||
self.__init__(s[0]) |
|||
|
|||
def __hash__(self): |
|||
raise TypeError |
|||
|
|||
def __copy__(self): |
|||
return self.__class__(self) |
|||
|
|||
def __deepcopy__(self, memo={}): |
|||
from copy import deepcopy |
|||
result = self.__class__() |
|||
memo[id(self)] = result |
|||
result.__init__(deepcopy(tuple(self), memo)) |
|||
return result |
|||
|
|||
#Some utility functions to dal with weirdness around UCS2 vs UCS4 |
|||
#python builds |
|||
|
|||
def encodingType(): |
|||
if len() == 2: |
|||
return "UCS2" |
|||
else: |
|||
return "UCS4" |
|||
|
|||
def isSurrogatePair(data): |
|||
return (len(data) == 2 and |
|||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and |
|||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF) |
|||
|
|||
def surrogatePairToCodepoint(data): |
|||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 + |
|||
(ord(data[1]) - 0xDC00)) |
|||
return char_val |
@ -0,0 +1,155 @@ |
|||
# -*- coding: utf-8 -*- |
|||
from __future__ import absolute_import |
|||
|
|||
""" |
|||
oauthlib.common |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module provides data structures and utilities common |
|||
to all implementations of OAuth. |
|||
""" |
|||
|
|||
import re |
|||
import urllib |
|||
import urlparse |
|||
|
|||
|
|||
always_safe = (u'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
|||
u'abcdefghijklmnopqrstuvwxyz' |
|||
u'0123456789' u'_.-') |
|||
|
|||
|
|||
def quote(s, safe=u'/'): |
|||
encoded = s.encode("utf-8") |
|||
quoted = urllib.quote(encoded, safe) |
|||
return quoted.decode("utf-8") |
|||
|
|||
|
|||
def unquote(s): |
|||
encoded = s.encode("utf-8") |
|||
unquoted = urllib.unquote(encoded) |
|||
return unquoted.decode("utf-8") |
|||
|
|||
|
|||
def urlencode(params): |
|||
utf8_params = encode_params_utf8(params) |
|||
urlencoded = urllib.urlencode(utf8_params) |
|||
return urlencoded.decode("utf-8") |
|||
|
|||
|
|||
def encode_params_utf8(params): |
|||
"""Ensures that all parameters in a list of 2-element tuples are encoded to |
|||
bytestrings using UTF-8 |
|||
""" |
|||
encoded = [] |
|||
for k, v in params: |
|||
encoded.append(( |
|||
k.encode('utf-8') if isinstance(k, unicode) else k, |
|||
v.encode('utf-8') if isinstance(v, unicode) else v)) |
|||
return encoded |
|||
|
|||
|
|||
def decode_params_utf8(params): |
|||
"""Ensures that all parameters in a list of 2-element tuples are decoded to |
|||
unicode using UTF-8. |
|||
""" |
|||
decoded = [] |
|||
for k, v in params: |
|||
decoded.append(( |
|||
k.decode('utf-8') if isinstance(k, str) else k, |
|||
v.decode('utf-8') if isinstance(v, str) else v)) |
|||
return decoded |
|||
|
|||
|
|||
urlencoded = set(always_safe) | set(u'=&;%+~') |
|||
|
|||
|
|||
def urldecode(query): |
|||
"""Decode a query string in x-www-form-urlencoded format into a sequence |
|||
of two-element tuples. |
|||
|
|||
Unlike urlparse.parse_qsl(..., strict_parsing=True) urldecode will enforce |
|||
correct formatting of the query string by validation. If validation fails |
|||
a ValueError will be raised. urllib.parse_qsl will only raise errors if |
|||
any of name-value pairs omits the equals sign. |
|||
""" |
|||
# Check if query contains invalid characters |
|||
if query and not set(query) <= urlencoded: |
|||
raise ValueError('Invalid characters in query string.') |
|||
|
|||
# Check for correctly hex encoded values using a regular expression |
|||
# All encoded values begin with % followed by two hex characters |
|||
# correct = %00, %A0, %0A, %FF |
|||
# invalid = %G0, %5H, %PO |
|||
invalid_hex = u'%[^0-9A-Fa-f]|%[0-9A-Fa-f][^0-9A-Fa-f]' |
|||
if len(re.findall(invalid_hex, query)): |
|||
raise ValueError('Invalid hex encoding in query string.') |
|||
|
|||
query = query.decode('utf-8') if isinstance(query, str) else query |
|||
# We want to allow queries such as "c2" whereas urlparse.parse_qsl |
|||
# with the strict_parsing flag will not. |
|||
params = urlparse.parse_qsl(query, keep_blank_values=True) |
|||
|
|||
# unicode all the things |
|||
return decode_params_utf8(params) |
|||
|
|||
|
|||
def extract_params(raw): |
|||
"""Extract parameters and return them as a list of 2-tuples. |
|||
|
|||
Will successfully extract parameters from urlencoded query strings, |
|||
dicts, or lists of 2-tuples. Empty strings/dicts/lists will return an |
|||
empty list of parameters. Any other input will result in a return |
|||
value of None. |
|||
""" |
|||
if isinstance(raw, basestring): |
|||
try: |
|||
params = urldecode(raw) |
|||
except ValueError: |
|||
params = None |
|||
elif hasattr(raw, '__iter__'): |
|||
try: |
|||
dict(raw) |
|||
except ValueError: |
|||
params = None |
|||
except TypeError: |
|||
params = None |
|||
else: |
|||
params = list(raw.items() if isinstance(raw, dict) else raw) |
|||
params = decode_params_utf8(params) |
|||
else: |
|||
params = None |
|||
|
|||
return params |
|||
|
|||
|
|||
class Request(object): |
|||
"""A malleable representation of a signable HTTP request. |
|||
|
|||
Body argument may contain any data, but parameters will only be decoded if |
|||
they are one of: |
|||
|
|||
* urlencoded query string |
|||
* dict |
|||
* list of 2-tuples |
|||
|
|||
Anything else will be treated as raw body data to be passed through |
|||
unmolested. |
|||
""" |
|||
|
|||
def __init__(self, uri, http_method=u'GET', body=None, headers=None): |
|||
self.uri = uri |
|||
self.http_method = http_method |
|||
self.headers = headers or {} |
|||
self.body = body |
|||
self.decoded_body = extract_params(body) |
|||
self.oauth_params = [] |
|||
|
|||
@property |
|||
def uri_query(self): |
|||
return urlparse.urlparse(self.uri).query |
|||
|
|||
@property |
|||
def uri_query_params(self): |
|||
return urlparse.parse_qsl(self.uri_query, keep_blank_values=True, |
|||
strict_parsing=True) |
@ -0,0 +1,13 @@ |
|||
# -*- coding: utf-8 -*- |
|||
from __future__ import absolute_import |
|||
|
|||
""" |
|||
oauthlib.oauth1 |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module is a wrapper for the most recent implementation of OAuth 1.0 Client |
|||
and Server classes. |
|||
""" |
|||
|
|||
from .rfc5849 import Client, Server |
|||
|
@ -0,0 +1,350 @@ |
|||
# -*- coding: utf-8 -*- |
|||
from __future__ import absolute_import |
|||
|
|||
""" |
|||
oauthlib.oauth1.rfc5849 |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module is an implementation of various logic needed |
|||
for signing and checking OAuth 1.0 RFC 5849 requests. |
|||
""" |
|||
|
|||
import logging |
|||
import urlparse |
|||
|
|||
from oauthlib.common import Request, urlencode |
|||
from . import parameters, signature, utils |
|||
|
|||
SIGNATURE_HMAC = u"HMAC-SHA1" |
|||
SIGNATURE_RSA = u"RSA-SHA1" |
|||
SIGNATURE_PLAINTEXT = u"PLAINTEXT" |
|||
SIGNATURE_METHODS = (SIGNATURE_HMAC, SIGNATURE_RSA, SIGNATURE_PLAINTEXT) |
|||
|
|||
SIGNATURE_TYPE_AUTH_HEADER = u'AUTH_HEADER' |
|||
SIGNATURE_TYPE_QUERY = u'QUERY' |
|||
SIGNATURE_TYPE_BODY = u'BODY' |
|||
|
|||
CONTENT_TYPE_FORM_URLENCODED = u'application/x-www-form-urlencoded' |
|||
|
|||
|
|||
class Client(object): |
|||
"""A client used to sign OAuth 1.0 RFC 5849 requests""" |
|||
def __init__(self, client_key, |
|||
client_secret=None, |
|||
resource_owner_key=None, |
|||
resource_owner_secret=None, |
|||
callback_uri=None, |
|||
signature_method=SIGNATURE_HMAC, |
|||
signature_type=SIGNATURE_TYPE_AUTH_HEADER, |
|||
rsa_key=None, verifier=None): |
|||
self.client_key = client_key |
|||
self.client_secret = client_secret |
|||
self.resource_owner_key = resource_owner_key |
|||
self.resource_owner_secret = resource_owner_secret |
|||
self.signature_method = signature_method |
|||
self.signature_type = signature_type |
|||
self.callback_uri = callback_uri |
|||
self.rsa_key = rsa_key |
|||
self.verifier = verifier |
|||
|
|||
if self.signature_method == SIGNATURE_RSA and self.rsa_key is None: |
|||
raise ValueError('rsa_key is required when using RSA signature method.') |
|||
|
|||
def get_oauth_signature(self, request): |
|||
"""Get an OAuth signature to be used in signing a request |
|||
""" |
|||
if self.signature_method == SIGNATURE_PLAINTEXT: |
|||
# fast-path |
|||
return signature.sign_plaintext(self.client_secret, |
|||
self.resource_owner_secret) |
|||
|
|||
uri, headers, body = self._render(request) |
|||
|
|||
collected_params = signature.collect_parameters( |
|||
uri_query=urlparse.urlparse(uri).query, |
|||
body=body, |
|||
headers=headers) |
|||
logging.debug("Collected params: {0}".format(collected_params)) |
|||
|
|||
normalized_params = signature.normalize_parameters(collected_params) |
|||
normalized_uri = signature.normalize_base_string_uri(request.uri) |
|||
logging.debug("Normalized params: {0}".format(normalized_params)) |
|||
logging.debug("Normalized URI: {0}".format(normalized_uri)) |
|||
|
|||
base_string = signature.construct_base_string(request.http_method, |
|||
normalized_uri, normalized_params) |
|||
|
|||
logging.debug("Base signing string: {0}".format(base_string)) |
|||
|
|||
if self.signature_method == SIGNATURE_HMAC: |
|||
sig = signature.sign_hmac_sha1(base_string, self.client_secret, |
|||
self.resource_owner_secret) |
|||
elif self.signature_method == SIGNATURE_RSA: |
|||
sig = signature.sign_rsa_sha1(base_string, self.rsa_key) |
|||
else: |
|||
sig = signature.sign_plaintext(self.client_secret, |
|||
self.resource_owner_secret) |
|||
|
|||
logging.debug("Signature: {0}".format(sig)) |
|||
return sig |
|||
|
|||
def get_oauth_params(self): |
|||
"""Get the basic OAuth parameters to be used in generating a signature. |
|||
""" |
|||
params = [ |
|||
(u'oauth_nonce', utils.generate_nonce()), |
|||
(u'oauth_timestamp', utils.generate_timestamp()), |
|||
(u'oauth_version', u'1.0'), |
|||
(u'oauth_signature_method', self.signature_method), |
|||
(u'oauth_consumer_key', self.client_key), |
|||
] |
|||
if self.resource_owner_key: |
|||
params.append((u'oauth_token', self.resource_owner_key)) |
|||
if self.callback_uri: |
|||
params.append((u'oauth_callback', self.callback_uri)) |
|||
if self.verifier: |
|||
params.append((u'oauth_verifier', self.verifier)) |
|||
|
|||
return params |
|||
|
|||
def _render(self, request, formencode=False): |
|||
"""Render a signed request according to signature type |
|||
|
|||
Returns a 3-tuple containing the request URI, headers, and body. |
|||
|
|||
If the formencode argument is True and the body contains parameters, it |
|||
is escaped and returned as a valid formencoded string. |
|||
""" |
|||
# TODO what if there are body params on a header-type auth? |
|||
# TODO what if there are query params on a body-type auth? |
|||
|
|||
uri, headers, body = request.uri, request.headers, request.body |
|||
|
|||
# TODO: right now these prepare_* methods are very narrow in scope--they |
|||
# only affect their little thing. In some cases (for example, with |
|||
# header auth) it might be advantageous to allow these methods to touch |
|||
# other parts of the request, like the headers—so the prepare_headers |
|||
# method could also set the Content-Type header to x-www-form-urlencoded |
|||
# like the spec requires. This would be a fundamental change though, and |
|||
# I'm not sure how I feel about it. |
|||
if self.signature_type == SIGNATURE_TYPE_AUTH_HEADER: |
|||
headers = parameters.prepare_headers(request.oauth_params, request.headers) |
|||
elif self.signature_type == SIGNATURE_TYPE_BODY and request.decoded_body is not None: |
|||
body = parameters.prepare_form_encoded_body(request.oauth_params, request.decoded_body) |
|||
if formencode: |
|||
body = urlencode(body) |
|||
headers['Content-Type'] = u'application/x-www-form-urlencoded' |
|||
elif self.signature_type == SIGNATURE_TYPE_QUERY: |
|||
uri = parameters.prepare_request_uri_query(request.oauth_params, request.uri) |
|||
else: |
|||
raise ValueError('Unknown signature type specified.') |
|||
|
|||
return uri, headers, body |
|||
|
|||
def sign(self, uri, http_method=u'GET', body=None, headers=None): |
|||
"""Sign a request |
|||
|
|||
Signs an HTTP request with the specified parts. |
|||
|
|||
Returns a 3-tuple of the signed request's URI, headers, and body. |
|||
Note that http_method is not returned as it is unaffected by the OAuth |
|||
signing process. |
|||
|
|||
The body argument may be a dict, a list of 2-tuples, or a formencoded |
|||
string. The Content-Type header must be 'application/x-www-form-urlencoded' |
|||
if it is present. |
|||
|
|||
If the body argument is not one of the above, it will be returned |
|||
verbatim as it is unaffected by the OAuth signing process. Attempting to |
|||
sign a request with non-formencoded data using the OAuth body signature |
|||
type is invalid and will raise an exception. |
|||
|
|||
If the body does contain parameters, it will be returned as a properly- |
|||
formatted formencoded string. |
|||
|
|||
All string data MUST be unicode. This includes strings inside body |
|||
dicts, for example. |
|||
""" |
|||
# normalize request data |
|||
request = Request(uri, http_method, body, headers) |
|||
|
|||
# sanity check |
|||
content_type = request.headers.get('Content-Type', None) |
|||
multipart = content_type and content_type.startswith('multipart/') |
|||
should_have_params = content_type == CONTENT_TYPE_FORM_URLENCODED |
|||
has_params = request.decoded_body is not None |
|||
# 3.4.1.3.1. Parameter Sources |
|||
# [Parameters are collected from the HTTP request entity-body, but only |
|||
# if [...]: |
|||
# * The entity-body is single-part. |
|||
if multipart and has_params: |
|||
raise ValueError("Headers indicate a multipart body but body contains parameters.") |
|||
# * The entity-body follows the encoding requirements of the |
|||
# "application/x-www-form-urlencoded" content-type as defined by |
|||
# [W3C.REC-html40-19980424]. |
|||
elif should_have_params and not has_params: |
|||
raise ValueError("Headers indicate a formencoded body but body was not decodable.") |
|||
# * The HTTP request entity-header includes the "Content-Type" |
|||
# header field set to "application/x-www-form-urlencoded". |
|||
elif not should_have_params and has_params: |
|||
raise ValueError("Body contains parameters but Content-Type header was not set.") |
|||
|
|||
# 3.5.2. Form-Encoded Body |
|||
# Protocol parameters can be transmitted in the HTTP request entity- |
|||
# body, but only if the following REQUIRED conditions are met: |
|||
# o The entity-body is single-part. |
|||
# o The entity-body follows the encoding requirements of the |
|||
# "application/x-www-form-urlencoded" content-type as defined by |
|||
# [W3C.REC-html40-19980424]. |
|||
# o The HTTP request entity-header includes the "Content-Type" header |
|||
# field set to "application/x-www-form-urlencoded". |
|||
elif self.signature_type == SIGNATURE_TYPE_BODY and not ( |
|||
should_have_params and has_params and not multipart): |
|||
raise ValueError('Body signatures may only be used with form-urlencoded content') |
|||
|
|||
# generate the basic OAuth parameters |
|||
request.oauth_params = self.get_oauth_params() |
|||
|
|||
# generate the signature |
|||
request.oauth_params.append((u'oauth_signature', self.get_oauth_signature(request))) |
|||
|
|||
# render the signed request and return it |
|||
return self._render(request, formencode=True) |
|||
|
|||
|
|||
class Server(object): |
|||
"""A server used to verify OAuth 1.0 RFC 5849 requests""" |
|||
def __init__(self, signature_method=SIGNATURE_HMAC, rsa_key=None): |
|||
self.signature_method = signature_method |
|||
self.rsa_key = rsa_key |
|||
|
|||
def get_client_secret(self, client_key): |
|||
raise NotImplementedError("Subclasses must implement this function.") |
|||
|
|||
def get_resource_owner_secret(self, resource_owner_key): |
|||
raise NotImplementedError("Subclasses must implement this function.") |
|||
|
|||
def get_signature_type_and_params(self, uri_query, headers, body): |
|||
signature_types_with_oauth_params = filter(lambda s: s[1], ( |
|||
(SIGNATURE_TYPE_AUTH_HEADER, utils.filter_oauth_params( |
|||
signature.collect_parameters(headers=headers, |
|||
exclude_oauth_signature=False))), |
|||
(SIGNATURE_TYPE_BODY, utils.filter_oauth_params( |
|||
signature.collect_parameters(body=body, |
|||
exclude_oauth_signature=False))), |
|||
(SIGNATURE_TYPE_QUERY, utils.filter_oauth_params( |
|||
signature.collect_parameters(uri_query=uri_query, |
|||
exclude_oauth_signature=False))), |
|||
)) |
|||
|
|||
if len(signature_types_with_oauth_params) > 1: |
|||
raise ValueError('oauth_ params must come from only 1 signature type but were found in %s' % ', '.join( |
|||
[s[0] for s in signature_types_with_oauth_params])) |
|||
try: |
|||
signature_type, params = signature_types_with_oauth_params[0] |
|||
except IndexError: |
|||
raise ValueError('oauth_ params are missing. Could not determine signature type.') |
|||
|
|||
return signature_type, dict(params) |
|||
|
|||
def check_client_key(self, client_key): |
|||
raise NotImplementedError("Subclasses must implement this function.") |
|||
|
|||
def check_resource_owner_key(self, client_key, resource_owner_key): |
|||
raise NotImplementedError("Subclasses must implement this function.") |
|||
|
|||
def check_timestamp_and_nonce(self, timestamp, nonce): |
|||
raise NotImplementedError("Subclasses must implement this function.") |
|||
|
|||
def check_request_signature(self, uri, http_method=u'GET', body='', |
|||
headers=None): |
|||
"""Check a request's supplied signature to make sure the request is |
|||
valid. |
|||
|
|||
Servers should return HTTP status 400 if a ValueError exception |
|||
is raised and HTTP status 401 on return value False. |
|||
|
|||
Per `section 3.2`_ of the spec. |
|||
|
|||
.. _`section 3.2`: http://tools.ietf.org/html/rfc5849#section-3.2 |
|||
""" |
|||
headers = headers or {} |
|||
signature_type = None |
|||
# FIXME: urlparse does not return unicode! |
|||
uri_query = urlparse.urlparse(uri).query |
|||
|
|||
signature_type, params = self.get_signature_type_and_params(uri_query, |
|||
headers, body) |
|||
|
|||
# the parameters may not include duplicate oauth entries |
|||
filtered_params = utils.filter_oauth_params(params) |
|||
if len(filtered_params) != len(params): |
|||
raise ValueError("Duplicate OAuth entries.") |
|||
|
|||
params = dict(params) |
|||
request_signature = params.get(u'oauth_signature') |
|||
client_key = params.get(u'oauth_consumer_key') |
|||
resource_owner_key = params.get(u'oauth_token') |
|||
nonce = params.get(u'oauth_nonce') |
|||
timestamp = params.get(u'oauth_timestamp') |
|||
callback_uri = params.get(u'oauth_callback') |
|||
verifier = params.get(u'oauth_verifier') |
|||
signature_method = params.get(u'oauth_signature_method') |
|||
|
|||
# ensure all mandatory parameters are present |
|||
if not all((request_signature, client_key, nonce, |
|||
timestamp, signature_method)): |
|||
raise ValueError("Missing OAuth parameters.") |
|||
|
|||
# if version is supplied, it must be "1.0" |
|||
if u'oauth_version' in params and params[u'oauth_version'] != u'1.0': |
|||
raise ValueError("Invalid OAuth version.") |
|||
|
|||
# signature method must be valid |
|||
if not signature_method in SIGNATURE_METHODS: |
|||
raise ValueError("Invalid signature method.") |
|||
|
|||
# ensure client key is valid |
|||
if not self.check_client_key(client_key): |
|||
return False |
|||
|
|||
# ensure resource owner key is valid and not expired |
|||
if not self.check_resource_owner_key(client_key, resource_owner_key): |
|||
return False |
|||
|
|||
# ensure the nonce and timestamp haven't been used before |
|||
if not self.check_timestamp_and_nonce(timestamp, nonce): |
|||
return False |
|||
|
|||
# FIXME: extract realm, then self.check_realm |
|||
|
|||
# oauth_client parameters depend on client chosen signature method |
|||
# which may vary for each request, section 3.4 |
|||
# HMAC-SHA1 and PLAINTEXT share parameters |
|||
if signature_method == SIGNATURE_RSA: |
|||
oauth_client = Client(client_key, |
|||
resource_owner_key=resource_owner_key, |
|||
callback_uri=callback_uri, |
|||
signature_method=signature_method, |
|||
signature_type=signature_type, |
|||
rsa_key=self.rsa_key, verifier=verifier) |
|||
else: |
|||
client_secret = self.get_client_secret(client_key) |
|||
resource_owner_secret = self.get_resource_owner_secret( |
|||
resource_owner_key) |
|||
oauth_client = Client(client_key, |
|||
client_secret=client_secret, |
|||
resource_owner_key=resource_owner_key, |
|||
resource_owner_secret=resource_owner_secret, |
|||
callback_uri=callback_uri, |
|||
signature_method=signature_method, |
|||
signature_type=signature_type, |
|||
verifier=verifier) |
|||
|
|||
request = Request(uri, http_method, body, headers) |
|||
request.oauth_params = params |
|||
|
|||
client_signature = oauth_client.get_oauth_signature(request) |
|||
|
|||
# FIXME: use near constant time string compare to avoid timing attacks |
|||
return client_signature == request_signature |
@ -0,0 +1,134 @@ |
|||
# -*- coding: utf-8 -*- |
|||
from __future__ import absolute_import |
|||
|
|||
""" |
|||
oauthlib.parameters |
|||
~~~~~~~~~~~~~~~~~~~ |
|||
|
|||
This module contains methods related to `section 3.5`_ of the OAuth 1.0a spec. |
|||
|
|||
.. _`section 3.5`: http://tools.ietf.org/html/rfc5849#section-3.5 |
|||
""" |
|||
|
|||
from urlparse import urlparse, urlunparse |
|||
from . import utils |
|||
from oauthlib.common import extract_params, urlencode |
|||
|
|||
|
|||
# TODO: do we need filter_params now that oauth_params are handled by Request? |
|||
# We can easily pass in just oauth protocol params. |
|||
@utils.filter_params |
|||
def prepare_headers(oauth_params, headers=None, realm=None): |
|||
"""**Prepare the Authorization header.** |
|||
Per `section 3.5.1`_ of the spec. |
|||
|
|||
Protocol parameters can be transmitted using the HTTP "Authorization" |
|||
header field as defined by `RFC2617`_ with the auth-scheme name set to |
|||
"OAuth" (case insensitive). |
|||
|
|||
For example:: |
|||
|
|||
Authorization: OAuth realm="Example", |
|||
oauth_consumer_key="0685bd9184jfhq22", |
|||
oauth_token="ad180jjd733klru7", |
|||
oauth_signature_method="HMAC-SHA1", |
|||
oauth_signature="wOJIO9A2W5mFwDgiDvZbTSMK%2FPY%3D", |
|||
oauth_timestamp="137131200", |
|||
oauth_nonce="4572616e48616d6d65724c61686176", |
|||
oauth_version="1.0" |
|||
|
|||
|
|||
.. _`section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1 |
|||
.. _`RFC2617`: http://tools.ietf.org/html/rfc2617 |
|||
""" |
|||
headers = headers or {} |
|||
|
|||
# Protocol parameters SHALL be included in the "Authorization" header |
|||
# field as follows: |
|||
authorization_header_parameters_parts = [] |
|||
for oauth_parameter_name, value in oauth_params: |
|||
# 1. Parameter names and values are encoded per Parameter Encoding |
|||
# (`Section 3.6`_) |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
escaped_name = utils.escape(oauth_parameter_name) |
|||
escaped_value = utils.escape(value) |
|||
|
|||
# 2. Each parameter's name is immediately followed by an "=" character |
|||
# (ASCII code 61), a """ character (ASCII code 34), the parameter |
|||
# value (MAY be empty), and another """ character (ASCII code 34). |
|||
part = u'{0}="{1}"'.format(escaped_name, escaped_value) |
|||
|
|||
authorization_header_parameters_parts.append(part) |
|||
|
|||
# 3. Parameters are separated by a "," character (ASCII code 44) and |
|||
# OPTIONAL linear whitespace per `RFC2617`_. |
|||
# |
|||
# .. _`RFC2617`: http://tools.ietf.org/html/rfc2617 |
|||
authorization_header_parameters = ', '.join( |
|||
authorization_header_parameters_parts) |
|||
|
|||
# 4. The OPTIONAL "realm" parameter MAY be added and interpreted per |
|||
# `RFC2617 section 1.2`_. |
|||
# |
|||
# .. _`RFC2617 section 1.2`: http://tools.ietf.org/html/rfc2617#section-1.2 |
|||
if realm: |
|||
# NOTE: realm should *not* be escaped |
|||
authorization_header_parameters = (u'realm="%s", ' % realm + |
|||
authorization_header_parameters) |
|||
|
|||
# the auth-scheme name set to "OAuth" (case insensitive). |
|||
authorization_header = u'OAuth %s' % authorization_header_parameters |
|||
|
|||
# contribute the Authorization header to the given headers |
|||
full_headers = {} |
|||
full_headers.update(headers) |
|||
full_headers[u'Authorization'] = authorization_header |
|||
return full_headers |
|||
|
|||
|
|||
def _append_params(oauth_params, params): |
|||
"""Append OAuth params to an existing set of parameters. |
|||
|
|||
Both params and oauth_params is must be lists of 2-tuples. |
|||
|
|||
Per `section 3.5.2`_ and `3.5.3`_ of the spec. |
|||
|
|||
.. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2 |
|||
.. _`3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3 |
|||
|
|||
""" |
|||
merged = list(params) |
|||
merged.extend(oauth_params) |
|||
# The request URI / entity-body MAY include other request-specific |
|||
# parameters, in which case, the protocol parameters SHOULD be appended |
|||
# following the request-specific parameters, properly separated by an "&" |
|||
# character (ASCII code 38) |
|||
merged.sort(key=lambda i: i[0].startswith('oauth_')) |
|||
return merged |
|||
|
|||
|
|||
def prepare_form_encoded_body(oauth_params, body): |
|||
"""Prepare the Form-Encoded Body. |
|||
|
|||
Per `section 3.5.2`_ of the spec. |
|||
|
|||
.. _`section 3.5.2`: http://tools.ietf.org/html/rfc5849#section-3.5.2 |
|||
|
|||
""" |
|||
# append OAuth params to the existing body |
|||
return _append_params(oauth_params, body) |
|||
|
|||
|
|||
def prepare_request_uri_query(oauth_params, uri): |
|||
"""Prepare the Request URI Query. |
|||
|
|||
Per `section 3.5.3`_ of the spec. |
|||
|
|||
.. _`section 3.5.3`: http://tools.ietf.org/html/rfc5849#section-3.5.3 |
|||
|
|||
""" |
|||
# append OAuth params to the existing set of query components |
|||
sch, net, path, par, query, fra = urlparse(uri) |
|||
query = urlencode(_append_params(oauth_params, extract_params(query) or [])) |
|||
return urlunparse((sch, net, path, par, query, fra)) |
@ -0,0 +1,501 @@ |
|||
# -*- coding: utf-8 -*- |
|||
from __future__ import absolute_import |
|||
""" |
|||
oauthlib.oauth1.rfc5849.signature |
|||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|||
|
|||
This module represents a direct implementation of `section 3.4`_ of the spec. |
|||
|
|||
Terminology: |
|||
* Client: software interfacing with an OAuth API |
|||
* Server: the API provider |
|||
* Resource Owner: the user who is granting authorization to the client |
|||
|
|||
Steps for signing a request: |
|||
|
|||
1. Collect parameters from the uri query, auth header, & body |
|||
2. Normalize those parameters |
|||
3. Normalize the uri |
|||
4. Pass the normalized uri, normalized parameters, and http method to |
|||
construct the base string |
|||
5. Pass the base string and any keys needed to a signing function |
|||
|
|||
.. _`section 3.4`: http://tools.ietf.org/html/rfc5849#section-3.4 |
|||
""" |
|||
import binascii |
|||
import hashlib |
|||
import hmac |
|||
import urlparse |
|||
from . import utils |
|||
from oauthlib.common import extract_params |
|||
|
|||
|
|||
def construct_base_string(http_method, base_string_uri, |
|||
normalized_encoded_request_parameters): |
|||
"""**String Construction** |
|||
Per `section 3.4.1.1`_ of the spec. |
|||
|
|||
For example, the HTTP request:: |
|||
|
|||
POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1 |
|||
Host: example.com |
|||
Content-Type: application/x-www-form-urlencoded |
|||
Authorization: OAuth realm="Example", |
|||
oauth_consumer_key="9djdj82h48djs9d2", |
|||
oauth_token="kkk9d7dh3k39sjv7", |
|||
oauth_signature_method="HMAC-SHA1", |
|||
oauth_timestamp="137131201", |
|||
oauth_nonce="7d8f3e4a", |
|||
oauth_signature="bYT5CMsGcbgUdFHObYMEfcx6bsw%3D" |
|||
|
|||
c2&a3=2+q |
|||
|
|||
is represented by the following signature base string (line breaks |
|||
are for display purposes only):: |
|||
|
|||
POST&http%3A%2F%2Fexample.com%2Frequest&a2%3Dr%2520b%26a3%3D2%2520q |
|||
%26a3%3Da%26b5%3D%253D%25253D%26c%2540%3D%26c2%3D%26oauth_consumer_ |
|||
key%3D9djdj82h48djs9d2%26oauth_nonce%3D7d8f3e4a%26oauth_signature_m |
|||
ethod%3DHMAC-SHA1%26oauth_timestamp%3D137131201%26oauth_token%3Dkkk |
|||
9d7dh3k39sjv7 |
|||
|
|||
.. _`section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1 |
|||
""" |
|||
|
|||
# The signature base string is constructed by concatenating together, |
|||
# in order, the following HTTP request elements: |
|||
|
|||
# 1. The HTTP request method in uppercase. For example: "HEAD", |
|||
# "GET", "POST", etc. If the request uses a custom HTTP method, it |
|||
# MUST be encoded (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
base_string = utils.escape(http_method.upper()) |
|||
|
|||
# 2. An "&" character (ASCII code 38). |
|||
base_string += u'&' |
|||
|
|||
# 3. The base string URI from `Section 3.4.1.2`_, after being encoded |
|||
# (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2 |
|||
# .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6 |
|||
base_string += utils.escape(base_string_uri) |
|||
|
|||
# 4. An "&" character (ASCII code 38). |
|||
base_string += u'&' |
|||
|
|||
# 5. The request parameters as normalized in `Section 3.4.1.3.2`_, after |
|||
# being encoded (`Section 3.6`). |
|||
# |
|||
# .. _`Section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2 |
|||
# .. _`Section 3.4.6`: http://tools.ietf.org/html/rfc5849#section-3.4.6 |
|||
base_string += utils.escape(normalized_encoded_request_parameters) |
|||
|
|||
return base_string |
|||
|
|||
|
|||
def normalize_base_string_uri(uri): |
|||
"""**Base String URI** |
|||
Per `section 3.4.1.2`_ of the spec. |
|||
|
|||
For example, the HTTP request:: |
|||
|
|||
GET /r%20v/X?id=123 HTTP/1.1 |
|||
Host: EXAMPLE.COM:80 |
|||
|
|||
is represented by the base string URI: "http://example.com/r%20v/X". |
|||
|
|||
In another example, the HTTPS request:: |
|||
|
|||
GET /?q=1 HTTP/1.1 |
|||
Host: www.example.net:8080 |
|||
|
|||
is represented by the base string URI: "https://www.example.net:8080/". |
|||
|
|||
.. _`section 3.4.1.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.2 |
|||
""" |
|||
if not isinstance(uri, unicode): |
|||
raise ValueError('uri must be a unicode object.') |
|||
|
|||
# FIXME: urlparse does not support unicode |
|||
scheme, netloc, path, params, query, fragment = urlparse.urlparse(uri) |
|||
|
|||
# The scheme, authority, and path of the request resource URI `RFC3986` |
|||
# are included by constructing an "http" or "https" URI representing |
|||
# the request resource (without the query or fragment) as follows: |
|||
# |
|||
# .. _`RFC2616`: http://tools.ietf.org/html/rfc3986 |
|||
|
|||
# 1. The scheme and host MUST be in lowercase. |
|||
scheme = scheme.lower() |
|||
netloc = netloc.lower() |
|||
|
|||
# 2. The host and port values MUST match the content of the HTTP |
|||
# request "Host" header field. |
|||
# TODO: enforce this constraint |
|||
|
|||
# 3. The port MUST be included if it is not the default port for the |
|||
# scheme, and MUST be excluded if it is the default. Specifically, |
|||
# the port MUST be excluded when making an HTTP request `RFC2616`_ |
|||
# to port 80 or when making an HTTPS request `RFC2818`_ to port 443. |
|||
# All other non-default port numbers MUST be included. |
|||
# |
|||
# .. _`RFC2616`: http://tools.ietf.org/html/rfc2616 |
|||
# .. _`RFC2818`: http://tools.ietf.org/html/rfc2818 |
|||
default_ports = ( |
|||
(u'http', u'80'), |
|||
(u'https', u'443'), |
|||
) |
|||
if u':' in netloc: |
|||
host, port = netloc.split(u':', 1) |
|||
if (scheme, port) in default_ports: |
|||
netloc = host |
|||
|
|||
return urlparse.urlunparse((scheme, netloc, path, u'', u'', u'')) |
|||
|
|||
|
|||
# ** Request Parameters ** |
|||
# |
|||
# Per `section 3.4.1.3`_ of the spec. |
|||
# |
|||
# In order to guarantee a consistent and reproducible representation of |
|||
# the request parameters, the parameters are collected and decoded to |
|||
# their original decoded form. They are then sorted and encoded in a |
|||
# particular manner that is often different from their original |
|||
# encoding scheme, and concatenated into a single string. |
|||
# |
|||
# .. _`section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3 |
|||
|
|||
def collect_parameters(uri_query='', body=[], headers=None, |
|||
exclude_oauth_signature=True): |
|||
"""**Parameter Sources** |
|||
|
|||
Parameters starting with `oauth_` will be unescaped. |
|||
|
|||
Body parameters must be supplied as a dict, a list of 2-tuples, or a |
|||
formencoded query string. |
|||
|
|||
Headers must be supplied as a dict. |
|||
|
|||
Per `section 3.4.1.3.1`_ of the spec. |
|||
|
|||
For example, the HTTP request:: |
|||
|
|||
POST /request?b5=%3D%253D&a3=a&c%40=&a2=r%20b HTTP/1.1 |
|||
Host: example.com |
|||
Content-Type: application/x-www-form-urlencoded |
|||
Authorization: OAuth realm="Example", |
|||
oauth_consumer_key="9djdj82h48djs9d2", |
|||
oauth_token="kkk9d7dh3k39sjv7", |
|||
oauth_signature_method="HMAC-SHA1", |
|||
oauth_timestamp="137131201", |
|||
oauth_nonce="7d8f3e4a", |
|||
oauth_signature="djosJKDKJSD8743243%2Fjdk33klY%3D" |
|||
|
|||
c2&a3=2+q |
|||
|
|||
contains the following (fully decoded) parameters used in the |
|||
signature base sting:: |
|||
|
|||
+------------------------+------------------+ |
|||
| Name | Value | |
|||
+------------------------+------------------+ |
|||
| b5 | =%3D | |
|||
| a3 | a | |
|||
| c@ | | |
|||
| a2 | r b | |
|||
| oauth_consumer_key | 9djdj82h48djs9d2 | |
|||
| oauth_token | kkk9d7dh3k39sjv7 | |
|||
| oauth_signature_method | HMAC-SHA1 | |
|||
| oauth_timestamp | 137131201 | |
|||
| oauth_nonce | 7d8f3e4a | |
|||
| c2 | | |
|||
| a3 | 2 q | |
|||
+------------------------+------------------+ |
|||
|
|||
Note that the value of "b5" is "=%3D" and not "==". Both "c@" and |
|||
"c2" have empty values. While the encoding rules specified in this |
|||
specification for the purpose of constructing the signature base |
|||
string exclude the use of a "+" character (ASCII code 43) to |
|||
represent an encoded space character (ASCII code 32), this practice |
|||
is widely used in "application/x-www-form-urlencoded" encoded values, |
|||
and MUST be properly decoded, as demonstrated by one of the "a3" |
|||
parameter instances (the "a3" parameter is used twice in this |
|||
request). |
|||
|
|||
.. _`section 3.4.1.3.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.1 |
|||
""" |
|||
headers = headers or {} |
|||
params = [] |
|||
|
|||
# The parameters from the following sources are collected into a single |
|||
# list of name/value pairs: |
|||
|
|||
# * The query component of the HTTP request URI as defined by |
|||
# `RFC3986, Section 3.4`_. The query component is parsed into a list |
|||
# of name/value pairs by treating it as an |
|||
# "application/x-www-form-urlencoded" string, separating the names |
|||
# and values and decoding them as defined by |
|||
# `W3C.REC-html40-19980424`_, Section 17.13.4. |
|||
# |
|||
# .. _`RFC3986, Section 3.4`: http://tools.ietf.org/html/rfc3986#section-3.4 |
|||
# .. _`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424 |
|||
if uri_query: |
|||
params.extend(urlparse.parse_qsl(uri_query, keep_blank_values=True)) |
|||
|
|||
# * The OAuth HTTP "Authorization" header field (`Section 3.5.1`_) if |
|||
# present. The header's content is parsed into a list of name/value |
|||
# pairs excluding the "realm" parameter if present. The parameter |
|||
# values are decoded as defined by `Section 3.5.1`_. |
|||
# |
|||
# .. _`Section 3.5.1`: http://tools.ietf.org/html/rfc5849#section-3.5.1 |
|||
if headers: |
|||
headers_lower = dict((k.lower(), v) for k, v in headers.items()) |
|||
authorization_header = headers_lower.get(u'authorization') |
|||
if authorization_header is not None: |
|||
params.extend([i for i in utils.parse_authorization_header( |
|||
authorization_header) if i[0] != u'realm']) |
|||
|
|||
# * The HTTP request entity-body, but only if all of the following |
|||
# conditions are met: |
|||
# * The entity-body is single-part. |
|||
# |
|||
# * The entity-body follows the encoding requirements of the |
|||
# "application/x-www-form-urlencoded" content-type as defined by |
|||
# `W3C.REC-html40-19980424`_. |
|||
|
|||
# * The HTTP request entity-header includes the "Content-Type" |
|||
# header field set to "application/x-www-form-urlencoded". |
|||
# |
|||
# .._`W3C.REC-html40-19980424`: http://tools.ietf.org/html/rfc5849#ref-W3C.REC-html40-19980424 |
|||
|
|||
# TODO: enforce header param inclusion conditions |
|||
bodyparams = extract_params(body) or [] |
|||
params.extend(bodyparams) |
|||
|
|||
# ensure all oauth params are unescaped |
|||
unescaped_params = [] |
|||
for k, v in params: |
|||
if k.startswith(u'oauth_'): |
|||
v = utils.unescape(v) |
|||
unescaped_params.append((k, v)) |
|||
|
|||
# The "oauth_signature" parameter MUST be excluded from the signature |
|||
# base string if present. |
|||
if exclude_oauth_signature: |
|||
unescaped_params = filter(lambda i: i[0] != u'oauth_signature', |
|||
unescaped_params) |
|||
|
|||
return unescaped_params |
|||
|
|||
|
|||
def normalize_parameters(params): |
|||
"""**Parameters Normalization** |
|||
Per `section 3.4.1.3.2`_ of the spec. |
|||
|
|||
For example, the list of parameters from the previous section would |
|||
be normalized as follows: |
|||
|
|||
Encoded:: |
|||
|
|||
+------------------------+------------------+ |
|||
| Name | Value | |
|||
+------------------------+------------------+ |
|||
| b5 | %3D%253D | |
|||
| a3 | a | |
|||
| c%40 | | |
|||
| a2 | r%20b | |
|||
| oauth_consumer_key | 9djdj82h48djs9d2 | |
|||
| oauth_token | kkk9d7dh3k39sjv7 | |
|||
| oauth_signature_method | HMAC-SHA1 | |
|||
| oauth_timestamp | 137131201 | |
|||
| oauth_nonce | 7d8f3e4a | |
|||
| c2 | | |
|||
| a3 | 2%20q | |
|||
+------------------------+------------------+ |
|||
|
|||
Sorted:: |
|||
|
|||
+------------------------+------------------+ |
|||
| Name | Value | |
|||
+------------------------+------------------+ |
|||
| a2 | r%20b | |
|||
| a3 | 2%20q | |
|||
| a3 | a | |
|||
| b5 | %3D%253D | |
|||
| c%40 | | |
|||
| c2 | | |
|||
| oauth_consumer_key | 9djdj82h48djs9d2 | |
|||
| oauth_nonce | 7d8f3e4a | |
|||
| oauth_signature_method | HMAC-SHA1 | |
|||
| oauth_timestamp | 137131201 | |
|||
| oauth_token | kkk9d7dh3k39sjv7 | |
|||
+------------------------+------------------+ |
|||
|
|||
Concatenated Pairs:: |
|||
|
|||
+-------------------------------------+ |
|||
| Name=Value | |
|||
+-------------------------------------+ |
|||
| a2=r%20b | |
|||
| a3=2%20q | |
|||
| a3=a | |
|||
| b5=%3D%253D | |
|||
| c%40= | |
|||
| c2= | |
|||
| oauth_consumer_key=9djdj82h48djs9d2 | |
|||
| oauth_nonce=7d8f3e4a | |
|||
| oauth_signature_method=HMAC-SHA1 | |
|||
| oauth_timestamp=137131201 | |
|||
| oauth_token=kkk9d7dh3k39sjv7 | |
|||
+-------------------------------------+ |
|||
|
|||
and concatenated together into a single string (line breaks are for |
|||
display purposes only):: |
|||
|
|||
a2=r%20b&a3=2%20q&a3=a&b5=%3D%253D&c%40=&c2=&oauth_consumer_key=9dj |
|||
dj82h48djs9d2&oauth_nonce=7d8f3e4a&oauth_signature_method=HMAC-SHA1 |
|||
&oauth_timestamp=137131201&oauth_token=kkk9d7dh3k39sjv7 |
|||
|
|||
.. _`section 3.4.1.3.2`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3.2 |
|||
""" |
|||
|
|||
# The parameters collected in `Section 3.4.1.3`_ are normalized into a |
|||
# single string as follows: |
|||
# |
|||
# .. _`Section 3.4.1.3`: http://tools.ietf.org/html/rfc5849#section-3.4.1.3 |
|||
|
|||
# 1. First, the name and value of each parameter are encoded |
|||
# (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
key_values = [(utils.escape(k), utils.escape(v)) for k, v in params] |
|||
|
|||
# 2. The parameters are sorted by name, using ascending byte value |
|||
# ordering. If two or more parameters share the same name, they |
|||
# are sorted by their value. |
|||
key_values.sort() |
|||
|
|||
# 3. The name of each parameter is concatenated to its corresponding |
|||
# value using an "=" character (ASCII code 61) as a separator, even |
|||
# if the value is empty. |
|||
parameter_parts = [u'{0}={1}'.format(k, v) for k, v in key_values] |
|||
|
|||
# 4. The sorted name/value pairs are concatenated together into a |
|||
# single string by using an "&" character (ASCII code 38) as |
|||
# separator. |
|||
return u'&'.join(parameter_parts) |
|||
|
|||
|
|||
def sign_hmac_sha1(base_string, client_secret, resource_owner_secret): |
|||
"""**HMAC-SHA1** |
|||
|
|||
The "HMAC-SHA1" signature method uses the HMAC-SHA1 signature |
|||
algorithm as defined in `RFC2104`_:: |
|||
|
|||
digest = HMAC-SHA1 (key, text) |
|||
|
|||
Per `section 3.4.2`_ of the spec. |
|||
|
|||
.. _`RFC2104`: http://tools.ietf.org/html/rfc2104 |
|||
.. _`section 3.4.2`: http://tools.ietf.org/html/rfc5849#section-3.4.2 |
|||
""" |
|||
|
|||
# The HMAC-SHA1 function variables are used in following way: |
|||
|
|||
# text is set to the value of the signature base string from |
|||
# `Section 3.4.1.1`_. |
|||
# |
|||
# .. _`Section 3.4.1.1`: http://tools.ietf.org/html/rfc5849#section-3.4.1.1 |
|||
text = base_string |
|||
|
|||
# key is set to the concatenated values of: |
|||
# 1. The client shared-secret, after being encoded (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
key = utils.escape(client_secret or u'') |
|||
|
|||
# 2. An "&" character (ASCII code 38), which MUST be included |
|||
# even when either secret is empty. |
|||
key += u'&' |
|||
|
|||
# 3. The token shared-secret, after being encoded (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
key += utils.escape(resource_owner_secret or u'') |
|||
|
|||
# FIXME: HMAC does not support unicode! |
|||
key_utf8 = key.encode('utf-8') |
|||
text_utf8 = text.encode('utf-8') |
|||
signature = hmac.new(key_utf8, text_utf8, hashlib.sha1) |
|||
|
|||
# digest is used to set the value of the "oauth_signature" protocol |
|||
# parameter, after the result octet string is base64-encoded |
|||
# per `RFC2045, Section 6.8`. |
|||
# |
|||
# .. _`RFC2045, Section 6.8`: http://tools.ietf.org/html/rfc2045#section-6.8 |
|||
return binascii.b2a_base64(signature.digest())[:-1].decode('utf-8') |
|||
|
|||
|
|||
def sign_rsa_sha1(base_string, rsa_private_key): |
|||
"""**RSA-SHA1** |
|||
|
|||
Per `section 3.4.3`_ of the spec. |
|||
|
|||
The "RSA-SHA1" signature method uses the RSASSA-PKCS1-v1_5 signature |
|||
algorithm as defined in `RFC3447, Section 8.2`_ (also known as |
|||
PKCS#1), using SHA-1 as the hash function for EMSA-PKCS1-v1_5. To |
|||
use this method, the client MUST have established client credentials |
|||
with the server that included its RSA public key (in a manner that is |
|||
beyond the scope of this specification). |
|||
|
|||
NOTE: this method requires the python-rsa library. |
|||
|
|||
.. _`section 3.4.3`: http://tools.ietf.org/html/rfc5849#section-3.4.3 |
|||
.. _`RFC3447, Section 8.2`: http://tools.ietf.org/html/rfc3447#section-8.2 |
|||
|
|||
""" |
|||
|
|||
# TODO: finish RSA documentation |
|||
|
|||
import rsa |
|||
key = rsa.PrivateKey.load_pkcs1(rsa_private_key) |
|||
sig = rsa.sign(base_string, key, 'SHA-1') |
|||
return binascii.b2a_base64(sig)[:-1] |
|||
|
|||
|
|||
def sign_plaintext(client_secret, resource_owner_secret): |
|||
"""Sign a request using plaintext. |
|||
|
|||
Per `section 3.4.4`_ of the spec. |
|||
|
|||
The "PLAINTEXT" method does not employ a signature algorithm. It |
|||
MUST be used with a transport-layer mechanism such as TLS or SSL (or |
|||
sent over a secure channel with equivalent protections). It does not |
|||
utilize the signature base string or the "oauth_timestamp" and |
|||
"oauth_nonce" parameters. |
|||
|
|||
.. _`section 3.4.4`: http://tools.ietf.org/html/rfc5849#section-3.4.4 |
|||
|
|||
""" |
|||
|
|||
# The "oauth_signature" protocol parameter is set to the concatenated |
|||
# value of: |
|||
|
|||
# 1. The client shared-secret, after being encoded (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
signature = utils.escape(client_secret or u'') |
|||
|
|||
# 2. An "&" character (ASCII code 38), which MUST be included even |
|||
# when either secret is empty. |
|||
signature += u'&' |
|||
|
|||
# 3. The token shared-secret, after being encoded (`Section 3.6`_). |
|||
# |
|||
# .. _`Section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
signature += utils.escape(resource_owner_secret or u'') |
|||
|
|||
return signature |
|||
|
@ -0,0 +1,141 @@ |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
oauthlib.utils |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module contains utility methods used by various parts of the OAuth |
|||
spec. |
|||
""" |
|||
|
|||
import string |
|||
import time |
|||
import urllib2 |
|||
from random import getrandbits, choice |
|||
|
|||
from oauthlib.common import quote, unquote |
|||
|
|||
UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') + |
|||
string.digits.decode('ascii')) |
|||
|
|||
|
|||
def filter_params(target): |
|||
"""Decorator which filters params to remove non-oauth_* parameters |
|||
|
|||
Assumes the decorated method takes a params dict or list of tuples as its |
|||
first argument. |
|||
""" |
|||
def wrapper(params, *args, **kwargs): |
|||
params = filter_oauth_params(params) |
|||
return target(params, *args, **kwargs) |
|||
|
|||
wrapper.__doc__ = target.__doc__ |
|||
return wrapper |
|||
|
|||
|
|||
def filter_oauth_params(params): |
|||
"""Removes all non oauth parameters from a dict or a list of params.""" |
|||
is_oauth = lambda kv: kv[0].startswith(u"oauth_") |
|||
if isinstance(params, dict): |
|||
return filter(is_oauth, params.items()) |
|||
else: |
|||
return filter(is_oauth, params) |
|||
|
|||
|
|||
def generate_timestamp(): |
|||
"""Get seconds since epoch (UTC). |
|||
|
|||
Per `section 3.3`_ of the spec. |
|||
|
|||
.. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3 |
|||
""" |
|||
return unicode(int(time.time())) |
|||
|
|||
|
|||
def generate_nonce(): |
|||
"""Generate pseudorandom nonce that is unlikely to repeat. |
|||
|
|||
Per `section 3.3`_ of the spec. |
|||
|
|||
A random 64-bit number is appended to the epoch timestamp for both |
|||
randomness and to decrease the likelihood of collisions. |
|||
|
|||
.. _`section 3.3`: http://tools.ietf.org/html/rfc5849#section-3.3 |
|||
""" |
|||
return unicode(getrandbits(64)) + generate_timestamp() |
|||
|
|||
|
|||
def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET): |
|||
"""Generates a generic OAuth token |
|||
|
|||
According to `section 2`_ of the spec, the method of token |
|||
construction is undefined. This implementation is simply a random selection |
|||
of `length` choices from `chars`. |
|||
|
|||
Credit to Ignacio Vazquez-Abrams for his excellent `Stackoverflow answer`_ |
|||
|
|||
.. _`Stackoverflow answer` : http://stackoverflow.com/questions/2257441/ |
|||
python-random-string-generation-with-upper-case-letters-and-digits |
|||
|
|||
""" |
|||
return u''.join(choice(chars) for x in range(length)) |
|||
|
|||
|
|||
def escape(u): |
|||
"""Escape a unicode string in an OAuth-compatible fashion. |
|||
|
|||
Per `section 3.6`_ of the spec. |
|||
|
|||
.. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
|
|||
""" |
|||
if not isinstance(u, unicode): |
|||
raise ValueError('Only unicode objects are escapable.') |
|||
# Letters, digits, and the characters '_.-' are already treated as safe |
|||
# by urllib.quote(). We need to add '~' to fully support rfc5849. |
|||
return quote(u, safe='~') |
|||
|
|||
|
|||
def unescape(u): |
|||
if not isinstance(u, unicode): |
|||
raise ValueError('Only unicode objects are unescapable.') |
|||
return unquote(u) |
|||
|
|||
|
|||
def urlencode(query): |
|||
"""Encode a sequence of two-element tuples or dictionary into a URL query string. |
|||
|
|||
Operates using an OAuth-safe escape() method, in contrast to urllib.urlencode. |
|||
""" |
|||
# Convert dictionaries to list of tuples |
|||
if isinstance(query, dict): |
|||
query = query.items() |
|||
return u"&".join([u'='.join([escape(k), escape(v)]) for k, v in query]) |
|||
|
|||
|
|||
def parse_keqv_list(l): |
|||
"""A unicode-safe version of urllib2.parse_keqv_list""" |
|||
encoded_list = [u.encode('utf-8') for u in l] |
|||
encoded_parsed = urllib2.parse_keqv_list(encoded_list) |
|||
return dict((k.decode('utf-8'), |
|||
v.decode('utf-8')) for k,v in encoded_parsed.items()) |
|||
|
|||
|
|||
def parse_http_list(u): |
|||
"""A unicode-safe version of urllib2.parse_http_list""" |
|||
encoded_str = u.encode('utf-8') |
|||
encoded_list = urllib2.parse_http_list(encoded_str) |
|||
return [s.decode('utf-8') for s in encoded_list] |
|||
|
|||
|
|||
def parse_authorization_header(authorization_header): |
|||
"""Parse an OAuth authorization header into a list of 2-tuples""" |
|||
auth_scheme = u'OAuth ' |
|||
if authorization_header.startswith(auth_scheme): |
|||
authorization_header = authorization_header.replace(auth_scheme, u'', 1) |
|||
items = parse_http_list(authorization_header) |
|||
try: |
|||
return parse_keqv_list(items).items() |
|||
except ValueError: |
|||
raise ValueError('Malformed authorization header') |
|||
|
@ -0,0 +1,13 @@ |
|||
# -*- coding: utf-8 -*- |
|||
from __future__ import absolute_import |
|||
|
|||
""" |
|||
oauthlib.oauth2 |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module is a wrapper for the most recent implementation of OAuth 2.0 Client |
|||
and Server classes. |
|||
""" |
|||
|
|||
from .draft25 import Client, Server |
|||
|
@ -0,0 +1,14 @@ |
|||
""" |
|||
oauthlib.oauth2.draft_25 |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module is an implementation of various logic needed |
|||
for signing and checking OAuth 2.0 draft 25 requests. |
|||
""" |
|||
|
|||
class Client(object): |
|||
pass |
|||
|
|||
class Server(object): |
|||
pass |
|||
|
@ -0,0 +1,131 @@ |
|||
from __future__ import absolute_import |
|||
""" |
|||
oauthlib.oauth2.draft25.tokens |
|||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ |
|||
|
|||
This module contains methods for adding two types of access tokens to requests. |
|||
|
|||
- Bearer http://tools.ietf.org/html/draft-ietf-oauth-saml2-bearer-08 |
|||
- MAC http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-00 |
|||
|
|||
""" |
|||
from binascii import b2a_base64 |
|||
import hashlib |
|||
import hmac |
|||
from urlparse import urlparse |
|||
|
|||
from . import utils |
|||
|
|||
|
|||
def prepare_mac_header(token, uri, key, http_method, nonce=None, headers=None, |
|||
body=None, ext=u'', hash_algorithm=u'hmac-sha-1'): |
|||
"""Add an `MAC Access Authentication`_ signature to headers. |
|||
|
|||
Unlike OAuth 1, this HMAC signature does not require inclusion of the request |
|||
payload/body, neither does it use a combination of client_secret and |
|||
token_secret but rather a mac_key provided together with the access token. |
|||
|
|||
Currently two algorithms are supported, "hmac-sha-1" and "hmac-sha-256", |
|||
`extension algorithms`_ are not supported. |
|||
|
|||
Example MAC Authorization header, linebreaks added for clarity |
|||
|
|||
Authorization: MAC id="h480djs93hd8", |
|||
nonce="1336363200:dj83hs9s", |
|||
mac="bhCQXTVyfj5cmA9uKkPFx1zeOXM=" |
|||
|
|||
.. _`MAC Access Authentication`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01 |
|||
.. _`extension algorithms`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-7.1 |
|||
|
|||
:param uri: Request URI. |
|||
:param headers: Request headers as a dictionary. |
|||
:param http_method: HTTP Request method. |
|||
:param key: MAC given provided by token endpoint. |
|||
:param algorithm: HMAC algorithm provided by token endpoint. |
|||
:return: headers dictionary with the authorization field added. |
|||
""" |
|||
http_method = http_method.upper() |
|||
host, port = utils.host_from_uri(uri) |
|||
|
|||
if hash_algorithm.lower() == u'hmac-sha-1': |
|||
h = hashlib.sha1 |
|||
else: |
|||
h = hashlib.sha256 |
|||
|
|||
nonce = nonce or u'{0}:{1}'.format(utils.generate_nonce(), utils.generate_timestamp()) |
|||
sch, net, path, par, query, fra = urlparse(uri) |
|||
|
|||
if query: |
|||
request_uri = path + u'?' + query |
|||
else: |
|||
request_uri = path |
|||
|
|||
# Hash the body/payload |
|||
if body is not None: |
|||
bodyhash = b2a_base64(h(body).digest())[:-1].decode('utf-8') |
|||
else: |
|||
bodyhash = u'' |
|||
|
|||
# Create the normalized base string |
|||
base = [] |
|||
base.append(nonce) |
|||
base.append(http_method.upper()) |
|||
base.append(request_uri) |
|||
base.append(host) |
|||
base.append(port) |
|||
base.append(bodyhash) |
|||
base.append(ext) |
|||
base_string = '\n'.join(base) + u'\n' |
|||
|
|||
# hmac struggles with unicode strings - http://bugs.python.org/issue5285 |
|||
if isinstance(key, unicode): |
|||
key = key.encode('utf-8') |
|||
sign = hmac.new(key, base_string, h) |
|||
sign = b2a_base64(sign.digest())[:-1].decode('utf-8') |
|||
|
|||
header = [] |
|||
header.append(u'MAC id="%s"' % token) |
|||
header.append(u'nonce="%s"' % nonce) |
|||
if bodyhash: |
|||
header.append(u'bodyhash="%s"' % bodyhash) |
|||
if ext: |
|||
header.append(u'ext="%s"' % ext) |
|||
header.append(u'mac="%s"' % sign) |
|||
|
|||
headers = headers or {} |
|||
headers[u'Authorization'] = u', '.join(header) |
|||
return headers |
|||
|
|||
|
|||
def prepare_bearer_uri(token, uri): |
|||
"""Add a `Bearer Token`_ to the request URI. |
|||
Not recommended, use only if client can't use authorization header or body. |
|||
|
|||
http://www.example.com/path?access_token=h480djs93hd8 |
|||
|
|||
.. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18 |
|||
""" |
|||
return utils.add_params_to_uri(uri, [((u'access_token', token))]) |
|||
|
|||
|
|||
def prepare_bearer_headers(token, headers=None): |
|||
"""Add a `Bearer Token`_ to the request URI. |
|||
Recommended method of passing bearer tokens. |
|||
|
|||
Authorization: Bearer h480djs93hd8 |
|||
|
|||
.. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18 |
|||
""" |
|||
headers = headers or {} |
|||
headers[u'Authorization'] = u'Bearer %s' % token |
|||
return headers |
|||
|
|||
|
|||
def prepare_bearer_body(token, body=u''): |
|||
"""Add a `Bearer Token`_ to the request body. |
|||
|
|||
access_token=h480djs93hd8 |
|||
|
|||
.. _`Bearer Token`: http://tools.ietf.org/html/draft-ietf-oauth-v2-bearer-18 |
|||
""" |
|||
return utils.add_params_to_qs(body, [((u'access_token', token))]) |
@ -0,0 +1,128 @@ |
|||
""" |
|||
oauthlib.utils |
|||
~~~~~~~~~~~~~~ |
|||
|
|||
This module contains utility methods used by various parts of the OAuth 2 spec. |
|||
""" |
|||
|
|||
import random |
|||
import string |
|||
import time |
|||
import urllib |
|||
from urlparse import urlparse, urlunparse, parse_qsl |
|||
|
|||
UNICODE_ASCII_CHARACTER_SET = (string.ascii_letters.decode('ascii') + |
|||
string.digits.decode('ascii')) |
|||
|
|||
def add_params_to_qs(query, params): |
|||
"""Extend a query with a list of two-tuples. |
|||
|
|||
:param query: Query string. |
|||
:param params: List of two-tuples. |
|||
:return: extended query |
|||
""" |
|||
queryparams = parse_qsl(query, keep_blank_values=True) |
|||
queryparams.extend(params) |
|||
return urlencode(queryparams) |
|||
|
|||
|
|||
def add_params_to_uri(uri, params): |
|||
"""Add a list of two-tuples to the uri query components. |
|||
|
|||
:param uri: Full URI. |
|||
:param params: List of two-tuples. |
|||
:return: uri with extended query |
|||
""" |
|||
sch, net, path, par, query, fra = urlparse(uri) |
|||
query = add_params_to_qs(query, params) |
|||
return urlunparse((sch, net, path, par, query, fra)) |
|||
|
|||
|
|||
def escape(u): |
|||
"""Escape a string in an OAuth-compatible fashion. |
|||
|
|||
Per `section 3.6`_ of the spec. |
|||
|
|||
.. _`section 3.6`: http://tools.ietf.org/html/rfc5849#section-3.6 |
|||
|
|||
""" |
|||
if not isinstance(u, unicode): |
|||
raise ValueError('Only unicode objects are escapable.') |
|||
return urllib.quote(u.encode('utf-8'), safe='~') |
|||
|
|||
|
|||
def generate_nonce(): |
|||
"""Generate pseudorandom nonce that is unlikely to repeat. |
|||
|
|||
Per `section 3.2.1`_ of the MAC Access Authentication spec. |
|||
|
|||
A random 64-bit number is appended to the epoch timestamp for both |
|||
randomness and to decrease the likelihood of collisions. |
|||
|
|||
.. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1 |
|||
""" |
|||
return unicode(unicode(random.getrandbits(64)) + generate_timestamp()) |
|||
|
|||
|
|||
def generate_timestamp(): |
|||
"""Get seconds since epoch (UTC). |
|||
|
|||
Per `section 3.2.1`_ of the MAC Access Authentication spec. |
|||
|
|||
.. _`section 3.2.1`: http://tools.ietf.org/html/draft-ietf-oauth-v2-http-mac-01#section-3.2.1 |
|||
""" |
|||
return unicode(int(time.time())) |
|||
|
|||
|
|||
def generate_token(length=20, chars=UNICODE_ASCII_CHARACTER_SET): |
|||
"""Generates a generic OAuth 2 token |
|||
|
|||
According to `section 1.4`_ and `section 1.5` of the spec, the method of token |
|||
construction is undefined. This implementation is simply a random selection |
|||
of `length` choices from `chars`. SystemRandom is used since it provides |
|||
higher entropy than random.choice. |
|||
|
|||
.. _`section 1.4`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.4 |
|||
.. _`section 1.5`: http://tools.ietf.org/html/draft-ietf-oauth-v2-25#section-1.5 |
|||
""" |
|||
rand = random.SystemRandom() |
|||
return u''.join(rand.choice(chars) for x in range(length)) |
|||
|
|||
|
|||
def host_from_uri(uri): |
|||
"""Extract hostname and port from URI. |
|||
|
|||
Will use default port for HTTP and HTTPS if none is present in the URI. |
|||
|
|||
>>> host_from_uri(u'https://www.example.com/path?query') |
|||
u'www.example.com', u'443' |
|||
>>> host_from_uri(u'http://www.example.com:8080/path?query') |
|||
u'www.example.com', u'8080' |
|||
|
|||
:param uri: Full URI. |
|||
:param http_method: HTTP request method. |
|||
:return: hostname, port |
|||
""" |
|||
default_ports = { |
|||
u'HTTP' : u'80', |
|||
u'HTTPS' : u'443', |
|||
} |
|||
|
|||
sch, netloc, path, par, query, fra = urlparse(uri) |
|||
if u':' in netloc: |
|||
netloc, port = netloc.split(u':', 1) |
|||
else: |
|||
port = default_ports.get(sch.upper()) |
|||
|
|||
return netloc, port |
|||
|
|||
|
|||
def urlencode(query): |
|||
"""Encode a sequence of two-element tuples or dictionary into a URL query string. |
|||
|
|||
Operates using an OAuth-safe escape() method, in contrast to urllib.urlenocde. |
|||
""" |
|||
# Convert dictionaries to list of tuples |
|||
if isinstance(query, dict): |
|||
query = query.items() |
|||
return "&".join(['='.join([escape(k), escape(v)]) for k, v in query]) |
Some files were not shown because too many files changed in this diff
Loading…
Reference in new issue