Browse Source

Update html5lib 1.1-dev (05b73ef) → 1.1 (f87487a).

tags/release_0.25.1
JackDandy 5 years ago
parent
commit
0812e36515
  1. 4
      CHANGES.md
  2. 2
      lib/html5lib/__init__.py
  3. 23
      lib/html5lib/_inputstream.py
  4. 16
      lib/html5lib/_tokenizer.py
  5. 13
      lib/html5lib/_trie/__init__.py
  6. 44
      lib/html5lib/_trie/datrie.py
  7. 47
      lib/html5lib/_utils.py
  8. 20
      lib/html5lib/filters/sanitizer.py
  9. 720
      lib/html5lib/html5parser.py
  10. 27
      lib/html5lib/treebuilders/etree.py
  11. 64
      lib/html5lib/treebuilders/etree_lxml.py
  12. 4
      lib/html5lib/treewalkers/etree_lxml.py

4
CHANGES.md

@ -23,7 +23,7 @@
* Update feedparser 6.0.0b1 (d12d3bd) to feedparser_py2 6.0.0b3 (7e255f0) * Update feedparser 6.0.0b1 (d12d3bd) to feedparser_py2 6.0.0b3 (7e255f0)
* Add feedparser_py3 6.0.0b3 (7e255f0) * Add feedparser_py3 6.0.0b3 (7e255f0)
* Update Fuzzywuzzy 0.17.0 (0cfb2c8) to 0.18.0 (2188520) * Update Fuzzywuzzy 0.17.0 (0cfb2c8) to 0.18.0 (2188520)
* Update html5lib 1.1-dev (4b22754) to 1.1-dev (05b73ef) * Update html5lib 1.1-dev (4b22754) to 1.1 (f87487a)
* Update idna library 2.8 (032fc55) to 2.9 (1233a73) * Update idna library 2.8 (032fc55) to 2.9 (1233a73)
* Update isotope library 3.0.1 (98ba374) to 3.0.6 (ad00807) * Update isotope library 3.0.1 (98ba374) to 3.0.6 (ad00807)
* Update functools_lru_cache 1.5 (21e85f5) to 1.6.1 (2dc65b5) * Update functools_lru_cache 1.5 (21e85f5) to 1.6.1 (2dc65b5)
@ -62,6 +62,8 @@
* Update Beautiful Soup 4.8.2 (r544) to 4.8.2 (r556) * Update Beautiful Soup 4.8.2 (r544) to 4.8.2 (r556)
* Update Certifi 2019.06.16 (84dc766) to 2019.11.28 (21abb9b) * Update Certifi 2019.06.16 (84dc766) to 2019.11.28 (21abb9b)
* Update dateutil 2.8.1 (fc9b162) to 2.8.1 (110a09b) * Update dateutil 2.8.1 (fc9b162) to 2.8.1 (110a09b)
* Update html5lib 1.1-dev (4b22754) to 1.1-dev (05b73ef)
* Update html5lib 1.1-dev (05b73ef) to 1.1 (f87487a)
* Update Requests library 2.22.0 (3d968ff) to 2.22.0 (d2f65af) * Update Requests library 2.22.0 (3d968ff) to 2.22.0 (d2f65af)
* Update Requests library 2.22.0 (d2f65af) to 2.23.0 (b7c6aba) * Update Requests library 2.22.0 (d2f65af) to 2.23.0 (b7c6aba)
* Update Requests library 2.23.0 (b7c6aba) to 2.24.0 (1b41763) * Update Requests library 2.23.0 (b7c6aba) to 2.24.0 (1b41763)

2
lib/html5lib/__init__.py

@ -32,4 +32,4 @@ __all__ = ["HTMLParser", "parse", "parseFragment", "getTreeBuilder",
# this has to be at the top level, see how setup.py parses this # this has to be at the top level, see how setup.py parses this
#: Distribution version number. #: Distribution version number.
__version__ = "1.1-dev" __version__ = "1.1"

23
lib/html5lib/_inputstream.py

@ -658,9 +658,7 @@ class EncodingBytes(bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes """Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone""" match. Otherwise return False and leave the position alone"""
p = self.position rv = self.startswith(bytes, self.position)
data = self[p:p + len(bytes)]
rv = data.startswith(bytes)
if rv: if rv:
self.position += len(bytes) self.position += len(bytes)
return rv return rv
@ -668,15 +666,11 @@ class EncodingBytes(bytes):
def jumpTo(self, bytes): def jumpTo(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If """Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match""" a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes) try:
if newPosition > -1: self._position = self.index(bytes, self.position) + len(bytes) - 1
# XXX: This is ugly, but I can't see a nicer way to fix this. except ValueError:
if self._position == -1:
self._position = 0
self._position += (newPosition + len(bytes) - 1)
return True
else:
raise StopIteration raise StopIteration
return True
class EncodingParser(object): class EncodingParser(object):
@ -688,6 +682,9 @@ class EncodingParser(object):
self.encoding = None self.encoding = None
def getEncoding(self): def getEncoding(self):
if b"<meta" not in self.data:
return None
methodDispatch = ( methodDispatch = (
(b"<!--", self.handleComment), (b"<!--", self.handleComment),
(b"<meta", self.handleMeta), (b"<meta", self.handleMeta),
@ -697,6 +694,10 @@ class EncodingParser(object):
(b"<", self.handlePossibleStartTag)) (b"<", self.handlePossibleStartTag))
for _ in self.data: for _ in self.data:
keepParsing = True keepParsing = True
try:
self.data.jumpTo(b"<")
except StopIteration:
break
for key, method in methodDispatch: for key, method in methodDispatch:
if self.data.matchBytes(key): if self.data.matchBytes(key):
try: try:

16
lib/html5lib/_tokenizer.py

@ -2,7 +2,8 @@ from __future__ import absolute_import, division, unicode_literals
from six import unichr as chr from six import unichr as chr
from collections import deque from collections import deque, OrderedDict
from sys import version_info
from .constants import spaceCharacters from .constants import spaceCharacters
from .constants import entities from .constants import entities
@ -17,6 +18,11 @@ from ._trie import Trie
entitiesTrie = Trie(entities) entitiesTrie = Trie(entities)
if version_info >= (3, 7):
attributeMap = dict
else:
attributeMap = OrderedDict
class HTMLTokenizer(object): class HTMLTokenizer(object):
""" This class takes care of tokenizing HTML. """ This class takes care of tokenizing HTML.
@ -228,6 +234,14 @@ class HTMLTokenizer(object):
# Add token to the queue to be yielded # Add token to the queue to be yielded
if (token["type"] in tagTokenTypes): if (token["type"] in tagTokenTypes):
token["name"] = token["name"].translate(asciiUpper2Lower) token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
data = attributeMap(raw)
if len(raw) > len(data):
# we had some duplicated attribute, fix so first wins
data.update(raw[::-1])
token["data"] = data
if token["type"] == tokenTypes["EndTag"]: if token["type"] == tokenTypes["EndTag"]:
if token["data"]: if token["data"]:
self.tokenQueue.append({"type": tokenTypes["ParseError"], self.tokenQueue.append({"type": tokenTypes["ParseError"],

13
lib/html5lib/_trie/__init__.py

@ -1,14 +1,5 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from .py import Trie as PyTrie from .py import Trie
Trie = PyTrie __all__ = ["Trie"]
# pylint:disable=wrong-import-position
try:
from .datrie import Trie as DATrie
except ImportError:
pass
else:
Trie = DATrie
# pylint:enable=wrong-import-position

44
lib/html5lib/_trie/datrie.py

@ -1,44 +0,0 @@
from __future__ import absolute_import, division, unicode_literals
from datrie import Trie as DATrie
from six import text_type
from ._base import Trie as ABCTrie
class Trie(ABCTrie):
def __init__(self, data):
chars = set()
for key in data.keys():
if not isinstance(key, text_type):
raise TypeError("All keys must be strings")
for char in key:
chars.add(char)
self._data = DATrie("".join(chars))
for key, value in data.items():
self._data[key] = value
def __contains__(self, key):
return key in self._data
def __len__(self):
return len(self._data)
def __iter__(self):
raise NotImplementedError()
def __getitem__(self, key):
return self._data[key]
def keys(self, prefix=None):
return self._data.keys(prefix)
def has_keys_with_prefix(self, prefix):
return self._data.has_keys_with_prefix(prefix)
def longest_prefix(self, prefix):
return self._data.longest_prefix(prefix)
def longest_prefix_item(self, prefix):
return self._data.longest_prefix_item(prefix)

47
lib/html5lib/_utils.py

@ -2,12 +2,20 @@ from __future__ import absolute_import, division, unicode_literals
from types import ModuleType from types import ModuleType
from six import text_type
try: try:
import xml.etree.cElementTree as default_etree from collections.abc import Mapping
except ImportError: except ImportError:
from collections import Mapping
from six import text_type, PY3
if PY3:
import xml.etree.ElementTree as default_etree import xml.etree.ElementTree as default_etree
else:
try:
import xml.etree.cElementTree as default_etree
except ImportError:
import xml.etree.ElementTree as default_etree
__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@ -47,9 +55,6 @@ class MethodDispatcher(dict):
""" """
def __init__(self, items=()): def __init__(self, items=()):
# Using _dictEntries instead of directly assigning to self is about
# twice as fast. Please do careful performance testing before changing
# anything here.
_dictEntries = [] _dictEntries = []
for name, value in items: for name, value in items:
if isinstance(name, (list, tuple, frozenset, set)): if isinstance(name, (list, tuple, frozenset, set)):
@ -64,6 +69,36 @@ class MethodDispatcher(dict):
def __getitem__(self, key): def __getitem__(self, key):
return dict.get(self, key, self.default) return dict.get(self, key, self.default)
def __get__(self, instance, owner=None):
return BoundMethodDispatcher(instance, self)
class BoundMethodDispatcher(Mapping):
"""Wraps a MethodDispatcher, binding its return values to `instance`"""
def __init__(self, instance, dispatcher):
self.instance = instance
self.dispatcher = dispatcher
def __getitem__(self, key):
# see https://docs.python.org/3/reference/datamodel.html#object.__get__
# on a function, __get__ is used to bind a function to an instance as a bound method
return self.dispatcher[key].__get__(self.instance)
def get(self, key, default):
if key in self.dispatcher:
return self[key]
else:
return default
def __iter__(self):
return iter(self.dispatcher)
def __len__(self):
return len(self.dispatcher)
def __contains__(self, key):
return key in self.dispatcher
# Some utility functions to deal with weirdness around UCS2 vs UCS4 # Some utility functions to deal with weirdness around UCS2 vs UCS4
# python builds # python builds

20
lib/html5lib/filters/sanitizer.py

@ -1,6 +1,15 @@
"""Deprecated from html5lib 1.1.
See `here <https://github.com/html5lib/html5lib-python/issues/443>`_ for
information about its deprecation; `Bleach <https://github.com/mozilla/bleach>`_
is recommended as a replacement. Please let us know in the aforementioned issue
if Bleach is unsuitable for your needs.
"""
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
import re import re
import warnings
from xml.sax.saxutils import escape, unescape from xml.sax.saxutils import escape, unescape
from six.moves import urllib_parse as urlparse from six.moves import urllib_parse as urlparse
@ -11,6 +20,14 @@ from ..constants import namespaces, prefixes
__all__ = ["Filter"] __all__ = ["Filter"]
_deprecation_msg = (
"html5lib's sanitizer is deprecated; see " +
"https://github.com/html5lib/html5lib-python/issues/443 and please let " +
"us know if Bleach is unsuitable for your needs"
)
warnings.warn(_deprecation_msg, DeprecationWarning)
allowed_elements = frozenset(( allowed_elements = frozenset((
(namespaces['html'], 'a'), (namespaces['html'], 'a'),
(namespaces['html'], 'abbr'), (namespaces['html'], 'abbr'),
@ -750,6 +767,9 @@ class Filter(base.Filter):
""" """
super(Filter, self).__init__(source) super(Filter, self).__init__(source)
warnings.warn(_deprecation_msg, DeprecationWarning)
self.allowed_elements = allowed_elements self.allowed_elements = allowed_elements
self.allowed_attributes = allowed_attributes self.allowed_attributes = allowed_attributes
self.allowed_css_properties = allowed_css_properties self.allowed_css_properties = allowed_css_properties

720
lib/html5lib/html5parser.py

@ -2,7 +2,6 @@ from __future__ import absolute_import, division, unicode_literals
from six import with_metaclass, viewkeys from six import with_metaclass, viewkeys
import types import types
from collections import OrderedDict
from . import _inputstream from . import _inputstream
from . import _tokenizer from . import _tokenizer
@ -202,7 +201,7 @@ class HTMLParser(object):
DoctypeToken = tokenTypes["Doctype"] DoctypeToken = tokenTypes["Doctype"]
ParseErrorToken = tokenTypes["ParseError"] ParseErrorToken = tokenTypes["ParseError"]
for token in self.normalizedTokens(): for token in self.tokenizer:
prev_token = None prev_token = None
new_token = token new_token = token
while new_token is not None: while new_token is not None:
@ -260,10 +259,6 @@ class HTMLParser(object):
if reprocess: if reprocess:
assert self.phase not in phases assert self.phase not in phases
def normalizedTokens(self):
for token in self.tokenizer:
yield self.normalizeToken(token)
def parse(self, stream, *args, **kwargs): def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree """Parse a HTML document into a well-formed tree
@ -325,17 +320,6 @@ class HTMLParser(object):
if self.strict: if self.strict:
raise ParseError(E[errorcode] % datavars) raise ParseError(E[errorcode] % datavars)
def normalizeToken(self, token):
# HTML5 specific normalizations to the token stream
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
token["data"] = OrderedDict(raw)
if len(raw) > len(token["data"]):
# we had some duplicated attribute, fix so first wins
token["data"].update(raw[::-1])
return token
def adjustMathMLAttributes(self, token): def adjustMathMLAttributes(self, token):
adjust_attributes(token, adjustMathMLAttributes) adjust_attributes(token, adjustMathMLAttributes)
@ -442,10 +426,13 @@ def getPhases(debug):
class Phase(with_metaclass(getMetaclass(debug, log))): class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing """Base class for helper object that implements each phase of processing
""" """
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
def __init__(self, parser, tree): def __init__(self, parser, tree):
self.parser = parser self.parser = parser
self.tree = tree self.tree = tree
self.__startTagCache = {}
self.__endTagCache = {}
def processEOF(self): def processEOF(self):
raise NotImplementedError raise NotImplementedError
@ -465,7 +452,21 @@ def getPhases(debug):
self.tree.insertText(token["data"]) self.tree.insertText(token["data"])
def processStartTag(self, token): def processStartTag(self, token):
return self.startTagHandler[token["name"]](token) # Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__startTagCache:
func = self.__startTagCache[name]
else:
func = self.__startTagCache[name] = self.startTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__startTagCache.pop(next(iter(self.__startTagCache)))
return func(token)
def startTagHtml(self, token): def startTagHtml(self, token):
if not self.parser.firstStartTag and token["name"] == "html": if not self.parser.firstStartTag and token["name"] == "html":
@ -478,9 +479,25 @@ def getPhases(debug):
self.parser.firstStartTag = False self.parser.firstStartTag = False
def processEndTag(self, token): def processEndTag(self, token):
return self.endTagHandler[token["name"]](token) # Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__endTagCache:
func = self.__endTagCache[name]
else:
func = self.__endTagCache[name] = self.endTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__endTagCache.pop(next(iter(self.__endTagCache)))
return func(token)
class InitialPhase(Phase): class InitialPhase(Phase):
__slots__ = tuple()
def processSpaceCharacters(self, token): def processSpaceCharacters(self, token):
pass pass
@ -609,6 +626,8 @@ def getPhases(debug):
return True return True
class BeforeHtmlPhase(Phase): class BeforeHtmlPhase(Phase):
__slots__ = tuple()
# helper methods # helper methods
def insertHtmlElement(self): def insertHtmlElement(self):
self.tree.insertRoot(impliedTagToken("html", "StartTag")) self.tree.insertRoot(impliedTagToken("html", "StartTag"))
@ -644,19 +663,7 @@ def getPhases(debug):
return token return token
class BeforeHeadPhase(Phase): class BeforeHeadPhase(Phase):
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
self.startTagHead(impliedTagToken("head", "StartTag")) self.startTagHead(impliedTagToken("head", "StartTag"))
@ -689,28 +696,19 @@ def getPhases(debug):
self.parser.parseError("end-tag-after-implied-root", self.parser.parseError("end-tag-after-implied-root",
{"name": token["name"]}) {"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), endTagImplyHead)
])
endTagHandler.default = endTagOther
class InHeadPhase(Phase): class InHeadPhase(Phase):
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("title", self.startTagTitle),
(("noframes", "style"), self.startTagNoFramesStyle),
("noscript", self.startTagNoscript),
("script", self.startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
self.startTagBaseLinkCommand),
("meta", self.startTagMeta),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("head", self.endTagHead),
(("br", "html", "body"), self.endTagHtmlBodyBr)
])
self.endTagHandler.default = self.endTagOther
# the real thing # the real thing
def processEOF(self): def processEOF(self):
@ -792,22 +790,27 @@ def getPhases(debug):
def anythingElse(self): def anythingElse(self):
self.endTagHead(impliedTagToken("head")) self.endTagHead(impliedTagToken("head"))
class InHeadNoscriptPhase(Phase): startTagHandler = _utils.MethodDispatcher([
def __init__(self, parser, tree): ("html", startTagHtml),
Phase.__init__(self, parser, tree) ("title", startTagTitle),
(("noframes", "style"), startTagNoFramesStyle),
("noscript", startTagNoscript),
("script", startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
startTagBaseLinkCommand),
("meta", startTagMeta),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("head", endTagHead),
(("br", "html", "body"), endTagHtmlBodyBr)
])
endTagHandler.default = endTagOther
self.startTagHandler = _utils.MethodDispatcher([ class InHeadNoscriptPhase(Phase):
("html", self.startTagHtml), __slots__ = tuple()
(("basefont", "bgsound", "link", "meta", "noframes", "style"), self.startTagBaseLinkCommand),
(("head", "noscript"), self.startTagHeadNoscript),
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("noscript", self.endTagNoscript),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
self.parser.parseError("eof-in-head-noscript") self.parser.parseError("eof-in-head-noscript")
@ -856,23 +859,21 @@ def getPhases(debug):
# Caller must raise parse error first! # Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript")) self.endTagNoscript(impliedTagToken("noscript"))
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
(("head", "noscript"), startTagHeadNoscript),
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("noscript", endTagNoscript),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class AfterHeadPhase(Phase): class AfterHeadPhase(Phase):
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
self.startTagFromHead),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
self.endTagHtmlBodyBr)])
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
self.anythingElse() self.anythingElse()
@ -923,80 +924,30 @@ def getPhases(debug):
self.parser.phase = self.parser.phases["inBody"] self.parser.phase = self.parser.phases["inBody"]
self.parser.framesetOK = True self.parser.framesetOK = True
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("body", startTagBody),
("frameset", startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
startTagFromHead),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
endTagHtmlBodyBr)])
endTagHandler.default = endTagOther
class InBodyPhase(Phase): class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode # the really-really-really-very crazy mode
def __init__(self, parser, tree): __slots__ = ("processSpaceCharacters",)
Phase.__init__(self, parser, tree)
def __init__(self, *args, **kwargs):
super(InBodyPhase, self).__init__(*args, **kwargs)
# Set this to the default handler # Set this to the default handler
self.processSpaceCharacters = self.processSpaceCharactersNonPre self.processSpaceCharacters = self.processSpaceCharactersNonPre
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
self.startTagProcessInHead),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
self.startTagCloseP),
(headingElements, self.startTagHeading),
(("pre", "listing"), self.startTagPreListing),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext", self.startTagPlaintext),
("a", self.startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), self.startTagFormatting),
("nobr", self.startTagNobr),
("button", self.startTagButton),
(("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
self.startTagVoidFormatting),
(("param", "source", "track"), self.startTagParamSource),
("input", self.startTagInput),
("hr", self.startTagHr),
("image", self.startTagImage),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
("iframe", self.startTagIFrame),
("noscript", self.startTagNoscript),
(("noembed", "noframes"), self.startTagRawtext),
("select", self.startTagSelect),
(("rp", "rt"), self.startTagRpRt),
(("option", "optgroup"), self.startTagOpt),
(("math"), self.startTagMath),
(("svg"), self.startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), self.startTagMisplaced)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("body", self.endTagBody),
("html", self.endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), self.endTagBlock),
("form", self.endTagForm),
("p", self.endTagP),
(("dd", "dt", "li"), self.endTagListItem),
(headingElements, self.endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("applet", "marquee", "object"), self.endTagAppletMarqueeObject),
("br", self.endTagBr),
])
self.endTagHandler.default = self.endTagOther
def isMatchingFormattingElement(self, node1, node2): def isMatchingFormattingElement(self, node1, node2):
return (node1.name == node2.name and return (node1.name == node2.name and
node1.namespace == node2.namespace and node1.namespace == node2.namespace and
@ -1646,14 +1597,73 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
break break
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("base", "basefont", "bgsound", "command", "link", "meta",
"script", "style", "title"),
startTagProcessInHead),
("body", startTagBody),
("frameset", startTagFrameset),
(("address", "article", "aside", "blockquote", "center", "details",
"dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
"section", "summary", "ul"),
startTagCloseP),
(headingElements, startTagHeading),
(("pre", "listing"), startTagPreListing),
("form", startTagForm),
(("li", "dd", "dt"), startTagListItem),
("plaintext", startTagPlaintext),
("a", startTagA),
(("b", "big", "code", "em", "font", "i", "s", "small", "strike",
"strong", "tt", "u"), startTagFormatting),
("nobr", startTagNobr),
("button", startTagButton),
(("applet", "marquee", "object"), startTagAppletMarqueeObject),
("xmp", startTagXmp),
("table", startTagTable),
(("area", "br", "embed", "img", "keygen", "wbr"),
startTagVoidFormatting),
(("param", "source", "track"), startTagParamSource),
("input", startTagInput),
("hr", startTagHr),
("image", startTagImage),
("isindex", startTagIsIndex),
("textarea", startTagTextarea),
("iframe", startTagIFrame),
("noscript", startTagNoscript),
(("noembed", "noframes"), startTagRawtext),
("select", startTagSelect),
(("rp", "rt"), startTagRpRt),
(("option", "optgroup"), startTagOpt),
(("math"), startTagMath),
(("svg"), startTagSvg),
(("caption", "col", "colgroup", "frame", "head",
"tbody", "td", "tfoot", "th", "thead",
"tr"), startTagMisplaced)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("body", endTagBody),
("html", endTagHtml),
(("address", "article", "aside", "blockquote", "button", "center",
"details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
"footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
"section", "summary", "ul"), endTagBlock),
("form", endTagForm),
("p", endTagP),
(("dd", "dt", "li"), endTagListItem),
(headingElements, endTagHeading),
(("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), endTagFormatting),
(("applet", "marquee", "object"), endTagAppletMarqueeObject),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class TextPhase(Phase): class TextPhase(Phase):
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("script", self.endTagScript)])
self.endTagHandler.default = self.endTagOther
def processCharacters(self, token): def processCharacters(self, token):
self.tree.insertText(token["data"]) self.tree.insertText(token["data"])
@ -1679,30 +1689,15 @@ def getPhases(debug):
self.tree.openElements.pop() self.tree.openElements.pop()
self.parser.phase = self.parser.originalPhase self.parser.phase = self.parser.originalPhase
startTagHandler = _utils.MethodDispatcher([])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("script", endTagScript)])
endTagHandler.default = endTagOther
class InTablePhase(Phase): class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table # http://www.whatwg.org/specs/web-apps/current-work/#in-table
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("caption", self.startTagCaption),
("colgroup", self.startTagColgroup),
("col", self.startTagCol),
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
(("td", "th", "tr"), self.startTagImplyTbody),
("table", self.startTagTable),
(("style", "script"), self.startTagStyleScript),
("input", self.startTagInput),
("form", self.startTagForm)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods # helper methods
def clearStackToTableContext(self): def clearStackToTableContext(self):
@ -1824,9 +1819,32 @@ def getPhases(debug):
self.parser.phases["inBody"].processEndTag(token) self.parser.phases["inBody"].processEndTag(token)
self.tree.insertFromTable = False self.tree.insertFromTable = False
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("caption", startTagCaption),
("colgroup", startTagColgroup),
("col", startTagCol),
(("tbody", "tfoot", "thead"), startTagRowGroup),
(("td", "th", "tr"), startTagImplyTbody),
("table", startTagTable),
(("style", "script"), startTagStyleScript),
("input", startTagInput),
("form", startTagForm)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("table", endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InTableTextPhase(Phase): class InTableTextPhase(Phase):
def __init__(self, parser, tree): __slots__ = ("originalPhase", "characterTokens")
Phase.__init__(self, parser, tree)
def __init__(self, *args, **kwargs):
super(InTableTextPhase, self).__init__(*args, **kwargs)
self.originalPhase = None self.originalPhase = None
self.characterTokens = [] self.characterTokens = []
@ -1871,23 +1889,7 @@ def getPhases(debug):
class InCaptionPhase(Phase): class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableElement)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("caption", self.endTagCaption),
("table", self.endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagCaption(self): def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", variant="table") return not self.tree.elementInScope("caption", variant="table")
@ -1940,23 +1942,24 @@ def getPhases(debug):
def endTagOther(self, token): def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token) return self.parser.phases["inBody"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), startTagTableElement)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("caption", endTagCaption),
("table", endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InColumnGroupPhase(Phase): class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column # http://www.whatwg.org/specs/web-apps/current-work/#in-column
__slots__ = tuple()
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("col", self.startTagCol)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("colgroup", self.endTagColgroup),
("col", self.endTagCol)
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagColgroup(self): def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html" return self.tree.openElements[-1].name == "html"
@ -2006,26 +2009,21 @@ def getPhases(debug):
if not ignoreEndTag: if not ignoreEndTag:
return token return token
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("col", startTagCol)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("colgroup", endTagColgroup),
("col", endTagCol)
])
endTagHandler.default = endTagOther
class InTableBodyPhase(Phase): class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods # helper methods
def clearStackToTableBodyContext(self): def clearStackToTableBodyContext(self):
@ -2104,26 +2102,26 @@ def getPhases(debug):
def endTagOther(self, token): def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token) return self.parser.phases["inTable"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
("tr", startTagTr),
(("td", "th"), startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), endTagTableRowGroup),
("table", endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), endTagIgnore)
])
endTagHandler.default = endTagOther
class InRowPhase(Phase): class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row # http://www.whatwg.org/specs/web-apps/current-work/#in-row
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("tr", self.endTagTr),
("table", self.endTagTable),
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods (XXX unify this with other table helper methods) # helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self): def clearStackToTableRowContext(self):
@ -2193,23 +2191,26 @@ def getPhases(debug):
def endTagOther(self, token): def endTagOther(self, token):
return self.parser.phases["inTable"].processEndTag(token) return self.parser.phases["inTable"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("td", "th"), startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("tr", endTagTr),
("table", endTagTable),
(("tbody", "tfoot", "thead"), endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
endTagIgnore)
])
endTagHandler.default = endTagOther
class InCellPhase(Phase): class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
(("td", "th"), self.endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
])
self.endTagHandler.default = self.endTagOther
# helper # helper
def closeCell(self): def closeCell(self):
@ -2269,26 +2270,22 @@ def getPhases(debug):
def endTagOther(self, token): def endTagOther(self, token):
return self.parser.phases["inBody"].processEndTag(token) return self.parser.phases["inBody"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
("html", Phase.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), startTagTableOther)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("td", "th"), endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
])
endTagHandler.default = endTagOther
class InSelectPhase(Phase): class InSelectPhase(Phase):
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml),
("option", self.startTagOption),
("optgroup", self.startTagOptgroup),
("select", self.startTagSelect),
(("input", "keygen", "textarea"), self.startTagInput),
("script", self.startTagScript)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([
("option", self.endTagOption),
("optgroup", self.endTagOptgroup),
("select", self.endTagSelect)
])
self.endTagHandler.default = self.endTagOther
# http://www.whatwg.org/specs/web-apps/current-work/#in-select # http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processEOF(self): def processEOF(self):
@ -2369,21 +2366,25 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag-in-select", self.parser.parseError("unexpected-end-tag-in-select",
{"name": token["name"]}) {"name": token["name"]})
class InSelectInTablePhase(Phase): startTagHandler = _utils.MethodDispatcher([
def __init__(self, parser, tree): ("html", Phase.startTagHtml),
Phase.__init__(self, parser, tree) ("option", startTagOption),
("optgroup", startTagOptgroup),
self.startTagHandler = _utils.MethodDispatcher([ ("select", startTagSelect),
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), (("input", "keygen", "textarea"), startTagInput),
self.startTagTable) ("script", startTagScript)
]) ])
self.startTagHandler.default = self.startTagOther startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("option", endTagOption),
("optgroup", endTagOptgroup),
("select", endTagSelect)
])
endTagHandler.default = endTagOther
self.endTagHandler = _utils.MethodDispatcher([ class InSelectInTablePhase(Phase):
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), __slots__ = tuple()
self.endTagTable)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
self.parser.phases["inSelect"].processEOF() self.parser.phases["inSelect"].processEOF()
@ -2408,7 +2409,21 @@ def getPhases(debug):
def endTagOther(self, token): def endTagOther(self, token):
return self.parser.phases["inSelect"].processEndTag(token) return self.parser.phases["inSelect"].processEndTag(token)
startTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
startTagTable)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
endTagTable)
])
endTagHandler.default = endTagOther
class InForeignContentPhase(Phase): class InForeignContentPhase(Phase):
__slots__ = tuple()
breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
"center", "code", "dd", "div", "dl", "dt", "center", "code", "dd", "div", "dl", "dt",
"em", "embed", "h1", "h2", "h3", "em", "embed", "h1", "h2", "h3",
@ -2418,9 +2433,6 @@ def getPhases(debug):
"span", "strong", "strike", "sub", "sup", "span", "strong", "strike", "sub", "sup",
"table", "tt", "u", "ul", "var"]) "table", "tt", "u", "ul", "var"])
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
def adjustSVGTagNames(self, token): def adjustSVGTagNames(self, token):
replacements = {"altglyph": "altGlyph", replacements = {"altglyph": "altGlyph",
"altglyphdef": "altGlyphDef", "altglyphdef": "altGlyphDef",
@ -2524,16 +2536,7 @@ def getPhases(debug):
return new_token return new_token
class AfterBodyPhase(Phase): class AfterBodyPhase(Phase):
def __init__(self, parser, tree): __slots__ = tuple()
Phase.__init__(self, parser, tree)
self.startTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
# Stop parsing # Stop parsing
@ -2570,23 +2573,17 @@ def getPhases(debug):
self.parser.phase = self.parser.phases["inBody"] self.parser.phase = self.parser.phases["inBody"]
return token return token
class InFramesetPhase(Phase): startTagHandler = _utils.MethodDispatcher([
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset ("html", startTagHtml)
def __init__(self, parser, tree): ])
Phase.__init__(self, parser, tree) startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([ endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
("html", self.startTagHtml), endTagHandler.default = endTagOther
("frameset", self.startTagFrameset),
("frame", self.startTagFrame),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([ class InFramesetPhase(Phase):
("frameset", self.endTagFrameset) # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
]) __slots__ = tuple()
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
if self.tree.openElements[-1].name != "html": if self.tree.openElements[-1].name != "html":
@ -2627,21 +2624,22 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag-in-frameset", self.parser.parseError("unexpected-end-tag-in-frameset",
{"name": token["name"]}) {"name": token["name"]})
class AfterFramesetPhase(Phase): startTagHandler = _utils.MethodDispatcher([
# http://www.whatwg.org/specs/web-apps/current-work/#after3 ("html", Phase.startTagHtml),
def __init__(self, parser, tree): ("frameset", startTagFrameset),
Phase.__init__(self, parser, tree) ("frame", startTagFrame),
("noframes", startTagNoframes)
])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([ endTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml), ("frameset", endTagFrameset)
("noframes", self.startTagNoframes) ])
]) endTagHandler.default = endTagOther
self.startTagHandler.default = self.startTagOther
self.endTagHandler = _utils.MethodDispatcher([ class AfterFramesetPhase(Phase):
("html", self.endTagHtml) # http://www.whatwg.org/specs/web-apps/current-work/#after3
]) __slots__ = tuple()
self.endTagHandler.default = self.endTagOther
def processEOF(self): def processEOF(self):
# Stop parsing # Stop parsing
@ -2664,14 +2662,19 @@ def getPhases(debug):
self.parser.parseError("unexpected-end-tag-after-frameset", self.parser.parseError("unexpected-end-tag-after-frameset",
{"name": token["name"]}) {"name": token["name"]})
class AfterAfterBodyPhase(Phase): startTagHandler = _utils.MethodDispatcher([
def __init__(self, parser, tree): ("html", Phase.startTagHtml),
Phase.__init__(self, parser, tree) ("noframes", startTagNoframes)
])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([ endTagHandler = _utils.MethodDispatcher([
("html", self.startTagHtml) ("html", endTagHtml)
]) ])
self.startTagHandler.default = self.startTagOther endTagHandler.default = endTagOther
class AfterAfterBodyPhase(Phase):
__slots__ = tuple()
def processEOF(self): def processEOF(self):
pass pass
@ -2702,15 +2705,13 @@ def getPhases(debug):
self.parser.phase = self.parser.phases["inBody"] self.parser.phase = self.parser.phases["inBody"]
return token return token
class AfterAfterFramesetPhase(Phase): startTagHandler = _utils.MethodDispatcher([
def __init__(self, parser, tree): ("html", startTagHtml)
Phase.__init__(self, parser, tree) ])
startTagHandler.default = startTagOther
self.startTagHandler = _utils.MethodDispatcher([ class AfterAfterFramesetPhase(Phase):
("html", self.startTagHtml), __slots__ = tuple()
("noframes", self.startTagNoFrames)
])
self.startTagHandler.default = self.startTagOther
def processEOF(self): def processEOF(self):
pass pass
@ -2737,6 +2738,13 @@ def getPhases(debug):
def processEndTag(self, token): def processEndTag(self, token):
self.parser.parseError("expected-eof-but-got-end-tag", self.parser.parseError("expected-eof-but-got-end-tag",
{"name": token["name"]}) {"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("noframes", startTagNoFrames)
])
startTagHandler.default = startTagOther
# pylint:enable=unused-argument # pylint:enable=unused-argument
return { return {
@ -2770,8 +2778,8 @@ def getPhases(debug):
def adjust_attributes(token, replacements): def adjust_attributes(token, replacements):
needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
if needs_adjustment: if needs_adjustment:
token['data'] = OrderedDict((replacements.get(k, k), v) token['data'] = type(token['data'])((replacements.get(k, k), v)
for k, v in token['data'].items()) for k, v in token['data'].items())
def impliedTagToken(name, type="EndTag", attributes=None, def impliedTagToken(name, type="EndTag", attributes=None,

27
lib/html5lib/treebuilders/etree.py

@ -5,6 +5,8 @@ from six import text_type
import re import re
from copy import copy
from . import base from . import base
from .. import _ihatexml from .. import _ihatexml
from .. import constants from .. import constants
@ -61,16 +63,17 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
return self._element.attrib return self._element.attrib
def _setAttributes(self, attributes): def _setAttributes(self, attributes):
# Delete existing attributes first el_attrib = self._element.attrib
# XXX - there may be a better way to do this... el_attrib.clear()
for key in list(self._element.attrib.keys()): if attributes:
del self._element.attrib[key] # calling .items _always_ allocates, and the above truthy check is cheaper than the
for key, value in attributes.items(): # allocation on average
if isinstance(key, tuple): for key, value in attributes.items():
name = "{%s}%s" % (key[2], key[1]) if isinstance(key, tuple):
else: name = "{%s}%s" % (key[2], key[1])
name = key else:
self._element.set(name, value) name = key
el_attrib[name] = value
attributes = property(_getAttributes, _setAttributes) attributes = property(_getAttributes, _setAttributes)
@ -129,8 +132,8 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
def cloneNode(self): def cloneNode(self):
element = type(self)(self.name, self.namespace) element = type(self)(self.name, self.namespace)
for name, value in self.attributes.items(): if self._element.attrib:
element.attributes[name] = value element._element.attrib = copy(self._element.attrib)
return element return element
def reparentChildren(self, newParent): def reparentChildren(self, newParent):

64
lib/html5lib/treebuilders/etree_lxml.py

@ -16,6 +16,11 @@ import warnings
import re import re
import sys import sys
try:
from collections.abc import MutableMapping
except ImportError:
from collections import MutableMapping
from . import base from . import base
from ..constants import DataLossWarning from ..constants import DataLossWarning
from .. import constants from .. import constants
@ -23,6 +28,7 @@ from . import etree as etree_builders
from .. import _ihatexml from .. import _ihatexml
import lxml.etree as etree import lxml.etree as etree
from six import PY3, binary_type
fullTree = True fullTree = True
@ -44,7 +50,11 @@ class Document(object):
self._childNodes = [] self._childNodes = []
def appendChild(self, element): def appendChild(self, element):
self._elementTree.getroot().addnext(element._element) last = self._elementTree.getroot()
for last in self._elementTree.getroot().itersiblings():
pass
last.addnext(element._element)
def _getChildNodes(self): def _getChildNodes(self):
return self._childNodes return self._childNodes
@ -185,26 +195,37 @@ class TreeBuilder(base.TreeBuilder):
infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True) infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
self.namespaceHTMLElements = namespaceHTMLElements self.namespaceHTMLElements = namespaceHTMLElements
class Attributes(dict): class Attributes(MutableMapping):
def __init__(self, element, value=None): def __init__(self, element):
if value is None:
value = {}
self._element = element self._element = element
dict.__init__(self, value) # pylint:disable=non-parent-init-called
for key, value in self.items():
if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else:
name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value
def __setitem__(self, key, value): def _coerceKey(self, key):
dict.__setitem__(self, key, value)
if isinstance(key, tuple): if isinstance(key, tuple):
name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1])) name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
else: else:
name = infosetFilter.coerceAttribute(key) name = infosetFilter.coerceAttribute(key)
self._element._element.attrib[name] = value return name
def __getitem__(self, key):
value = self._element._element.attrib[self._coerceKey(key)]
if not PY3 and isinstance(value, binary_type):
value = value.decode("ascii")
return value
def __setitem__(self, key, value):
self._element._element.attrib[self._coerceKey(key)] = value
def __delitem__(self, key):
del self._element._element.attrib[self._coerceKey(key)]
def __iter__(self):
return iter(self._element._element.attrib)
def __len__(self):
return len(self._element._element.attrib)
def clear(self):
return self._element._element.attrib.clear()
class Element(builder.Element): class Element(builder.Element):
def __init__(self, name, namespace): def __init__(self, name, namespace):
@ -225,8 +246,10 @@ class TreeBuilder(base.TreeBuilder):
def _getAttributes(self): def _getAttributes(self):
return self._attributes return self._attributes
def _setAttributes(self, attributes): def _setAttributes(self, value):
self._attributes = Attributes(self, attributes) attributes = self.attributes
attributes.clear()
attributes.update(value)
attributes = property(_getAttributes, _setAttributes) attributes = property(_getAttributes, _setAttributes)
@ -234,8 +257,11 @@ class TreeBuilder(base.TreeBuilder):
data = infosetFilter.coerceCharacters(data) data = infosetFilter.coerceCharacters(data)
builder.Element.insertText(self, data, insertBefore) builder.Element.insertText(self, data, insertBefore)
def appendChild(self, child): def cloneNode(self):
builder.Element.appendChild(self, child) element = type(self)(self.name, self.namespace)
if self._element.attrib:
element._element.attrib.update(self._element.attrib)
return element
class Comment(builder.Comment): class Comment(builder.Comment):
def __init__(self, data): def __init__(self, data):

4
lib/html5lib/treewalkers/etree_lxml.py

@ -1,6 +1,8 @@
from __future__ import absolute_import, division, unicode_literals from __future__ import absolute_import, division, unicode_literals
from six import text_type from six import text_type
from collections import OrderedDict
from lxml import etree from lxml import etree
from ..treebuilders.etree import tag_regexp from ..treebuilders.etree import tag_regexp
@ -163,7 +165,7 @@ class TreeWalker(base.NonRecursiveTreeWalker):
else: else:
namespace = None namespace = None
tag = ensure_str(node.tag) tag = ensure_str(node.tag)
attrs = {} attrs = OrderedDict()
for name, value in list(node.attrib.items()): for name, value in list(node.attrib.items()):
name = ensure_str(name) name = ensure_str(name)
value = ensure_str(value) value = ensure_str(value)

Loading…
Cancel
Save