You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1732 lines
75 KiB

12 years ago
from __future__ import absolute_import, division, unicode_literals
13 years ago
try:
12 years ago
chr = unichr # flake8: noqa
13 years ago
except NameError:
12 years ago
pass
from collections import deque
from .constants import spaceCharacters
from .constants import entities
from .constants import asciiLetters, asciiUpper2Lower
from .constants import digits, hexDigits, EOF
from .constants import tokenTypes, tagTokenTypes
from .constants import replacementCharacters
from .inputstream import HTMLInputStream
from .trie import Trie
entitiesTrie = Trie(entities)
13 years ago
class HTMLTokenizer(object):
""" This class takes care of tokenizing HTML.
* self.currentToken
Holds the token that is currently being processed.
* self.state
Holds a reference to the method to be invoked... XXX
* self.stream
Points to HTMLInputStream object.
"""
def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
lowercaseElementName=True, lowercaseAttrName=True, parser=None):
self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
self.parser = parser
12 years ago
# Perform case conversions?
13 years ago
self.lowercaseElementName = lowercaseElementName
self.lowercaseAttrName = lowercaseAttrName
12 years ago
13 years ago
# Setup the initial tokenizer state
self.escapeFlag = False
self.lastFourChars = []
self.state = self.dataState
self.escape = False
# The current token being created
self.currentToken = None
super(HTMLTokenizer, self).__init__()
def __iter__(self):
""" This is where the magic happens.
We do our usually processing through the states and when we have a token
to return we yield the token which pauses processing until the next token
is requested.
"""
self.tokenQueue = deque([])
# Start processing. When EOF is reached self.state will return False
# instead of True and the loop will terminate.
while self.state():
while self.stream.errors:
yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
while self.tokenQueue:
yield self.tokenQueue.popleft()
def consumeNumberEntity(self, isHex):
"""This function returns either U+FFFD or the character based on the
decimal or hexadecimal representation. It also discards ";" if present.
If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
"""
allowed = digits
radix = 10
if isHex:
allowed = hexDigits
radix = 16
charStack = []
# Consume all the characters that are in range while making sure we
# don't hit an EOF.
c = self.stream.char()
while c in allowed and c is not EOF:
charStack.append(c)
c = self.stream.char()
# Convert the set of characters consumed to an int.
charAsInt = int("".join(charStack), radix)
# Certain characters get replaced with others
if charAsInt in replacementCharacters:
char = replacementCharacters[charAsInt]
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
elif ((0xD800 <= charAsInt <= 0xDFFF) or
13 years ago
(charAsInt > 0x10FFFF)):
12 years ago
char = "\uFFFD"
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
13 years ago
else:
12 years ago
# Should speed up this check somehow (e.g. move the set to a constant)
if ((0x0001 <= charAsInt <= 0x0008) or
(0x000E <= charAsInt <= 0x001F) or
(0x007F <= charAsInt <= 0x009F) or
(0xFDD0 <= charAsInt <= 0xFDEF) or
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
13 years ago
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
12 years ago
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
13 years ago
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
12 years ago
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
13 years ago
0xFFFFF, 0x10FFFE, 0x10FFFF])):
12 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data":
12 years ago
"illegal-codepoint-for-numeric-entity",
13 years ago
"datavars": {"charAsInt": charAsInt}})
try:
# Try/except needed as UCS-2 Python builds' unichar only works
# within the BMP.
12 years ago
char = chr(charAsInt)
13 years ago
except ValueError:
12 years ago
v = charAsInt - 0x10000
char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF))
13 years ago
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
12 years ago
if c != ";":
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"numeric-entity-without-semicolon"})
13 years ago
self.stream.unget(c)
return char
def consumeEntity(self, allowedChar=None, fromAttribute=False):
# Initialise to the default output for when no entity is matched
12 years ago
output = "&"
13 years ago
charStack = [self.stream.char()]
12 years ago
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&")
or (allowedChar is not None and allowedChar == charStack[0])):
13 years ago
self.stream.unget(charStack[0])
12 years ago
elif charStack[0] == "#":
13 years ago
# Read the next character to see if it's hex or decimal
hex = False
charStack.append(self.stream.char())
12 years ago
if charStack[-1] in ("x", "X"):
13 years ago
hex = True
charStack.append(self.stream.char())
# charStack[-1] should be the first digit
if (hex and charStack[-1] in hexDigits) \
12 years ago
or (not hex and charStack[-1] in digits):
13 years ago
# At least one digit found, so consume the whole number
self.stream.unget(charStack[-1])
output = self.consumeNumberEntity(hex)
else:
# No digits found
self.tokenQueue.append({"type": tokenTypes["ParseError"],
12 years ago
"data": "expected-numeric-entity"})
13 years ago
self.stream.unget(charStack.pop())
12 years ago
output = "&" + "".join(charStack)
13 years ago
else:
# At this point in the process might have named entity. Entities
# are stored in the global variable "entities".
#
# Consume characters and compare to these to a substring of the
# entity names in the list until the substring no longer matches.
12 years ago
while (charStack[-1] is not EOF):
if not entitiesTrie.has_keys_with_prefix("".join(charStack)):
break
13 years ago
charStack.append(self.stream.char())
# At this point we have a string that starts with some characters
# that may match an entity
# Try to find the longest entity the string will match to take care
# of &noti for instance.
12 years ago
try:
entityName = entitiesTrie.longest_prefix("".join(charStack[:-1]))
entityLength = len(entityName)
except KeyError:
entityName = None
13 years ago
if entityName is not None:
if entityName[-1] != ";":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"named-entity-without-semicolon"})
13 years ago
if (entityName[-1] != ";" and fromAttribute and
(charStack[entityLength] in asciiLetters or
charStack[entityLength] in digits or
12 years ago
charStack[entityLength] == "=")):
13 years ago
self.stream.unget(charStack.pop())
12 years ago
output = "&" + "".join(charStack)
13 years ago
else:
output = entities[entityName]
self.stream.unget(charStack.pop())
12 years ago
output += "".join(charStack[entityLength:])
13 years ago
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-named-entity"})
13 years ago
self.stream.unget(charStack.pop())
12 years ago
output = "&" + "".join(charStack)
13 years ago
if fromAttribute:
self.currentToken["data"][-1][1] += output
else:
if output in spaceCharacters:
tokenType = "SpaceCharacters"
else:
tokenType = "Characters"
self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output})
def processEntityInAttribute(self, allowedChar):
"""This method replaces the need for "entityInAttributeValueState".
"""
self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
def emitCurrentToken(self):
"""This method is a generic handler for emitting the tags. It also sets
the state to "data" because that's what's needed after a token has been
emitted.
"""
token = self.currentToken
# Add token to the queue to be yielded
if (token["type"] in tagTokenTypes):
if self.lowercaseElementName:
token["name"] = token["name"].translate(asciiUpper2Lower)
if token["type"] == tokenTypes["EndTag"]:
if token["data"]:
12 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "attributes-in-end-tag"})
13 years ago
if token["selfClosing"]:
12 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "self-closing-flag-on-end-tag"})
13 years ago
self.tokenQueue.append(token)
self.state = self.dataState
# Below are the various tokenizer states worked out.
def dataState(self):
data = self.stream.char()
if data == "&":
self.state = self.entityDataState
elif data == "<":
self.state = self.tagOpenState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data": "invalid-codepoint"})
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\u0000"})
13 years ago
elif data is EOF:
# Tokenization ends.
return False
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
12 years ago
data + self.stream.charsUntil(spaceCharacters, True)})
13 years ago
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
12 years ago
chars = self.stream.charsUntil(("&", "<", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
13 years ago
return True
def entityDataState(self):
self.consumeEntity()
self.state = self.dataState
return True
12 years ago
13 years ago
def rcdataState(self):
data = self.stream.char()
if data == "&":
self.state = self.characterReferenceInRcdata
elif data == "<":
self.state = self.rcdataLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
12 years ago
data + self.stream.charsUntil(spaceCharacters, True)})
13 years ago
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
12 years ago
chars = self.stream.charsUntil(("&", "<", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
13 years ago
return True
def characterReferenceInRcdata(self):
self.consumeEntity()
self.state = self.rcdataState
return True
12 years ago
13 years ago
def rawtextState(self):
data = self.stream.char()
if data == "<":
self.state = self.rawtextLessThanSignState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
elif data == EOF:
# Tokenization ends.
return False
else:
12 years ago
chars = self.stream.charsUntil(("<", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
13 years ago
return True
12 years ago
13 years ago
def scriptDataState(self):
data = self.stream.char()
if data == "<":
self.state = self.scriptDataLessThanSignState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
elif data == EOF:
# Tokenization ends.
return False
else:
12 years ago
chars = self.stream.charsUntil(("<", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
13 years ago
return True
12 years ago
13 years ago
def plaintextState(self):
data = self.stream.char()
if data == EOF:
# Tokenization ends.
return False
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + self.stream.charsUntil("\u0000")})
13 years ago
return True
def tagOpenState(self):
data = self.stream.char()
12 years ago
if data == "!":
13 years ago
self.state = self.markupDeclarationOpenState
12 years ago
elif data == "/":
13 years ago
self.state = self.closeTagOpenState
elif data in asciiLetters:
12 years ago
self.currentToken = {"type": tokenTypes["StartTag"],
13 years ago
"name": data, "data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
12 years ago
elif data == ">":
13 years ago
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-tag-name-but-got-right-bracket"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"})
13 years ago
self.state = self.dataState
12 years ago
elif data == "?":
13 years ago
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-tag-name-but-got-question-mark"})
13 years ago
self.stream.unget(data)
self.state = self.bogusCommentState
else:
# XXX
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-tag-name"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.stream.unget(data)
self.state = self.dataState
return True
def closeTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.tagNameState
12 years ago
elif data == ">":
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-closing-tag-but-got-right-bracket"})
13 years ago
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-closing-tag-but-got-eof"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
13 years ago
self.state = self.dataState
else:
# XXX data can be _'_...
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-closing-tag-but-got-char",
"datavars": {"data": data}})
13 years ago
self.stream.unget(data)
self.state = self.bogusCommentState
return True
def tagNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeAttributeNameState
12 years ago
elif data == ">":
13 years ago
self.emitCurrentToken()
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-tag-name"})
13 years ago
self.state = self.dataState
12 years ago
elif data == "/":
13 years ago
self.state = self.selfClosingStartTagState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["name"] += "\uFFFD"
13 years ago
else:
self.currentToken["name"] += data
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
12 years ago
13 years ago
def rcdataLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.rcdataEndTagOpenState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.stream.unget(data)
self.state = self.rcdataState
return True
12 years ago
13 years ago
def rcdataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.rcdataEndTagNameState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
13 years ago
self.stream.unget(data)
self.state = self.rcdataState
return True
12 years ago
13 years ago
def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
12 years ago
"data": "</" + self.temporaryBuffer})
13 years ago
self.stream.unget(data)
self.state = self.rcdataState
return True
12 years ago
13 years ago
def rawtextLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.rawtextEndTagOpenState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.stream.unget(data)
self.state = self.rawtextState
return True
12 years ago
13 years ago
def rawtextEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.rawtextEndTagNameState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
13 years ago
self.stream.unget(data)
self.state = self.rawtextState
return True
12 years ago
13 years ago
def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
12 years ago
"data": "</" + self.temporaryBuffer})
13 years ago
self.stream.unget(data)
self.state = self.rawtextState
return True
12 years ago
13 years ago
def scriptDataLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.scriptDataEndTagOpenState
elif data == "!":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"})
13 years ago
self.state = self.scriptDataEscapeStartState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.stream.unget(data)
self.state = self.scriptDataState
return True
12 years ago
13 years ago
def scriptDataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.scriptDataEndTagNameState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
13 years ago
self.stream.unget(data)
self.state = self.scriptDataState
return True
12 years ago
13 years ago
def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
12 years ago
"data": "</" + self.temporaryBuffer})
13 years ago
self.stream.unget(data)
self.state = self.scriptDataState
return True
12 years ago
13 years ago
def scriptDataEscapeStartState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
self.state = self.scriptDataEscapeStartDashState
else:
self.stream.unget(data)
self.state = self.scriptDataState
return True
12 years ago
13 years ago
def scriptDataEscapeStartDashState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
self.state = self.scriptDataEscapedDashDashState
else:
self.stream.unget(data)
self.state = self.scriptDataState
return True
12 years ago
13 years ago
def scriptDataEscapedState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
self.state = self.scriptDataEscapedDashState
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
elif data == EOF:
self.state = self.dataState
else:
12 years ago
chars = self.stream.charsUntil(("<", "-", "\u0000"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
13 years ago
return True
12 years ago
13 years ago
def scriptDataEscapedDashState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
self.state = self.scriptDataEscapedDashDashState
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
self.state = self.scriptDataEscapedState
elif data == EOF:
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataEscapedState
return True
12 years ago
13 years ago
def scriptDataEscapedDashDashState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == ">":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
13 years ago
self.state = self.scriptDataState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
self.state = self.scriptDataEscapedState
elif data == EOF:
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataEscapedState
return True
12 years ago
13 years ago
def scriptDataEscapedLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.scriptDataEscapedEndTagOpenState
elif data in asciiLetters:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data})
13 years ago
self.temporaryBuffer = data
self.state = self.scriptDataDoubleEscapeStartState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
12 years ago
13 years ago
def scriptDataEscapedEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer = data
self.state = self.scriptDataEscapedEndTagNameState
else:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"})
13 years ago
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
12 years ago
13 years ago
def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
12 years ago
"data": [], "selfClosing": False}
13 years ago
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
12 years ago
"data": "</" + self.temporaryBuffer})
13 years ago
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
12 years ago
13 years ago
def scriptDataDoubleEscapeStartState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
if self.temporaryBuffer.lower() == "script":
self.state = self.scriptDataDoubleEscapedState
else:
self.state = self.scriptDataEscapedState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.temporaryBuffer += data
else:
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
12 years ago
13 years ago
def scriptDataDoubleEscapedState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
self.state = self.scriptDataDoubleEscapedDashState
elif data == "<":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.state = self.scriptDataDoubleEscapedLessThanSignState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-script-in-script"})
13 years ago
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
return True
12 years ago
13 years ago
def scriptDataDoubleEscapedDashState(self):
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
self.state = self.scriptDataDoubleEscapedDashDashState
elif data == "<":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.state = self.scriptDataDoubleEscapedLessThanSignState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
self.state = self.scriptDataDoubleEscapedState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-script-in-script"})
13 years ago
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataDoubleEscapedState
return True
12 years ago
def scriptDataDoubleEscapedDashDashState(self):
13 years ago
data = self.stream.char()
if data == "-":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"})
13 years ago
elif data == "<":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"})
13 years ago
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == ">":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"})
13 years ago
self.state = self.scriptDataState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": "\uFFFD"})
13 years ago
self.state = self.scriptDataDoubleEscapedState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-script-in-script"})
13 years ago
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataDoubleEscapedState
return True
12 years ago
13 years ago
def scriptDataDoubleEscapedLessThanSignState(self):
data = self.stream.char()
if data == "/":
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"})
13 years ago
self.temporaryBuffer = ""
self.state = self.scriptDataDoubleEscapeEndState
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
return True
12 years ago
13 years ago
def scriptDataDoubleEscapeEndState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
if self.temporaryBuffer.lower() == "script":
self.state = self.scriptDataEscapedState
else:
self.state = self.scriptDataDoubleEscapedState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.temporaryBuffer += data
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
return True
def beforeAttributeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
self.state = self.attributeNameState
12 years ago
elif data == ">":
13 years ago
self.emitCurrentToken()
12 years ago
elif data == "/":
13 years ago
self.state = self.selfClosingStartTagState
12 years ago
elif data in ("'", '"', "=", "<"):
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"invalid-character-in-attribute-name"})
13 years ago
self.currentToken["data"].append([data, ""])
self.state = self.attributeNameState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"].append(["\uFFFD", ""])
13 years ago
self.state = self.attributeNameState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-attribute-name-but-got-eof"})
13 years ago
self.state = self.dataState
else:
self.currentToken["data"].append([data, ""])
self.state = self.attributeNameState
return True
def attributeNameState(self):
data = self.stream.char()
leavingThisState = True
emitToken = False
12 years ago
if data == "=":
13 years ago
self.state = self.beforeAttributeValueState
elif data in asciiLetters:
self.currentToken["data"][-1][0] += data +\
12 years ago
self.stream.charsUntil(asciiLetters, True)
13 years ago
leavingThisState = False
12 years ago
elif data == ">":
13 years ago
# XXX If we emit here the attributes are converted to a dict
# without being checked and when the code below runs we error
# because data is a dict not a list
emitToken = True
elif data in spaceCharacters:
self.state = self.afterAttributeNameState
12 years ago
elif data == "/":
13 years ago
self.state = self.selfClosingStartTagState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"][-1][0] += "\uFFFD"
13 years ago
leavingThisState = False
12 years ago
elif data in ("'", '"', "<"):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data":
12 years ago
"invalid-character-in-attribute-name"})
13 years ago
self.currentToken["data"][-1][0] += data
leavingThisState = False
elif data is EOF:
12 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "eof-in-attribute-name"})
self.state = self.dataState
else:
self.currentToken["data"][-1][0] += data
leavingThisState = False
if leavingThisState:
# Attributes are not dropped at this stage. That happens when the
# start tag token is emitted so values can still be safely appended
# to attributes, but we do want to report the parse error in time.
if self.lowercaseAttrName:
self.currentToken["data"][-1][0] = (
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
for name, value in self.currentToken["data"][:-1]:
if self.currentToken["data"][-1][0] == name:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"duplicate-attribute"})
13 years ago
break
# XXX Fix for above XXX
if emitToken:
self.emitCurrentToken()
return True
def afterAttributeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
12 years ago
elif data == "=":
13 years ago
self.state = self.beforeAttributeValueState
12 years ago
elif data == ">":
13 years ago
self.emitCurrentToken()
elif data in asciiLetters:
self.currentToken["data"].append([data, ""])
self.state = self.attributeNameState
12 years ago
elif data == "/":
13 years ago
self.state = self.selfClosingStartTagState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"].append(["\uFFFD", ""])
13 years ago
self.state = self.attributeNameState
12 years ago
elif data in ("'", '"', "<"):
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"invalid-character-after-attribute-name"})
13 years ago
self.currentToken["data"].append([data, ""])
self.state = self.attributeNameState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-end-of-tag-but-got-eof"})
13 years ago
self.state = self.dataState
else:
self.currentToken["data"].append([data, ""])
self.state = self.attributeNameState
return True
def beforeAttributeValueState(self):
data = self.stream.char()
if data in spaceCharacters:
self.stream.charsUntil(spaceCharacters, True)
12 years ago
elif data == "\"":
13 years ago
self.state = self.attributeValueDoubleQuotedState
12 years ago
elif data == "&":
13 years ago
self.state = self.attributeValueUnQuotedState
12 years ago
self.stream.unget(data)
elif data == "'":
13 years ago
self.state = self.attributeValueSingleQuotedState
12 years ago
elif data == ">":
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-attribute-value-but-got-right-bracket"})
13 years ago
self.emitCurrentToken()
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"][-1][1] += "\uFFFD"
13 years ago
self.state = self.attributeValueUnQuotedState
12 years ago
elif data in ("=", "<", "`"):
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"equals-in-unquoted-attribute-value"})
13 years ago
self.currentToken["data"][-1][1] += data
self.state = self.attributeValueUnQuotedState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-attribute-value-but-got-eof"})
13 years ago
self.state = self.dataState
else:
self.currentToken["data"][-1][1] += data
self.state = self.attributeValueUnQuotedState
return True
def attributeValueDoubleQuotedState(self):
data = self.stream.char()
if data == "\"":
self.state = self.afterAttributeValueState
12 years ago
elif data == "&":
self.processEntityInAttribute('"')
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"][-1][1] += "\uFFFD"
13 years ago
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-attribute-value-double-quote"})
13 years ago
self.state = self.dataState
else:
self.currentToken["data"][-1][1] += data +\
12 years ago
self.stream.charsUntil(("\"", "&", "\u0000"))
13 years ago
return True
def attributeValueSingleQuotedState(self):
data = self.stream.char()
if data == "'":
self.state = self.afterAttributeValueState
12 years ago
elif data == "&":
self.processEntityInAttribute("'")
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"][-1][1] += "\uFFFD"
13 years ago
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-attribute-value-single-quote"})
13 years ago
self.state = self.dataState
else:
self.currentToken["data"][-1][1] += data +\
12 years ago
self.stream.charsUntil(("'", "&", "\u0000"))
13 years ago
return True
def attributeValueUnQuotedState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeAttributeNameState
12 years ago
elif data == "&":
13 years ago
self.processEntityInAttribute(">")
12 years ago
elif data == ">":
13 years ago
self.emitCurrentToken()
12 years ago
elif data in ('"', "'", "=", "<", "`"):
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-character-in-unquoted-attribute-value"})
13 years ago
self.currentToken["data"][-1][1] += data
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"][-1][1] += "\uFFFD"
13 years ago
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-attribute-value-no-quotes"})
13 years ago
self.state = self.dataState
else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
12 years ago
frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters)
13 years ago
return True
def afterAttributeValueState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeAttributeNameState
12 years ago
elif data == ">":
13 years ago
self.emitCurrentToken()
12 years ago
elif data == "/":
13 years ago
self.state = self.selfClosingStartTagState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-EOF-after-attribute-value"})
13 years ago
self.stream.unget(data)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-character-after-attribute-value"})
13 years ago
self.stream.unget(data)
self.state = self.beforeAttributeNameState
return True
def selfClosingStartTagState(self):
data = self.stream.char()
if data == ">":
self.currentToken["selfClosing"] = True
self.emitCurrentToken()
elif data is EOF:
12 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data":
12 years ago
"unexpected-EOF-after-solidus-in-tag"})
13 years ago
self.stream.unget(data)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-character-after-solidus-in-tag"})
13 years ago
self.stream.unget(data)
self.state = self.beforeAttributeNameState
return True
def bogusCommentState(self):
# Make a new comment token and give it as value all the characters
# until the first > or EOF (charsUntil checks for EOF automatically)
# and emit it.
12 years ago
data = self.stream.charsUntil(">")
data = data.replace("\u0000", "\uFFFD")
13 years ago
self.tokenQueue.append(
12 years ago
{"type": tokenTypes["Comment"], "data": data})
13 years ago
# Eat the character directly after the bogus comment which is either a
# ">" or an EOF.
self.stream.char()
self.state = self.dataState
return True
def markupDeclarationOpenState(self):
charStack = [self.stream.char()]
12 years ago
if charStack[-1] == "-":
13 years ago
charStack.append(self.stream.char())
12 years ago
if charStack[-1] == "-":
self.currentToken = {"type": tokenTypes["Comment"], "data": ""}
13 years ago
self.state = self.commentStartState
return True
12 years ago
elif charStack[-1] in ('d', 'D'):
13 years ago
matched = True
12 years ago
for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'),
('y', 'Y'), ('p', 'P'), ('e', 'E')):
13 years ago
charStack.append(self.stream.char())
if charStack[-1] not in expected:
matched = False
break
if matched:
self.currentToken = {"type": tokenTypes["Doctype"],
12 years ago
"name": "",
"publicId": None, "systemId": None,
13 years ago
"correct": True}
self.state = self.doctypeState
return True
12 years ago
elif (charStack[-1] == "[" and
13 years ago
self.parser is not None and
self.parser.tree.openElements and
self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace):
matched = True
for expected in ["C", "D", "A", "T", "A", "["]:
charStack.append(self.stream.char())
if charStack[-1] != expected:
matched = False
break
if matched:
self.state = self.cdataSectionState
return True
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-dashes-or-doctype"})
13 years ago
while charStack:
self.stream.unget(charStack.pop())
self.state = self.bogusCommentState
return True
def commentStartState(self):
data = self.stream.char()
if data == "-":
self.state = self.commentStartDashState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"] += "\uFFFD"
13 years ago
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"incorrect-comment"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-comment"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["data"] += data
self.state = self.commentState
return True
12 years ago
13 years ago
def commentStartDashState(self):
data = self.stream.char()
if data == "-":
self.state = self.commentEndState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"] += "-\uFFFD"
13 years ago
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"incorrect-comment"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-comment"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["data"] += "-" + data
self.state = self.commentState
return True
def commentState(self):
data = self.stream.char()
12 years ago
if data == "-":
13 years ago
self.state = self.commentEndDashState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"] += "\uFFFD"
13 years ago
elif data is EOF:
12 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "eof-in-comment"})
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["data"] += data + \
12 years ago
self.stream.charsUntil(("-", "\u0000"))
13 years ago
return True
def commentEndDashState(self):
data = self.stream.char()
12 years ago
if data == "-":
13 years ago
self.state = self.commentEndState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"] += "-\uFFFD"
13 years ago
self.state = self.commentState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-comment-end-dash"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
12 years ago
self.currentToken["data"] += "-" + data
13 years ago
self.state = self.commentState
return True
def commentEndState(self):
data = self.stream.char()
12 years ago
if data == ">":
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"] += "--\uFFFD"
13 years ago
self.state = self.commentState
elif data == "!":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-bang-after-double-dash-in-comment"})
13 years ago
self.state = self.commentEndBangState
12 years ago
elif data == "-":
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-dash-after-double-dash-in-comment"})
13 years ago
self.currentToken["data"] += data
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-comment-double-dash"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
# XXX
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-comment"})
self.currentToken["data"] += "--" + data
13 years ago
self.state = self.commentState
return True
def commentEndBangState(self):
data = self.stream.char()
12 years ago
if data == ">":
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
12 years ago
elif data == "-":
13 years ago
self.currentToken["data"] += "--!"
self.state = self.commentEndDashState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["data"] += "--!\uFFFD"
13 years ago
self.state = self.commentState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-comment-end-bang-state"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
12 years ago
self.currentToken["data"] += "--!" + data
13 years ago
self.state = self.commentState
return True
def doctypeState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypeNameState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-doctype-name-but-got-eof"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"need-space-after-doctype"})
13 years ago
self.stream.unget(data)
self.state = self.beforeDoctypeNameState
return True
def beforeDoctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
12 years ago
elif data == ">":
13 years ago
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-doctype-name-but-got-right-bracket"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["name"] = "\uFFFD"
13 years ago
self.state = self.doctypeNameState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-doctype-name-but-got-eof"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] = data
self.state = self.doctypeNameState
return True
def doctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.state = self.afterDoctypeNameState
12 years ago
elif data == ">":
13 years ago
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["name"] += "\uFFFD"
13 years ago
self.state = self.doctypeNameState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype-name"})
13 years ago
self.currentToken["correct"] = False
self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["name"] += data
return True
def afterDoctypeNameState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
12 years ago
elif data == ">":
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.currentToken["correct"] = False
self.stream.unget(data)
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
12 years ago
if data in ("p", "P"):
13 years ago
matched = True
12 years ago
for expected in (("u", "U"), ("b", "B"), ("l", "L"),
("i", "I"), ("c", "C")):
13 years ago
data = self.stream.char()
if data not in expected:
matched = False
break
if matched:
self.state = self.afterDoctypePublicKeywordState
return True
12 years ago
elif data in ("s", "S"):
13 years ago
matched = True
12 years ago
for expected in (("y", "Y"), ("s", "S"), ("t", "T"),
("e", "E"), ("m", "M")):
13 years ago
data = self.stream.char()
if data not in expected:
matched = False
break
if matched:
self.state = self.afterDoctypeSystemKeywordState
return True
# All the characters read before the current 'data' will be
# [a-zA-Z], so they're garbage in the bogus doctype and can be
# discarded; only the latest character might be '>' or EOF
# and needs to be ungetted
self.stream.unget(data)
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"expected-space-or-right-bracket-in-doctype", "datavars":
{"data": data}})
13 years ago
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
12 years ago
13 years ago
def afterDoctypePublicKeywordState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypePublicIdentifierState
elif data in ("'", '"'):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.stream.unget(data)
self.state = self.beforeDoctypePublicIdentifierState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.stream.unget(data)
self.state = self.beforeDoctypePublicIdentifierState
return True
def beforeDoctypePublicIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == "\"":
12 years ago
self.currentToken["publicId"] = ""
13 years ago
self.state = self.doctypePublicIdentifierDoubleQuotedState
elif data == "'":
12 years ago
self.currentToken["publicId"] = ""
13 years ago
self.state = self.doctypePublicIdentifierSingleQuotedState
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-end-of-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
def doctypePublicIdentifierDoubleQuotedState(self):
data = self.stream.char()
if data == "\"":
self.state = self.afterDoctypePublicIdentifierState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["publicId"] += "\uFFFD"
13 years ago
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-end-of-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["publicId"] += data
return True
def doctypePublicIdentifierSingleQuotedState(self):
data = self.stream.char()
if data == "'":
self.state = self.afterDoctypePublicIdentifierState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["publicId"] += "\uFFFD"
13 years ago
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-end-of-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["publicId"] += data
return True
def afterDoctypePublicIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == '"':
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
self.currentToken["systemId"] = ""
13 years ago
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
self.currentToken["systemId"] = ""
13 years ago
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
12 years ago
13 years ago
def betweenDoctypePublicAndSystemIdentifiersState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == '"':
12 years ago
self.currentToken["systemId"] = ""
13 years ago
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
12 years ago
self.currentToken["systemId"] = ""
13 years ago
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
12 years ago
13 years ago
def afterDoctypeSystemKeywordState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypeSystemIdentifierState
elif data in ("'", '"'):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.stream.unget(data)
self.state = self.beforeDoctypeSystemIdentifierState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.stream.unget(data)
self.state = self.beforeDoctypeSystemIdentifierState
return True
12 years ago
13 years ago
def beforeDoctypeSystemIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == "\"":
12 years ago
self.currentToken["systemId"] = ""
13 years ago
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
12 years ago
self.currentToken["systemId"] = ""
13 years ago
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
def doctypeSystemIdentifierDoubleQuotedState(self):
data = self.stream.char()
if data == "\"":
self.state = self.afterDoctypeSystemIdentifierState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["systemId"] += "\uFFFD"
13 years ago
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-end-of-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["systemId"] += data
return True
def doctypeSystemIdentifierSingleQuotedState(self):
data = self.stream.char()
if data == "'":
self.state = self.afterDoctypeSystemIdentifierState
12 years ago
elif data == "\u0000":
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
self.currentToken["systemId"] += "\uFFFD"
13 years ago
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-end-of-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.currentToken["systemId"] += data
return True
def afterDoctypeSystemIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"eof-in-doctype"})
13 years ago
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
12 years ago
"unexpected-char-in-doctype"})
13 years ago
self.state = self.bogusDoctypeState
return True
def bogusDoctypeState(self):
data = self.stream.char()
12 years ago
if data == ">":
13 years ago
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data is EOF:
# XXX EMIT
self.stream.unget(data)
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
pass
return True
def cdataSectionState(self):
data = []
while True:
12 years ago
data.append(self.stream.charsUntil("]"))
data.append(self.stream.charsUntil(">"))
char = self.stream.char()
if char == EOF:
break
else:
assert char == ">"
if data[-1][-2:] == "]]":
data[-1] = data[-1][:-2]
13 years ago
break
12 years ago
else:
data.append(char)
13 years ago
data = "".join(data)
12 years ago
# Deal with null here rather than in the parser
nullCount = data.count("\u0000")
13 years ago
if nullCount > 0:
12 years ago
for i in range(nullCount):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
13 years ago
"data": "invalid-codepoint"})
12 years ago
data = data.replace("\u0000", "\uFFFD")
13 years ago
if data:
12 years ago
self.tokenQueue.append({"type": tokenTypes["Characters"],
13 years ago
"data": data})
self.state = self.dataState
return True