You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

608 lines
24 KiB

#!/usr/bin/env python
"""Python codec for CSS."""
__docformat__ = 'restructuredtext'
__author__ = 'Walter Doerwald'
__version__ = '$Id: util.py 1114 2008-03-05 13:22:59Z cthedot $'
import sys
import codecs
import marshal
# We're using bits to store all possible candidate encodings (or variants, i.e.
# we have two bits for the variants of UTF-16 and two for the
# variants of UTF-32).
#
# Prefixes for various CSS encodings
# UTF-8-SIG xEF xBB xBF
# UTF-16 (LE) xFF xFE ~x00|~x00
# UTF-16 (BE) xFE xFF
# UTF-16-LE @ x00 @ x00
# UTF-16-BE x00 @
# UTF-32 (LE) xFF xFE x00 x00
# UTF-32 (BE) x00 x00 xFE xFF
# UTF-32-LE @ x00 x00 x00
# UTF-32-BE x00 x00 x00 @
# CHARSET @ c h a ...
def chars(bytestring):
return ''.join(chr(byte) for byte in bytestring)
def detectencoding_str(input, final=False):
"""
Detect the encoding of the byte string ``input``, which contains the
beginning of a CSS file. This function returns the detected encoding (or
``None`` if it hasn't got enough data), and a flag that indicates whether
that encoding has been detected explicitely or implicitely. To detect the
encoding the first few bytes are used (or if ``input`` is ASCII compatible
and starts with a charset rule the encoding name from the rule). "Explicit"
detection means that the bytes start with a BOM or a charset rule.
If the encoding can't be detected yet, ``None`` is returned as the encoding.
``final`` specifies whether more data will be available in later calls or
not. If ``final`` is true, ``detectencoding_str()`` will never return
``None`` as the encoding.
"""
# A bit for every candidate
CANDIDATE_UTF_8_SIG = 1
CANDIDATE_UTF_16_AS_LE = 2
CANDIDATE_UTF_16_AS_BE = 4
CANDIDATE_UTF_16_LE = 8
CANDIDATE_UTF_16_BE = 16
CANDIDATE_UTF_32_AS_LE = 32
CANDIDATE_UTF_32_AS_BE = 64
CANDIDATE_UTF_32_LE = 128
CANDIDATE_UTF_32_BE = 256
CANDIDATE_CHARSET = 512
candidates = 1023 # all candidates
#input = chars(input)
li = len(input)
if li>=1:
# Check first byte
c = input[0]
if c != b"\xef"[0]:
candidates &= ~CANDIDATE_UTF_8_SIG
if c != b"\xff"[0]:
candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_16_AS_LE)
if c != b"\xfe"[0]:
candidates &= ~CANDIDATE_UTF_16_AS_BE
if c != b"@"[0]:
candidates &= ~(CANDIDATE_UTF_32_LE|CANDIDATE_UTF_16_LE|CANDIDATE_CHARSET)
if c != b"\x00"[0]:
candidates &= ~(CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_BE|CANDIDATE_UTF_16_BE)
if li>=2:
# Check second byte
c = input[1]
if c != b"\xbb"[0]:
candidates &= ~CANDIDATE_UTF_8_SIG
if c != b"\xfe"[0]:
candidates &= ~(CANDIDATE_UTF_16_AS_LE|CANDIDATE_UTF_32_AS_LE)
if c != b"\xff"[0]:
candidates &= ~CANDIDATE_UTF_16_AS_BE
if c != b"\x00"[0]:
candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_BE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
if c != b"@"[0]:
candidates &= ~CANDIDATE_UTF_16_BE
if c != b"c"[0]:
candidates &= ~CANDIDATE_CHARSET
if li>=3:
# Check third byte
c = input[2]
if c != b"\xbf"[0]:
candidates &= ~CANDIDATE_UTF_8_SIG
if c != b"c"[0]:
candidates &= ~CANDIDATE_UTF_16_LE
if c != b"\x00"[0]:
candidates &= ~(CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE|CANDIDATE_UTF_32_BE)
if c != b"\xfe"[0]:
candidates &= ~CANDIDATE_UTF_32_AS_BE
if c != b"h"[0]:
candidates &= ~CANDIDATE_CHARSET
if li>=4:
# Check fourth byte
c = input[3]
if input[2:4] == b"\x00\x00"[0:2]:
candidates &= ~CANDIDATE_UTF_16_AS_LE
if c != b"\x00"[0]:
candidates &= ~(CANDIDATE_UTF_16_LE|CANDIDATE_UTF_32_AS_LE|CANDIDATE_UTF_32_LE)
if c != b"\xff"[0]:
candidates &= ~CANDIDATE_UTF_32_AS_BE
if c != b"@"[0]:
candidates &= ~CANDIDATE_UTF_32_BE
if c != b"a"[0]:
candidates &= ~CANDIDATE_CHARSET
if candidates == 0:
return ("utf-8", False)
if not (candidates & (candidates-1)): # only one candidate remaining
if candidates == CANDIDATE_UTF_8_SIG and li >= 3:
return ("utf-8-sig", True)
elif candidates == CANDIDATE_UTF_16_AS_LE and li >= 2:
return ("utf-16", True)
elif candidates == CANDIDATE_UTF_16_AS_BE and li >= 2:
return ("utf-16", True)
elif candidates == CANDIDATE_UTF_16_LE and li >= 4:
return ("utf-16-le", False)
elif candidates == CANDIDATE_UTF_16_BE and li >= 2:
return ("utf-16-be", False)
elif candidates == CANDIDATE_UTF_32_AS_LE and li >= 4:
return ("utf-32", True)
elif candidates == CANDIDATE_UTF_32_AS_BE and li >= 4:
return ("utf-32", True)
elif candidates == CANDIDATE_UTF_32_LE and li >= 4:
return ("utf-32-le", False)
elif candidates == CANDIDATE_UTF_32_BE and li >= 4:
return ("utf-32-be", False)
elif candidates == CANDIDATE_CHARSET and li >= 4:
prefix = '@charset "'
charsinput = chars(input)
if charsinput[:len(prefix)] == prefix:
pos = charsinput.find('"', len(prefix))
if pos >= 0:
# TODO: return str and not bytes!
return (charsinput[len(prefix):pos], True)
# if this is the last call, and we haven't determined an encoding yet,
# we default to UTF-8
if final:
return ("utf-8", False)
return (None, False) # dont' know yet
def detectencoding_unicode(input, final=False):
"""
Detect the encoding of the unicode string ``input``, which contains the
beginning of a CSS file. The encoding is detected from the charset rule
at the beginning of ``input``. If there is no charset rule, ``"utf-8"``
will be returned.
If the encoding can't be detected yet, ``None`` is returned. ``final``
specifies whether more data will be available in later calls or not. If
``final`` is true, ``detectencoding_unicode()`` will never return ``None``.
"""
prefix = '@charset "'
if input.startswith(prefix):
pos = input.find('"', len(prefix))
if pos >= 0:
return (input[len(prefix):pos], True)
elif final or not prefix.startswith(input):
# if this is the last call, and we haven't determined an encoding yet,
# (or the string definitely doesn't start with prefix) we default to UTF-8
return ("utf-8", False)
return (None, False) # don't know yet
def _fixencoding(input, encoding, final=False):
"""
Replace the name of the encoding in the charset rule at the beginning of
``input`` with ``encoding``. If ``input`` doesn't starts with a charset
rule, ``input`` will be returned unmodified.
If the encoding can't be found yet, ``None`` is returned. ``final``
specifies whether more data will be available in later calls or not.
If ``final`` is true, ``_fixencoding()`` will never return ``None``.
"""
prefix = '@charset "'
if len(input) > len(prefix):
if input.startswith(prefix):
pos = input.find('"', len(prefix))
if pos >= 0:
if encoding.replace("_", "-").lower() == "utf-8-sig":
encoding = "utf-8"
return prefix + encoding + input[pos:]
# we haven't seen the end of the encoding name yet => fall through
else:
return input # doesn't start with prefix, so nothing to fix
elif not prefix.startswith(input) or final:
# can't turn out to be a @charset rule later (or there is no "later")
return input
if final:
return input
return None # don't know yet
def decode(input, errors="strict", encoding=None, force=True):
try:
# py 3 only, memory?! object to bytes
input = input.tobytes()
except AttributeError as e:
pass
if encoding is None or not force:
(_encoding, explicit) = detectencoding_str(input, True)
if _encoding == "css":
raise ValueError("css not allowed as encoding name")
if (explicit and not force) or encoding is None: # Take the encoding from the input
encoding = _encoding
# NEEDS: change in parse.py (str to bytes!)
(input, consumed) = codecs.getdecoder(encoding)(input, errors)
return (_fixencoding(input, str(encoding), True), consumed)
def encode(input, errors="strict", encoding=None):
consumed = len(input)
if encoding is None:
encoding = detectencoding_unicode(input, True)[0]
if encoding.replace("_", "-").lower() == "utf-8-sig":
input = _fixencoding(input, "utf-8", True)
else:
input = _fixencoding(input, str(encoding), True)
if encoding == "css":
raise ValueError("css not allowed as encoding name")
encoder = codecs.getencoder(encoding)
return (encoder(input, errors)[0], consumed)
def _bytes2int(bytes):
# Helper: convert an 8 bit string into an ``int``.
i = 0
for byte in bytes:
i = (i<<8) + ord(byte)
return i
def _int2bytes(i):
# Helper: convert an ``int`` into an 8-bit string.
v = []
while i:
v.insert(0, chr(i&0xff))
i >>= 8
return "".join(v)
if hasattr(codecs, "IncrementalDecoder"):
class IncrementalDecoder(codecs.IncrementalDecoder):
def __init__(self, errors="strict", encoding=None, force=True):
self.decoder = None
self.encoding = encoding
self.force = force
codecs.IncrementalDecoder.__init__(self, errors)
# Store ``errors`` somewhere else,
# because we have to hide it in a property
self._errors = errors
self.buffer = b""
self.headerfixed = False
def iterdecode(self, input):
for part in input:
result = self.decode(part, False)
if result:
yield result
result = self.decode("", True)
if result:
yield result
def decode(self, input, final=False):
# We're doing basically the same as a ``BufferedIncrementalDecoder``,
# but since the buffer is only relevant until the encoding has been
# detected (in which case the buffer of the underlying codec might
# kick in), we're implementing buffering ourselves to avoid some
# overhead.
if self.decoder is None:
input = self.buffer + input
# Do we have to detect the encoding from the input?
if self.encoding is None or not self.force:
(encoding, explicit) = detectencoding_str(input, final)
if encoding is None: # no encoding determined yet
self.buffer = input # retry the complete input on the next call
return "" # no encoding determined yet, so no output
elif encoding == "css":
raise ValueError("css not allowed as encoding name")
if (explicit and not self.force) or self.encoding is None: # Take the encoding from the input
self.encoding = encoding
self.buffer = "" # drop buffer, as the decoder might keep its own
decoder = codecs.getincrementaldecoder(self.encoding)
self.decoder = decoder(self._errors)
if self.headerfixed:
return self.decoder.decode(input, final)
# If we haven't fixed the header yet,
# the content of ``self.buffer`` is a ``unicode`` object
output = self.buffer + self.decoder.decode(input, final)
encoding = self.encoding
if encoding.replace("_", "-").lower() == "utf-8-sig":
encoding = "utf-8"
newoutput = _fixencoding(output, str(encoding), final)
if newoutput is None:
# retry fixing the @charset rule (but keep the decoded stuff)
self.buffer = output
return ""
self.headerfixed = True
return newoutput
def reset(self):
codecs.IncrementalDecoder.reset(self)
self.decoder = None
self.buffer = b""
self.headerfixed = False
def _geterrors(self):
return self._errors
def _seterrors(self, errors):
# Setting ``errors`` must be done on the real decoder too
if self.decoder is not None:
self.decoder.errors = errors
self._errors = errors
errors = property(_geterrors, _seterrors)
def getstate(self):
if self.decoder is not None:
state = (self.encoding, self.buffer, self.headerfixed, True, self.decoder.getstate())
else:
state = (self.encoding, self.buffer, self.headerfixed, False, None)
return ("", _bytes2int(marshal.dumps(state)))
def setstate(self, state):
state = _int2bytes(marshal.loads(state[1])) # ignore buffered input
self.encoding = state[0]
self.buffer = state[1]
self.headerfixed = state[2]
if state[3] is not None:
self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
self.decoder.setstate(state[4])
else:
self.decoder = None
if hasattr(codecs, "IncrementalEncoder"):
class IncrementalEncoder(codecs.IncrementalEncoder):
def __init__(self, errors="strict", encoding=None):
self.encoder = None
self.encoding = encoding
codecs.IncrementalEncoder.__init__(self, errors)
# Store ``errors`` somewhere else,
# because we have to hide it in a property
self._errors = errors
self.buffer = ""
def iterencode(self, input):
for part in input:
result = self.encode(part, False)
if result:
yield result
result = self.encode("", True)
if result:
yield result
def encode(self, input, final=False):
if self.encoder is None:
input = self.buffer + input
if self.encoding is not None:
# Replace encoding in the @charset rule with the specified one
encoding = self.encoding
if encoding.replace("_", "-").lower() == "utf-8-sig":
encoding = "utf-8"
newinput = _fixencoding(input, str(encoding), final)
if newinput is None: # @charset rule incomplete => Retry next time
self.buffer = input
return ""
input = newinput
else:
# Use encoding from the @charset declaration
self.encoding = detectencoding_unicode(input, final)[0]
if self.encoding is not None:
if self.encoding == "css":
raise ValueError("css not allowed as encoding name")
info = codecs.lookup(self.encoding)
encoding = self.encoding
if self.encoding.replace("_", "-").lower() == "utf-8-sig":
input = _fixencoding(input, "utf-8", True)
self.encoder = info.incrementalencoder(self._errors)
self.buffer = ""
else:
self.buffer = input
return ""
return self.encoder.encode(input, final)
def reset(self):
codecs.IncrementalEncoder.reset(self)
self.encoder = None
self.buffer = ""
def _geterrors(self):
return self._errors
def _seterrors(self, errors):
# Setting ``errors ``must be done on the real encoder too
if self.encoder is not None:
self.encoder.errors = errors
self._errors = errors
errors = property(_geterrors, _seterrors)
def getstate(self):
if self.encoder is not None:
state = (self.encoding, self.buffer, True, self.encoder.getstate())
else:
state = (self.encoding, self.buffer, False, None)
return _bytes2int(marshal.dumps(state))
def setstate(self, state):
state = _int2bytes(marshal.loads(state))
self.encoding = state[0]
self.buffer = state[1]
if state[2] is not None:
self.encoder = codecs.getincrementalencoder(self.encoding)(self._errors)
self.encoder.setstate(state[4])
else:
self.encoder = None
class StreamWriter(codecs.StreamWriter):
def __init__(self, stream, errors="strict", encoding=None, header=False):
codecs.StreamWriter.__init__(self, stream, errors)
self.streamwriter = None
self.encoding = encoding
self._errors = errors
self.buffer = ""
def encode(self, input, errors='strict'):
li = len(input)
if self.streamwriter is None:
input = self.buffer + input
li = len(input)
if self.encoding is not None:
# Replace encoding in the @charset rule with the specified one
encoding = self.encoding
if encoding.replace("_", "-").lower() == "utf-8-sig":
encoding = "utf-8"
newinput = _fixencoding(input, str(encoding), False)
if newinput is None: # @charset rule incomplete => Retry next time
self.buffer = input
return ("", 0)
input = newinput
else:
# Use encoding from the @charset declaration
self.encoding = detectencoding_unicode(input, False)[0]
if self.encoding is not None:
if self.encoding == "css":
raise ValueError("css not allowed as encoding name")
self.streamwriter = codecs.getwriter(self.encoding)(self.stream, self._errors)
encoding = self.encoding
if self.encoding.replace("_", "-").lower() == "utf-8-sig":
input = _fixencoding(input, "utf-8", True)
self.buffer = ""
else:
self.buffer = input
return ("", 0)
return (self.streamwriter.encode(input, errors)[0], li)
def _geterrors(self):
return self._errors
def _seterrors(self, errors):
# Setting ``errors`` must be done on the streamwriter too
try:
if self.streamwriter is not None:
self.streamwriter.errors = errors
except AttributeError as e:
# TODO: py3 only exception?
pass
self._errors = errors
errors = property(_geterrors, _seterrors)
class StreamReader(codecs.StreamReader):
def __init__(self, stream, errors="strict", encoding=None, force=True):
codecs.StreamReader.__init__(self, stream, errors)
self.streamreader = None
self.encoding = encoding
self.force = force
self._errors = errors
def decode(self, input, errors='strict'):
if self.streamreader is None:
if self.encoding is None or not self.force:
(encoding, explicit) = detectencoding_str(input, False)
if encoding is None: # no encoding determined yet
return ("", 0) # no encoding determined yet, so no output
elif encoding == "css":
raise ValueError("css not allowed as encoding name")
if (explicit and not self.force) or self.encoding is None: # Take the encoding from the input
self.encoding = encoding
streamreader = codecs.getreader(self.encoding)
streamreader = streamreader(self.stream, self._errors)
(output, consumed) = streamreader.decode(input, errors)
encoding = self.encoding
if encoding.replace("_", "-").lower() == "utf-8-sig":
encoding = "utf-8"
newoutput = _fixencoding(output, str(encoding), False)
if newoutput is not None:
self.streamreader = streamreader
return (newoutput, consumed)
return ("", 0) # we will create a new streamreader on the next call
return self.streamreader.decode(input, errors)
def _geterrors(self):
return self._errors
def _seterrors(self, errors):
# Setting ``errors`` must be done on the streamreader too
try:
if self.streamreader is not None:
self.streamreader.errors = errors
except AttributeError as e:
# TODO: py3 only exception?
pass
self._errors = errors
errors = property(_geterrors, _seterrors)
if hasattr(codecs, "CodecInfo"):
# We're running on Python 2.5 or better
def search_function(name):
if name == "css":
return codecs.CodecInfo(
name="css",
encode=encode,
decode=decode,
incrementalencoder=IncrementalEncoder,
incrementaldecoder=IncrementalDecoder,
streamwriter=StreamWriter,
streamreader=StreamReader,
)
else:
# If we're running on Python 2.4, define the utf-8-sig codec here
def utf8sig_encode(input, errors='strict'):
return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
def utf8sig_decode(input, errors='strict'):
prefix = 0
if input[:3] == codecs.BOM_UTF8:
input = input[3:]
prefix = 3
(output, consumed) = codecs.utf_8_decode(input, errors, True)
return (output, consumed+prefix)
class UTF8SigStreamWriter(codecs.StreamWriter):
def reset(self):
codecs.StreamWriter.reset(self)
try:
del self.encode
except AttributeError:
pass
def encode(self, input, errors='strict'):
self.encode = codecs.utf_8_encode
return utf8sig_encode(input, errors)
class UTF8SigStreamReader(codecs.StreamReader):
def reset(self):
codecs.StreamReader.reset(self)
try:
del self.decode
except AttributeError:
pass
def decode(self, input, errors='strict'):
if len(input) < 3 and codecs.BOM_UTF8.startswith(input):
# not enough data to decide if this is a BOM
# => try again on the next call
return ("", 0)
self.decode = codecs.utf_8_decode
return utf8sig_decode(input, errors)
def search_function(name):
import encodings
name = encodings.normalize_encoding(name)
if name == "css":
return (encode, decode, StreamReader, StreamWriter)
elif name == "utf_8_sig":
return (utf8sig_encode, utf8sig_decode, UTF8SigStreamReader, UTF8SigStreamWriter)
codecs.register(search_function)
# Error handler for CSS escaping
def cssescape(exc):
if not isinstance(exc, UnicodeEncodeError):
raise TypeError("don't know how to handle %r" % exc)
return ("".join("\\%06x" % ord(c) for c in exc.object[exc.start:exc.end]), exc.end)
codecs.register_error("cssescape", cssescape)