Browse Source

Update Chardet

pull/4513/head
Ruud 10 years ago
parent
commit
e1bb8c5419
  1. 2
      libs/chardet/__init__.py
  2. 64
      libs/chardet/chardetect.py
  3. 8
      libs/chardet/jpcntx.py
  4. 6
      libs/chardet/latin1prober.py
  5. 9
      libs/chardet/mbcssm.py
  6. 2
      libs/chardet/sjisprober.py
  7. 4
      libs/chardet/universaldetector.py

2
libs/chardet/__init__.py

@ -15,7 +15,7 @@
# 02110-1301 USA # 02110-1301 USA
######################### END LICENSE BLOCK ######################### ######################### END LICENSE BLOCK #########################
__version__ = "2.2.1" __version__ = "2.3.0"
from sys import version_info from sys import version_info

64
libs/chardet/chardetect.py

@ -12,34 +12,68 @@ Example::
If no paths are provided, it takes its input from stdin. If no paths are provided, it takes its input from stdin.
""" """
from __future__ import absolute_import, print_function, unicode_literals
import argparse
import sys
from io import open from io import open
from sys import argv, stdin
from chardet import __version__
from chardet.universaldetector import UniversalDetector from chardet.universaldetector import UniversalDetector
def description_of(file, name='stdin'): def description_of(lines, name='stdin'):
"""Return a string describing the probable encoding of a file.""" """
Return a string describing the probable encoding of a file or
list of strings.
:param lines: The lines to get the encoding of.
:type lines: Iterable of bytes
:param name: Name of file or collection of lines
:type name: str
"""
u = UniversalDetector() u = UniversalDetector()
for line in file: for line in lines:
u.feed(line) u.feed(line)
u.close() u.close()
result = u.result result = u.result
if result['encoding']: if result['encoding']:
return '%s: %s with confidence %s' % (name, return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
result['encoding'], result['confidence'])
result['confidence'])
else: else:
return '%s: no result' % name return '{0}: no result'.format(name)
def main(): def main(argv=None):
if len(argv) <= 1: '''
print(description_of(stdin)) Handles command line arguments and gets things started.
else:
for path in argv[1:]: :param argv: List of arguments, as if specified on the command-line.
with open(path, 'rb') as f: If None, ``sys.argv[1:]`` is used instead.
print(description_of(f, path)) :type argv: list of str
'''
# Get command line arguments
parser = argparse.ArgumentParser(
description="Takes one or more file paths and reports their detected \
encodings",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('input',
help='File whose encoding we would like to determine.',
type=argparse.FileType('rb'), nargs='*',
default=[sys.stdin])
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
args = parser.parse_args(argv)
for f in args.input:
if f.isatty():
print("You are running chardetect interactively. Press " +
"CTRL-D twice at the start of a blank line to signal the " +
"end of your input. If you want help, run chardetect " +
"--help\n", file=sys.stderr)
print(description_of(f, f.name))
if __name__ == '__main__': if __name__ == '__main__':

8
libs/chardet/jpcntx.py

@ -177,6 +177,12 @@ class JapaneseContextAnalysis:
return -1, 1 return -1, 1
class SJISContextAnalysis(JapaneseContextAnalysis): class SJISContextAnalysis(JapaneseContextAnalysis):
def __init__(self):
self.charset_name = "SHIFT_JIS"
def get_charset_name(self):
return self.charset_name
def get_order(self, aBuf): def get_order(self, aBuf):
if not aBuf: if not aBuf:
return -1, 1 return -1, 1
@ -184,6 +190,8 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
first_char = wrap_ord(aBuf[0]) first_char = wrap_ord(aBuf[0])
if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)): if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
charLen = 2 charLen = 2
if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
self.charset_name = "CP932"
else: else:
charLen = 1 charLen = 1

6
libs/chardet/latin1prober.py

@ -129,11 +129,11 @@ class Latin1Prober(CharSetProber):
if total < 0.01: if total < 0.01:
confidence = 0.0 confidence = 0.0
else: else:
confidence = ((self._mFreqCounter[3] / total) confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
- (self._mFreqCounter[1] * 20.0 / total)) / total)
if confidence < 0.0: if confidence < 0.0:
confidence = 0.0 confidence = 0.0
# lower the confidence of latin1 so that other more accurate # lower the confidence of latin1 so that other more accurate
# detector can take priority. # detector can take priority.
confidence = confidence * 0.5 confidence = confidence * 0.73
return confidence return confidence

9
libs/chardet/mbcssm.py

@ -353,7 +353,7 @@ SJIS_cls = (
2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 68 - 6f
2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,2, # 70 - 77
2,2,2,2,2,2,2,1, # 78 - 7f 2,2,2,2,2,2,2,1, # 78 - 7f
3,3,3,3,3,3,3,3, # 80 - 87 3,3,3,3,3,2,2,3, # 80 - 87
3,3,3,3,3,3,3,3, # 88 - 8f 3,3,3,3,3,3,3,3, # 88 - 8f
3,3,3,3,3,3,3,3, # 90 - 97 3,3,3,3,3,3,3,3, # 90 - 97
3,3,3,3,3,3,3,3, # 98 - 9f 3,3,3,3,3,3,3,3, # 98 - 9f
@ -369,9 +369,8 @@ SJIS_cls = (
2,2,2,2,2,2,2,2, # d8 - df 2,2,2,2,2,2,2,2, # d8 - df
3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,3,3,3, # e0 - e7
3,3,3,3,3,4,4,4, # e8 - ef 3,3,3,3,3,4,4,4, # e8 - ef
4,4,4,4,4,4,4,4, # f0 - f7 3,3,3,3,3,3,3,3, # f0 - f7
4,4,4,4,4,0,0,0 # f8 - ff 3,3,3,3,3,0,0,0) # f8 - ff
)
SJIS_st = ( SJIS_st = (
@ -571,5 +570,3 @@ UTF8SMModel = {'classTable': UTF8_cls,
'stateTable': UTF8_st, 'stateTable': UTF8_st,
'charLenTable': UTF8CharLenTable, 'charLenTable': UTF8CharLenTable,
'name': 'UTF-8'} 'name': 'UTF-8'}
# flake8: noqa

2
libs/chardet/sjisprober.py

@ -47,7 +47,7 @@ class SJISProber(MultiByteCharSetProber):
self._mContextAnalyzer.reset() self._mContextAnalyzer.reset()
def get_charset_name(self): def get_charset_name(self):
return "SHIFT_JIS" return self._mContextAnalyzer.get_charset_name()
def feed(self, aBuf): def feed(self, aBuf):
aLen = len(aBuf) aLen = len(aBuf)

4
libs/chardet/universaldetector.py

@ -71,9 +71,9 @@ class UniversalDetector:
if not self._mGotData: if not self._mGotData:
# If the data starts with BOM, we know it is UTF # If the data starts with BOM, we know it is UTF
if aBuf[:3] == codecs.BOM: if aBuf[:3] == codecs.BOM_UTF8:
# EF BB BF UTF-8 with BOM # EF BB BF UTF-8 with BOM
self.result = {'encoding': "UTF-8", 'confidence': 1.0} self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
elif aBuf[:4] == codecs.BOM_UTF32_LE: elif aBuf[:4] == codecs.BOM_UTF32_LE:
# FF FE 00 00 UTF-32, little-endian BOM # FF FE 00 00 UTF-32, little-endian BOM
self.result = {'encoding': "UTF-32LE", 'confidence': 1.0} self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}

Loading…
Cancel
Save