Update Chardet

10 years ago · e1bb8c5419
7 changed files with 67 additions and 28 deletions
--- a/libs/chardet/init.py
+++ b/libs/chardet/init.py
@ -15,7 +15,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
-__version__ = "2.2.1"
+__version__ = "2.3.0"
 from sys import version_info
--- a/libs/chardet/chardetect.py
+++ b/libs/chardet/chardetect.py
@ -12,34 +12,68 @@ Example::
 If no paths are provided, it takes its input from stdin.
 """
 from __future__ import absolute_import, print_function, unicode_literals
 import argparse
 import sys
 from io import open
 from sys import argv, stdin
 from chardet import __version__
 from chardet.universaldetector import UniversalDetector
-def description_of(file, name='stdin'):
+def description_of(lines, name='stdin'):
-    """Return a string describing the probable encoding of a file."""
+    """
    Return a string describing the probable encoding of a file or
    list of strings.
    :param lines: The lines to get the encoding of.
    :type lines: Iterable of bytes
    :param name: Name of file or collection of lines
    :type name: str
    """
    u = UniversalDetector()
-    for line in file:
+    for line in lines:
        u.feed(line)
    u.close()
    result = u.result
    if result['encoding']:
-        return '%s: %s with confidence %s' % (name,
+        return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
-                                              result['encoding'],
+                                                     result['confidence'])
                                              result['confidence'])
    else:
-        return '%s: no result' % name
+        return '{0}: no result'.format(name)
-def main():
+def main(argv=None):
-    if len(argv) <= 1:
+    '''
-        print(description_of(stdin))
+    Handles command line arguments and gets things started.
-    else:
+
-        for path in argv[1:]:
+    :param argv: List of arguments, as if specified on the command-line.
-            with open(path, 'rb') as f:
+                 If None, ``sys.argv[1:]`` is used instead.
-                print(description_of(f, path))
+    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes one or more file paths and reports their detected \
                     encodings",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        conflict_handler='resolve')
    parser.add_argument('input',
                        help='File whose encoding we would like to determine.',
                        type=argparse.FileType('rb'), nargs='*',
                        default=[sys.stdin])
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)
    for f in args.input:
        if f.isatty():
            print("You are running chardetect interactively. Press " +
                  "CTRL-D twice at the start of a blank line to signal the " +
                  "end of your input. If you want help, run chardetect " +
                  "--help\n", file=sys.stderr)
        print(description_of(f, f.name))
 if __name__ == '__main__':
--- a/libs/chardet/jpcntx.py
+++ b/libs/chardet/jpcntx.py
@ -177,6 +177,12 @@ class JapaneseContextAnalysis:
        return -1, 1
 class SJISContextAnalysis(JapaneseContextAnalysis):
    def __init__(self):
        self.charset_name = "SHIFT_JIS"
    def get_charset_name(self):
        return self.charset_name
    def get_order(self, aBuf):
        if not aBuf:
            return -1, 1
@ -184,6 +190,8 @@ class SJISContextAnalysis(JapaneseContextAnalysis):
        first_char = wrap_ord(aBuf[0])
        if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
            charLen = 2
            if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
                self.charset_name = "CP932"
        else:
            charLen = 1
--- a/libs/chardet/latin1prober.py
+++ b/libs/chardet/latin1prober.py
@ -129,11 +129,11 @@ class Latin1Prober(CharSetProber):
        if total < 0.01:
            confidence = 0.0
        else:
-            confidence = ((self._mFreqCounter[3] / total)
+            confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0)
-                          - (self._mFreqCounter[1] * 20.0 / total))
+                          / total)
        if confidence < 0.0:
            confidence = 0.0
        # lower the confidence of latin1 so that other more accurate
        # detector can take priority.
-        confidence = confidence * 0.5
+        confidence = confidence * 0.73
        return confidence
--- a/libs/chardet/mbcssm.py
+++ b/libs/chardet/mbcssm.py
@ -353,7 +353,7 @@ SJIS_cls = (
    2,2,2,2,2,2,2,2,  # 68 - 6f
    2,2,2,2,2,2,2,2,  # 70 - 77
    2,2,2,2,2,2,2,1,  # 78 - 7f
-    3,3,3,3,3,3,3,3,  # 80 - 87
+    3,3,3,3,3,2,2,3,  # 80 - 87
    3,3,3,3,3,3,3,3,  # 88 - 8f
    3,3,3,3,3,3,3,3,  # 90 - 97
    3,3,3,3,3,3,3,3,  # 98 - 9f
@ -369,9 +369,8 @@ SJIS_cls = (
    2,2,2,2,2,2,2,2,  # d8 - df
    3,3,3,3,3,3,3,3,  # e0 - e7
    3,3,3,3,3,4,4,4,  # e8 - ef
-    4,4,4,4,4,4,4,4,  # f0 - f7
+    3,3,3,3,3,3,3,3,  # f0 - f7
-    4,4,4,4,4,0,0,0   # f8 - ff
+    3,3,3,3,3,0,0,0)  # f8 - ff
 )
 SJIS_st = (
@ -571,5 +570,3 @@ UTF8SMModel = {'classTable': UTF8_cls,
               'stateTable': UTF8_st,
               'charLenTable': UTF8CharLenTable,
               'name': 'UTF-8'}
 # flake8: noqa
--- a/libs/chardet/sjisprober.py
+++ b/libs/chardet/sjisprober.py
@ -47,7 +47,7 @@ class SJISProber(MultiByteCharSetProber):
        self._mContextAnalyzer.reset()
    def get_charset_name(self):
-        return "SHIFT_JIS"
+        return self._mContextAnalyzer.get_charset_name()
    def feed(self, aBuf):
        aLen = len(aBuf)
--- a/libs/chardet/universaldetector.py
+++ b/libs/chardet/universaldetector.py
@ -71,9 +71,9 @@ class UniversalDetector:
        if not self._mGotData:
            # If the data starts with BOM, we know it is UTF
-            if aBuf[:3] == codecs.BOM:
+            if aBuf[:3] == codecs.BOM_UTF8:
                # EF BB BF  UTF-8 with BOM
-                self.result = {'encoding': "UTF-8", 'confidence': 1.0}
+                self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0}
            elif aBuf[:4] == codecs.BOM_UTF32_LE:
                # FF FE 00 00  UTF-32, little-endian BOM
                self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}