38 changed files with 1491 additions and 1159 deletions
@ -0,0 +1,46 @@ |
|||
#!/usr/bin/env python |
|||
""" |
|||
Script which takes one or more file paths and reports on their detected |
|||
encodings |
|||
|
|||
Example:: |
|||
|
|||
% chardetect somefile someotherfile |
|||
somefile: windows-1252 with confidence 0.5 |
|||
someotherfile: ascii with confidence 1.0 |
|||
|
|||
If no paths are provided, it takes its input from stdin. |
|||
|
|||
""" |
|||
from io import open |
|||
from sys import argv, stdin |
|||
|
|||
from chardet.universaldetector import UniversalDetector |
|||
|
|||
|
|||
def description_of(file, name='stdin'): |
|||
"""Return a string describing the probable encoding of a file.""" |
|||
u = UniversalDetector() |
|||
for line in file: |
|||
u.feed(line) |
|||
u.close() |
|||
result = u.result |
|||
if result['encoding']: |
|||
return '%s: %s with confidence %s' % (name, |
|||
result['encoding'], |
|||
result['confidence']) |
|||
else: |
|||
return '%s: no result' % name |
|||
|
|||
|
|||
def main(): |
|||
if len(argv) <= 1: |
|||
print(description_of(stdin)) |
|||
else: |
|||
for path in argv[1:]: |
|||
with open(path, 'rb') as f: |
|||
print(description_of(f, path)) |
|||
|
|||
|
|||
if __name__ == '__main__': |
|||
main() |
@ -0,0 +1,34 @@ |
|||
######################## BEGIN LICENSE BLOCK ######################## |
|||
# Contributor(s): |
|||
# Ian Cordasco - port to Python |
|||
# |
|||
# This library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# This library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with this library; if not, write to the Free Software |
|||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
|||
# 02110-1301 USA |
|||
######################### END LICENSE BLOCK ######################### |
|||
|
|||
import sys |
|||
|
|||
|
|||
if sys.version_info < (3, 0): |
|||
base_str = (str, unicode) |
|||
else: |
|||
base_str = (bytes, str) |
|||
|
|||
|
|||
def wrap_ord(a): |
|||
if sys.version_info < (3, 0) and isinstance(a, base_str): |
|||
return ord(a) |
|||
else: |
|||
return a |
@ -0,0 +1,44 @@ |
|||
######################## BEGIN LICENSE BLOCK ######################## |
|||
# The Original Code is mozilla.org code. |
|||
# |
|||
# The Initial Developer of the Original Code is |
|||
# Netscape Communications Corporation. |
|||
# Portions created by the Initial Developer are Copyright (C) 1998 |
|||
# the Initial Developer. All Rights Reserved. |
|||
# |
|||
# Contributor(s): |
|||
# Mark Pilgrim - port to Python |
|||
# |
|||
# This library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# This library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with this library; if not, write to the Free Software |
|||
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
|||
# 02110-1301 USA |
|||
######################### END LICENSE BLOCK ######################### |
|||
|
|||
from .mbcharsetprober import MultiByteCharSetProber |
|||
from .codingstatemachine import CodingStateMachine |
|||
from .chardistribution import EUCKRDistributionAnalysis |
|||
from .mbcssm import CP949SMModel |
|||
|
|||
|
|||
class CP949Prober(MultiByteCharSetProber): |
|||
def __init__(self): |
|||
MultiByteCharSetProber.__init__(self) |
|||
self._mCodingSM = CodingStateMachine(CP949SMModel) |
|||
# NOTE: CP949 is a superset of EUC-KR, so the distribution should be |
|||
# not different. |
|||
self._mDistributionAnalyzer = EUCKRDistributionAnalysis() |
|||
self.reset() |
|||
|
|||
def get_charset_name(self): |
|||
return "CP949" |
Loading…
Reference in new issue