You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

96 lines
3.1 KiB

#!/usr/bin/env python
import binascii, codecs, encodings, locale, os, sys, zlib
import argparse
def listcodecs(dir):
names = []
for filename in os.listdir(dir):
if filename[-3:] != '.py':
continue
name = filename[:-3]
# Check whether we've found a true codec
try:
codecs.lookup(name)
except LookupError:
# Codec not found
continue
except Exception:
# Probably an error from importing the codec; still it's
# a valid code name
pass
names.append(name)
return names
def listem():
return listcodecs(encodings.__path__[0])
def _canonical_encoding(encoding):
if encoding is None:
encoding = 'utf-8'
encoding = encoding.lower()
if encoding == "cp65001":
encoding = 'utf-8'
elif encoding == "us-ascii" or encoding == "646":
encoding = 'ascii'
# sometimes Python returns an encoding name that it doesn't support for conversion
# fail early if this happens
try:
u"test".encode(encoding)
except (LookupError, AttributeError):
raise AssertionError("The character encoding '%s' is not supported for conversion." % (encoding,))
return encoding
def get_output_encoding():
return _canonical_encoding(sys.stdout.encoding or locale.getpreferredencoding())
def get_argv_encoding():
if sys.platform == 'win32':
# Unicode arguments are not supported on Windows yet; see Tahoe-LAFS tickets #565 and #1074.
return 'ascii'
else:
return get_output_encoding()
output_encoding = get_output_encoding()
argv_encoding = get_argv_encoding()
def type_unicode(argstr):
return argstr.decode(argv_encoding)
def main():
parser = argparse.ArgumentParser(prog="try_decoding", description="Try decoding some bytes with all sorts of different codecs and print out any that decode.")
parser.add_argument('inputfile', help='file to decode or "-" for stdin', type=argparse.FileType('rb'), metavar='INF')
parser.add_argument('-t', '--target', help='unicode string to match against (if any)', type=type_unicode, metavar='T')
parser.add_argument('-a', '--accept-bytes', help='include codecs which return bytes instead of returning unicode (they will be marked with "!!!" in the output)', action='store_true')
args = parser.parse_args()
inb = args.inputfile.read()
for codec in listem():
try:
u = inb.decode(codec)
except (UnicodeDecodeError, IOError, TypeError, IndexError, UnicodeError, ValueError, zlib.error, binascii.Error):
pass
else:
if isinstance(u, unicode):
if args.target:
if args.target != u:
continue
print "%19s" % codec,
print ':',
print u.encode(output_encoding)
else:
if not args.accept_bytes:
continue
print "%19s" % codec,
print "!!! ",
print ':',
print u
if __name__ == "__main__":
main()