sabnzbd/sabnzbd/encoding.py


								#!/usr/bin/python -OO

								# Copyright 2007-2019 The SABnzbd-Team <team@sabnzbd.org>

								#

								# This program is free software; you can redistribute it and/or

								# modify it under the terms of the GNU General Public License

								# as published by the Free Software Foundation; either version 2

								# of the License, or (at your option) any later version.

								#

								# This program is distributed in the hope that it will be useful,

								# but WITHOUT ANY WARRANTY; without even the implied warranty of

								# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

								# GNU General Public License for more details.

								#

								# You should have received a copy of the GNU General Public License

								# along with this program; if not, write to the Free Software

								# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.


								"""

								sabnzbd.encoding - Unicoded filename support

								"""


								import locale

								import string

								from xml.sax.saxutils import escape

								from Cheetah.Filters import Filter


								import sabnzbd


								gUTF = False


								def auto_fsys():

								    global gUTF

								    try:

								        if sabnzbd.DARWIN:

								            gUTF = True

								        else:

								            gUTF = locale.getdefaultlocale()[1].lower().find('utf') >= 0

								    except:

								        # Incorrect locale implementation, assume the worst

								        gUTF = False


								def change_fsys(value):

								    global gUTF

								    if not sabnzbd.WIN32 and not sabnzbd.DARWIN:

								        if value == 1:

								            gUTF = False

								        elif value == 2:

								            gUTF = True

								        else:

								            auto_fsys()


								def platform_encode(p):

								    """ Return Unicode name, if not already Unicode, decode with UTF-8 or latin1 """

								    if isinstance(p, str):

								        try:

								            return p.decode('utf-8')

								        except:

								            return p.decode(codepage, errors='replace').replace('?', '!')

								    else:

								        return p


								def yenc_name_fixer(p):

								    """ Return Unicode name of 8bit ASCII string, first try utf-8, then cp1252 """

								    try:

								        return p.decode('utf-8')

								    except:

								        return p.decode('cp1252', errors='replace').replace('?', '!')


								def special_fixer(p):

								    """ Return string appropriate for the platform.

								        Also takes care of the situation where a non-Windows/UTF-8 system

								        receives a latin-1 encoded name.

								    """

								    if p:

								        # Remove \" constructions from incoming headers

								        p = p.replace(r'\"', r'"')

								    if not p or isinstance(p, unicode):

								        return p

								    try:

								        # First see if it isn't just UTF-8

								        p.decode('utf-8')

								        if sabnzbd.DARWIN and '&#' in p:

								            p = fixup_ff4(p)

								        return p.decode('utf-8')

								    except:

								        # Now assume it's 8bit ASCII

								        return p.decode(codepage)


								def unicoder(p, force=False):

								    """ Make sure a Unicode string is returned

								        When `force` is True, ignore filesystem encoding

								    """

								    if isinstance(p, unicode):

								        return p

								    if isinstance(p, str):

								        if gUTF or force:

								            try:

								                return p.decode('utf-8')

								            except:

								                return p.decode(codepage, 'replace')

								        return p.decode(codepage, 'replace')

								    else:

								        return unicode(str(p))


								def xml_name(p, keep_escape=False, encoding=None):

								    """ Prepare name for use in HTML/XML contect """

								    if isinstance(p, unicode):

								        pass

								    elif isinstance(p, str):

								        if sabnzbd.DARWIN or encoding == 'utf-8':

								            p = p.decode('utf-8', 'replace')

								        elif gUTF:

								            p = p.decode('utf-8', 'replace')

								        else:

								            p = p.decode(codepage, 'replace')

								    else:

								        p = str(p)


								    if keep_escape:

								        return p.encode('ascii', 'xmlcharrefreplace')

								    else:

								        return escape(p).encode('ascii', 'xmlcharrefreplace')


								class LatinFilter(Filter):

								    """ Make sure Cheetah gets only Unicode strings """


								    def filter(self, val, str=str, **kw):

								        if isinstance(val, unicode):

								            return val

								        elif isinstance(val, basestring):

								            try:

								                if sabnzbd.WIN32:

								                    return val.decode(codepage)

								                else:

								                    return val.decode('utf-8')

								            except:

								                return val.decode(codepage, 'replace')

								        elif val is None:

								            return u''

								        else:

								            return unicode(str(val))


								class EmailFilter(Filter):

								    """ Make sure Cheetah gets only Unicode strings

								        First try utf-8, then 8bit ASCII

								    """


								    def filter(self, val, str=str, **kw):

								        if isinstance(val, unicode):

								            return val

								        elif isinstance(val, basestring):

								            try:

								                return val.decode('utf-8')

								            except:

								                return val.decode(codepage, 'replace')

								        elif val is None:

								            return u''

								        else:

								            return unicode(str(val))


								################################################################################

								#

								# Map CodePage-850 characters to Python's pseudo-Unicode 8bit ASCII

								# Use to transform 8-bit console output to plain Python strings

								# For example for unrar and par2 output

								#


								TAB_850 = \

								    "\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F" \

								    "\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F" \

								    "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" \

								    "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF" \

								    "\xC0\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF" \

								    "\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD7\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF" \

								    "\xE0\xE1\xE2\xE3\xE4\xE5\xE6\xE7\xE8\xE9\xEA\xEB\xEC\xED\xEE\xEF" \

								    "\xF0\xF1\xF2\xF3\xF4\xF5\xF6\xF7\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF"


								TAB_LATIN = \

								    "\xC7\xFC\xE9\xE2\xE4\xE0\xE5\xE7\xEA\xEB\xE8\xEF\xEE\xEC\xC4\xC5" \

								    "\xC9\xE6\xC6\xF4\xF6\xF2\xFB\xF9\xFF\xD6\xDC\xF8\xA3\xD8\xD7\x83" \

								    "\xE1\xED\xF3\xFA\xF1\xD1\xAA\xBA\xBF\xAE\xAC\xDB\xBC\xA1\xAB\xBB" \

								    "\x7E\x7E\x7E\x7E\x7E\xC1\xC2\xC0\xA9\x7E\x7E\x7E\x7E\xA2\xA5\x7E" \

								    "\x7E\x7E\x7E\x7E\x7E\x7E\xE3\xc3\x7E\x7E\x7E\x7E\x7E\x7E\x7E\xA4" \

								    "\xF0\xD0\xCA\xCB\xC8\x7E\xCD\xCE\xCF\x7E\x7E\x7E\x7E\xA6\xCC\x7E" \

								    "\xD3\xDF\xD4\xD2\xF5\xD5\xB5\xFE\xDE\xDA\xDB\xD9\xFD\xDD\xAF\xB4" \

								    "\xAD\xB1\x5F\xBE\xB6\xA7\xF7\xB8\xB0\xA8\xB7\xB9\xB3\xB2\x7E\xA0"


								def TRANS(p):

								    """ For Windows: Translate CP850 to Python's Latin-1 and return in Unicode

								        Others: return original string

								    """

								    if sabnzbd.WIN32:

								        if p:

								            return p.translate(string.maketrans(TAB_850, TAB_LATIN)).decode('cp1252', 'replace')

								        else:

								            # translate() fails on empty or None strings

								            return ''

								    else:

								        return unicoder(p)


								def fixup_ff4(p):

								    """ Fix incompatibility between CherryPy and Firefox-4 on OSX,

								        where a filename contains &#xx; encodings

								    """

								    name = []

								    num = 0

								    start = amp = False

								    for ch in p:

								        if start:

								            if ch.isdigit():

								                num += ch

								            elif ch == ';':

								                name.append(unichr(int(num)).encode('utf8'))

								                start = False

								            else:

								                name.append('&#%s%s' % (num, ch))

								                start = False

								        elif ch == '&':

								            amp = True

								        elif amp:

								            amp = False

								            if ch == '#':

								                start = True

								                num = ''

								            else:

								                name.append('&' + ch)

								        else:

								            name.append(ch)

								    return ''.join(name)


								_HTML_TABLE = {

								    #'&' : '&amp;', # Not yet, texts need to be cleaned from HTML first

								    #'>' : '&gt;',  # Not yet, texts need to be cleaned from HTML first

								    #'<' : '&lt;',  # Not yet, texts need to be cleaned from HTML first

								    '"': '&quot;',

								    "'": '&apos;'

								}


								def html_escape(txt):

								    """ Replace HTML metacharacters with &-constructs """

								    # Replacement for inefficient xml.sax.saxutils.escape function

								    if any(ch in txt for ch in _HTML_TABLE):

								        return ''.join((_HTML_TABLE.get(ch, ch) for ch in txt))

								    else:

								        return txt


								def deunicode(p):

								    """ Return the correct 8bit ASCII encoding for the platform:

								        Latin-1 for Windows/Posix-non-UTF and UTF-8 for OSX/Posix-UTF

								    """

								    if isinstance(p, unicode):

								        if gUTF:

								            return p.encode('utf-8')

								        else:

								            return p.encode(codepage, 'replace')

								    elif isinstance(p, basestring):

								        if gUTF:

								            try:

								                p.decode('utf-8')

								                return p

								            except:

								                return p.decode(codepage).encode('utf-8')

								        else:

								            try:

								                return p.decode('utf-8').encode(codepage, 'replace')

								            except:

								                return p

								    else:

								        return str(p)


								auto_fsys()