sabnzbd/sabnzbd/assembler.py

#!/usr/bin/python -OO
# Copyright 2008-2012 The SABnzbd-Team <team@sabnzbd.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

"""
sabnzbd.assembler - threaded assembly/decoding of files
"""

import os
import Queue
import binascii
import logging
import struct
import re
from threading import Thread
from time import sleep
try:
    import hashlib
    new_md5 = hashlib.md5
except:
    import md5
    new_md5 = md5.new

import sabnzbd
from sabnzbd.misc import get_filepath, sanitize_filename, get_unique_path, renamer, \
                         set_permissions, flag_file, long_path
from sabnzbd.constants import QCHECK_FILE
import sabnzbd.cfg as cfg
from sabnzbd.articlecache import ArticleCache
from sabnzbd.postproc import PostProcessor
import sabnzbd.downloader
from sabnzbd.utils.rarfile import RarFile, is_rarfile
from sabnzbd.encoding import latin1, unicoder, is_utf8


#------------------------------------------------------------------------------
class Assembler(Thread):
    do = None # Link to the instance of this method

    def __init__ (self, queue = None):
        Thread.__init__(self)

        if queue:
            self.queue = queue
        else:
            self.queue = Queue.Queue()
        Assembler.do = self

    def stop(self):
        self.process(None)

    def process(self, job):
        self.queue.put(job)

    def run(self):
        import sabnzbd.nzbqueue
        while 1:
            job = self.queue.get()
            if not job:
                logging.info("Shutting down")
                break

            nzo, nzf = job

            if nzf:
                sabnzbd.CheckFreeSpace()
                filename = sanitize_filename(nzf.filename)
                nzf.filename = filename

                dupe = nzo.check_for_dupe(nzf)

                filepath = get_filepath(long_path(cfg.download_dir.get_path()), nzo, filename)

                if filepath:
                    logging.info('Decoding %s %s', filepath, nzf.type)
                    try:
                        filepath = _assemble(nzf, filepath, dupe)
                    except IOError, (errno, strerror):
                        if nzo.deleted:
                            # Job was deleted, ignore error
                            pass
                        else:
                            # 28 == disk full => pause downloader
                            if errno == 28:
                                logging.error(Ta('Disk full! Forcing Pause'))
                            else:
                                logging.error(T('Disk error on creating file %s'), filepath)
                            # Pause without saving
                            sabnzbd.downloader.Downloader.do.pause(save=False)
                    except:
                        logging.error('Fatal error in Assembler', exc_info = True)
                        break

                    nzf.remove_admin()
                    setname = nzf.setname
                    if nzf.is_par2 and (nzo.md5packs.get(setname) is None):
                        pack = GetMD5Hashes(filepath)[0]
                        if pack:
                            nzo.md5packs[setname] = pack
                            logging.debug('Got md5pack for set %s', setname)

                    if check_encrypted_rar(nzo, filepath):
                        if cfg.pause_on_pwrar() == 1:
                            logging.warning(Ta('WARNING: Paused job "%s" because of encrypted RAR file'), latin1(nzo.final_name))
                            nzo.pause()
                        else:
                            logging.warning(Ta('WARNING: Aborted job "%s" because of encrypted RAR file'), latin1(nzo.final_name))
                            nzo.fail_msg = T('Aborted, encryption detected')
                            import sabnzbd.nzbqueue
                            sabnzbd.nzbqueue.NzbQueue.do.end_job(nzo)

                    unwanted = rar_contains_unwanted_file(nzo, filepath)
                    if unwanted:
                        logging.warning(Ta('WARNING: In "%s" unwanted extension in RAR file. Unwanted file is %s '), latin1(nzo.final_name), unwanted)
                        if cfg.action_on_unwanted_extensions() == 1:
                            logging.debug('Unwanted extension ... pausing')
                            nzo.unwanted_ext = True
                            nzo.pause()
                        if cfg.action_on_unwanted_extensions() == 2:
                            logging.debug('Unwanted extension ... aborting')
                            nzo.fail_msg = T('Aborted, unwanted extension detected')
                            import sabnzbd.nzbqueue
                            sabnzbd.nzbqueue.NzbQueue.do.end_job(nzo)


                    nzf.completed = True
            else:
                sabnzbd.nzbqueue.NzbQueue.do.remove(nzo.nzo_id, add_to_history=False, cleanup=False)
                PostProcessor.do.process(nzo)


def _assemble(nzf, path, dupe):
    if os.path.exists(path):
        unique_path = get_unique_path(path, create_dir = False)
        if dupe:
            path = unique_path
        else:
            renamer(path, unique_path)

    fout = open(path, 'ab')

    if cfg.quick_check():
        md5 = new_md5()
    else:
        md5 = None

    _type = nzf.type
    decodetable = nzf.decodetable

    for articlenum in decodetable:
        sleep(0.001)
        article = decodetable[articlenum]

        data = ArticleCache.do.load_article(article)

        if not data:
            logging.info(Ta('%s missing'), article)
        else:
            # yenc data already decoded, flush it out
            if _type == 'yenc':
                fout.write(data)
                if md5: md5.update(data)
            # need to decode uu data now
            elif _type == 'uu':
                data = data.split('\r\n')

                chunks = []
                for line in data:
                    if not line:
                        continue

                    if line == '-- ' or line.startswith('Posted via '):
                        continue
                    try:
                        tmpdata = binascii.a2b_uu(line)
                        chunks.append(tmpdata)
                    except binascii.Error, msg:
                        ## Workaround for broken uuencoders by
                        ##/Fredrik Lundh
                        nbytes = (((ord(line[0])-32) & 63) * 4 + 5) / 3
                        try:
                            tmpdata = binascii.a2b_uu(line[:nbytes])
                            chunks.append(tmpdata)
                        except binascii.Error, msg:
                            logging.info('Decode failed in part %s: %s', article.article, msg)
                data = ''.join(chunks)
                fout.write(data)
                if md5: md5.update(data)

    fout.flush()
    fout.close()
    set_permissions(path)
    if md5:
        nzf.md5sum = md5.digest()
        del md5

    return path


def file_has_articles(nzf):
    """ Do a quick check to see if any articles are present for this file.
        Destructive: only to be used to differentiate between unknown encoding and no articles.
    """
    has = False
    decodetable = nzf.decodetable
    for articlenum in decodetable:
        sleep(0.01)
        article = decodetable[articlenum]
        data = ArticleCache.do.load_article(article)
        if data:
            has = True
    return has


# For a full description of the par2 specification, visit:
# http://parchive.sourceforge.net/docs/specifications/parity-volume-spec/article-spec.html

def GetMD5Hashes(fname, force=False):
    """ Get the hash table from a PAR2 file
        Return as dictionary, indexed on names and True for utf8-encoded names
    """
    new_encoding = True
    table = {}
    if force or not flag_file(os.path.split(fname)[0], QCHECK_FILE):
        try:
            f = open(fname, 'rb')
        except:
            return table, new_encoding

        new_encoding = False
        try:
            header = f.read(8)
            while header:
                name, hash = ParseFilePacket(f, header)
                new_encoding |= is_utf8(name)
                if name:
                    table[name] = hash
                header = f.read(8)

        except (struct.error, IndexError):
            logging.info('Cannot use corrupt par2 file for QuickCheck, "%s"', fname)
            table = {}
        except:
            logging.debug('QuickCheck parser crashed in file %s', fname)
            logging.info('Traceback: ', exc_info = True)
            table = {}

        f.close()
    return table, new_encoding


def ParseFilePacket(f, header):
    """ Look up and analyse a FileDesc package """

    nothing = None, None

    if header != 'PAR2\0PKT':
        return nothing

    # Length must be multiple of 4 and at least 20
    len = struct.unpack('<Q', f.read(8))[0]
    if int(len/4)*4 != len or len < 20:
        return nothing

    # Next 16 bytes is md5sum of this packet
    md5sum = f.read(16)

    # Read and check the data
    data = f.read(len-32)
    md5 = new_md5()
    md5.update(data)
    if md5sum != md5.digest():
        return nothing

    # The FileDesc packet looks like:
    # 16 : "PAR 2.0\0FileDesc"
    # 16 : FileId
    # 16 : Hash for full file **
    # 16 : Hash for first 16K
    #  8 : File length
    # xx : Name (multiple of 4, padded with \0 if needed) **

    # See if it's the right packet and get name + hash
    for offset in range(0, len, 8):
        if data[offset:offset+16] == "PAR 2.0\0FileDesc":
            hash = data[offset+32:offset+48]
            filename = data[offset+72:].strip('\0')
            return filename, hash

    return nothing


RE_SUBS = re.compile(r'\W+subs(?![a-z])', re.I)
def is_cloaked(path, names):
    """ Return True if this is likely to be a cloaked encrypted post """
    fname = unicoder(os.path.split(path)[1]).lower()
    fname = os.path.splitext(fname)[0]
    for name in names:
        name = os.path.split(name.lower())[1]
        name, ext = os.path.splitext(unicoder(name))
        if ext == u'.rar' and fname.startswith(name) and (len(fname) - len(name)) < 8 and not RE_SUBS.search(fname):
            logging.debug('File %s is probably encrypted due to RAR with same name inside this RAR', fname)
            return True
        elif 'password' in name:
            logging.debug('RAR %s is probably encrypted: "password" in filename %s', fname, name)
            return True
    return False


def check_encrypted_rar(nzo, filepath):
    """ Check if file is rar and is encrypted """
    encrypted = False
    if  not nzo.password and not nzo.meta.get('password') and cfg.pause_on_pwrar() and is_rarfile(filepath):
        try:
            zf = RarFile(filepath, all_names=True)
            encrypted = zf.encrypted or is_cloaked(filepath, zf.namelist())
            if encrypted and int(nzo.encrypted) < 2 and not nzo.reuse:
                nzo.encrypted = 1
            else:
                encrypted = False
            zf.close()
            del zf
        except:
            logging.debug('RAR file %s cannot be inspected', filepath)
    return encrypted


def rar_contains_unwanted_file(nzo, filepath):
    # checks for unwanted extensions in the rar file 'filepath'
    # ... unwanted extensions are defined in global variable cfg.unwanted_extensions()
    # returns False if no unwanted extensions are found in the rar file
    # returns name of file if unwanted extension is found in the rar file
    unwanted = False
    if is_rarfile(filepath):
        #logging.debug('rar file to check: %s',filepath)
        #logging.debug('unwanted extensions are: %s', cfg.unwanted_extensions())
        try:
            zf = RarFile(filepath, all_names=True)
            #logging.debug('files in rar file: %s', zf.namelist())
            for somefile in zf.namelist() :
                logging.debug('file in rar file: %s', somefile)
                if os.path.splitext(somefile)[1].replace('.', '').lower() in cfg.unwanted_extensions():
                    logging.debug('Unwanted file %s', somefile)
                    unwanted = True
                    zf.close()
        except:
            logging.debug('RAR file %s cannot be inspected.', filepath)
    return unwanted