sabnzbd/sabnzbd/urlgrabber.py

#!/usr/bin/python -OO
# Copyright 2008-2015 The SABnzbd-Team <team@sabnzbd.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

"""
sabnzbd.urlgrabber - Queue for grabbing NZB files from websites
"""

import os
import sys
import time
import re
import logging
import Queue
import urllib2
from threading import Thread

import sabnzbd
from sabnzbd.constants import FUTURE_Q_FOLDER, Status
from sabnzbd.encoding import unicoder
import sabnzbd.misc as misc
import sabnzbd.dirscanner as dirscanner
from sabnzbd.nzbqueue import NzbQueue
import sabnzbd.cfg as cfg
import sabnzbd.emailer as emailer
import sabnzbd.notifier as notifier


_BAD_GZ_HOSTS = ('.zip', 'nzbsa.co.za', 'newshost.za.net')
_RARTING_FIELDS = ('x-rating-id', 'x-rating-url', 'x-rating-host', 'x-rating-video', 'x-rating-videocnt', 'x-rating-audio', 'x-rating-audiocnt', 
                    'x-rating-voteup', 'x-rating-votedown', 'x-rating-spam', 'x-rating-confirmed-spam', 'x-rating-passworded', 'x-rating-confirmed-passworded')


class URLGrabber(Thread):
    do = None  # Link to instance of the thread

    def __init__(self):
        Thread.__init__(self)
        self.queue = Queue.Queue()
        for tup in NzbQueue.do.get_urls():
            url, nzo = tup
            self.queue.put((url, nzo))
        self.shutdown = False
        URLGrabber.do = self

    def add(self, url, future_nzo, when=None):
        """ Add an URL to the URLGrabber queue, 'when' is seconds from now """
        if when and future_nzo:
            future_nzo.wait = time.time() + when
        self.queue.put((url, future_nzo))

    def stop(self):
        logging.info('URLGrabber shutting down')
        self.shutdown = True
        self.add(None, None)

    def run(self):
        logging.info('URLGrabber starting up')
        self.shutdown = False

        while not self.shutdown:
            # Don't pound the website!
            time.sleep(5.0)

            (url, future_nzo) = self.queue.get()

            if not url:
                # stop signal, go test self.shutdown
                continue
            if future_nzo and future_nzo.wait and future_nzo.wait > time.time():
                # Re-queue when too early and still active

                self.add(url, future_nzo)
                continue
            url = url.replace(' ', '')

            try:
                if future_nzo:
                    # If nzo entry deleted, give up
                    try:
                        deleted = future_nzo.deleted
                    except AttributeError:
                        deleted = True
                    if deleted:
                        logging.debug('Dropping URL %s, job entry missing', url)
                        continue

                logging.info('Grabbing URL %s', url)
                req = urllib2.Request(url)
                req.add_header('User-Agent', 'SABnzbd+/%s' % sabnzbd.version.__version__)
                if not any(item in url for item in _BAD_GZ_HOSTS):
                    req.add_header('Accept-encoding', 'gzip')
                filename = None
                category = None
                gzipped = False
                nzo_info = {}
                wait = 0
                retry = True
                fn = None
                try:
                    fn = urllib2.urlopen(req)
                except:
                    # Cannot list exceptions here, because of unpredictability over platforms
                    error0 = str(sys.exc_info()[0]).lower()
                    error1 = str(sys.exc_info()[1]).lower()
                    logging.debug('Error "%s" trying to get the url %s', error1, url)
                    if 'certificate_verify_failed' in error1 or 'certificateerror' in error0:
                        msg = T('Server %s uses an untrusted HTTPS certificate') % ''
                        retry = False
                    elif 'nodename nor servname provided' in error1:
                        msg = T('Server name does not resolve')
                        retry = False
                    elif '401' in error1 or 'unauthorized' in error1:
                        msg = T('Unauthorized access')
                        retry = False
                    elif '404' in error1:
                        msg = T('File not on server')
                        retry = False

                new_url = dereferring(url, fn)
                if new_url:
                    self.add(new_url, future_nzo)
                    continue

                if fn:
                    for hdr in fn.headers:
                        try:
                            item = hdr.lower()
                            value = fn.headers[hdr]
                        except:
                            continue
                        if item in ('content-encoding',) and value == 'gzip':
                            gzipped = True
                        if item in ('category_id', 'x-dnzb-category'):
                            category = value
                        elif item in ('x-dnzb-moreinfo',):
                            nzo_info['more_info'] = value
                        elif item in ('x-dnzb-name',):
                            filename = value
                            if not filename.endswith('.nzb'):
                                filename += '.nzb'
                        elif item == 'x-dnzb-propername':
                            nzo_info['propername'] = value
                        elif item == 'x-dnzb-episodename':
                            nzo_info['episodename'] = value
                        elif item == 'x-dnzb-year':
                            nzo_info['year'] = value
                        elif item == 'x-dnzb-failure':
                            nzo_info['failure'] = value
                        elif item == 'x-dnzb-details':
                            nzo_info['details'] = value
                        elif item == 'x-dnzb-password':
                            nzo_info['password'] = value
                        elif item == 'retry-after':
                            # For NZBFinder
                            wait = misc.int_conv(value)

                        # Rating fields
                        if item in _RARTING_FIELDS:
                            nzo_info[item] = value

                        if not filename and "filename=" in value:
                            filename = value[value.index("filename=") + 9:].strip(';').strip('"')

                if wait:
                    # For sites that have a rate-limiting attribute
                    msg = ''
                    retry = True
                    fn = None
                elif retry:
                    fn, msg, retry, wait, data = _analyse(fn, url)

                if not fn:
                    if retry:
                        logging.info('Retry URL %s', url)
                        self.add(url, future_nzo, wait)
                    else:
                        bad_fetch(future_nzo, url, msg)
                    continue

                if not filename:
                    filename = os.path.basename(url)
                elif '&nzbname=' in filename:
                    # Sometimes the filename contains the full URL, duh!
                    filename = filename[filename.find('&nzbname=') + 9:]

                pp = future_nzo.pp
                script = future_nzo.script
                cat = future_nzo.cat
                if (cat is None or cat == '*') and category:
                    cat = misc.cat_convert(category)
                priority = future_nzo.priority
                nzbname = future_nzo.custom_name

                # process data
                if gzipped:
                    filename += '.gz'
                if not data:
                    data = fn.read()
                fn.close()

                if '<nzb' in data and misc.get_ext(filename) != '.nzb':
                    filename += '.nzb'

                # Sanatize filename first
                filename = misc.sanitize_filename(filename)

                # Write data to temp file
                path = os.path.join(cfg.admin_dir.get_path(), FUTURE_Q_FOLDER)
                path = os.path.join(path, filename)
                f = open(path, 'wb')
                f.write(data)
                f.close()
                del data

                # Check if nzb file
                if misc.get_ext(filename) in ('.nzb', '.gz', 'bz2'):
                    res = dirscanner.ProcessSingleFile(filename, path, pp=pp, script=script, cat=cat, priority=priority,
                                                       nzbname=nzbname, nzo_info=nzo_info, url=future_nzo.url, keep=False,
                                                       nzo_id=future_nzo.nzo_id)[0]
                    if res:
                        if res == -2:
                            logging.info('Incomplete NZB, retry after 5 min %s', url)
                            when = 300
                        elif res == -1:
                            # Error, but no reason to retry. Warning is already given
                            NzbQueue.do.remove(future_nzo.nzo_id, add_to_history=False)
                            continue
                        else:
                            logging.info('Unknown error fetching NZB, retry after 2 min %s', url)
                            when = 120
                        self.add(url, future_nzo, when)
                # Check if a supported archive
                else:
                    status, zf, exp_ext = dirscanner.is_archive(path)
                    if status == 0:
                        if misc.get_ext(filename) not in ('.rar', '.zip', '.7z'):
                            filename = filename + exp_ext
                            os.rename(path, path + exp_ext)
                            path = path + exp_ext

                        dirscanner.ProcessArchiveFile(filename, path, pp, script, cat, priority=priority,
                                                     nzbname=nzbname, url=future_nzo.url, keep=False,
                                                     nzo_id=future_nzo.nzo_id)
                        # Not a supported filetype, not an nzb (text/html ect)
                        try:
                            os.remove(fn)
                        except:
                            pass
                        logging.info('Unknown filetype when fetching NZB, retry after 30s %s', url)
                        self.add(url, future_nzo, 30)
            except:
                logging.error(T('URLGRABBER CRASHED'), exc_info=True)
                logging.debug("URLGRABBER Traceback: ", exc_info=True)


def _analyse(fn, url):
    """ Analyze response of indexer
        returns fn|None, error-message|None, retry, wait-seconds, data
    """
    data = None
    if not fn or fn.code != 200:
        logging.debug('No usable response from indexer, retry after 60 sec')
        if fn:
            msg = fn.msg
        else:
            msg = ''
        return None, msg, True, 60, data

    # Check for an error response
    if not fn or fn.msg != 'OK':
        logging.debug('Received nothing from indexer, retry after 60 sec')
        return None, fn.msg, True, 60, data

    if '.oznzb.com' in url and 'login?' in fn.url:
        return None, T('Unauthorized access'), False, 0, data

    return fn, fn.msg, False, 0, data


def dereferring(url, fn):
    """ Find out if we're being diverted to another location.
        If so, return new url else None
    """
    if 'derefer.me' in url:
        _RE_DEREFER = re.compile(r'content=".*url=([^"]+)">')
        data = fn.read()
        for line in data.split('\n'):
            if '<meta' in line:
                m = _RE_DEREFER.search(data)
                if m:
                    return m.group(1)
    return None


def bad_fetch(nzo, url, msg='', content=False):
    """ Create History entry for failed URL Fetch
        msg : message to be logged
        retry : make retry link in history
        content : report in history that cause is a bad NZB file
    """
    if msg:
        msg = unicoder(msg)
    else:
        msg = ''

    nzo.status = Status.FAILED

    if url:
        nzo.filename = url
        nzo.final_name = url.strip()

    if content:
        # Bad content
        msg = T('Unusable NZB file')
    else:
        # Failed fetch
        msg = T('URL Fetching failed; %s') % msg

    nzo.fail_msg = msg

    notifier.send_notification(T('URL Fetching failed; %s') % '', '%s\n%s' % (msg, url), 'other')
    if cfg.email_endjob() > 0:
        emailer.badfetch_mail(msg, url)

    NzbQueue.do.remove(nzo.nzo_id, add_to_history=True)