sabnzbd/sabnzbd/urlgrabber.py

#!/usr/bin/python3 -OO
# Copyright 2007-2021 The SABnzbd-Team <team@sabnzbd.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.

"""
sabnzbd.urlgrabber - Queue for grabbing NZB files from websites
"""

import os
import sys
import time
import logging
import queue
import urllib.request
import urllib.error
import urllib.parse
from http.client import IncompleteRead, HTTPResponse
from threading import Thread
import base64
from typing import Tuple, Optional

import sabnzbd
from sabnzbd.constants import DEF_TIMEOUT, FUTURE_Q_FOLDER, VALID_NZB_FILES, Status, VALID_ARCHIVES
import sabnzbd.misc as misc
import sabnzbd.filesystem
import sabnzbd.cfg as cfg
import sabnzbd.emailer as emailer
import sabnzbd.notifier as notifier
from sabnzbd.encoding import ubtou, utob
from sabnzbd.nzbstuff import NzbObject

_RARTING_FIELDS = (
    "x-rating-id",
    "x-rating-url",
    "x-rating-host",
    "x-rating-video",
    "x-rating-videocnt",
    "x-rating-audio",
    "x-rating-audiocnt",
    "x-rating-voteup",
    "x-rating-votedown",
    "x-rating-spam",
    "x-rating-confirmed-spam",
    "x-rating-passworded",
    "x-rating-confirmed-passworded",
)


class URLGrabber(Thread):
    def __init__(self):
        super().__init__()
        self.queue: queue.Queue[Tuple[Optional[str], Optional[NzbObject]]] = queue.Queue()
        for url_nzo_tup in sabnzbd.NzbQueue.get_urls():
            self.queue.put(url_nzo_tup)
        self.shutdown = False

    def add(self, url: str, future_nzo: NzbObject, when: Optional[int] = None):
        """Add an URL to the URLGrabber queue, 'when' is seconds from now"""
        if future_nzo and when:
            # Always increase counter
            future_nzo.url_tries += 1

            # Too many tries? Cancel
            if future_nzo.url_tries > cfg.max_url_retries():
                self.fail_to_history(future_nzo, url, T("Maximum retries"))
                return

            future_nzo.url_wait = time.time() + when

        self.queue.put((url, future_nzo))

    def stop(self):
        self.shutdown = True
        self.queue.put((None, None))

    def run(self):
        self.shutdown = False
        while not self.shutdown:
            # Set NzbObject object to None so reference from this thread
            # does not keep the object alive in the future (see #1628)
            future_nzo = None
            url, future_nzo = self.queue.get()

            if not url:
                # stop signal, go test self.shutdown
                continue

            if future_nzo:
                # Re-queue when too early and still active
                if future_nzo.url_wait and future_nzo.url_wait > time.time():
                    self.add(url, future_nzo)
                    time.sleep(1.0)
                    continue
                # Paused
                if future_nzo.status == Status.PAUSED:
                    self.add(url, future_nzo)
                    time.sleep(1.0)
                    continue

            url = url.replace(" ", "")

            try:
                if future_nzo:
                    # If nzo entry deleted, give up
                    try:
                        deleted = future_nzo.deleted
                    except AttributeError:
                        deleted = True
                    if deleted:
                        logging.debug("Dropping URL %s, job entry missing", url)
                        continue

                filename = None
                category = None
                nzo_info = {}
                wait = 0
                retry = True
                fetch_request = None

                logging.info("Grabbing URL %s", url)
                try:
                    fetch_request = _build_request(url)
                except Exception as e:
                    # Cannot list exceptions here, because of unpredictability over platforms
                    error0 = str(sys.exc_info()[0]).lower()
                    error1 = str(sys.exc_info()[1]).lower()
                    logging.debug('Error "%s" trying to get the url %s', error1, url)
                    if "certificate_verify_failed" in error1 or "certificateerror" in error0:
                        msg = T("Server %s uses an untrusted HTTPS certificate") % ""
                        msg += " - https://sabnzbd.org/certificate-errors"
                        retry = False
                    elif "nodename nor servname provided" in error1:
                        msg = T("Server name does not resolve")
                        retry = False
                    elif "401" in error1 or "unauthorized" in error1:
                        msg = T("Unauthorized access")
                        retry = False
                    elif "404" in error1:
                        msg = T("File not on server")
                        retry = False
                    elif hasattr(e, "headers") and "retry-after" in e.headers:
                        # Catch if the server send retry (e.headers is case-INsensitive)
                        wait = misc.int_conv(e.headers["retry-after"])

                if fetch_request:
                    for hdr in fetch_request.headers:
                        try:
                            item = hdr.lower()
                            value = fetch_request.headers[hdr]
                        except:
                            continue
                        if item in ("category_id", "x-dnzb-category"):
                            category = value
                        elif item in ("x-dnzb-moreinfo",):
                            nzo_info["more_info"] = value
                        elif item in ("x-dnzb-name",):
                            filename = value
                            if not filename.endswith(".nzb"):
                                filename += ".nzb"
                        elif item == "x-dnzb-propername":
                            nzo_info["propername"] = value
                        elif item == "x-dnzb-episodename":
                            nzo_info["episodename"] = value
                        elif item == "x-dnzb-year":
                            nzo_info["year"] = value
                        elif item == "x-dnzb-failure":
                            nzo_info["failure"] = value
                        elif item == "x-dnzb-details":
                            nzo_info["details"] = value
                        elif item == "x-dnzb-password":
                            nzo_info["password"] = value
                        elif item == "retry-after":
                            wait = misc.int_conv(value)

                        # Rating fields
                        if item in _RARTING_FIELDS:
                            nzo_info[item] = value

                        # Get filename from Content-Disposition header
                        if not filename and "filename=" in value:
                            filename = value[value.index("filename=") + 9 :].strip(";").strip('"')

                if wait:
                    # For sites that have a rate-limiting attribute
                    msg = ""
                    retry = True
                    fetch_request = None
                elif retry:
                    fetch_request, msg, retry, wait, data = _analyse(fetch_request, future_nzo)

                if not fetch_request:
                    if retry:
                        logging.info("Retry URL %s", url)
                        self.add(url, future_nzo, wait)
                    else:
                        self.fail_to_history(future_nzo, url, msg)
                    continue

                if not filename:
                    filename = os.path.basename(urllib.parse.unquote(url))

                    # URL was redirected, maybe the redirect has better filename?
                    # Check if the original URL has extension
                    if (
                        url != fetch_request.geturl()
                        and sabnzbd.filesystem.get_ext(filename) not in VALID_NZB_FILES + VALID_ARCHIVES
                    ):
                        filename = os.path.basename(urllib.parse.unquote(fetch_request.geturl()))
                elif "&nzbname=" in filename:
                    # Sometimes the filename contains the full URL, duh!
                    filename = filename[filename.find("&nzbname=") + 9 :]

                pp = future_nzo.pp
                script = future_nzo.script
                cat = future_nzo.cat
                if (cat is None or cat == "*") and category:
                    cat = misc.cat_convert(category)
                priority = future_nzo.priority
                nzbname = future_nzo.custom_name

                # process data
                if not data:
                    try:
                        data = fetch_request.read()
                    except (IncompleteRead, IOError):
                        self.fail_to_history(future_nzo, url, T("Server could not complete request"))
                        fetch_request.close()
                        continue
                fetch_request.close()

                if b"<nzb" in data and sabnzbd.filesystem.get_ext(filename) != ".nzb":
                    filename += ".nzb"

                # Sanitize filename first (also removing forbidden Windows-names)
                filename = sabnzbd.filesystem.sanitize_filename(filename)

                # If no filename, make one
                if not filename:
                    filename = sabnzbd.get_new_id("url", os.path.join(cfg.admin_dir.get_path(), FUTURE_Q_FOLDER))

                # Write data to temp file
                path = os.path.join(cfg.admin_dir.get_path(), FUTURE_Q_FOLDER, filename)
                with open(path, "wb") as temp_nzb:
                    temp_nzb.write(data)

                # Check if nzb file
                if sabnzbd.filesystem.get_ext(filename) in VALID_ARCHIVES + VALID_NZB_FILES:
                    res, _ = sabnzbd.add_nzbfile(
                        path,
                        pp=pp,
                        script=script,
                        cat=cat,
                        priority=priority,
                        nzbname=nzbname,
                        nzo_info=nzo_info,
                        url=future_nzo.url,
                        keep=False,
                        password=future_nzo.password,
                        nzo_id=future_nzo.nzo_id,
                    )
                    # -2==Error/retry, -1==Error, 0==OK, 1==Empty
                    if res == -2:
                        logging.info("Incomplete NZB, retry after 5 min %s", url)
                        self.add(url, future_nzo, when=300)
                    elif res == -1:
                        # Error already thrown
                        self.fail_to_history(future_nzo, url)
                    elif res == 1:
                        # No NZB-files inside archive
                        self.fail_to_history(future_nzo, url, T("Empty NZB file %s") % filename)
                else:
                    logging.info("Unknown filetype when fetching NZB, retry after 30s %s", url)
                    self.add(url, future_nzo, 30)

                # Always clean up what we wrote to disk
                try:
                    sabnzbd.filesystem.remove_file(path)
                except:
                    pass
            except:
                logging.error(T("URLGRABBER CRASHED"), exc_info=True)
                logging.debug("URLGRABBER Traceback: ", exc_info=True)

    @staticmethod
    def fail_to_history(nzo: NzbObject, url: str, msg="", content=False):
        """Create History entry for failed URL Fetch
        msg: message to be logged
        content: report in history that cause is a bad NZB file
        """
        # Remove the "Trying to fetch" part
        if url:
            nzo.filename = url
            nzo.final_name = url.strip()

        if content:
            # Bad content
            msg = T("Unusable NZB file")
        else:
            # Failed fetch
            msg = T("URL Fetching failed; %s") % msg

        # Mark as failed
        nzo.set_unpack_info("Source", msg)
        nzo.fail_msg = msg

        notifier.send_notification(T("URL Fetching failed; %s") % "", "%s\n%s" % (msg, url), "failed", nzo.cat)
        if cfg.email_endjob() > 0:
            emailer.badfetch_mail(msg, url)

        # Parse category to make sure script is set correctly after a grab
        nzo.cat, _, nzo.script, _ = misc.cat_to_opts(nzo.cat, script=nzo.script)

        # Add to history and run script if desired
        sabnzbd.NzbQueue.remove(nzo.nzo_id)
        sabnzbd.PostProcessor.process(nzo)


def _build_request(url: str) -> HTTPResponse:
    # Detect basic auth
    # Adapted from python-feedparser
    user_passwd = None
    u = urllib.parse.urlparse(url)
    if u.username is not None or u.password is not None:
        if u.username and u.password:
            user_passwd = "%s:%s" % (u.username, u.password)
        host_port = u.hostname
        if u.port:
            host_port += ":" + str(u.port)
        url = urllib.parse.urlunparse(u._replace(netloc=host_port))

    # Start request
    req = urllib.request.Request(url)

    # Add headers
    req.add_header("User-Agent", "SABnzbd/%s" % sabnzbd.__version__)
    req.add_header("Accept-encoding", "gzip")
    if user_passwd:
        req.add_header("Authorization", "Basic " + ubtou(base64.b64encode(utob(user_passwd))).strip())
    return urllib.request.urlopen(req)


def _analyse(fetch_request: HTTPResponse, future_nzo: NzbObject):
    """Analyze response of indexer
    returns fetch_request|None, error-message|None, retry, wait-seconds, data
    """
    data = None
    if not fetch_request or fetch_request.getcode() != 200:
        if fetch_request:
            msg = fetch_request.msg
        else:
            msg = ""

        # Increasing wait-time in steps for standard errors
        when = DEF_TIMEOUT * (future_nzo.url_tries + 1)
        logging.debug("No usable response from indexer, retry after %s sec", when)
        return None, msg, True, when, data

    return fetch_request, fetch_request.msg, False, 0, data