You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

832 lines
29 KiB

#!/usr/bin/python3 -OO
# Copyright 2007-2020 The SABnzbd-Team <team@sabnzbd.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
sabnzbd.rss - rss client functionality
"""
import re
import logging
import time
import datetime
import threading
import sabnzbd
from sabnzbd.constants import RSS_FILE_NAME, DEFAULT_PRIORITY, NORMAL_PRIORITY, DUP_PRIORITY
from sabnzbd.decorators import synchronized
import sabnzbd.config as config
import sabnzbd.cfg as cfg
from sabnzbd.misc import cat_convert, wildcard_to_re, cat_to_opts, match_str, from_units, int_conv, get_base_url
import sabnzbd.emailer as emailer
import feedparser
__RSS = None # Global pointer to RSS-scanner instance
##############################################################################
# Wrapper functions
##############################################################################
def init():
global __RSS
__RSS = RSSQueue()
def stop():
global __RSS
if __RSS:
__RSS.stop()
try:
__RSS.join()
except:
pass
def run_feed(feed, download, ignoreFirst=False, force=False, readout=True):
global __RSS
if __RSS:
return __RSS.run_feed(feed, download, ignoreFirst, force=force, readout=readout)
def show_result(feed):
global __RSS
if __RSS:
return __RSS.show_result(feed)
def flag_downloaded(feed, fid):
global __RSS
if __RSS:
__RSS.flag_downloaded(feed, fid)
def lookup_url(feed, fid):
global __RSS
if __RSS:
return __RSS.lookup_url(feed, fid)
def run_method():
global __RSS
if __RSS:
return __RSS.run()
else:
return None
def next_run(t=None):
global __RSS
if __RSS:
if t:
__RSS.next_run = t
else:
return __RSS.next_run
else:
return time.time()
def save():
global __RSS
if __RSS:
__RSS.save()
def clear_feed(feed):
global __RSS
if __RSS:
__RSS.clear_feed(feed)
def clear_downloaded(feed):
global __RSS
if __RSS:
__RSS.clear_downloaded(feed)
##############################################################################
def notdefault(item):
""" Return True if not 'Default|''|*' """
return bool(item) and str(item).lower() not in ("default", "*", "", str(DEFAULT_PRIORITY))
def convert_filter(text):
""" Return compiled regex.
If string starts with re: it's a real regex
else quote all regex specials, replace '*' by '.*'
"""
text = text.strip().lower()
if text.startswith("re:"):
txt = text[3:].strip()
else:
txt = wildcard_to_re(text)
try:
return re.compile(txt, re.I)
except:
logging.debug("Could not compile regex: %s", text)
return None
def remove_obsolete(jobs, new_jobs):
""" Expire G/B links that are not in new_jobs (mark them 'X')
Expired links older than 3 days are removed from 'jobs'
"""
now = time.time()
limit = now - 259200 # 3days (3x24x3600)
olds = list(jobs.keys())
for old in olds:
tm = jobs[old]["time"]
if old not in new_jobs:
if jobs[old].get("status", " ")[0] in ("G", "B"):
jobs[old]["status"] = "X"
if jobs[old]["status"] == "X" and tm < limit:
logging.debug("Purging link %s", old)
del jobs[old]
LOCK = threading.RLock()
_RE_SP = re.compile(r"s*(\d+)[ex](\d+)", re.I)
_RE_SIZE1 = re.compile(r"Size:\s*(\d+\.\d+\s*[KMG]{0,1})B\W*", re.I)
_RE_SIZE2 = re.compile(r"\W*(\d+\.\d+\s*[KMG]{0,1})B\W*", re.I)
class RSSQueue:
def __init__(self):
self.jobs = {}
self.next_run = time.time()
self.shutdown = False
try:
self.jobs = sabnzbd.load_admin(RSS_FILE_NAME)
if self.jobs:
for feed in self.jobs:
remove_obsolete(self.jobs[feed], list(self.jobs[feed].keys()))
except:
logging.warning(T("Cannot read %s"), RSS_FILE_NAME)
logging.info("Traceback: ", exc_info=True)
# Storage needs to be dict
if not self.jobs:
self.jobs = {}
# jobs is a NAME-indexed dictionary
# Each element is link-indexed dictionary
# Each element is another dictionary:
# status : 'D', 'G', 'B', 'X' (downloaded, good-match, bad-match, obsolete)
# '*' added means: from the initial batch
# '-' added to 'D' means downloaded, but not displayed anymore
# title : Title
# url : URL
# cat : category
# orgcat : category as read from feed
# pp : pp
# script : script
# prio : priority
# time : timestamp (used for time-based clean-up)
# size : size in bytes
# age : age in datetime format as specified by feed
# season : season number (if applicable)
# episode : episode number (if applicable)
# Patch feedparser
patch_feedparser()
def stop(self):
self.shutdown = True
@synchronized(LOCK)
def run_feed(self, feed=None, download=False, ignoreFirst=False, force=False, readout=True):
""" Run the query for one URI and apply filters """
self.shutdown = False
if not feed:
return "No such feed"
newlinks = []
new_downloads = []
# Preparations, get options
try:
feeds = config.get_rss()[feed]
except KeyError:
logging.error(T('Incorrect RSS feed description "%s"'), feed)
logging.info("Traceback: ", exc_info=True)
return T('Incorrect RSS feed description "%s"') % feed
uris = feeds.uri()
defCat = feeds.cat()
import sabnzbd.api
if not notdefault(defCat) or defCat not in sabnzbd.api.list_cats(default=False):
defCat = None
defPP = feeds.pp()
if not notdefault(defPP):
defPP = None
defScript = feeds.script()
if not notdefault(defScript):
defScript = None
defPrio = feeds.priority()
if not notdefault(defPrio):
defPrio = None
# Preparations, convert filters to regex's
regexes = []
reTypes = []
reCats = []
rePPs = []
rePrios = []
reScripts = []
reEnabled = []
for feed_filter in feeds.filters():
reCat = feed_filter[0]
if defCat in ("", "*"):
reCat = None
reCats.append(reCat)
rePPs.append(feed_filter[1])
reScripts.append(feed_filter[2])
reTypes.append(feed_filter[3])
if feed_filter[3] in ("<", ">", "F", "S"):
regexes.append(feed_filter[4])
else:
regexes.append(convert_filter(feed_filter[4]))
rePrios.append(feed_filter[5])
reEnabled.append(feed_filter[6] != "0")
regcount = len(regexes)
# Set first if this is the very first scan of this URI
first = (feed not in self.jobs) and ignoreFirst
# Add SABnzbd's custom User Agent
feedparser.USER_AGENT = "SABnzbd+/%s" % sabnzbd.version.__version__
# Read the RSS feed
msg = None
entries = None
if readout:
all_entries = []
for uri in uris:
uri = uri.replace(" ", "%20")
logging.debug("Running feedparser on %s", uri)
feed_parsed = feedparser.parse(uri.replace("feed://", "http://"))
logging.debug("Done parsing %s", uri)
if not feed_parsed:
msg = T("Failed to retrieve RSS from %s: %s") % (uri, "?")
logging.info(msg)
status = feed_parsed.get("status", 999)
if status in (401, 402, 403):
msg = T("Do not have valid authentication for feed %s") % uri
logging.info(msg)
if 500 <= status <= 599:
msg = T("Server side error (server code %s); could not get %s on %s") % (status, feed, uri)
logging.info(msg)
entries = feed_parsed.get("entries")
if "bozo_exception" in feed_parsed and not entries:
msg = str(feed_parsed["bozo_exception"])
if "CERTIFICATE_VERIFY_FAILED" in msg:
msg = T("Server %s uses an untrusted HTTPS certificate") % get_base_url(uri)
msg += " - https://sabnzbd.org/certificate-errors"
logging.error(msg)
elif "href" in feed_parsed and feed_parsed["href"] != uri and "login" in feed_parsed["href"]:
# Redirect to login page!
msg = T("Do not have valid authentication for feed %s") % uri
else:
msg = T("Failed to retrieve RSS from %s: %s") % (uri, msg)
logging.info(msg)
if not entries and not msg:
msg = T("RSS Feed %s was empty") % uri
logging.info(msg)
all_entries.extend(entries)
entries = all_entries
# In case of a new feed
if feed not in self.jobs:
self.jobs[feed] = {}
jobs = self.jobs[feed]
# Error in readout or now new readout
if readout:
if not entries:
return msg
else:
entries = jobs
# Filter out valid new links
for entry in entries:
if self.shutdown:
return
if readout:
try:
link, infourl, category, size, age, season, episode = _get_link(entry)
except (AttributeError, IndexError):
logging.info(T("Incompatible feed") + " " + uri)
logging.info("Traceback: ", exc_info=True)
return T("Incompatible feed")
title = entry.title
# If there's multiple feeds, remove the duplicates based on title and size
if len(uris) > 1:
skip_job = False
for job_link, job in jobs.items():
# Allow 5% size deviation because indexers might have small differences for same release
if (
job.get("title") == title
and link != job_link
and (job.get("size") * 0.95) < size < (job.get("size") * 1.05)
):
logging.info("Ignoring job %s from other feed", title)
skip_job = True
break
if skip_job:
continue
else:
link = entry
infourl = jobs[link].get("infourl", "")
category = jobs[link].get("orgcat", "")
if category in ("", "*"):
category = None
title = jobs[link].get("title", "")
size = jobs[link].get("size", 0)
age = jobs[link].get("age")
season = jobs[link].get("season", 0)
episode = jobs[link].get("episode", 0)
if link:
# Make sure spaces are quoted in the URL
link = link.strip().replace(" ", "%20")
newlinks.append(link)
if link in jobs:
jobstat = jobs[link].get("status", " ")[0]
else:
jobstat = "N"
if jobstat in "NGB" or (jobstat == "X" and readout):
# Match this title against all filters
logging.debug("Trying title %s", title)
result = False
myCat = defCat
myPP = defPP
myScript = defScript
myPrio = defPrio
n = 0
if ("F" in reTypes or "S" in reTypes) and (not season or not episode):
season, episode = sabnzbd.newsunpack.analyse_show(title)[1:3]
# Match against all filters until an positive or negative match
logging.debug("Size %s", size)
for n in range(regcount):
if reEnabled[n]:
if category and reTypes[n] == "C":
found = re.search(regexes[n], category)
if not found:
logging.debug("Filter rejected on rule %d", n)
result = False
break
elif reTypes[n] == "<" and size and from_units(regexes[n]) < size:
# "Size at most" : too large
logging.debug("Filter rejected on rule %d", n)
result = False
break
elif reTypes[n] == ">" and size and from_units(regexes[n]) > size:
# "Size at least" : too small
logging.debug("Filter rejected on rule %d", n)
result = False
break
elif reTypes[n] == "F" and not ep_match(season, episode, regexes[n]):
# "Starting from SxxEyy", too early episode
logging.debug("Filter requirement match on rule %d", n)
result = False
break
elif (
reTypes[n] == "S"
and season
and episode
and ep_match(season, episode, regexes[n], title)
):
logging.debug("Filter matched on rule %d", n)
result = True
break
else:
if regexes[n]:
found = re.search(regexes[n], title)
else:
found = False
if reTypes[n] == "M" and not found:
logging.debug("Filter rejected on rule %d", n)
result = False
break
if found and reTypes[n] == "A":
logging.debug("Filter matched on rule %d", n)
result = True
break
if found and reTypes[n] == "R":
logging.debug("Filter rejected on rule %d", n)
result = False
break
if len(reCats):
if not result and defCat:
# Apply Feed-category on non-matched items
myCat = defCat
elif result and notdefault(reCats[n]):
# Use the matched info
myCat = reCats[n]
elif category and not defCat:
# No result and no Feed-category
myCat = cat_convert(category)
if myCat:
myCat, catPP, catScript, catPrio = cat_to_opts(myCat)
else:
myCat = catPP = catScript = catPrio = None
if notdefault(rePPs[n]):
myPP = rePPs[n]
elif not (reCats[n] or category):
myPP = catPP
if notdefault(reScripts[n]):
myScript = reScripts[n]
elif not (notdefault(reCats[n]) or category):
myScript = catScript
if rePrios[n] not in (str(DEFAULT_PRIORITY), ""):
myPrio = rePrios[n]
elif not ((rePrios[n] != str(DEFAULT_PRIORITY)) or category):
myPrio = catPrio
if cfg.no_dupes() and self.check_duplicate(title):
if cfg.no_dupes() == 1:
# Dupe-detection: Discard
logging.info("Ignoring duplicate job %s", title)
continue
elif cfg.no_dupes() == 3:
# Dupe-detection: Fail
# We accept it so the Queue can send it to the History
logging.info("Found duplicate job %s", title)
else:
# Dupe-detection: Pause
myPrio = DUP_PRIORITY
act = download and not first
if link in jobs:
act = act and not jobs[link].get("status", "").endswith("*")
act = act or force
star = first or jobs[link].get("status", "").endswith("*")
else:
star = first
if result:
_HandleLink(
jobs,
link,
infourl,
title,
size,
age,
season,
episode,
"G",
category,
myCat,
myPP,
myScript,
act,
star,
priority=myPrio,
rule=n,
)
if act:
new_downloads.append(title)
else:
_HandleLink(
jobs,
link,
infourl,
title,
size,
age,
season,
episode,
"B",
category,
myCat,
myPP,
myScript,
False,
star,
priority=myPrio,
rule=n,
)
# Send email if wanted and not "forced"
if new_downloads and cfg.email_rss() and not force:
emailer.rss_mail(feed, new_downloads)
remove_obsolete(jobs, newlinks)
return msg
def run(self):
""" Run all the URI's and filters """
if not sabnzbd.PAUSED_ALL:
active = False
if self.next_run < time.time():
self.next_run = time.time() + cfg.rss_rate.get() * 60
feeds = config.get_rss()
try:
for feed in feeds:
if feeds[feed].enable.get():
logging.info('Starting scheduled RSS read-out for "%s"', feed)
active = True
self.run_feed(feed, download=True, ignoreFirst=True)
# Wait 15 seconds, else sites may get irritated
for unused in range(15):
if self.shutdown:
return
else:
time.sleep(1.0)
except (KeyError, RuntimeError):
# Feed must have been deleted
logging.info("RSS read-out crashed, feed must have been deleted or edited")
logging.debug("Traceback: ", exc_info=True)
pass
if active:
self.save()
logging.info("Finished scheduled RSS read-outs")
@synchronized(LOCK)
def show_result(self, feed):
if feed in self.jobs:
try:
return self.jobs[feed]
except:
return {}
else:
return {}
@synchronized(LOCK)
def save(self):
sabnzbd.save_admin(self.jobs, RSS_FILE_NAME)
@synchronized(LOCK)
def delete(self, feed):
if feed in self.jobs:
del self.jobs[feed]
@synchronized(LOCK)
def flag_downloaded(self, feed, fid):
if feed in self.jobs:
lst = self.jobs[feed]
for link in lst:
if lst[link].get("url", "") == fid:
lst[link]["status"] = "D"
lst[link]["time_downloaded"] = time.localtime()
@synchronized(LOCK)
def lookup_url(self, feed, url):
if url and feed in self.jobs:
lst = self.jobs[feed]
for link in lst:
if lst[link].get("url") == url:
return lst[link]
return None
@synchronized(LOCK)
def clear_feed(self, feed):
# Remove any previous references to this feed name, and start fresh
if feed in self.jobs:
del self.jobs[feed]
@synchronized(LOCK)
def clear_downloaded(self, feed):
# Mark downloaded jobs, so that they won't be displayed any more.
if feed in self.jobs:
for item in self.jobs[feed]:
if self.jobs[feed][item]["status"] == "D":
self.jobs[feed][item]["status"] = "D-"
def check_duplicate(self, title):
""" Check if this title was in this or other feeds
Return matching feed name
"""
title = title.lower()
for fd in self.jobs:
for lk in self.jobs[fd]:
item = self.jobs[fd][lk]
if item.get("status", " ")[0] == "D" and item.get("title", "").lower() == title:
return fd
return ""
def patch_feedparser():
""" Apply options that work for SABnzbd
Add additional parsing of attributes
"""
feedparser.SANITIZE_HTML = 0
feedparser.PARSE_MICROFORMATS = 0
# Add our own namespace
feedparser._FeedParserMixin.namespaces["http://www.newznab.com/DTD/2010/feeds/attributes/"] = "newznab"
# Add parsers for the namespace
def _start_newznab_attr(self, attrsD):
context = self._getContext()
# Add the dict
if "newznab" not in context:
context["newznab"] = {}
# Don't crash when it fails
try:
# Add keys
context["newznab"][attrsD["name"]] = attrsD["value"]
# Try to get date-object
if attrsD["name"] == "usenetdate":
context["newznab"][attrsD["name"] + "_parsed"] = feedparser._parse_date(attrsD["value"])
except KeyError:
pass
feedparser._FeedParserMixin._start_newznab_attr = _start_newznab_attr
feedparser._FeedParserMixin._start_nZEDb_attr = _start_newznab_attr
feedparser._FeedParserMixin._start_nzedb_attr = _start_newznab_attr
feedparser._FeedParserMixin._start_nntmux_attr = _start_newznab_attr
def _HandleLink(
jobs,
link,
infourl,
title,
size,
age,
season,
episode,
flag,
orgcat,
cat,
pp,
script,
download,
star,
priority=NORMAL_PRIORITY,
rule=0,
):
""" Process one link """
if script == "":
script = None
if pp == "":
pp = None
jobs[link] = {}
jobs[link]["title"] = title
jobs[link]["url"] = link
jobs[link]["infourl"] = infourl
jobs[link]["cat"] = cat
jobs[link]["pp"] = pp
jobs[link]["script"] = script
jobs[link]["prio"] = str(priority)
jobs[link]["orgcat"] = orgcat
jobs[link]["size"] = size
jobs[link]["age"] = age
jobs[link]["time"] = time.time()
jobs[link]["rule"] = str(rule)
jobs[link]["season"] = season
jobs[link]["episode"] = episode
if special_rss_site(link):
nzbname = None
else:
nzbname = title
if download:
jobs[link]["status"] = "D"
jobs[link]["time_downloaded"] = time.localtime()
logging.info("Adding %s (%s) to queue", link, title)
sabnzbd.add_url(link, pp=pp, script=script, cat=cat, priority=priority, nzbname=nzbname)
else:
if star:
jobs[link]["status"] = flag + "*"
else:
jobs[link]["status"] = flag
def _get_link(entry):
""" Retrieve the post link from this entry
Returns (link, category, size)
"""
size = 0
age = datetime.datetime.now()
# Try standard link and enclosures first
link = entry.link
if not link:
link = entry.links[0].href
if "enclosures" in entry:
try:
link = entry.enclosures[0]["href"]
size = int(entry.enclosures[0]["length"])
except:
pass
# GUID usually has URL to result on page
infourl = None
if entry.get("id") and entry.id != link and entry.id.startswith("http"):
infourl = entry.id
if size == 0:
# Try to find size in Description
try:
desc = entry.description.replace("\n", " ").replace("&nbsp;", " ")
m = _RE_SIZE1.search(desc) or _RE_SIZE2.search(desc)
if m:
size = from_units(m.group(1))
except:
pass
# Try newznab attribute first, this is the correct one
try:
# Convert it to format that calc_age understands
age = datetime.datetime(*entry["newznab"]["usenetdate_parsed"][:6])
except:
# Date from feed (usually lags behind)
try:
# Convert it to format that calc_age understands
age = datetime.datetime(*entry.published_parsed[:6])
except:
pass
finally:
# We need to convert it to local timezone, feedparser always returns UTC
age = age - datetime.timedelta(seconds=time.timezone)
# Maybe the newznab also provided SxxExx info
try:
season = re.findall(r"\d+", entry["newznab"]["season"])[0]
episode = re.findall(r"\d+", entry["newznab"]["episode"])[0]
except (KeyError, IndexError):
season = episode = 0
if link and "http" in link.lower():
try:
category = entry.cattext
except AttributeError:
try:
category = entry.category
except AttributeError:
try: # nzb.su
category = entry.tags[0]["term"]
except (AttributeError, KeyError):
try:
category = entry.description
except AttributeError:
category = ""
return link, infourl, category, size, age, season, episode
else:
logging.warning(T("Empty RSS entry found (%s)"), link)
return None, None, "", 0, None, 0, 0
def special_rss_site(url):
""" Return True if url describes an RSS site with odd titles """
return cfg.rss_filenames() or match_str(url, cfg.rss_odd_titles())
def ep_match(season, episode, expr, title=None):
""" Return True if season, episode is at or above expected
Optionally `title` can be matched
"""
m = _RE_SP.search(expr)
if m:
# Make sure they are all integers for comparison
req_season = int(m.group(1))
req_episode = int(m.group(2))
season = int_conv(season)
episode = int_conv(episode)
if season > req_season or (season == req_season and episode >= req_episode):
if title:
show = expr[: m.start()].replace(".", " ").replace("_", " ").strip()
show = show.replace(" ", "[._ ]+")
return bool(re.search(show, title, re.I))
else:
return True
else:
return False
else:
return True