SickGear/lib/subliminal/services/tvsubtitles.py

# -*- coding: utf-8 -*-
# Copyright 2012 Nicolas Wack <wackou@gmail.com>
#
# This file is part of subliminal.
#
# subliminal is free software; you can redistribute it and/or modify it under
# the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# subliminal is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with subliminal.  If not, see <http://www.gnu.org/licenses/>.
from . import ServiceBase
from ..cache import cachedmethod
from ..language import language_set, Language
from ..subtitles import get_subtitle_path, ResultSubtitle
from ..utils import get_keywords
from ..videos import Episode
from bs4 import BeautifulSoup
import logging
import re


logger = logging.getLogger("subliminal")


def match(pattern, string):
    try:
        return re.search(pattern, string).group(1)
    except AttributeError:
        logger.debug(u'Could not match %r on %r' % (pattern, string))
        return None


class TvSubtitles(ServiceBase):
    server_url = 'http://www.tvsubtitles.net'
    site_url = 'http://www.tvsubtitles.net'
    api_based = False
    languages = language_set(['ar', 'bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'fi', 'fr', 'hu',
                              'it', 'ja', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'uk',
                              'zh', 'pb'])
    #TODO: Find more exceptions
    language_map = {'gr': Language('gre'), 'cz': Language('cze'), 'ua': Language('ukr'),
                    'cn': Language('chi'), 'br': Language('pob')}
    videos = [Episode]
    require_video = False
    required_features = ['lxml']  # ['permissive']

    @cachedmethod
    def get_likely_series_id(self, name):
        r = self.session.post('%s/search.php' % self.server_url, data={'q': name})
        soup = BeautifulSoup(r.text, self.required_features[0])
        maindiv = soup.find('div', 'left')
        results = []
        for elem in maindiv.find_all('li'):
            sid = int(match(r'tvshow-([0-9]+)\.html', elem.a['href']))
            show_name = match(r'(.*) \(', elem.a.text)
            results.append((show_name, sid))

        if len(results):
            #TODO: pick up the best one in a smart way
            result = results[0]
            return result[1]

    @cachedmethod
    def get_episode_id(self, series_id, season, number):
        """Get the TvSubtitles id for the given episode. Raises KeyError if none
        could be found."""
        # download the page of the season, contains ids for all episodes
        episode_id = None
        r = self.session.get('%s/tvshow-%d-%d.html' % (self.server_url, series_id, season))
        soup = BeautifulSoup(r.text, self.required_features[0])
        table = soup.find('table', id='table5')
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if not cells:
                continue
            episode_number = match('x([0-9]+)', cells[0].text)
            if not episode_number:
                continue
            episode_number = int(episode_number)
            episode_id = int(match('episode-([0-9]+)', cells[1].a['href']))
            # we could just return the id of the queried episode, but as we
            # already downloaded the whole page we might as well fill in the
            # information for all the episodes of the season
            self.cache_for(self.get_episode_id, args=(series_id, season, episode_number), result=episode_id)
        # raises KeyError if not found
        return self.cached_value(self.get_episode_id, args=(series_id, season, number))

    # Do not cache this method in order to always check for the most recent
    # subtitles
    def get_sub_ids(self, episode_id):
        subids = []
        r = self.session.get('%s/episode-%d.html' % (self.server_url, episode_id))
        epsoup = BeautifulSoup(r.text, self.required_features[0])
        for subdiv in epsoup.find_all('a'):
            if 'href' not in subdiv.attrs or not subdiv['href'].startswith('/subtitle'):
                continue
            subid = int(match('([0-9]+)', subdiv['href']))
            lang = self.get_language(match('flags/(.*).gif', subdiv.img['src']))
            result = {'subid': subid, 'language': lang}
            for p in subdiv.find_all('p'):
                if 'alt' in p.attrs and p['alt'] == 'rip':
                    result['rip'] = p.text.strip()
                if 'alt' in p.attrs and p['alt'] == 'release':
                    result['release'] = p.text.strip()
            subids.append(result)
        return subids

    def list_checked(self, video, languages):
        return self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode)

    def query(self, filepath, languages, keywords, series, season, episode):
        logger.debug(u'Getting subtitles for %s season %d episode %d with languages %r' % (series, season, episode, languages))
        self.init_cache()
        sid = self.get_likely_series_id(series.lower())
        try:
            ep_id = self.get_episode_id(sid, season, episode)
        except (KeyError, TypeError):
            logger.debug(u'Could not find episode id for %s season %d episode %d' % (series, season, episode))
            return []
        subids = self.get_sub_ids(ep_id)
        # filter the subtitles with our queried languages
        subtitles = []
        for subid in subids:
            language = subid['language']
            if language not in languages:
                continue
            path = get_subtitle_path(filepath, language, self.config.multi)
            subtitle = ResultSubtitle(path, language, self.__class__.__name__.lower(), '%s/download-%d.html' % (self.server_url, subid['subid']),
                                      keywords=[subid['rip'], subid['release']])
            subtitles.append(subtitle)
        return subtitles

    def download(self, subtitle):
        self.download_zip_file(subtitle.link, subtitle.path)
        return subtitle


Service = TvSubtitles
Fixed IndexError: list index out of range issue 11 years ago			`# -- coding: utf-8 --`
			`# Copyright 2012 Nicolas Wack <wackou@gmail.com>`
			`#`
			`# This file is part of subliminal.`
			`#`
			`# subliminal is free software; you can redistribute it and/or modify it under`
			`# the terms of the GNU Lesser General Public License as published by`
			`# the Free Software Foundation; either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# subliminal is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public License`
			`# along with subliminal. If not, see <http://www.gnu.org/licenses/>.`
			`from . import ServiceBase`
			`from ..cache import cachedmethod`
			`from ..language import language_set, Language`
			`from ..subtitles import get_subtitle_path, ResultSubtitle`
			`from ..utils import get_keywords`
			`from ..videos import Episode`
			`from bs4 import BeautifulSoup`
			`import logging`
			`import re`


			`logger = logging.getLogger("subliminal")`


			`def match(pattern, string):`
			`try:`
			`return re.search(pattern, string).group(1)`
			`except AttributeError:`
			`logger.debug(u'Could not match %r on %r' % (pattern, string))`
			`return None`


			`class TvSubtitles(ServiceBase):`
			`server_url = 'http://www.tvsubtitles.net'`
			`site_url = 'http://www.tvsubtitles.net'`
			`api_based = False`
			`languages = language_set(['ar', 'bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'fi', 'fr', 'hu',`
			`'it', 'ja', 'ko', 'nl', 'pl', 'pt', 'ro', 'ru', 'sv', 'tr', 'uk',`
			`'zh', 'pb'])`
			`#TODO: Find more exceptions`
			`language_map = {'gr': Language('gre'), 'cz': Language('cze'), 'ua': Language('ukr'),`
			`'cn': Language('chi'), 'br': Language('pob')}`
			`videos = [Episode]`
			`require_video = False`
Fix subtitle providers that don't use auth. Fix rTorrent exception handling. Change make IMDB id parsing one central function. 5 years ago			`required_features = ['lxml'] # ['permissive']`
Fixed IndexError: list index out of range issue 11 years ago
			`@cachedmethod`
			`def get_likely_series_id(self, name):`
			`r = self.session.post('%s/search.php' % self.server_url, data={'q': name})`
Fix thesubdb under py3. Change autoProcessTV.py to remove bytestring identifiers that are printed under py3. The actual media process has no issue, however, the aesthetic output caused confusion. Fix saving nzb data to blackhole under py3. Fix deprecated use of elementtree obj without len or explicit None test. Change init provider options _after_ loading settings not before. Change handle search task attributes that do not exist. 5 years ago			`soup = BeautifulSoup(r.text, self.required_features[0])`
Fixed IndexError: list index out of range issue 11 years ago			`maindiv = soup.find('div', 'left')`
			`results = []`
			`for elem in maindiv.find_all('li'):`
Change core system to improve performance and facilitate multi TV info sources. Change migrate core objects TVShow and TVEpisode and everywhere that these objects affect. Add message to logs and disable ui backlog buttons when no media provider has active and/or scheduled searching enabled. Change views for py3 compat. Change set default runtime of 5 mins if none is given for layout Day by Day. Add OpenSubtitles authentication support to config/Subtitles/Subtitles Plugin. Add "Enforce media hash match" to config/Subtitles Plugin/Opensubtitles for accurate subs if enabled, but if disabled, search failures will fallback to use less reliable subtitle results. Add Apprise 0.8.0 (6aa52c3). Add hachoir_py3 3.0a6 (5b9e05a). Add sgmllib3k 1.0.0 Update soupsieve 1.9.1 (24859cc) to soupsieve_py2 1.9.5 (6a38398) Add soupsieve_py3 2.0.0.dev (69194a2). Add Tornado_py3 Web Server 6.0.3 (ff985fe). Add xmlrpclib_to 0.1.1 (c37db9e). Remove ancient Growl lib 0.1 Remove xmltodict library. Change requirements.txt for Cheetah3 to minimum 3.2.4 Change update sabToSickBeard. Change update autoProcessTV. Change remove Twitter notifier. Update NZBGet Process Media extension, SickGear-NG 1.7 → 2.4 Update Kodi addon 1.0.3 → 1.0.4 Update ADBA for py3. Update Beautiful Soup 4.8.0 (r526) to 4.8.1 (r531). Update Send2Trash 1.3.0 (a568370) to 1.5.0 (66afce7). Update soupsieve 1.9.1 (24859cc) to 1.9.5 (6a38398). Change use GNTP (Growl Notification Transport Protocol) from Apprise. Change add multi host support to Growl notifier. Fix Growl notifier when using empty password. Change update links for Growl notifications. Change deprecate confg/Notifications/Growl password field as these are now stored with host setting. Fix prevent infinite memoryError from a particular jpg data structure. Change subliminal for py3. Change enzyme for py3. Change browser_ua for py3. Change feedparser for py3 (sgmlib is no longer available on py3 as standardlib so added ext lib) Fix Guessit. Fix parse_xml for py3. Fix name parser with multi eps for py3. Fix tvdb_api fixes for py3 (search show). Fix config/media process to only display "pattern is invalid" qtip on "Episode naming" tab if the associated field is actually visible. Also, if the field becomes hidden due to a setting change, hide any previously displayed qtip. Note for Javascript::getelementbyid (or $('tag[id="<name>"')) is required when an id is being searched in the dom due to ":" used in a shows id name. Change download anidb xml files to main cache folder and use adba lib folder as a last resort. Change create get anidb show groups as centralised helper func and consolidate dupe code. Change move anidb related functions to newly renamed anime.py (from blacklistandwhitelist.py). Change str encode hex no longer exits in py3, use codecs.encode(...) instead. Change fix b64decode on py3 returns bytestrings. Change use binary read when downloading log file via browser to prevent any encoding issues. Change add case insensitive ordering to anime black/whitelist. Fix anime groups list not excluding whitelisted stuff. Change add Windows utf8 fix ... see: ytdl-org/youtube-dl#820 Change if no qualities are wanted, exit manual search thread. Fix keepalive for py3 process media. Change add a once a month update of tvinfo show mappings to the daily updater. Change autocorrect ids of new shows by updating from -8 to 31 days of the airdate of episode one. Add next run time to Manage/Show Tasks/Daily show update. Change when fetching imdb data, if imdb id is an episode id then try to find and use real show id. Change delete diskcache db in imdbpie when value error (due to change in Python version). Change during startup, cleanup any _cleaner.pyc/o to prevent issues when switching python versions. Add .pyc cleaner if python version is switched. Change replace deprecated gettz_db_metadata() and gettz. Change rebrand "SickGear PostProcessing script" to "SickGear Process Media extension". Change improve setup guide to use the NZBGet version to minimise displayed text based on version. Change NZBGet versions prior to v17 now told to upgrade as those version are no longer supported - code has actually exit on start up for some time but docs were outdated. Change comment out code and unused option sg_base_path. Change supported Python version 2.7.9-2.7.18 inclusive expanded to 3.7.1-3.8.1 inclusive. Change pidfile creation under Linux 0o644. Make logger accept lists to output continuously using the log_lock instead of split up by other processes. Fix long path issues with Windows process media. 6 years ago			`sid = int(match(r'tvshow-([0-9]+)\.html', elem.a['href']))`
			`show_name = match(r'(.*) \(', elem.a.text)`
Fixed IndexError: list index out of range issue 11 years ago			`results.append((show_name, sid))`

			`if len(results):`
			`#TODO: pick up the best one in a smart way`
			`result = results[0]`
			`return result[1]`

			`@cachedmethod`
			`def get_episode_id(self, series_id, season, number):`
			`"""Get the TvSubtitles id for the given episode. Raises KeyError if none`
			`could be found."""`
			`# download the page of the season, contains ids for all episodes`
			`episode_id = None`
			`r = self.session.get('%s/tvshow-%d-%d.html' % (self.server_url, series_id, season))`
Fix thesubdb under py3. Change autoProcessTV.py to remove bytestring identifiers that are printed under py3. The actual media process has no issue, however, the aesthetic output caused confusion. Fix saving nzb data to blackhole under py3. Fix deprecated use of elementtree obj without len or explicit None test. Change init provider options _after_ loading settings not before. Change handle search task attributes that do not exist. 5 years ago			`soup = BeautifulSoup(r.text, self.required_features[0])`
Fixed IndexError: list index out of range issue 11 years ago			`table = soup.find('table', id='table5')`
			`for row in table.find_all('tr'):`
			`cells = row.find_all('td')`
			`if not cells:`
			`continue`
			`episode_number = match('x([0-9]+)', cells[0].text)`
			`if not episode_number:`
			`continue`
			`episode_number = int(episode_number)`
			`episode_id = int(match('episode-([0-9]+)', cells[1].a['href']))`
			`# we could just return the id of the queried episode, but as we`
			`# already downloaded the whole page we might as well fill in the`
			`# information for all the episodes of the season`
			`self.cache_for(self.get_episode_id, args=(series_id, season, episode_number), result=episode_id)`
			`# raises KeyError if not found`
			`return self.cached_value(self.get_episode_id, args=(series_id, season, number))`

			`# Do not cache this method in order to always check for the most recent`
			`# subtitles`
			`def get_sub_ids(self, episode_id):`
			`subids = []`
			`r = self.session.get('%s/episode-%d.html' % (self.server_url, episode_id))`
Fix thesubdb under py3. Change autoProcessTV.py to remove bytestring identifiers that are printed under py3. The actual media process has no issue, however, the aesthetic output caused confusion. Fix saving nzb data to blackhole under py3. Fix deprecated use of elementtree obj without len or explicit None test. Change init provider options _after_ loading settings not before. Change handle search task attributes that do not exist. 5 years ago			`epsoup = BeautifulSoup(r.text, self.required_features[0])`
Fixed IndexError: list index out of range issue 11 years ago			`for subdiv in epsoup.find_all('a'):`
			`if 'href' not in subdiv.attrs or not subdiv['href'].startswith('/subtitle'):`
			`continue`
			`subid = int(match('([0-9]+)', subdiv['href']))`
			`lang = self.get_language(match('flags/(.*).gif', subdiv.img['src']))`
			`result = {'subid': subid, 'language': lang}`
			`for p in subdiv.find_all('p'):`
			`if 'alt' in p.attrs and p['alt'] == 'rip':`
			`result['rip'] = p.text.strip()`
			`if 'alt' in p.attrs and p['alt'] == 'release':`
			`result['release'] = p.text.strip()`
			`subids.append(result)`
			`return subids`

			`def list_checked(self, video, languages):`
			`return self.query(video.path or video.release, languages, get_keywords(video.guess), video.series, video.season, video.episode)`

			`def query(self, filepath, languages, keywords, series, season, episode):`
			`logger.debug(u'Getting subtitles for %s season %d episode %d with languages %r' % (series, season, episode, languages))`
			`self.init_cache()`
			`sid = self.get_likely_series_id(series.lower())`
			`try:`
			`ep_id = self.get_episode_id(sid, season, episode)`
Fix subtitle providers that don't use auth. Fix rTorrent exception handling. Change make IMDB id parsing one central function. 5 years ago			`except (KeyError, TypeError):`
Fixed IndexError: list index out of range issue 11 years ago			`logger.debug(u'Could not find episode id for %s season %d episode %d' % (series, season, episode))`
			`return []`
			`subids = self.get_sub_ids(ep_id)`
			`# filter the subtitles with our queried languages`
			`subtitles = []`
			`for subid in subids:`
			`language = subid['language']`
			`if language not in languages:`
			`continue`
			`path = get_subtitle_path(filepath, language, self.config.multi)`
			`subtitle = ResultSubtitle(path, language, self.__class__.__name__.lower(), '%s/download-%d.html' % (self.server_url, subid['subid']),`
			`keywords=[subid['rip'], subid['release']])`
			`subtitles.append(subtitle)`
			`return subtitles`

			`def download(self, subtitle):`
			`self.download_zip_file(subtitle.link, subtitle.path)`
			`return subtitle`


Fix subtitle providers that don't use auth. Fix rTorrent exception handling. Change make IMDB id parsing one central function. 5 years ago			`Service = TvSubtitles`