SickGear/lib/guessit/__init__.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#

from __future__ import unicode_literals

__version__ = '0.7.dev0'
__all__ = ['Guess', 'Language',
           'guess_file_info', 'guess_video_info',
           'guess_movie_info', 'guess_episode_info']


# Do python3 detection before importing any other module, to be sure that
# it will then always be available
# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/
import sys
if sys.version_info[0] >= 3:
    PY3 = True
    unicode_text_type = str
    native_text_type = str
    base_text_type = str
    def u(x):
        return str(x)
    def s(x):
        return x
    class UnicodeMixin(object):
        __str__ = lambda x: x.__unicode__()
    import binascii
    def to_hex(x):
        return binascii.hexlify(x).decode('utf-8')

else:
    PY3 = False
    __all__ = [ str(s) for s in __all__ ] # fix imports for python2
    unicode_text_type = unicode
    native_text_type = str
    base_text_type = basestring
    def u(x):
        if isinstance(x, str):
            return x.decode('utf-8')
        return unicode(x)
    def s(x):
        if isinstance(x, unicode):
            return x.encode('utf-8')
        if isinstance(x, list):
            return [ s(y) for y in x ]
        if isinstance(x, tuple):
            return tuple(s(y) for y in x)
        if isinstance(x, dict):
            return dict((s(key), s(value)) for key, value in x.items())
        return x
    class UnicodeMixin(object):
        __str__ = lambda x: unicode(x).encode('utf-8')
    def to_hex(x):
        return x.encode('hex')


from guessit.guess import Guess, merge_all
from guessit.language import Language
from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_string
import logging

log = logging.getLogger(__name__)


class NullHandler(logging.Handler):
    def emit(self, record):
        pass

# let's be a nicely behaving library
h = NullHandler()
log.addHandler(h)


def _guess_filename(filename, filetype):
    def find_nodes(tree, props):
        """Yields all nodes containing any of the given props."""
        if isinstance(props, base_text_type):
            props = [props]
        for node in tree.nodes():
            if any(prop in node.guess for prop in props):
                yield node

    def warning(title):
        log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
        return m

    mtree = IterativeMatcher(filename, filetype=filetype)

    # if there are multiple possible years found, we assume the first one is
    # part of the title, reparse the tree taking this into account
    years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))
    if len(years) >= 2:
        mtree = IterativeMatcher(filename, filetype=filetype,
                                 opts=['skip_first_year'])


    m = mtree.matched()

    if 'language' not in m and 'subtitleLanguage' not in m:
        return m

    # if we found some language, make sure we didn't cut a title or sth...
    mtree2 = IterativeMatcher(filename, filetype=filetype,
                              opts=['nolanguage', 'nocountry'])
    m2 = mtree2.matched()


    if m.get('title') is None:
        return m

    if m.get('title') != m2.get('title'):
        title = next(find_nodes(mtree.match_tree, 'title'))
        title2 = next(find_nodes(mtree2.match_tree, 'title'))

        langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))
        if not langs:
            return warning('A weird error happened with language detection')

        # find the language that is likely more relevant
        for lng in langs:
            if lng.value in title2.value:
                # if the language was detected as part of a potential title,
                # look at this one in particular
                lang = lng
                break
        else:
            # pick the first one if we don't have a better choice
            lang = langs[0]


        # language code are rarely part of a title, and those
        # should be handled by the Language exceptions anyway
        if len(lang.value) <= 3:
            return m


        # if filetype is subtitle and the language appears last, just before
        # the extension, then it is likely a subtitle language
        parts = clean_string(title.root.value).split()
        try:
            if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and
                    parts.index(lang.value) == len(parts) - 2):
                return m
        except ValueError:
            pass

        # if the language was in the middle of the other potential title,
        # keep the other title (eg: The Italian Job), except if it is at the
        # very beginning, in which case we consider it an error
        if m2['title'].startswith(lang.value):
            return m
        elif lang.value in title2.value:
            return m2

        # if a node is in an explicit group, then the correct title is probably
        # the other one
        if title.root.node_at(title.node_idx[:2]).is_explicit():
            return m2
        elif title2.root.node_at(title2.node_idx[:2]).is_explicit():
            return m

        return warning('Not sure of the title because of the language position')


    return m


def guess_file_info(filename, filetype, info=None):
    """info can contain the names of the various plugins, such as 'filename' to
    detect filename info, or 'hash_md5' to get the md5 hash of the file.

    >>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])
    {'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}
    """
    result = []
    hashers = []

    # Force unicode as soon as possible
    filename = u(filename)

    if info is None:
        info = ['filename']

    if isinstance(info, base_text_type):
        info = [info]

    for infotype in info:
        if infotype == 'filename':
            result.append(_guess_filename(filename, filetype))

        elif infotype == 'hash_mpc':
            from guessit.hash_mpc import hash_file
            try:
                result.append(Guess({'hash_mpc': hash_file(filename)},
                                    confidence=1.0))
            except Exception as e:
                log.warning('Could not compute MPC-style hash because: %s' % e)

        elif infotype == 'hash_ed2k':
            from guessit.hash_ed2k import hash_file
            try:
                result.append(Guess({'hash_ed2k': hash_file(filename)},
                                    confidence=1.0))
            except Exception as e:
                log.warning('Could not compute ed2k hash because: %s' % e)

        elif infotype.startswith('hash_'):
            import hashlib
            hashname = infotype[5:]
            try:
                hasher = getattr(hashlib, hashname)()
                hashers.append((infotype, hasher))
            except AttributeError:
                log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)

        else:
            log.warning('Invalid infotype: %s' % infotype)

    # do all the hashes now, but on a single pass
    if hashers:
        try:
            blocksize = 8192
            hasherobjs = dict(hashers).values()

            with open(filename, 'rb') as f:
                chunk = f.read(blocksize)
                while chunk:
                    for hasher in hasherobjs:
                        hasher.update(chunk)
                    chunk = f.read(blocksize)

            for infotype, hasher in hashers:
                result.append(Guess({infotype: hasher.hexdigest()},
                                    confidence=1.0))
        except Exception as e:
            log.warning('Could not compute hash because: %s' % e)

    result = merge_all(result)

    # last minute adjustments

    # if country is in the guessed properties, make it part of the filename
    if 'series' in result and 'country' in result:
        result['series'] += ' (%s)' % result['country'].alpha2.upper()


    return result


def guess_video_info(filename, info=None):
    return guess_file_info(filename, 'autodetect', info)


def guess_movie_info(filename, info=None):
    return guess_file_info(filename, 'movie', info)


def guess_episode_info(filename, info=None):
    return guess_file_info(filename, 'episode', info)
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 11 years ago			`#!/usr/bin/env python2`
			`# -- coding: utf-8 --`
			`#`
			`# GuessIt - A library for guessing information from filenames`
			`# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>`
			`#`
			`# GuessIt is free software; you can redistribute it and/or modify it under`
			`# the terms of the Lesser GNU General Public License as published by`
			`# the Free Software Foundation; either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# GuessIt is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# Lesser GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the Lesser GNU General Public License`
			`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
			`#`

			`from __future__ import unicode_literals`

			`__version__ = '0.7.dev0'`
			`__all__ = ['Guess', 'Language',`
			`'guess_file_info', 'guess_video_info',`
			`'guess_movie_info', 'guess_episode_info']`


			`# Do python3 detection before importing any other module, to be sure that`
			`# it will then always be available`
			`# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/`
			`import sys`
			`if sys.version_info[0] >= 3:`
			`PY3 = True`
			`unicode_text_type = str`
			`native_text_type = str`
			`base_text_type = str`
			`def u(x):`
			`return str(x)`
			`def s(x):`
			`return x`
			`class UnicodeMixin(object):`
			`__str__ = lambda x: x.__unicode__()`
			`import binascii`
			`def to_hex(x):`
			`return binascii.hexlify(x).decode('utf-8')`

			`else:`
			`PY3 = False`
			`__all__ = [ str(s) for s in __all__ ] # fix imports for python2`
			`unicode_text_type = unicode`
			`native_text_type = str`
			`base_text_type = basestring`
			`def u(x):`
			`if isinstance(x, str):`
			`return x.decode('utf-8')`
			`return unicode(x)`
			`def s(x):`
			`if isinstance(x, unicode):`
			`return x.encode('utf-8')`
			`if isinstance(x, list):`
			`return [ s(y) for y in x ]`
			`if isinstance(x, tuple):`
			`return tuple(s(y) for y in x)`
			`if isinstance(x, dict):`
			`return dict((s(key), s(value)) for key, value in x.items())`
			`return x`
			`class UnicodeMixin(object):`
			`__str__ = lambda x: unicode(x).encode('utf-8')`
			`def to_hex(x):`
			`return x.encode('hex')`


			`from guessit.guess import Guess, merge_all`
			`from guessit.language import Language`
			`from guessit.matcher import IterativeMatcher`
			`from guessit.textutils import clean_string`
			`import logging`

			`log = logging.getLogger(__name__)`



			`class NullHandler(logging.Handler):`
			`def emit(self, record):`
			`pass`

			`# let's be a nicely behaving library`
			`h = NullHandler()`
			`log.addHandler(h)`


			`def _guess_filename(filename, filetype):`
			`def find_nodes(tree, props):`
			`"""Yields all nodes containing any of the given props."""`
			`if isinstance(props, base_text_type):`
			`props = [props]`
			`for node in tree.nodes():`
			`if any(prop in node.guess for prop in props):`
			`yield node`

			`def warning(title):`
			`log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))`
			`return m`

			`mtree = IterativeMatcher(filename, filetype=filetype)`

			`# if there are multiple possible years found, we assume the first one is`
			`# part of the title, reparse the tree taking this into account`
			`years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))`
			`if len(years) >= 2:`
			`mtree = IterativeMatcher(filename, filetype=filetype,`
			`opts=['skip_first_year'])`


			`m = mtree.matched()`

			`if 'language' not in m and 'subtitleLanguage' not in m:`
			`return m`

			`# if we found some language, make sure we didn't cut a title or sth...`
			`mtree2 = IterativeMatcher(filename, filetype=filetype,`
			`opts=['nolanguage', 'nocountry'])`
			`m2 = mtree2.matched()`


			`if m.get('title') is None:`
			`return m`

			`if m.get('title') != m2.get('title'):`
			`title = next(find_nodes(mtree.match_tree, 'title'))`
			`title2 = next(find_nodes(mtree2.match_tree, 'title'))`

			`langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))`
			`if not langs:`
			`return warning('A weird error happened with language detection')`

			`# find the language that is likely more relevant`
			`for lng in langs:`
			`if lng.value in title2.value:`
			`# if the language was detected as part of a potential title,`
			`# look at this one in particular`
			`lang = lng`
			`break`
			`else:`
			`# pick the first one if we don't have a better choice`
			`lang = langs[0]`


			`# language code are rarely part of a title, and those`
			`# should be handled by the Language exceptions anyway`
			`if len(lang.value) <= 3:`
			`return m`


			`# if filetype is subtitle and the language appears last, just before`
			`# the extension, then it is likely a subtitle language`
			`parts = clean_string(title.root.value).split()`
Change core system to improve performance and facilitate multi TV info sources. Change migrate core objects TVShow and TVEpisode and everywhere that these objects affect. Add message to logs and disable ui backlog buttons when no media provider has active and/or scheduled searching enabled. Change views for py3 compat. Change set default runtime of 5 mins if none is given for layout Day by Day. Add OpenSubtitles authentication support to config/Subtitles/Subtitles Plugin. Add "Enforce media hash match" to config/Subtitles Plugin/Opensubtitles for accurate subs if enabled, but if disabled, search failures will fallback to use less reliable subtitle results. Add Apprise 0.8.0 (6aa52c3). Add hachoir_py3 3.0a6 (5b9e05a). Add sgmllib3k 1.0.0 Update soupsieve 1.9.1 (24859cc) to soupsieve_py2 1.9.5 (6a38398) Add soupsieve_py3 2.0.0.dev (69194a2). Add Tornado_py3 Web Server 6.0.3 (ff985fe). Add xmlrpclib_to 0.1.1 (c37db9e). Remove ancient Growl lib 0.1 Remove xmltodict library. Change requirements.txt for Cheetah3 to minimum 3.2.4 Change update sabToSickBeard. Change update autoProcessTV. Change remove Twitter notifier. Update NZBGet Process Media extension, SickGear-NG 1.7 → 2.4 Update Kodi addon 1.0.3 → 1.0.4 Update ADBA for py3. Update Beautiful Soup 4.8.0 (r526) to 4.8.1 (r531). Update Send2Trash 1.3.0 (a568370) to 1.5.0 (66afce7). Update soupsieve 1.9.1 (24859cc) to 1.9.5 (6a38398). Change use GNTP (Growl Notification Transport Protocol) from Apprise. Change add multi host support to Growl notifier. Fix Growl notifier when using empty password. Change update links for Growl notifications. Change deprecate confg/Notifications/Growl password field as these are now stored with host setting. Fix prevent infinite memoryError from a particular jpg data structure. Change subliminal for py3. Change enzyme for py3. Change browser_ua for py3. Change feedparser for py3 (sgmlib is no longer available on py3 as standardlib so added ext lib) Fix Guessit. Fix parse_xml for py3. Fix name parser with multi eps for py3. Fix tvdb_api fixes for py3 (search show). Fix config/media process to only display "pattern is invalid" qtip on "Episode naming" tab if the associated field is actually visible. Also, if the field becomes hidden due to a setting change, hide any previously displayed qtip. Note for Javascript::getelementbyid (or $('tag[id="<name>"')) is required when an id is being searched in the dom due to ":" used in a shows id name. Change download anidb xml files to main cache folder and use adba lib folder as a last resort. Change create get anidb show groups as centralised helper func and consolidate dupe code. Change move anidb related functions to newly renamed anime.py (from blacklistandwhitelist.py). Change str encode hex no longer exits in py3, use codecs.encode(...) instead. Change fix b64decode on py3 returns bytestrings. Change use binary read when downloading log file via browser to prevent any encoding issues. Change add case insensitive ordering to anime black/whitelist. Fix anime groups list not excluding whitelisted stuff. Change add Windows utf8 fix ... see: ytdl-org/youtube-dl#820 Change if no qualities are wanted, exit manual search thread. Fix keepalive for py3 process media. Change add a once a month update of tvinfo show mappings to the daily updater. Change autocorrect ids of new shows by updating from -8 to 31 days of the airdate of episode one. Add next run time to Manage/Show Tasks/Daily show update. Change when fetching imdb data, if imdb id is an episode id then try to find and use real show id. Change delete diskcache db in imdbpie when value error (due to change in Python version). Change during startup, cleanup any _cleaner.pyc/o to prevent issues when switching python versions. Add .pyc cleaner if python version is switched. Change replace deprecated gettz_db_metadata() and gettz. Change rebrand "SickGear PostProcessing script" to "SickGear Process Media extension". Change improve setup guide to use the NZBGet version to minimise displayed text based on version. Change NZBGet versions prior to v17 now told to upgrade as those version are no longer supported - code has actually exit on start up for some time but docs were outdated. Change comment out code and unused option sg_base_path. Change supported Python version 2.7.9-2.7.18 inclusive expanded to 3.7.1-3.8.1 inclusive. Change pidfile creation under Linux 0o644. Make logger accept lists to output continuously using the log_lock instead of split up by other processes. Fix long path issues with Windows process media. 6 years ago			`try:`
			`if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and`
			`parts.index(lang.value) == len(parts) - 2):`
			`return m`
			`except ValueError:`
			`pass`
Welcome to our SickBeard-TVRage Edition ... This version of SickBeard uses both TVDB and TVRage to search and gather it's series data from allowing you to now have access to and download shows that you couldn't before because of being locked into only what TheTVDB had to offer. Also this edition is based off the code we used in our XEM editon so it does come with scene numbering support as well as all the other features our XEM edition has to offer. Please before using this with your existing database (sickbeard.db) please make a backup copy of it and delete any other database files such as cache.db and failed.db if present, we HIGHLY recommend starting out with no database files at all to make this a fresh start but the choice is at your own risk! Enjoy! 11 years ago
			`# if the language was in the middle of the other potential title,`
			`# keep the other title (eg: The Italian Job), except if it is at the`
			`# very beginning, in which case we consider it an error`
			`if m2['title'].startswith(lang.value):`
			`return m`
			`elif lang.value in title2.value:`
			`return m2`

			`# if a node is in an explicit group, then the correct title is probably`
			`# the other one`
			`if title.root.node_at(title.node_idx[:2]).is_explicit():`
			`return m2`
			`elif title2.root.node_at(title2.node_idx[:2]).is_explicit():`
			`return m`

			`return warning('Not sure of the title because of the language position')`


			`return m`


			`def guess_file_info(filename, filetype, info=None):`
			`"""info can contain the names of the various plugins, such as 'filename' to`
			`detect filename info, or 'hash_md5' to get the md5 hash of the file.`

			`>>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])`
			`{'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}`
			`"""`
			`result = []`
			`hashers = []`

			`# Force unicode as soon as possible`
			`filename = u(filename)`

			`if info is None:`
			`info = ['filename']`

			`if isinstance(info, base_text_type):`
			`info = [info]`

			`for infotype in info:`
			`if infotype == 'filename':`
			`result.append(_guess_filename(filename, filetype))`

			`elif infotype == 'hash_mpc':`
			`from guessit.hash_mpc import hash_file`
			`try:`
			`result.append(Guess({'hash_mpc': hash_file(filename)},`
			`confidence=1.0))`
			`except Exception as e:`
			`log.warning('Could not compute MPC-style hash because: %s' % e)`

			`elif infotype == 'hash_ed2k':`
			`from guessit.hash_ed2k import hash_file`
			`try:`
			`result.append(Guess({'hash_ed2k': hash_file(filename)},`
			`confidence=1.0))`
			`except Exception as e:`
			`log.warning('Could not compute ed2k hash because: %s' % e)`

			`elif infotype.startswith('hash_'):`
			`import hashlib`
			`hashname = infotype[5:]`
			`try:`
			`hasher = getattr(hashlib, hashname)()`
			`hashers.append((infotype, hasher))`
			`except AttributeError:`
			`log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)`

			`else:`
			`log.warning('Invalid infotype: %s' % infotype)`

			`# do all the hashes now, but on a single pass`
			`if hashers:`
			`try:`
			`blocksize = 8192`
			`hasherobjs = dict(hashers).values()`

			`with open(filename, 'rb') as f:`
			`chunk = f.read(blocksize)`
			`while chunk:`
			`for hasher in hasherobjs:`
			`hasher.update(chunk)`
			`chunk = f.read(blocksize)`

			`for infotype, hasher in hashers:`
			`result.append(Guess({infotype: hasher.hexdigest()},`
			`confidence=1.0))`
			`except Exception as e:`
			`log.warning('Could not compute hash because: %s' % e)`

			`result = merge_all(result)`

			`# last minute adjustments`

			`# if country is in the guessed properties, make it part of the filename`
			`if 'series' in result and 'country' in result:`
			`result['series'] += ' (%s)' % result['country'].alpha2.upper()`


			`return result`


			`def guess_video_info(filename, info=None):`
			`return guess_file_info(filename, 'autodetect', info)`


			`def guess_movie_info(filename, info=None):`
			`return guess_file_info(filename, 'movie', info)`


			`def guess_episode_info(filename, info=None):`
			`return guess_file_info(filename, 'episode', info)`