diff --git a/libs/guessit/__init__.py b/libs/guessit/__init__.py index 386aa7f..ce14024 100755 --- a/libs/guessit/__init__.py +++ b/libs/guessit/__init__.py @@ -20,7 +20,7 @@ from __future__ import unicode_literals -__version__ = '0.6-dev' +__version__ = '0.7-dev' __all__ = ['Guess', 'Language', 'guess_file_info', 'guess_video_info', 'guess_movie_info', 'guess_episode_info'] @@ -91,7 +91,28 @@ log.addHandler(h) def _guess_filename(filename, filetype): + def find_nodes(tree, props): + """Yields all nodes containing any of the given props.""" + if isinstance(props, base_text_type): + props = [props] + for node in tree.nodes(): + if any(prop in node.guess for prop in props): + yield node + + def warning(title): + log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) + return m + mtree = IterativeMatcher(filename, filetype=filetype) + + # if there are multiple possible years found, we assume the first one is + # part of the title, reparse the tree taking this into account + years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) + if len(years) >= 2: + mtree = IterativeMatcher(filename, filetype=filetype, + opts=['skip_first_year']) + + m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m: @@ -102,20 +123,10 @@ def _guess_filename(filename, filetype): opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() - def find_nodes(tree, props): - """Yields all nodes containing any of the given props.""" - if isinstance(props, base_text_type): - props = [props] - for node in tree.nodes(): - if any(prop in node.guess for prop in props): - yield node - - def warning(title): - log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) + if m.get('title') is None: return m - if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) diff --git a/libs/guessit/fileutils.py b/libs/guessit/fileutils.py index 45f07e8..dc077e6 100755 --- a/libs/guessit/fileutils.py +++ b/libs/guessit/fileutils.py @@ -77,12 +77,12 @@ def file_in_same_dir(ref_file, desired_file): def load_file_in_same_dir(ref_file, filename): """Load a given file. Works even when the file is contained inside a zip.""" - path = split_path(ref_file)[:-1] + [str(filename)] + path = split_path(ref_file)[:-1] + [filename] for i, p in enumerate(path): - if p[-4:] == '.zip': + if p.endswith('.zip'): zfilename = os.path.join(*path[:i + 1]) zfile = zipfile.ZipFile(zfilename) return zfile.read('/'.join(path[i + 1:])) - return u(io.open(os.path.join(*path), encoding = 'utf-8').read()) + return u(io.open(os.path.join(*path), encoding='utf-8').read()) diff --git a/libs/guessit/guess.py b/libs/guessit/guess.py index 62385e8..33d3651 100755 --- a/libs/guessit/guess.py +++ b/libs/guessit/guess.py @@ -295,7 +295,7 @@ def merge_all(guesses, append=None): # then merge the remaining ones dups = set(result) & set(g) if dups: - log.warning('duplicate properties %s in merged result...' % dups) + log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] ) result.update_highest_confidence(g) diff --git a/libs/guessit/language.py b/libs/guessit/language.py index 3b3a86a..2714c6e 100755 --- a/libs/guessit/language.py +++ b/libs/guessit/language.py @@ -326,7 +326,7 @@ def search_language(string, lang_filter=None): 'la', 'el', 'del', 'por', 'mar', # other 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', - 'vi', 'ben', 'da' + 'vi', 'ben', 'da', 'lt' ]) sep = r'[](){} \._-+' diff --git a/libs/guessit/matcher.py b/libs/guessit/matcher.py index cc77b81..4337819 100755 --- a/libs/guessit/matcher.py +++ b/libs/guessit/matcher.py @@ -128,12 +128,14 @@ class IterativeMatcher(object): apply_transfo(name) # more guessers for both movies and episodes - for name in ['guess_bonus_features', 'guess_year']: - apply_transfo(name) + apply_transfo('guess_bonus_features') + apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts)) if 'nocountry' not in opts: apply_transfo('guess_country') + apply_transfo('guess_idnumber') + # split into '-' separated subgroups (with required separator chars # around the dash) diff --git a/libs/guessit/matchtree.py b/libs/guessit/matchtree.py index 2853c3a..0725e83 100755 --- a/libs/guessit/matchtree.py +++ b/libs/guessit/matchtree.py @@ -275,7 +275,7 @@ class MatchTree(BaseMatchTree): for string_part in ('title', 'series', 'container', 'format', 'releaseGroup', 'website', 'audioCodec', 'videoCodec', 'screenSize', 'episodeFormat', - 'audioChannels'): + 'audioChannels', 'idNumber'): merge_similar_guesses(parts, string_part, choose_string) # 2- merge the rest, potentially discarding information not properly diff --git a/libs/guessit/patterns.py b/libs/guessit/patterns.py index a8a0607..ed3982b 100755 --- a/libs/guessit/patterns.py +++ b/libs/guessit/patterns.py @@ -43,13 +43,13 @@ episode_rexps = [ # ... Season 2 ... (r'saison (?P[0-9]+)', 1.0, (0, 0)), # ... s02e13 ... - (r'[Ss](?P[0-9]{1,2}).?(?P(?:[Ee-][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)), + (r'[Ss](?P[0-9]{1,3})[^0-9]?(?P(?:-?[eE-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)), - # ... s03-x02 ... - (r'[Ss](?P[0-9]{1,2}).?(?P(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)), + # ... s03-x02 ... # FIXME: redundant? remove it? + #(r'[Ss](?P[0-9]{1,3})[^0-9]?(?P(?:-?[xX-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)), # ... 2x13 ... - (r'[^0-9](?P[0-9]{1,2}).?(?P(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)), + (r'[^0-9](?P[0-9]{1,2})[^0-9]?(?P(?:-?[xX][0-9]{1,3})+)[^0-9]', 1.0, (1, -1)), # ... s02 ... #(sep + r's(?P[0-9]{1,2})' + sep, 0.6, (1, -1)), @@ -122,20 +122,25 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ], 'VHS': [ 'VHS' ], 'WEB-DL': [ 'WEB-DL' ] }, - 'screenSize': { '480p': [ '480p?' ], - '720p': [ '720p?' ], - '1080p': [ '1080p?' ] }, + 'screenSize': { '480p': [ '480[pi]?' ], + '720p': [ '720[pi]?' ], + '1080p': [ '1080[pi]?' ] }, 'videoCodec': { 'XviD': [ 'Xvid' ], 'DivX': [ 'DVDivX', 'DivX' ], 'h264': [ '[hx]-264' ], - 'Rv10': [ 'Rv10' ] }, + 'Rv10': [ 'Rv10' ], + 'Mpeg2': [ 'Mpeg2' ] }, + + # has nothing to do here (or on filenames for that matter), but some + # releases use it and it helps to identify release groups, so we adapt + 'videoApi': { 'DXVA': [ 'DXVA' ] }, 'audioCodec': { 'AC3': [ 'AC3' ], 'DTS': [ 'DTS' ], 'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] }, - 'audioChannels': { '5.1': [ r'5\.1', 'DD5\.1', '5ch' ] }, + 'audioChannels': { '5.1': [ r'5\.1', 'DD5[\._ ]1', '5ch' ] }, 'episodeFormat': { 'Minisode': [ 'Minisodes?' ] } @@ -143,14 +148,21 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ], # prop_single dict of { property_name: [ canonical_form ] } prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA', - 'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE', - 'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', 'FiNaLe', - 'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL', - 'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE', - 'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM', - '2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV', - 'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV', - 'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ], + 'CHD', 'ViTE', 'TLF', 'FLAiTE', + 'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', + 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL', + 'CtrlHD', 'POD', 'WiKi','IMMERSE', 'FQM', + '2HD', 'CTU', 'HALCYON', 'EbP', 'SiTV', + 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV', + 'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3', + 'TrollHD', 'ECI' + ], + + # potentially confusing release group names (they are words) + 'weakReleaseGroup': [ 'DEiTY', 'FiNaLe', 'UnSeeN', 'KiNGS', 'CLUE', 'DIMENSION', + 'SAiNTS', 'ARROW', 'EuReKA', 'SiNNERS', 'DiRTY', 'REWARD', + 'REPTiLE', + ], 'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5', 'complete', 'classic', # not so sure about these ones, could appear in a title @@ -179,6 +191,10 @@ properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_f def find_properties(string): result = [] for property_name, props in properties_rexps.items(): + # FIXME: this should be done in a more flexible way... + if property_name in ['weakReleaseGroup']: + continue + for canonical_form, rexps in props.items(): for value_rexp in rexps: match = value_rexp.search(string) diff --git a/libs/guessit/transfo/guess_episodes_rexps.py b/libs/guessit/transfo/guess_episodes_rexps.py index 4ebfb54..29562be 100755 --- a/libs/guessit/transfo/guess_episodes_rexps.py +++ b/libs/guessit/transfo/guess_episodes_rexps.py @@ -28,7 +28,13 @@ import logging log = logging.getLogger(__name__) def number_list(s): - return list(re.sub('[^0-9]+', ' ', s).split()) + l = [ int(n) for n in re.sub('[^0-9]+', ' ', s).split() ] + + if len(l) == 2: + # it is an episode interval, return all numbers in between + return range(l[0], l[1]+1) + + return l def guess_episodes_rexps(string): for rexp, confidence, span_adjust in episode_rexps: @@ -38,23 +44,23 @@ def guess_episodes_rexps(string): span = (match.start() + span_adjust[0], match.end() + span_adjust[1]) - # episodes which have a season > 25 are most likely errors + # episodes which have a season > 30 are most likely errors # (Simpsons is at 24!) - if int(guess.get('season', 0)) > 25: + if int(guess.get('season', 0)) > 30: continue # decide whether we have only a single episode number or an # episode list if guess.get('episodeNumber'): eplist = number_list(guess['episodeNumber']) - guess.set('episodeNumber', int(eplist[0]), confidence=confidence) + guess.set('episodeNumber', eplist[0], confidence=confidence) if len(eplist) > 1: - guess.set('episodeList', list(map(int, eplist)), confidence=confidence) + guess.set('episodeList', eplist, confidence=confidence) if guess.get('bonusNumber'): eplist = number_list(guess['bonusNumber']) - guess.set('bonusNumber', int(eplist[0]), confidence=confidence) + guess.set('bonusNumber', eplist[0], confidence=confidence) return guess, span diff --git a/libs/guessit/transfo/guess_idnumber.py b/libs/guessit/transfo/guess_idnumber.py new file mode 100755 index 0000000..0e15af5 --- /dev/null +++ b/libs/guessit/transfo/guess_idnumber.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# GuessIt - A library for guessing information from filenames +# Copyright (c) 2013 Nicolas Wack +# +# GuessIt is free software; you can redistribute it and/or modify it under +# the terms of the Lesser GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# GuessIt is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# Lesser GNU General Public License for more details. +# +# You should have received a copy of the Lesser GNU General Public License +# along with this program. If not, see . +# + +from __future__ import unicode_literals +from guessit.transfo import SingleNodeGuesser +from guessit.patterns import find_properties +import re +import logging + +log = logging.getLogger(__name__) + + +def guess_properties(string): + try: + prop, value, pos, end = find_properties(string)[0] + return { prop: value }, (pos, end) + except IndexError: + return None, None + +_idnum = re.compile(r'(?P[a-zA-Z0-9-]{10,})') # 1.0, (0, 0)) + +def guess_idnumber(string): + match = _idnum.search(string) + if match is not None: + result = match.groupdict() + switch_count = 0 + DIGIT = 0 + LETTER = 1 + OTHER = 2 + last = LETTER + for c in result['idNumber']: + if c in '0123456789': + ci = DIGIT + elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ': + ci = LETTER + else: + ci = OTHER + + if ci != last: + switch_count += 1 + + last = ci + + switch_ratio = float(switch_count) / len(result['idNumber']) + + # only return the result as probable if we alternate often between + # char type (more likely for hash values than for common words) + if switch_ratio > 0.4: + return result, match.span() + + return None, None + +def process(mtree): + SingleNodeGuesser(guess_idnumber, 0.4, log).process(mtree) diff --git a/libs/guessit/transfo/guess_release_group.py b/libs/guessit/transfo/guess_release_group.py index 2ff237d..b72c736 100755 --- a/libs/guessit/transfo/guess_release_group.py +++ b/libs/guessit/transfo/guess_release_group.py @@ -31,16 +31,22 @@ def get_patterns(property_name): CODECS = get_patterns('videoCodec') FORMATS = get_patterns('format') +VAPIS = get_patterns('videoApi') -GROUP_NAMES = [ r'(?P' + codec + r')-?(?P.*?)[ \.]' +# RG names following a codec or format, with a potential space or dash inside the name +GROUP_NAMES = [ r'(?P' + codec + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' for codec in CODECS ] -GROUP_NAMES += [ r'(?P' + fmt + r')-?(?P.*?)[ \.]' +GROUP_NAMES += [ r'(?P' + fmt + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' for fmt in FORMATS ] +GROUP_NAMES += [ r'(?P' + api + r')[ \.-](?P.+?([- \.].*?)??)[ \.]' + for api in VAPIS ] GROUP_NAMES2 = [ r'\.(?P' + codec + r')-(?P.*?)(-(.*?))?[ \.]' for codec in CODECS ] -GROUP_NAMES2 += [ r'\.(?P' + fmt + r')-(?P.*?)(-(.*?))?[ \.]' +GROUP_NAMES2 += [ r'\.(?P' + fmt + r')-(?P.*?)(-(.*?))?[ \.]' for fmt in FORMATS ] +GROUP_NAMES2 += [ r'\.(?P' + vapi + r')-(?P.*?)(-(.*?))?[ \.]' + for vapi in VAPIS ] GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ] GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ] @@ -54,12 +60,17 @@ def guess_release_group(string): # first try to see whether we have both a known codec and a known release group for rexp in GROUP_NAMES: match = rexp.search(string) - if match: + while match: metadata = match.groupdict() - release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup']) + # make sure this is an actual release group we caught + release_group = (compute_canonical_form('releaseGroup', metadata['releaseGroup']) or + compute_canonical_form('weakReleaseGroup', metadata['releaseGroup'])) if release_group: return adjust_metadata(metadata), (match.start(1), match.end(2)) + # we didn't find anything conclusive, keep searching + match = rexp.search(string, match.span()[0]+1) + # pick anything as releaseGroup as long as we have a codec in front # this doesn't include a potential dash ('-') ending the release group # eg: [...].X264-HiS@SiLUHD-English.[...] diff --git a/libs/guessit/transfo/guess_year.py b/libs/guessit/transfo/guess_year.py index 4bc9b86..c193af7 100755 --- a/libs/guessit/transfo/guess_year.py +++ b/libs/guessit/transfo/guess_year.py @@ -33,6 +33,18 @@ def guess_year(string): else: return None, None +def guess_year_skip_first(string): + year, span = search_year(string) + if year: + year2, span2 = guess_year(string[span[1]:]) + if year2: + return year2, (span2[0]+span[1], span2[1]+span[1]) + + return None, None -def process(mtree): - SingleNodeGuesser(guess_year, 1.0, log).process(mtree) + +def process(mtree, skip_first_year=False): + if skip_first_year: + SingleNodeGuesser(guess_year_skip_first, 1.0, log).process(mtree) + else: + SingleNodeGuesser(guess_year, 1.0, log).process(mtree)