Browse Source

Update GuessIt

pull/2167/merge
Ruud 12 years ago
parent
commit
ad01a3da4d
  1. 35
      libs/guessit/__init__.py
  2. 6
      libs/guessit/fileutils.py
  3. 2
      libs/guessit/guess.py
  4. 2
      libs/guessit/language.py
  5. 6
      libs/guessit/matcher.py
  6. 2
      libs/guessit/matchtree.py
  7. 50
      libs/guessit/patterns.py
  8. 18
      libs/guessit/transfo/guess_episodes_rexps.py
  9. 71
      libs/guessit/transfo/guess_idnumber.py
  10. 21
      libs/guessit/transfo/guess_release_group.py
  11. 16
      libs/guessit/transfo/guess_year.py

35
libs/guessit/__init__.py

@ -20,7 +20,7 @@
from __future__ import unicode_literals
__version__ = '0.6-dev'
__version__ = '0.7-dev'
__all__ = ['Guess', 'Language',
'guess_file_info', 'guess_video_info',
'guess_movie_info', 'guess_episode_info']
@ -91,7 +91,28 @@ log.addHandler(h)
def _guess_filename(filename, filetype):
def find_nodes(tree, props):
"""Yields all nodes containing any of the given props."""
if isinstance(props, base_text_type):
props = [props]
for node in tree.nodes():
if any(prop in node.guess for prop in props):
yield node
def warning(title):
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
return m
mtree = IterativeMatcher(filename, filetype=filetype)
# if there are multiple possible years found, we assume the first one is
# part of the title, reparse the tree taking this into account
years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))
if len(years) >= 2:
mtree = IterativeMatcher(filename, filetype=filetype,
opts=['skip_first_year'])
m = mtree.matched()
if 'language' not in m and 'subtitleLanguage' not in m:
@ -102,20 +123,10 @@ def _guess_filename(filename, filetype):
opts=['nolanguage', 'nocountry'])
m2 = mtree2.matched()
def find_nodes(tree, props):
"""Yields all nodes containing any of the given props."""
if isinstance(props, base_text_type):
props = [props]
for node in tree.nodes():
if any(prop in node.guess for prop in props):
yield node
def warning(title):
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
if m.get('title') is None:
return m
if m.get('title') != m2.get('title'):
title = next(find_nodes(mtree.match_tree, 'title'))
title2 = next(find_nodes(mtree2.match_tree, 'title'))

6
libs/guessit/fileutils.py

@ -77,12 +77,12 @@ def file_in_same_dir(ref_file, desired_file):
def load_file_in_same_dir(ref_file, filename):
"""Load a given file. Works even when the file is contained inside a zip."""
path = split_path(ref_file)[:-1] + [str(filename)]
path = split_path(ref_file)[:-1] + [filename]
for i, p in enumerate(path):
if p[-4:] == '.zip':
if p.endswith('.zip'):
zfilename = os.path.join(*path[:i + 1])
zfile = zipfile.ZipFile(zfilename)
return zfile.read('/'.join(path[i + 1:]))
return u(io.open(os.path.join(*path), encoding = 'utf-8').read())
return u(io.open(os.path.join(*path), encoding='utf-8').read())

2
libs/guessit/guess.py

@ -295,7 +295,7 @@ def merge_all(guesses, append=None):
# then merge the remaining ones
dups = set(result) & set(g)
if dups:
log.warning('duplicate properties %s in merged result...' % dups)
log.warning('duplicate properties %s in merged result...' % [ (result[p], g[p]) for p in dups] )
result.update_highest_confidence(g)

2
libs/guessit/language.py

@ -326,7 +326,7 @@ def search_language(string, lang_filter=None):
'la', 'el', 'del', 'por', 'mar',
# other
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
'vi', 'ben', 'da'
'vi', 'ben', 'da', 'lt'
])
sep = r'[](){} \._-+'

6
libs/guessit/matcher.py

@ -128,12 +128,14 @@ class IterativeMatcher(object):
apply_transfo(name)
# more guessers for both movies and episodes
for name in ['guess_bonus_features', 'guess_year']:
apply_transfo(name)
apply_transfo('guess_bonus_features')
apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts))
if 'nocountry' not in opts:
apply_transfo('guess_country')
apply_transfo('guess_idnumber')
# split into '-' separated subgroups (with required separator chars
# around the dash)

2
libs/guessit/matchtree.py

@ -275,7 +275,7 @@ class MatchTree(BaseMatchTree):
for string_part in ('title', 'series', 'container', 'format',
'releaseGroup', 'website', 'audioCodec',
'videoCodec', 'screenSize', 'episodeFormat',
'audioChannels'):
'audioChannels', 'idNumber'):
merge_similar_guesses(parts, string_part, choose_string)
# 2- merge the rest, potentially discarding information not properly

50
libs/guessit/patterns.py

@ -43,13 +43,13 @@ episode_rexps = [ # ... Season 2 ...
(r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),
# ... s02e13 ...
(r'[Ss](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[Ee-][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
(r'[Ss](?P<season>[0-9]{1,3})[^0-9]?(?P<episodeNumber>(?:-?[eE-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)),
# ... s03-x02 ...
(r'[Ss](?P<season>[0-9]{1,2}).?(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
# ... s03-x02 ... # FIXME: redundant? remove it?
#(r'[Ss](?P<season>[0-9]{1,3})[^0-9]?(?P<bonusNumber>(?:-?[xX-][0-9]{1,3})+)[^0-9]', 1.0, (0, -1)),
# ... 2x13 ...
(r'[^0-9](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
(r'[^0-9](?P<season>[0-9]{1,2})[^0-9]?(?P<episodeNumber>(?:-?[xX][0-9]{1,3})+)[^0-9]', 1.0, (1, -1)),
# ... s02 ...
#(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
@ -122,20 +122,25 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
'VHS': [ 'VHS' ],
'WEB-DL': [ 'WEB-DL' ] },
'screenSize': { '480p': [ '480p?' ],
'720p': [ '720p?' ],
'1080p': [ '1080p?' ] },
'screenSize': { '480p': [ '480[pi]?' ],
'720p': [ '720[pi]?' ],
'1080p': [ '1080[pi]?' ] },
'videoCodec': { 'XviD': [ 'Xvid' ],
'DivX': [ 'DVDivX', 'DivX' ],
'h264': [ '[hx]-264' ],
'Rv10': [ 'Rv10' ] },
'Rv10': [ 'Rv10' ],
'Mpeg2': [ 'Mpeg2' ] },
# has nothing to do here (or on filenames for that matter), but some
# releases use it and it helps to identify release groups, so we adapt
'videoApi': { 'DXVA': [ 'DXVA' ] },
'audioCodec': { 'AC3': [ 'AC3' ],
'DTS': [ 'DTS' ],
'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] },
'audioChannels': { '5.1': [ r'5\.1', 'DD5\.1', '5ch' ] },
'audioChannels': { '5.1': [ r'5\.1', 'DD5[\._ ]1', '5ch' ] },
'episodeFormat': { 'Minisode': [ 'Minisodes?' ] }
@ -143,14 +148,21 @@ prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
# prop_single dict of { property_name: [ canonical_form ] }
prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA',
'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', 'FiNaLe',
'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV',
'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ],
'CHD', 'ViTE', 'TLF', 'FLAiTE',
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS',
'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
'CtrlHD', 'POD', 'WiKi','IMMERSE', 'FQM',
'2HD', 'CTU', 'HALCYON', 'EbP', 'SiTV',
'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3',
'TrollHD', 'ECI'
],
# potentially confusing release group names (they are words)
'weakReleaseGroup': [ 'DEiTY', 'FiNaLe', 'UnSeeN', 'KiNGS', 'CLUE', 'DIMENSION',
'SAiNTS', 'ARROW', 'EuReKA', 'SiNNERS', 'DiRTY', 'REWARD',
'REPTiLE',
],
'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5',
'complete', 'classic', # not so sure about these ones, could appear in a title
@ -179,6 +191,10 @@ properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_f
def find_properties(string):
result = []
for property_name, props in properties_rexps.items():
# FIXME: this should be done in a more flexible way...
if property_name in ['weakReleaseGroup']:
continue
for canonical_form, rexps in props.items():
for value_rexp in rexps:
match = value_rexp.search(string)

18
libs/guessit/transfo/guess_episodes_rexps.py

@ -28,7 +28,13 @@ import logging
log = logging.getLogger(__name__)
def number_list(s):
return list(re.sub('[^0-9]+', ' ', s).split())
l = [ int(n) for n in re.sub('[^0-9]+', ' ', s).split() ]
if len(l) == 2:
# it is an episode interval, return all numbers in between
return range(l[0], l[1]+1)
return l
def guess_episodes_rexps(string):
for rexp, confidence, span_adjust in episode_rexps:
@ -38,23 +44,23 @@ def guess_episodes_rexps(string):
span = (match.start() + span_adjust[0],
match.end() + span_adjust[1])
# episodes which have a season > 25 are most likely errors
# episodes which have a season > 30 are most likely errors
# (Simpsons is at 24!)
if int(guess.get('season', 0)) > 25:
if int(guess.get('season', 0)) > 30:
continue
# decide whether we have only a single episode number or an
# episode list
if guess.get('episodeNumber'):
eplist = number_list(guess['episodeNumber'])
guess.set('episodeNumber', int(eplist[0]), confidence=confidence)
guess.set('episodeNumber', eplist[0], confidence=confidence)
if len(eplist) > 1:
guess.set('episodeList', list(map(int, eplist)), confidence=confidence)
guess.set('episodeList', eplist, confidence=confidence)
if guess.get('bonusNumber'):
eplist = number_list(guess['bonusNumber'])
guess.set('bonusNumber', int(eplist[0]), confidence=confidence)
guess.set('bonusNumber', eplist[0], confidence=confidence)
return guess, span

71
libs/guessit/transfo/guess_idnumber.py

@ -0,0 +1,71 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2013 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
from guessit.transfo import SingleNodeGuesser
from guessit.patterns import find_properties
import re
import logging
log = logging.getLogger(__name__)
def guess_properties(string):
try:
prop, value, pos, end = find_properties(string)[0]
return { prop: value }, (pos, end)
except IndexError:
return None, None
_idnum = re.compile(r'(?P<idNumber>[a-zA-Z0-9-]{10,})') # 1.0, (0, 0))
def guess_idnumber(string):
match = _idnum.search(string)
if match is not None:
result = match.groupdict()
switch_count = 0
DIGIT = 0
LETTER = 1
OTHER = 2
last = LETTER
for c in result['idNumber']:
if c in '0123456789':
ci = DIGIT
elif c in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ':
ci = LETTER
else:
ci = OTHER
if ci != last:
switch_count += 1
last = ci
switch_ratio = float(switch_count) / len(result['idNumber'])
# only return the result as probable if we alternate often between
# char type (more likely for hash values than for common words)
if switch_ratio > 0.4:
return result, match.span()
return None, None
def process(mtree):
SingleNodeGuesser(guess_idnumber, 0.4, log).process(mtree)

21
libs/guessit/transfo/guess_release_group.py

@ -31,16 +31,22 @@ def get_patterns(property_name):
CODECS = get_patterns('videoCodec')
FORMATS = get_patterns('format')
VAPIS = get_patterns('videoApi')
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
# RG names following a codec or format, with a potential space or dash inside the name
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
for codec in CODECS ]
GROUP_NAMES += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
GROUP_NAMES += [ r'(?P<format>' + fmt + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
for fmt in FORMATS ]
GROUP_NAMES += [ r'(?P<videoApi>' + api + r')[ \.-](?P<releaseGroup>.+?([- \.].*?)??)[ \.]'
for api in VAPIS ]
GROUP_NAMES2 = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for codec in CODECS ]
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for fmt in FORMATS ]
GROUP_NAMES2 += [ r'\.(?P<videoApi>' + vapi + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for vapi in VAPIS ]
GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ]
GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ]
@ -54,12 +60,17 @@ def guess_release_group(string):
# first try to see whether we have both a known codec and a known release group
for rexp in GROUP_NAMES:
match = rexp.search(string)
if match:
while match:
metadata = match.groupdict()
release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup'])
# make sure this is an actual release group we caught
release_group = (compute_canonical_form('releaseGroup', metadata['releaseGroup']) or
compute_canonical_form('weakReleaseGroup', metadata['releaseGroup']))
if release_group:
return adjust_metadata(metadata), (match.start(1), match.end(2))
# we didn't find anything conclusive, keep searching
match = rexp.search(string, match.span()[0]+1)
# pick anything as releaseGroup as long as we have a codec in front
# this doesn't include a potential dash ('-') ending the release group
# eg: [...].X264-HiS@SiLUHD-English.[...]

16
libs/guessit/transfo/guess_year.py

@ -33,6 +33,18 @@ def guess_year(string):
else:
return None, None
def guess_year_skip_first(string):
year, span = search_year(string)
if year:
year2, span2 = guess_year(string[span[1]:])
if year2:
return year2, (span2[0]+span[1], span2[1]+span[1])
return None, None
def process(mtree):
SingleNodeGuesser(guess_year, 1.0, log).process(mtree)
def process(mtree, skip_first_year=False):
if skip_first_year:
SingleNodeGuesser(guess_year_skip_first, 1.0, log).process(mtree)
else:
SingleNodeGuesser(guess_year, 1.0, log).process(mtree)

Loading…
Cancel
Save