Browse Source

Update guessit with unicode fix

pull/1867/head
Ruud 12 years ago
parent
commit
bc8d8dcd04
  1. 0
      libs/guessit/ISO-3166-1_utf8.txt
  2. 0
      libs/guessit/ISO-639-2_utf-8.txt
  3. 92
      libs/guessit/__init__.py
  4. 115
      libs/guessit/__main__.py
  5. 0
      libs/guessit/country.py
  6. 0
      libs/guessit/date.py
  7. 7
      libs/guessit/fileutils.py
  8. 52
      libs/guessit/guess.py
  9. 0
      libs/guessit/hash_ed2k.py
  10. 0
      libs/guessit/hash_mpc.py
  11. 30
      libs/guessit/language.py
  12. 52
      libs/guessit/matcher.py
  13. 28
      libs/guessit/matchtree.py
  14. 154
      libs/guessit/patterns.py
  15. 56
      libs/guessit/slogging.py
  16. 22
      libs/guessit/textutils.py
  17. 2
      libs/guessit/transfo/__init__.py
  18. 0
      libs/guessit/transfo/guess_bonus_features.py
  19. 24
      libs/guessit/transfo/guess_country.py
  20. 0
      libs/guessit/transfo/guess_date.py
  21. 0
      libs/guessit/transfo/guess_episode_info_from_position.py
  22. 0
      libs/guessit/transfo/guess_episodes_rexps.py
  23. 6
      libs/guessit/transfo/guess_filetype.py
  24. 15
      libs/guessit/transfo/guess_language.py
  25. 1
      libs/guessit/transfo/guess_movie_title_from_position.py
  26. 0
      libs/guessit/transfo/guess_properties.py
  27. 50
      libs/guessit/transfo/guess_release_group.py
  28. 0
      libs/guessit/transfo/guess_video_rexps.py
  29. 0
      libs/guessit/transfo/guess_weak_episodes_rexps.py
  30. 0
      libs/guessit/transfo/guess_website.py
  31. 0
      libs/guessit/transfo/guess_year.py
  32. 19
      libs/guessit/transfo/post_process.py
  33. 0
      libs/guessit/transfo/split_explicit_groups.py
  34. 10
      libs/guessit/transfo/split_on_dash.py
  35. 0
      libs/guessit/transfo/split_path_components.py

0
libs/guessit/ISO-3166-1_utf8.txt

0
libs/guessit/ISO-639-2_utf-8.txt

92
libs/guessit/__init__.py

@ -18,8 +18,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
# #
from __future__ import unicode_literals
__version__ = '0.5.2' __version__ = '0.6-dev'
__all__ = ['Guess', 'Language', __all__ = ['Guess', 'Language',
'guess_file_info', 'guess_video_info', 'guess_file_info', 'guess_video_info',
'guess_movie_info', 'guess_episode_info'] 'guess_movie_info', 'guess_episode_info']
@ -73,6 +74,7 @@ else:
from guessit.guess import Guess, merge_all from guessit.guess import Guess, merge_all
from guessit.language import Language from guessit.language import Language
from guessit.matcher import IterativeMatcher from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_string
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -88,6 +90,86 @@ h = NullHandler()
log.addHandler(h) log.addHandler(h)
def _guess_filename(filename, filetype):
mtree = IterativeMatcher(filename, filetype=filetype)
m = mtree.matched()
if 'language' not in m and 'subtitleLanguage' not in m:
return m
# if we found some language, make sure we didn't cut a title or sth...
mtree2 = IterativeMatcher(filename, filetype=filetype,
opts=['nolanguage', 'nocountry'])
m2 = mtree2.matched()
def find_nodes(tree, props):
"""Yields all nodes containing any of the given props."""
if isinstance(props, base_text_type):
props = [props]
for node in tree.nodes():
if any(prop in node.guess for prop in props):
yield node
def warning(title):
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
return m
if m.get('title') != m2.get('title'):
title = next(find_nodes(mtree.match_tree, 'title'))
title2 = next(find_nodes(mtree2.match_tree, 'title'))
langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))
if not langs:
return warning('A weird error happened with language detection')
# find the language that is likely more relevant
for lng in langs:
if lng.value in title2.value:
# if the language was detected as part of a potential title,
# look at this one in particular
lang = lng
break
else:
# pick the first one if we don't have a better choice
lang = langs[0]
# language code are rarely part of a title, and those
# should be handled by the Language exceptions anyway
if len(lang.value) <= 3:
return m
# if filetype is subtitle and the language appears last, just before
# the extension, then it is likely a subtitle language
parts = clean_string(title.root.value).split()
if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and
parts.index(lang.value) == len(parts) - 2):
return m
# if the language was in the middle of the other potential title,
# keep the other title (eg: The Italian Job), except if it is at the
# very beginning, in which case we consider it an error
if m2['title'].startswith(lang.value):
return m
elif lang.value in title2.value:
return m2
# if a node is in an explicit group, then the correct title is probably
# the other one
if title.root.node_at(title.node_idx[:2]).is_explicit():
return m2
elif title2.root.node_at(title2.node_idx[:2]).is_explicit():
return m
return warning('Not sure of the title because of the language position')
return m
def guess_file_info(filename, filetype, info=None): def guess_file_info(filename, filetype, info=None):
"""info can contain the names of the various plugins, such as 'filename' to """info can contain the names of the various plugins, such as 'filename' to
detect filename info, or 'hash_md5' to get the md5 hash of the file. detect filename info, or 'hash_md5' to get the md5 hash of the file.
@ -98,6 +180,9 @@ def guess_file_info(filename, filetype, info=None):
result = [] result = []
hashers = [] hashers = []
# Force unicode as soon as possible
filename = u(filename)
if info is None: if info is None:
info = ['filename'] info = ['filename']
@ -106,8 +191,7 @@ def guess_file_info(filename, filetype, info=None):
for infotype in info: for infotype in info:
if infotype == 'filename': if infotype == 'filename':
m = IterativeMatcher(filename, filetype=filetype) result.append(_guess_filename(filename, filetype))
result.append(m.matched())
elif infotype == 'hash_mpc': elif infotype == 'hash_mpc':
from guessit.hash_mpc import hash_file from guessit.hash_mpc import hash_file
@ -161,7 +245,7 @@ def guess_file_info(filename, filetype, info=None):
# last minute adjustments # last minute adjustments
# if country is in the guessed properties, make it part of the filename # if country is in the guessed properties, make it part of the filename
if 'country' in result: if 'series' in result and 'country' in result:
result['series'] += ' (%s)' % result['country'].alpha2.upper() result['series'] += ' (%s)' % result['country'].alpha2.upper()

115
libs/guessit/__main__.py

@ -0,0 +1,115 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
from __future__ import print_function
from guessit import u
from guessit import slogging, guess_file_info
from optparse import OptionParser
import logging
def detect_filename(filename, filetype, info=['filename']):
filename = u(filename)
print('For:', filename)
print('GuessIt found:', guess_file_info(filename, filetype, info).nice_string())
def run_demo(episodes=True, movies=True):
# NOTE: tests should not be added here but rather in the tests/ folder
# this is just intended as a quick example
if episodes:
testeps = [ 'Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi',
'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi',
'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi',
'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi',
'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi',
'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg',
'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi',
'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi',
'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi'
]
for f in testeps:
print('-'*80)
detect_filename(f, filetype='episode')
if movies:
testmovies = [ 'Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv',
'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi',
'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi',
'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv',
'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv',
'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten
'[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten
'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi',
'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt',
'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv',
'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv',
'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi',
'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi',
'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi',
'Movies/Juno (2007)/Juno KLAXXON.avi',
'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv',
'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt',
'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi',
'testsmewt_bugs/movies/Baraka_Edition_Collector.avi'
]
for f in testmovies:
print('-'*80)
detect_filename(f, filetype = 'movie')
def main():
slogging.setupLogging()
parser = OptionParser(usage = 'usage: %prog [options] file1 [file2...]')
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
help = 'display debug output')
parser.add_option('-i', '--info', dest = 'info', default = 'filename',
help = 'the desired information type: filename, hash_mpc or a hash from python\'s '
'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of '
'them, comma-separated')
parser.add_option('-t', '--type', dest = 'filetype', default = 'autodetect',
help = 'the suggested file type: movie, episode or autodetect')
parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False,
help = 'run a few builtin tests instead of analyzing a file')
options, args = parser.parse_args()
if options.verbose:
logging.getLogger('guessit').setLevel(logging.DEBUG)
if options.demo:
run_demo(episodes=True, movies=True)
else:
if args:
for filename in args:
detect_filename(filename,
filetype = options.filetype,
info = options.info.split(','))
else:
parser.print_help()
if __name__ == '__main__':
main()

0
libs/guessit/country.py

0
libs/guessit/date.py

7
libs/guessit/fileutils.py

@ -22,6 +22,7 @@ from __future__ import unicode_literals
from guessit import s, u from guessit import s, u
import os.path import os.path
import zipfile import zipfile
import io
def split_path(path): def split_path(path):
@ -76,12 +77,12 @@ def file_in_same_dir(ref_file, desired_file):
def load_file_in_same_dir(ref_file, filename): def load_file_in_same_dir(ref_file, filename):
"""Load a given file. Works even when the file is contained inside a zip.""" """Load a given file. Works even when the file is contained inside a zip."""
path = split_path(ref_file)[:-1] + [filename] path = split_path(ref_file)[:-1] + [str(filename)]
for i, p in enumerate(path): for i, p in enumerate(path):
if p.endswith('.zip'): if p[-4:] == '.zip':
zfilename = os.path.join(*path[:i + 1]) zfilename = os.path.join(*path[:i + 1])
zfile = zipfile.ZipFile(zfilename) zfile = zipfile.ZipFile(zfilename)
return zfile.read('/'.join(path[i + 1:])) return zfile.read('/'.join(path[i + 1:]))
return u(open(os.path.join(*path)).read()) return u(io.open(os.path.join(*path), encoding = 'utf-8').read())

52
libs/guessit/guess.py

@ -253,48 +253,26 @@ def merge_similar_guesses(guesses, prop, choose):
merge_similar_guesses(guesses, prop, choose) merge_similar_guesses(guesses, prop, choose)
def merge_append_guesses(guesses, prop):
"""Take a list of guesses and merge those which have the same properties by
appending them in a list.
DEPRECATED, remove with old guessers
"""
similar = [guess for guess in guesses if prop in guess]
if not similar:
return
merged = similar[0]
merged[prop] = [merged[prop]]
# TODO: what to do with global confidence? mean of them all?
for m in similar[1:]:
for prop2 in m:
if prop == prop2:
merged[prop].append(m[prop])
else:
if prop2 in m:
log.warning('overwriting property "%s" with value %s' % (prop2, m[prop2]))
merged[prop2] = m[prop2]
# TODO: confidence also
guesses.remove(m)
def merge_all(guesses, append=None): def merge_all(guesses, append=None):
"""Merge all the guesses in a single result, remove very unlikely values, """Merge all the guesses in a single result, remove very unlikely values,
and return it. and return it.
You can specify a list of properties that should be appended into a list You can specify a list of properties that should be appended into a list
instead of being merged. instead of being merged.
>>> s(merge_all([ Guess({ 'season': 2 }, confidence = 0.6), >>> s(merge_all([ Guess({'season': 2}, confidence=0.6),
... Guess({ 'episodeNumber': 13 }, confidence = 0.8) ])) ... Guess({'episodeNumber': 13}, confidence=0.8) ]))
{'season': 2, 'episodeNumber': 13} {'season': 2, 'episodeNumber': 13}
>>> s(merge_all([ Guess({ 'episodeNumber': 27 }, confidence = 0.02), >>> s(merge_all([ Guess({'episodeNumber': 27}, confidence=0.02),
... Guess({ 'season': 1 }, confidence = 0.2) ])) ... Guess({'season': 1}, confidence=0.2) ]))
{'season': 1} {'season': 1}
>>> s(merge_all([ Guess({'other': 'PROPER'}, confidence=0.8),
... Guess({'releaseGroup': '2HD'}, confidence=0.8) ],
... append=['other']))
{'releaseGroup': '2HD', 'other': ['PROPER']}
""" """
if not guesses: if not guesses:
return Guess() return Guess()
@ -328,7 +306,13 @@ def merge_all(guesses, append=None):
# make sure our appendable properties contain unique values # make sure our appendable properties contain unique values
for prop in append: for prop in append:
if prop in result: try:
result[prop] = list(set(result[prop])) value = result[prop]
if isinstance(value, list):
result[prop] = list(set(value))
else:
result[prop] = [ value ]
except KeyError:
pass
return result return result

0
libs/guessit/hash_ed2k.py

0
libs/guessit/hash_mpc.py

30
libs/guessit/language.py

@ -21,13 +21,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from guessit import UnicodeMixin, base_text_type, u, s from guessit import UnicodeMixin, base_text_type, u, s
from guessit.fileutils import load_file_in_same_dir from guessit.fileutils import load_file_in_same_dir
from guessit.textutils import find_words
from guessit.country import Country from guessit.country import Country
import re import re
import logging import logging
__all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language', __all__ = [ 'is_iso_language', 'is_language', 'lang_set', 'Language',
'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED', 'ALL_LANGUAGES', 'ALL_LANGUAGES_NAMES', 'UNDETERMINED',
'search_language' ] 'search_language', 'guess_language' ]
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -317,7 +318,7 @@ def search_language(string, lang_filter=None):
'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to', 'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to',
'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan', 'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan',
'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as', 'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as',
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi',
# french words # french words
'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que', 'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que',
'mal', 'est', 'vol', 'or', 'mon', 'se', 'mal', 'est', 'vol', 'or', 'mon', 'se',
@ -325,7 +326,7 @@ def search_language(string, lang_filter=None):
'la', 'el', 'del', 'por', 'mar', 'la', 'el', 'del', 'por', 'mar',
# other # other
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii', 'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
'vi' 'vi', 'ben', 'da'
]) ])
sep = r'[](){} \._-+' sep = r'[](){} \._-+'
@ -334,7 +335,8 @@ def search_language(string, lang_filter=None):
slow = ' %s ' % string.lower() slow = ' %s ' % string.lower()
confidence = 1.0 # for all of them confidence = 1.0 # for all of them
for lang in lng_all_names:
for lang in set(find_words(slow)) & lng_all_names:
if lang in lng_common_words: if lang in lng_common_words:
continue continue
@ -351,7 +353,7 @@ def search_language(string, lang_filter=None):
if lang_filter and language not in lang_filter: if lang_filter and language not in lang_filter:
continue continue
# only allow those languages that have a 2-letter code, those who # only allow those languages that have a 2-letter code, those that
# don't are too esoteric and probably false matches # don't are too esoteric and probably false matches
if language.lang not in lng3_to_lng2: if language.lang not in lng3_to_lng2:
continue continue
@ -364,9 +366,25 @@ def search_language(string, lang_filter=None):
else: else:
# Note: we could either be really confident that we found a # Note: we could either be really confident that we found a
# language or assume that full language names are too # language or assume that full language names are too
# common words # common words and lower their confidence accordingly
confidence = 0.3 # going with the low-confidence route here confidence = 0.3 # going with the low-confidence route here
return language, (pos - 1, end - 1), confidence return language, (pos - 1, end - 1), confidence
return None, None, None return None, None, None
def guess_language(text):
"""Guess the language in which a body of text is written.
This uses the external guess-language python module, and will fail and return
Language(Undetermined) if it is not installed.
"""
try:
from guess_language import guessLanguage
return Language(guessLanguage(text))
except ImportError:
log.error('Cannot detect the language of the given text body, missing dependency: guess-language')
log.error('Please install it from PyPI, by doing eg: pip install guess-language')
return UNDETERMINED

52
libs/guessit/matcher.py

@ -19,18 +19,16 @@
# #
from __future__ import unicode_literals from __future__ import unicode_literals
from guessit import PY3, u from guessit import PY3, u, base_text_type
from guessit.matchtree import MatchTree from guessit.matchtree import MatchTree
from guessit.guess import (merge_similar_guesses, merge_all, from guessit.textutils import normalize_unicode
choose_int, choose_string)
import copy
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class IterativeMatcher(object): class IterativeMatcher(object):
def __init__(self, filename, filetype='autodetect'): def __init__(self, filename, filetype='autodetect', opts=None):
"""An iterative matcher tries to match different patterns that appear """An iterative matcher tries to match different patterns that appear
in the filename. in the filename.
@ -76,6 +74,14 @@ class IterativeMatcher(object):
raise ValueError("filetype needs to be one of %s" % valid_filetypes) raise ValueError("filetype needs to be one of %s" % valid_filetypes)
if not PY3 and not isinstance(filename, unicode): if not PY3 and not isinstance(filename, unicode):
log.warning('Given filename to matcher is not unicode...') log.warning('Given filename to matcher is not unicode...')
filename = filename.decode('utf-8')
filename = normalize_unicode(filename)
if opts is None:
opts = []
elif isinstance(opts, base_text_type):
opts = opts.split()
self.match_tree = MatchTree(filename) self.match_tree = MatchTree(filename)
mtree = self.match_tree mtree = self.match_tree
@ -84,7 +90,7 @@ class IterativeMatcher(object):
def apply_transfo(transfo_name, *args, **kwargs): def apply_transfo(transfo_name, *args, **kwargs):
transfo = __import__('guessit.transfo.' + transfo_name, transfo = __import__('guessit.transfo.' + transfo_name,
globals=globals(), locals=locals(), globals=globals(), locals=locals(),
fromlist=['process'], level=-1) fromlist=['process'], level=0)
transfo.process(mtree, *args, **kwargs) transfo.process(mtree, *args, **kwargs)
# 1- first split our path into dirs + basename + ext # 1- first split our path into dirs + basename + ext
@ -115,13 +121,20 @@ class IterativeMatcher(object):
'guess_properties', 'guess_language', 'guess_properties', 'guess_language',
'guess_video_rexps' ] 'guess_video_rexps' ]
if 'nolanguage' in opts:
strategy.remove('guess_language')
for name in strategy: for name in strategy:
apply_transfo(name) apply_transfo(name)
# more guessers for both movies and episodes # more guessers for both movies and episodes
for name in ['guess_bonus_features', 'guess_year', 'guess_country']: for name in ['guess_bonus_features', 'guess_year']:
apply_transfo(name) apply_transfo(name)
if 'nocountry' not in opts:
apply_transfo('guess_country')
# split into '-' separated subgroups (with required separator chars # split into '-' separated subgroups (with required separator chars
# around the dash) # around the dash)
apply_transfo('split_on_dash') apply_transfo('split_on_dash')
@ -139,27 +152,4 @@ class IterativeMatcher(object):
log.debug('Found match tree:\n%s' % u(mtree)) log.debug('Found match tree:\n%s' % u(mtree))
def matched(self): def matched(self):
# we need to make a copy here, as the merge functions work in place and return self.match_tree.matched()
# calling them on the match tree would modify it
parts = [node.guess for node in self.match_tree.nodes() if node.guess]
parts = copy.deepcopy(parts)
# 1- try to merge similar information together and give it a higher
# confidence
for int_part in ('year', 'season', 'episodeNumber'):
merge_similar_guesses(parts, int_part, choose_int)
for string_part in ('title', 'series', 'container', 'format',
'releaseGroup', 'website', 'audioCodec',
'videoCodec', 'screenSize', 'episodeFormat',
'audioChannels'):
merge_similar_guesses(parts, string_part, choose_string)
# 2- merge the rest, potentially discarding information not properly
# merged before
result = merge_all(parts,
append=['language', 'subtitleLanguage', 'other'])
log.debug('Final result: ' + result.nice_string())
return result

28
libs/guessit/matchtree.py

@ -22,6 +22,9 @@ from __future__ import unicode_literals
from guessit import UnicodeMixin, base_text_type, Guess from guessit import UnicodeMixin, base_text_type, Guess
from guessit.textutils import clean_string, str_fill from guessit.textutils import clean_string, str_fill
from guessit.patterns import group_delimiters from guessit.patterns import group_delimiters
from guessit.guess import (merge_similar_guesses, merge_all,
choose_int, choose_string)
import copy
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -257,3 +260,28 @@ class MatchTree(BaseMatchTree):
"""Return whether the group was explicitly enclosed by """Return whether the group was explicitly enclosed by
parentheses/square brackets/etc.""" parentheses/square brackets/etc."""
return (self.value[0] + self.value[-1]) in group_delimiters return (self.value[0] + self.value[-1]) in group_delimiters
def matched(self):
# we need to make a copy here, as the merge functions work in place and
# calling them on the match tree would modify it
parts = [node.guess for node in self.nodes() if node.guess]
parts = copy.deepcopy(parts)
# 1- try to merge similar information together and give it a higher
# confidence
for int_part in ('year', 'season', 'episodeNumber'):
merge_similar_guesses(parts, int_part, choose_int)
for string_part in ('title', 'series', 'container', 'format',
'releaseGroup', 'website', 'audioCodec',
'videoCodec', 'screenSize', 'episodeFormat',
'audioChannels'):
merge_similar_guesses(parts, string_part, choose_string)
# 2- merge the rest, potentially discarding information not properly
# merged before
result = merge_all(parts,
append=['language', 'subtitleLanguage', 'other'])
log.debug('Final result: ' + result.nice_string())
return result

154
libs/guessit/patterns.py

@ -20,9 +20,10 @@
# #
from __future__ import unicode_literals from __future__ import unicode_literals
import re
subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa', 'txt' ] subtitle_exts = [ 'srt', 'idx', 'sub', 'ssa' ]
video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2', video_exts = ['3g2', '3gp', '3gp2', 'asf', 'avi', 'divx', 'flv', 'm4v', 'mk2',
'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm', 'mka', 'mkv', 'mov', 'mp4', 'mp4a', 'mpeg', 'mpg', 'ogg', 'ogm',
@ -42,13 +43,13 @@ episode_rexps = [ # ... Season 2 ...
(r'saison (?P<season>[0-9]+)', 1.0, (0, 0)), (r'saison (?P<season>[0-9]+)', 1.0, (0, 0)),
# ... s02e13 ... # ... s02e13 ...
(r'[Ss](?P<season>[0-9]{1,2}).{,3}(?P<episodeNumber>(?:[Ee][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)), (r'[Ss](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[Ee-][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
# ... s03-x02 ... # ... s03-x02 ...
(r'[Ss](?P<season>[0-9]{1,2}).{,3}(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)), (r'[Ss](?P<season>[0-9]{1,2}).?(?P<bonusNumber>(?:[Xx][0-9]{1,2})+)[^0-9]', 1.0, (0, -1)),
# ... 2x13 ... # ... 2x13 ...
(r'[^0-9](?P<season>[0-9]{1,2})(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)), (r'[^0-9](?P<season>[0-9]{1,2}).?(?P<episodeNumber>(?:[xX][0-9]{1,2})+)[^0-9]', 0.8, (1, -1)),
# ... s02 ... # ... s02 ...
#(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)), #(sep + r's(?P<season>[0-9]{1,2})' + sep, 0.6, (1, -1)),
@ -61,7 +62,7 @@ episode_rexps = [ # ... Season 2 ...
('ep' + sep + r'(?P<episodeNumber>[0-9]{1,2})[^0-9]', 0.7, (0, -1)), ('ep' + sep + r'(?P<episodeNumber>[0-9]{1,2})[^0-9]', 0.7, (0, -1)),
# ... e13 ... for a mini-series without a season number # ... e13 ... for a mini-series without a season number
(r'e(?P<episodeNumber>[0-9]{1,2})[^0-9]', 0.6, (0, -1)) (sep + r'e(?P<episodeNumber>[0-9]{1,2})' + sep, 0.6, (1, -1))
] ]
@ -99,92 +100,129 @@ video_rexps = [ # cd number
(r'f(?P<filmNumber>[0-9]{1,2})', 1.0, (0, 0)) (r'f(?P<filmNumber>[0-9]{1,2})', 1.0, (0, 0))
] ]
websites = [ 'tvu.org.ru', 'emule-island.com', 'UsaBit.com', 'www.divx-overnet.com', 'sharethefiles.com' ] websites = [ 'tvu.org.ru', 'emule-island.com', 'UsaBit.com', 'www.divx-overnet.com',
'sharethefiles.com' ]
unlikely_series = ['series'] unlikely_series = [ 'series' ]
properties = { 'format': [ 'DVDRip', 'HD-DVD', 'HDDVD', 'HDDVDRip', 'BluRay', 'Blu-ray', 'BDRip', 'BRRip',
'HDRip', 'DVD', 'DVDivX', 'HDTV', 'DVB', 'DVBRip', 'PDTV', 'WEBRip',
'DVDSCR', 'Screener', 'VHS', 'VIDEO_TS', 'WEB-DL', 'WEBDL' ],
'screenSize': [ '720p', '720', '1080p', '1080' ], # prop_multi is a dict of { property_name: { canonical_form: [ pattern ] } }
# pattern is a string considered as a regexp, with the addition that dashes are
# replaced with '([ \.-_])?' which matches more types of separators (or none)
# note: simpler patterns need to be at the end of the list to not shadow more
# complete ones, eg: 'AAC' needs to come after 'He-AAC'
# ie: from most specific to less specific
prop_multi = { 'format': { 'DVD': [ 'DVD', 'DVD-Rip', 'VIDEO-TS', 'DVDivX' ],
'HD-DVD': [ 'HD-(?:DVD)?-Rip', 'HD-DVD' ],
'BluRay': [ 'Blu-ray', 'B[DR]Rip' ],
'HDTV': [ 'HD-TV' ],
'DVB': [ 'DVB-Rip', 'DVB', 'PD-TV' ],
'WEBRip': [ 'WEB-Rip' ],
'Screener': [ 'DVD-SCR', 'Screener' ],
'VHS': [ 'VHS' ],
'WEB-DL': [ 'WEB-DL' ] },
'videoCodec': [ 'XviD', 'DivX', 'x264', 'h264', 'Rv10' ], 'screenSize': { '480p': [ '480p?' ],
'720p': [ '720p?' ],
'1080p': [ '1080p?' ] },
'audioCodec': [ 'AC3', 'DTS', 'He-AAC', 'AAC-He', 'AAC' ], 'videoCodec': { 'XviD': [ 'Xvid' ],
'DivX': [ 'DVDivX', 'DivX' ],
'h264': [ '[hx]-264' ],
'Rv10': [ 'Rv10' ] },
'audioChannels': [ '5.1' ], 'audioCodec': { 'AC3': [ 'AC3' ],
'DTS': [ 'DTS' ],
'AAC': [ 'He-AAC', 'AAC-He', 'AAC' ] },
'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', '[XCT]', 'iNT', 'PUKKA', 'audioChannels': { '5.1': [ r'5\.1', 'DD5\.1', '5ch' ] },
'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', ' FiNaLe',
'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV', 'SAiNTS',
'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV' ],
'episodeFormat': [ 'Minisode', 'Minisodes' ], 'episodeFormat': { 'Minisode': [ 'Minisodes?' ] }
'other': [ '5ch', 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'iNTERNAL', 'Audiofixed', 'R5',
'complete', 'classic', # not so sure about these ones, could appear in a title
'ws', # widescreen
],
} }
# prop_single dict of { property_name: [ canonical_form ] }
prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'PUKKA',
'CHD', 'ViTE', 'TLF', 'DEiTY', 'FLAiTE',
'MDX', 'GM4F', 'DVL', 'SVD', 'iLUMiNADOS', 'FiNaLe',
'UnSeeN', 'aXXo', 'KLAXXON', 'NoTV', 'ZeaL', 'LOL',
'SiNNERS', 'DiRTY', 'REWARD', 'ECI', 'KiNGS', 'CLUE',
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV',
'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ],
def find_properties(filename): 'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5',
'complete', 'classic', # not so sure about these ones, could appear in a title
'ws' ] # widescreen
}
_dash = '-'
_psep = '[-\. _]?'
def _to_rexp(prop):
return re.compile(prop.replace(_dash, _psep), re.IGNORECASE)
# properties_rexps dict of { property_name: { canonical_form: [ rexp ] } }
# containing the rexps compiled from both prop_multi and prop_single
properties_rexps = dict((type, dict((canonical_form,
[ _to_rexp(pattern) for pattern in patterns ])
for canonical_form, patterns in props.items()))
for type, props in prop_multi.items())
properties_rexps.update(dict((type, dict((canonical_form, [ _to_rexp(canonical_form) ])
for canonical_form in props))
for type, props in prop_single.items()))
def find_properties(string):
result = [] result = []
clow = filename.lower() for property_name, props in properties_rexps.items():
for prop, values in properties.items(): for canonical_form, rexps in props.items():
for value in values: for value_rexp in rexps:
pos = clow.find(value.lower()) match = value_rexp.search(string)
if pos != -1: if match:
end = pos + len(value) start, end = match.span()
# make sure our word is always surrounded by separators # make sure our word is always surrounded by separators
if ((pos > 0 and clow[pos - 1] not in sep) or
(end < len(clow) and clow[end] not in sep)):
# note: sep is a regexp, but in this case using it as # note: sep is a regexp, but in this case using it as
# a sequence achieves the same goal # a char sequence achieves the same goal
continue if ((start > 0 and string[start-1] not in sep) or
(end < len(string) and string[end] not in sep)):
continue
result.append((prop, value, pos, end)) result.append((property_name, canonical_form, start, end))
return result return result
property_synonyms = { 'DVD': [ 'DVDRip', 'VIDEO_TS' ], property_synonyms = { 'Special Edition': [ 'Special' ],
'HD-DVD': [ 'HDDVD', 'HDDVDRip' ],
'BluRay': [ 'BDRip', 'BRRip', 'Blu-ray' ],
'WEB-DL': [ 'WEBDL' ],
'DVB': [ 'DVBRip', 'PDTV' ],
'Screener': [ 'DVDSCR' ],
'DivX': [ 'DVDivX' ],
'h264': [ 'x264' ],
'720p': [ '720' ],
'1080p': [ '1080' ],
'AAC': [ 'He-AAC', 'AAC-He' ],
'Special Edition': [ 'Special' ],
'Collector Edition': [ 'Collector' ], 'Collector Edition': [ 'Collector' ],
'Criterion Edition': [ 'Criterion' ], 'Criterion Edition': [ 'Criterion' ]
'Minisode': [ 'Minisodes' ]
} }
def revert_synonyms(): def revert_synonyms():
reverse = {} reverse = {}
for _, values in properties.items():
for value in values:
reverse[value.lower()] = value
for canonical, synonyms in property_synonyms.items(): for canonical, synonyms in property_synonyms.items():
for synonym in synonyms: for synonym in synonyms:
reverse[synonym.lower()] = canonical reverse[synonym.lower()] = canonical
return reverse return reverse
reverse_synonyms = revert_synonyms() reverse_synonyms = revert_synonyms()
def canonical_form(string): def canonical_form(string):
return reverse_synonyms.get(string.lower(), string) return reverse_synonyms.get(string.lower(), string)
def compute_canonical_form(property_name, value):
"""Return the canonical form of a property given its type if it is a valid
one, None otherwise."""
for canonical_form, rexps in properties_rexps[property_name].items():
for rexp in rexps:
if rexp.match(value):
return canonical_form
return None

56
libs/guessit/slogging.py

@ -21,6 +21,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import logging import logging
import sys import sys
import os, os.path
GREEN_FONT = "\x1B[0;32m" GREEN_FONT = "\x1B[0;32m"
YELLOW_FONT = "\x1B[0;33m" YELLOW_FONT = "\x1B[0;33m"
@ -29,33 +31,57 @@ RED_FONT = "\x1B[0;31m"
RESET_FONT = "\x1B[0m" RESET_FONT = "\x1B[0m"
def setupLogging(colored=True): def setupLogging(colored=True, with_time=False, with_thread=False, filename=None):
"""Set up a nice colored logger as the main application logger.""" """Set up a nice colored logger as the main application logger."""
class SimpleFormatter(logging.Formatter): class SimpleFormatter(logging.Formatter):
def __init__(self): def __init__(self, with_time, with_thread):
self.fmt = '%(levelname)-8s %(module)s:%(funcName)s -- %(message)s' self.fmt = (('%(asctime)s ' if with_time else '') +
'%(levelname)-8s ' +
'[%(name)s:%(funcName)s]' +
('[%(threadName)s]' if with_thread else '') +
' -- %(message)s')
logging.Formatter.__init__(self, self.fmt) logging.Formatter.__init__(self, self.fmt)
class ColoredFormatter(logging.Formatter): class ColoredFormatter(logging.Formatter):
def __init__(self): def __init__(self, with_time, with_thread):
self.fmt = ('%(levelname)-8s ' + self.fmt = (('%(asctime)s ' if with_time else '') +
BLUE_FONT + '%(name)s:%(funcName)s' + '-CC-%(levelname)-8s ' +
RESET_FONT + ' -- %(message)s') BLUE_FONT + '[%(name)s:%(funcName)s]' +
RESET_FONT + ('[%(threadName)s]' if with_thread else '') +
' -- %(message)s')
logging.Formatter.__init__(self, self.fmt) logging.Formatter.__init__(self, self.fmt)
def format(self, record): def format(self, record):
modpath = record.name.split('.')
record.mname = modpath[0]
record.mmodule = '.'.join(modpath[1:])
result = logging.Formatter.format(self, record) result = logging.Formatter.format(self, record)
if record.levelno in (logging.DEBUG, logging.INFO): if record.levelno == logging.DEBUG:
return GREEN_FONT + result color = BLUE_FONT
elif record.levelno == logging.INFO:
color = GREEN_FONT
elif record.levelno == logging.WARNING: elif record.levelno == logging.WARNING:
return YELLOW_FONT + result color = YELLOW_FONT
else: else:
return RED_FONT + result color = RED_FONT
ch = logging.StreamHandler() result = result.replace('-CC-', color)
if colored and sys.platform != 'win32': return result
ch.setFormatter(ColoredFormatter())
if filename is not None:
# make sure we can write to our log file
logdir = os.path.dirname(filename)
if not os.path.exists(logdir):
os.makedirs(logdir)
ch = logging.FileHandler(filename, mode='w')
ch.setFormatter(SimpleFormatter(with_time, with_thread))
else: else:
ch.setFormatter(SimpleFormatter()) ch = logging.StreamHandler()
if colored and sys.platform != 'win32':
ch.setFormatter(ColoredFormatter(with_time, with_thread))
else:
ch.setFormatter(SimpleFormatter(with_time, with_thread))
logging.getLogger().addHandler(ch) logging.getLogger().addHandler(ch)

22
libs/guessit/textutils.py

@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# #
# Smewt - A smart collection manager # Smewt - A smart collection manager
# Copyright (c) 2008 Nicolas Wack <wackou@gmail.com> # Copyright (c) 2008-2012 Nicolas Wack <wackou@gmail.com>
# #
# Smewt is free software; you can redistribute it and/or modify # Smewt is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by # it under the terms of the GNU General Public License as published by
@ -23,10 +23,13 @@ from guessit import s
from guessit.patterns import sep from guessit.patterns import sep
import functools import functools
import unicodedata import unicodedata
import copy import re
# string-related functions # string-related functions
def normalize_unicode(s):
return unicodedata.normalize('NFC', s)
def strip_brackets(s): def strip_brackets(s):
if not s: if not s:
@ -55,6 +58,21 @@ def clean_string(s):
return result return result
_words_rexp = re.compile('\w+', re.UNICODE)
def find_words(s):
return _words_rexp.findall(s.replace('_', ' '))
def reorder_title(title):
ltitle = title.lower()
if ltitle[-4:] == ',the':
return title[-3:] + ' ' + title[:-4]
if ltitle[-5:] == ', the':
return title[-3:] + ' ' + title[:-5]
return title
def str_replace(string, pos, c): def str_replace(string, pos, c):
return string[:pos] + c + string[pos+1:] return string[:pos] + c + string[pos+1:]

2
libs/guessit/transfo/__init__.py

@ -45,7 +45,7 @@ def format_guess(guess):
elif isinstance(value, base_text_type): elif isinstance(value, base_text_type):
if prop in ('edition',): if prop in ('edition',):
value = clean_string(value) value = clean_string(value)
guess[prop] = canonical_form(value) guess[prop] = canonical_form(value).replace('\\', '')
return guess return guess

0
libs/guessit/transfo/guess_bonus_features.py

24
libs/guessit/transfo/guess_country.py

@ -19,24 +19,30 @@
# #
from __future__ import unicode_literals from __future__ import unicode_literals
#from guessit.transfo import SingleNodeGuesser
#from guessit.date import search_year
from guessit.country import Country from guessit.country import Country
from guessit import Guess from guessit import Guess
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# list of common words which could be interpreted as countries, but which
# are far too common to be able to say they represent a country
country_common_words = frozenset([ 'bt', 'bb' ])
def process(mtree): def process(mtree):
for node in mtree.unidentified_leaves(): for node in mtree.unidentified_leaves():
# only keep explicit groups (enclosed in parentheses/brackets)
if len(node.node_idx) == 2: if len(node.node_idx) == 2:
try: c = node.value[1:-1].lower()
country = Country(node.value[1:-1], strict=True) if c in country_common_words:
if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: continue
continue
node.guess = Guess(country=country, confidence=1.0) # only keep explicit groups (enclosed in parentheses/brackets)
if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
continue
try:
country = Country(c, strict=True)
except ValueError: except ValueError:
pass continue
node.guess = Guess(country=country, confidence=1.0)

0
libs/guessit/transfo/guess_date.py

0
libs/guessit/transfo/guess_episode_info_from_position.py

0
libs/guessit/transfo/guess_episodes_rexps.py

6
libs/guessit/transfo/guess_filetype.py

@ -21,7 +21,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from guessit import Guess from guessit import Guess
from guessit.patterns import (subtitle_exts, video_exts, episode_rexps, from guessit.patterns import (subtitle_exts, video_exts, episode_rexps,
find_properties, canonical_form) find_properties, compute_canonical_form)
from guessit.date import valid_year from guessit.date import valid_year
from guessit.textutils import clean_string from guessit.textutils import clean_string
import os.path import os.path
@ -89,7 +89,7 @@ def guess_filetype(mtree, filetype):
# check whether we are in a 'Movies', 'Tv Shows', ... folder # check whether we are in a 'Movies', 'Tv Shows', ... folder
folder_rexps = [ (r'Movies?', upgrade_movie), folder_rexps = [ (r'Movies?', upgrade_movie),
(r'Tv ?Shows?', upgrade_episode), (r'Tv[ _-]?Shows?', upgrade_episode),
(r'Series', upgrade_episode) (r'Series', upgrade_episode)
] ]
for frexp, upgrade_func in folder_rexps: for frexp, upgrade_func in folder_rexps:
@ -142,7 +142,7 @@ def guess_filetype(mtree, filetype):
upgrade_episode() upgrade_episode()
break break
elif canonical_form(value) == 'DVB': elif compute_canonical_form('format', value) == 'DVB':
upgrade_episode() upgrade_episode()
break break

15
libs/guessit/transfo/guess_language.py

@ -22,7 +22,7 @@ from __future__ import unicode_literals
from guessit import Guess from guessit import Guess
from guessit.transfo import SingleNodeGuesser from guessit.transfo import SingleNodeGuesser
from guessit.language import search_language from guessit.language import search_language
from guessit.textutils import clean_string from guessit.textutils import clean_string, find_words
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -31,18 +31,13 @@ log = logging.getLogger(__name__)
def guess_language(string): def guess_language(string):
language, span, confidence = search_language(string) language, span, confidence = search_language(string)
if language: if language:
# is it a subtitle language? return (Guess({'language': language},
if 'sub' in clean_string(string[:span[0]]).lower().split(' '): confidence=confidence),
return (Guess({'subtitleLanguage': language}, span)
confidence=confidence),
span)
else:
return (Guess({'language': language},
confidence=confidence),
span)
return None, None return None, None
def process(mtree): def process(mtree):
SingleNodeGuesser(guess_language, None, log).process(mtree) SingleNodeGuesser(guess_language, None, log).process(mtree)
# Note: 'language' is promoted to 'subtitleLanguage' in the post_process transfo

1
libs/guessit/transfo/guess_movie_title_from_position.py

@ -20,6 +20,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from guessit import Guess from guessit import Guess
import unicodedata
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)

0
libs/guessit/transfo/guess_properties.py

50
libs/guessit/transfo/guess_release_group.py

@ -20,49 +20,51 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from guessit.transfo import SingleNodeGuesser from guessit.transfo import SingleNodeGuesser
from guessit.patterns import properties, canonical_form from guessit.patterns import prop_multi, compute_canonical_form, _dash, _psep
import re import re
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def get_patterns(property_name):
return [ p.replace(_dash, _psep) for patterns in prop_multi[property_name].values() for p in patterns ]
CODECS = properties['videoCodec'] CODECS = get_patterns('videoCodec')
FORMATS = properties['format'] FORMATS = get_patterns('format')
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
for codec in CODECS ]
GROUP_NAMES += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
for fmt in FORMATS ]
GROUP_NAMES2 = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for codec in CODECS ]
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for fmt in FORMATS ]
GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ]
GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ]
def adjust_metadata(md): def adjust_metadata(md):
codec = canonical_form(md['videoCodec']) return dict((property_name, compute_canonical_form(property_name, value) or value)
if codec in FORMATS: for property_name, value in md.items())
md['format'] = codec
del md['videoCodec']
return md
def guess_release_group(string): def guess_release_group(string):
group_names = [ r'\.(Xvid)-(?P<releaseGroup>.*?)[ \.]',
r'\.(DivX)-(?P<releaseGroup>.*?)[\. ]',
r'\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]',
]
# first try to see whether we have both a known codec and a known release group # first try to see whether we have both a known codec and a known release group
group_names = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)[ \.]' for rexp in GROUP_NAMES:
for codec in (CODECS + FORMATS) ] match = rexp.search(string)
for rexp in group_names:
match = re.search(rexp, string, re.IGNORECASE)
if match: if match:
metadata = match.groupdict() metadata = match.groupdict()
if canonical_form(metadata['releaseGroup']) in properties['releaseGroup']: release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup'])
if release_group:
return adjust_metadata(metadata), (match.start(1), match.end(2)) return adjust_metadata(metadata), (match.start(1), match.end(2))
# pick anything as releaseGroup as long as we have a codec in front # pick anything as releaseGroup as long as we have a codec in front
# this doesn't include a potential dash ('-') ending the release group # this doesn't include a potential dash ('-') ending the release group
# eg: [...].X264-HiS@SiLUHD-English.[...] # eg: [...].X264-HiS@SiLUHD-English.[...]
group_names = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]' for rexp in GROUP_NAMES2:
for codec in (CODECS + FORMATS) ] match = rexp.search(string)
for rexp in group_names:
match = re.search(rexp, string, re.IGNORECASE)
if match: if match:
return adjust_metadata(match.groupdict()), (match.start(1), match.end(2)) return adjust_metadata(match.groupdict()), (match.start(1), match.end(2))

0
libs/guessit/transfo/guess_video_rexps.py

0
libs/guessit/transfo/guess_weak_episodes_rexps.py

0
libs/guessit/transfo/guess_website.py

0
libs/guessit/transfo/guess_year.py

19
libs/guessit/transfo/post_process.py

@ -20,6 +20,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from guessit.patterns import subtitle_exts from guessit.patterns import subtitle_exts
from guessit.textutils import reorder_title, find_words
import logging import logging
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -45,6 +46,15 @@ def process(mtree):
node == mtree.leaves()[-2]): node == mtree.leaves()[-2]):
promote_subtitle() promote_subtitle()
# - if we find the word 'sub' before the language, and in the same explicit
# group, then upgrade the language
explicit_group = mtree.node_at(node.node_idx[:2])
group_str = explicit_group.value.lower()
if ('sub' in find_words(group_str) and
0 <= group_str.find('sub') < (node.span[0] - explicit_group.span[0])):
promote_subtitle()
# - if a language is in an explicit group just preceded by "st", # - if a language is in an explicit group just preceded by "st",
# it is a subtitle language (eg: '...st[fr-eng]...') # it is a subtitle language (eg: '...st[fr-eng]...')
try: try:
@ -60,11 +70,4 @@ def process(mtree):
if 'series' not in node.guess: if 'series' not in node.guess:
continue continue
series = node.guess['series'] node.guess['series'] = reorder_title(node.guess['series'])
lseries = series.lower()
if lseries[-4:] == ',the':
node.guess['series'] = 'The ' + series[:-4]
if lseries[-5:] == ', the':
node.guess['series'] = 'The ' + series[:-5]

0
libs/guessit/transfo/split_explicit_groups.py

10
libs/guessit/transfo/split_on_dash.py

@ -38,15 +38,5 @@ def process(mtree):
indices.extend([ span[0], span[1] ]) indices.extend([ span[0], span[1] ])
match = pattern.search(node.value, span[1]) match = pattern.search(node.value, span[1])
didx = node.value.find('-')
while didx > 0:
if (didx > 10 and
(didx - 1 not in indices and
didx + 2 not in indices)):
indices.extend([ didx, didx + 1 ])
didx = node.value.find('-', didx + 1)
if indices: if indices:
node.partition(indices) node.partition(indices)

0
libs/guessit/transfo/split_path_components.py

Loading…
Cancel
Save