You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

289 lines
10 KiB

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
__version__ = '0.6.2'
__all__ = ['Guess', 'Language',
'guess_file_info', 'guess_video_info',
'guess_movie_info', 'guess_episode_info']
# Do python3 detection before importing any other module, to be sure that
# it will then always be available
# with code from http://lucumr.pocoo.org/2011/1/22/forwards-compatible-python/
import sys
if sys.version_info[0] >= 3:
PY3 = True
unicode_text_type = str
native_text_type = str
base_text_type = str
def u(x):
return str(x)
def s(x):
return x
class UnicodeMixin(object):
__str__ = lambda x: x.__unicode__()
import binascii
def to_hex(x):
return binascii.hexlify(x).decode('utf-8')
else:
PY3 = False
__all__ = [ str(s) for s in __all__ ] # fix imports for python2
unicode_text_type = unicode
native_text_type = str
base_text_type = basestring
def u(x):
if isinstance(x, str):
return x.decode('utf-8')
return unicode(x)
def s(x):
if isinstance(x, unicode):
return x.encode('utf-8')
if isinstance(x, list):
return [ s(y) for y in x ]
if isinstance(x, tuple):
return tuple(s(y) for y in x)
if isinstance(x, dict):
return dict((s(key), s(value)) for key, value in x.items())
return x
class UnicodeMixin(object):
__str__ = lambda x: unicode(x).encode('utf-8')
def to_hex(x):
return x.encode('hex')
from guessit.guess import Guess, merge_all
from guessit.language import Language
from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_string
import logging
import json
log = logging.getLogger(__name__)
class NullHandler(logging.Handler):
def emit(self, record):
pass
# let's be a nicely behaving library
h = NullHandler()
log.addHandler(h)
def _guess_filename(filename, filetype):
def find_nodes(tree, props):
"""Yields all nodes containing any of the given props."""
if isinstance(props, base_text_type):
props = [props]
for node in tree.nodes():
if any(prop in node.guess for prop in props):
yield node
def warning(title):
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
return m
mtree = IterativeMatcher(filename, filetype=filetype)
m = mtree.matched()
second_pass_opts = []
second_pass_transfo_opts = {}
# if there are multiple possible years found, we assume the first one is
# part of the title, reparse the tree taking this into account
years = set(n.value for n in find_nodes(mtree.match_tree, 'year'))
if len(years) >= 2:
second_pass_opts.append('skip_first_year')
to_skip_language_nodes = []
title_nodes = set(n for n in find_nodes(mtree.match_tree, ['title', 'series']))
title_spans = {}
for title_node in title_nodes:
title_spans[title_node.span[0]] = title_node
title_spans[title_node.span[1]] = title_node
for lang_key in ('language', 'subtitleLanguage'):
langs = {}
lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key))
for lang_node in lang_nodes:
lang = lang_node.guess.get(lang_key, None)
if len(lang_node.value) > 3 and (lang_node.span[0] in title_spans.keys() or lang_node.span[1] in title_spans.keys()):
# Language is next or before title, and is not a language code. Add to skip for 2nd pass.
# if filetype is subtitle and the language appears last, just before
# the extension, then it is likely a subtitle language
parts = clean_string(lang_node.root.value).split()
if m['type'] in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2):
continue
to_skip_language_nodes.append(lang_node)
elif not lang in langs:
langs[lang] = lang_node
else:
# The same language was found. Keep the more confident one, and add others to skip for 2nd pass.
existing_lang_node = langs[lang]
to_skip = None
if existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language'):
# lang_node is to remove
to_skip = lang_node
else:
# existing_lang_node is to remove
langs[lang] = lang_node
to_skip = existing_lang_node
to_skip_language_nodes.append(to_skip)
if to_skip_language_nodes:
second_pass_transfo_opts['guess_language'] = (
((), { 'skip': [ { 'node_idx': node.parent.node_idx,
'span': node.span }
for node in to_skip_language_nodes ] }))
if second_pass_opts or second_pass_transfo_opts:
# 2nd pass is needed
log.info("Running 2nd pass with options: %s" % second_pass_opts)
log.info("Transfo options: %s" % second_pass_transfo_opts)
mtree = IterativeMatcher(filename, filetype=filetype,
opts=second_pass_opts,
transfo_opts=second_pass_transfo_opts)
m = mtree.matched()
if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m:
return m
# if we found some language, make sure we didn't cut a title or sth...
mtree2 = IterativeMatcher(filename, filetype=filetype,
opts=['nolanguage', 'nocountry'])
m2 = mtree2.matched()
if m.get('title') != m2.get('title'):
title = next(find_nodes(mtree.match_tree, 'title'))
title2 = next(find_nodes(mtree2.match_tree, 'title'))
# if a node is in an explicit group, then the correct title is probably
# the other one
if title.root.node_at(title.node_idx[:2]).is_explicit():
return m2
elif title2.root.node_at(title2.node_idx[:2]).is_explicit():
return m
return m
def guess_file_info(filename, filetype, info=None):
"""info can contain the names of the various plugins, such as 'filename' to
detect filename info, or 'hash_md5' to get the md5 hash of the file.
>>> guess_file_info('tests/dummy.srt', 'autodetect', info = ['hash_md5', 'hash_sha1'])
{'hash_md5': 'e781de9b94ba2753a8e2945b2c0a123d', 'hash_sha1': 'bfd18e2f4e5d59775c2bc14d80f56971891ed620'}
"""
result = []
hashers = []
# Force unicode as soon as possible
filename = u(filename)
if info is None:
info = ['filename']
if isinstance(info, base_text_type):
info = [info]
for infotype in info:
if infotype == 'filename':
result.append(_guess_filename(filename, filetype))
elif infotype == 'hash_mpc':
from guessit.hash_mpc import hash_file
try:
result.append(Guess({'hash_mpc': hash_file(filename)},
confidence=1.0))
except Exception as e:
log.warning('Could not compute MPC-style hash because: %s' % e)
elif infotype == 'hash_ed2k':
from guessit.hash_ed2k import hash_file
try:
result.append(Guess({'hash_ed2k': hash_file(filename)},
confidence=1.0))
except Exception as e:
log.warning('Could not compute ed2k hash because: %s' % e)
elif infotype.startswith('hash_'):
import hashlib
hashname = infotype[5:]
try:
hasher = getattr(hashlib, hashname)()
hashers.append((infotype, hasher))
except AttributeError:
log.warning('Could not compute %s hash because it is not available from python\'s hashlib module' % hashname)
else:
log.warning('Invalid infotype: %s' % infotype)
# do all the hashes now, but on a single pass
if hashers:
try:
blocksize = 8192
hasherobjs = dict(hashers).values()
with open(filename, 'rb') as f:
chunk = f.read(blocksize)
while chunk:
for hasher in hasherobjs:
hasher.update(chunk)
chunk = f.read(blocksize)
for infotype, hasher in hashers:
result.append(Guess({infotype: hasher.hexdigest()},
confidence=1.0))
except Exception as e:
log.warning('Could not compute hash because: %s' % e)
result = merge_all(result)
# last minute adjustments
# if country is in the guessed properties, make it part of the filename
if 'series' in result and 'country' in result:
result['series'] += ' (%s)' % result['country'].alpha2.upper()
return result
def guess_video_info(filename, info=None):
return guess_file_info(filename, 'autodetect', info)
def guess_movie_info(filename, info=None):
return guess_file_info(filename, 'movie', info)
def guess_episode_info(filename, info=None):
return guess_file_info(filename, 'episode', info)