You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
622 lines
28 KiB
622 lines
28 KiB
14 years ago
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
#
|
||
|
# GuessIt - A library for guessing information from filenames
|
||
|
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
|
||
|
#
|
||
|
# GuessIt is free software; you can redistribute it and/or modify it under
|
||
|
# the terms of the Lesser GNU General Public License as published by
|
||
|
# the Free Software Foundation; either version 3 of the License, or
|
||
|
# (at your option) any later version.
|
||
|
#
|
||
|
# GuessIt is distributed in the hope that it will be useful,
|
||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
# Lesser GNU General Public License for more details.
|
||
|
#
|
||
|
# You should have received a copy of the Lesser GNU General Public License
|
||
|
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
#
|
||
|
|
||
|
from guessit import fileutils, textutils
|
||
|
from guessit.guess import Guess, merge_similar_guesses, merge_all, choose_int, choose_string
|
||
|
from guessit.date import search_date, search_year
|
||
|
from guessit.language import search_language
|
||
|
from guessit.patterns import video_exts, subtitle_exts, sep, deleted, video_rexps, websites, episode_rexps, weak_episode_rexps, non_episode_title, properties, canonical_form
|
||
|
from guessit.matchtree import get_group, find_group, leftover_valid_groups, tree_to_string
|
||
|
from guessit.textutils import find_first_level_groups, split_on_groups, blank_region, clean_string, to_utf8
|
||
|
from guessit.fileutils import split_path_components
|
||
|
import datetime
|
||
|
import os.path
|
||
|
import re
|
||
|
import copy
|
||
|
import logging
|
||
|
|
||
|
log = logging.getLogger("guessit.matcher")
|
||
|
|
||
|
|
||
|
|
||
|
def split_explicit_groups(string):
|
||
|
"""return the string split into explicit groups, that is, those either
|
||
|
between parenthese, square brackets or curly braces, and those separated
|
||
|
by a dash."""
|
||
|
result = find_first_level_groups(string, '()')
|
||
|
result = reduce(lambda l, x: l + find_first_level_groups(x, '[]'), result, [])
|
||
|
result = reduce(lambda l, x: l + find_first_level_groups(x, '{}'), result, [])
|
||
|
# do not do this at this moment, it is not strong enough and can break other
|
||
|
# patterns, such as dates, etc...
|
||
|
#result = reduce(lambda l, x: l + x.split('-'), result, [])
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
def format_guess(guess):
|
||
|
"""Format all the found values to their natural type.
|
||
|
For instance, a year would be stored as an int value, etc...
|
||
|
|
||
|
Note that this modifies the dictionary given as input.
|
||
|
"""
|
||
|
for prop, value in guess.items():
|
||
|
if prop in ('season', 'episodeNumber', 'year', 'cdNumber', 'cdNumberTotal'):
|
||
|
guess[prop] = int(guess[prop])
|
||
|
elif isinstance(value, basestring):
|
||
|
if prop in ('edition',):
|
||
|
value = clean_string(value)
|
||
|
guess[prop] = canonical_form(value)
|
||
|
|
||
|
return guess
|
||
|
|
||
|
|
||
|
def guess_groups(string, result, filetype):
|
||
|
# add sentinels so we can match a separator char at either end of
|
||
|
# our groups, even when they are at the beginning or end of the string
|
||
|
# we will adjust the span accordingly later
|
||
|
#
|
||
|
# filetype can either be movie, moviesubtitle, episode, episodesubtitle
|
||
|
current = ' ' + string + ' '
|
||
|
|
||
|
regions = [] # list of (start, end) of matched regions
|
||
|
|
||
|
def guessed(match_dict, confidence):
|
||
|
guess = format_guess(Guess(match_dict, confidence = confidence))
|
||
|
result.append(guess)
|
||
|
log.debug('Found with confidence %.2f: %s' % (confidence, guess))
|
||
|
return guess
|
||
|
|
||
|
def update_found(string, guess, span, span_adjust = (0,0)):
|
||
|
span = (span[0] + span_adjust[0],
|
||
|
span[1] + span_adjust[1])
|
||
|
regions.append((span, guess))
|
||
|
return blank_region(string, span)
|
||
|
|
||
|
# try to find dates first, as they are very specific
|
||
|
date, span = search_date(current)
|
||
|
if date:
|
||
|
guess = guessed({ 'date': date }, confidence = 1.0)
|
||
|
current = update_found(current, guess, span)
|
||
|
|
||
|
# for non episodes only, look for year information
|
||
|
if filetype not in ('episode', 'episodesubtitle'):
|
||
|
year, span = search_year(current)
|
||
|
if year:
|
||
|
guess = guessed({ 'year': year }, confidence = 1.0)
|
||
|
current = update_found(current, guess, span)
|
||
|
|
||
|
# specific regexps (ie: cd number, season X episode, ...)
|
||
|
for rexp, confidence, span_adjust in video_rexps:
|
||
|
match = re.search(rexp, current, re.IGNORECASE)
|
||
|
if match:
|
||
|
metadata = match.groupdict()
|
||
|
# is this the better place to put it? (maybe, as it is at least the soonest that we can catch it)
|
||
|
if 'cdNumberTotal' in metadata and metadata['cdNumberTotal'] is None:
|
||
|
del metadata['cdNumberTotal']
|
||
|
|
||
|
guess = guessed(metadata, confidence = confidence)
|
||
|
current = update_found(current, guess, match.span(), span_adjust)
|
||
|
|
||
|
if filetype in ('episode', 'episodesubtitle'):
|
||
|
for rexp, confidence, span_adjust in episode_rexps:
|
||
|
match = re.search(rexp, current, re.IGNORECASE)
|
||
|
if match:
|
||
|
metadata = match.groupdict()
|
||
|
guess = guessed(metadata, confidence = confidence)
|
||
|
current = update_found(current, guess, match.span(), span_adjust)
|
||
|
|
||
|
|
||
|
# Now websites, but as exact string instead of regexps
|
||
|
clow = current.lower()
|
||
|
for site in websites:
|
||
|
pos = clow.find(site.lower())
|
||
|
if pos != -1:
|
||
|
guess = guessed({ 'website': site }, confidence = confidence)
|
||
|
current = update_found(current, guess, (pos, pos+len(site)))
|
||
|
clow = current.lower()
|
||
|
|
||
|
|
||
|
# release groups have certain constraints, cannot be included in the previous general regexps
|
||
|
group_names = [ r'\.(Xvid)-(?P<releaseGroup>.*?)[ \.]',
|
||
|
r'\.(DivX)-(?P<releaseGroup>.*?)[\. ]',
|
||
|
r'\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]',
|
||
|
]
|
||
|
for rexp in group_names:
|
||
|
match = re.search(rexp, current, re.IGNORECASE)
|
||
|
if match:
|
||
|
metadata = match.groupdict()
|
||
|
metadata.update({ 'videoCodec': match.group(1) })
|
||
|
guess = guessed(metadata, confidence = 0.8)
|
||
|
current = update_found(current, guess, match.span(), span_adjust = (1, -1))
|
||
|
|
||
|
|
||
|
# common well-defined words and regexps
|
||
|
clow = current.lower()
|
||
|
confidence = 1.0 # for all of them
|
||
|
for prop, values in properties.items():
|
||
|
for value in values:
|
||
|
pos = clow.find(value.lower())
|
||
|
if pos != -1:
|
||
|
end = pos + len(value)
|
||
|
# make sure our word is always surrounded by separators
|
||
|
if clow[pos-1] not in sep or clow[end] not in sep:
|
||
|
# note: sep is a regexp, but in this case using it as
|
||
|
# a sequence achieves the same goal
|
||
|
continue
|
||
|
|
||
|
guess = guessed({ prop: value }, confidence = confidence)
|
||
|
current = update_found(current, guess, (pos, end))
|
||
|
clow = current.lower()
|
||
|
|
||
|
# weak guesses for episode number, only run it if we don't have an estimate already
|
||
|
if filetype in ('episode', 'episodesubtitle'):
|
||
|
if not any('episodeNumber' in match for match in result):
|
||
|
for rexp, _, span_adjust in weak_episode_rexps:
|
||
|
match = re.search(rexp, current, re.IGNORECASE)
|
||
|
if match:
|
||
|
metadata = match.groupdict()
|
||
|
epnum = int(metadata['episodeNumber'])
|
||
|
if epnum > 100:
|
||
|
guess = guessed({ 'season': epnum // 100,
|
||
|
'episodeNumber': epnum % 100 }, confidence = 0.6)
|
||
|
else:
|
||
|
guess = guessed(metadata, confidence = 0.3)
|
||
|
current = update_found(current, guess, match.span(), span_adjust)
|
||
|
|
||
|
# try to find languages now
|
||
|
language, span, confidence = search_language(current)
|
||
|
while language:
|
||
|
# is it a subtitle language?
|
||
|
if 'sub' in clean_string(current[:span[0]]).lower().split(' '):
|
||
|
guess = guessed({ 'subtitleLanguage': language }, confidence = confidence)
|
||
|
else:
|
||
|
guess = guessed({ 'language': language }, confidence = confidence)
|
||
|
current = update_found(current, guess, span)
|
||
|
|
||
|
language, span, confidence = search_language(current)
|
||
|
|
||
|
|
||
|
# remove our sentinels now and ajust spans accordingly
|
||
|
assert(current[0] == ' ' and current[-1] == ' ')
|
||
|
current = current[1:-1]
|
||
|
regions = [ ((start-1, end-1), guess) for (start, end), guess in regions ]
|
||
|
|
||
|
# split into '-' separated subgroups (with required separator chars
|
||
|
# around the dash)
|
||
|
didx = current.find('-')
|
||
|
while didx > 0:
|
||
|
regions.append(((didx, didx), None))
|
||
|
didx = current.find('-', didx+1)
|
||
|
|
||
|
# cut our final groups, and rematch the guesses to the group that created
|
||
|
# id, None if it is a leftover group
|
||
|
region_spans = [ span for span, guess in regions ]
|
||
|
string_groups = split_on_groups(string, region_spans)
|
||
|
remaining_groups = split_on_groups(current, region_spans)
|
||
|
guesses = []
|
||
|
|
||
|
pos = 0
|
||
|
for group in string_groups:
|
||
|
found = False
|
||
|
for span, guess in regions:
|
||
|
if span[0] == pos:
|
||
|
guesses.append(guess)
|
||
|
found = True
|
||
|
if not found:
|
||
|
guesses.append(None)
|
||
|
|
||
|
pos += len(group)
|
||
|
|
||
|
return zip(string_groups,
|
||
|
remaining_groups,
|
||
|
guesses)
|
||
|
|
||
|
|
||
|
def match_from_epnum_position(match_tree, epnum_pos, guessed, update_found):
|
||
|
"""guessed is a callback function to call with the guessed group
|
||
|
update_found is a callback to update the match group and returns leftover groups."""
|
||
|
pidx, eidx, gidx = epnum_pos
|
||
|
|
||
|
# a few helper functions to be able to filter using high-level semantics
|
||
|
def same_pgroup_before(group):
|
||
|
_, (ppidx, eeidx, ggidx) = group
|
||
|
return ppidx == pidx and (eeidx, ggidx) < (eidx, gidx)
|
||
|
|
||
|
def same_pgroup_after(group):
|
||
|
_, (ppidx, eeidx, ggidx) = group
|
||
|
return ppidx == pidx and (eeidx, ggidx) > (eidx, gidx)
|
||
|
|
||
|
def same_egroup_before(group):
|
||
|
_, (ppidx, eeidx, ggidx) = group
|
||
|
return ppidx == pidx and eeidx == eidx and ggidx < gidx
|
||
|
|
||
|
def same_egroup_after(group):
|
||
|
_, (ppidx, eeidx, ggidx) = group
|
||
|
return ppidx == pidx and eeidx == eidx and ggidx > gidx
|
||
|
|
||
|
leftover = leftover_valid_groups(match_tree)
|
||
|
|
||
|
# if we have at least 1 valid group before the episodeNumber, then it's probably
|
||
|
# the series name
|
||
|
series_candidates = filter(same_pgroup_before, leftover)
|
||
|
if len(series_candidates) >= 1:
|
||
|
guess = guessed({ 'series': series_candidates[0][0] }, confidence = 0.7)
|
||
|
leftover = update_found(leftover, series_candidates[0][1], guess)
|
||
|
|
||
|
# only 1 group after (in the same path group) and it's probably the episode title
|
||
|
title_candidates = filter(lambda g:g[0].lower() not in non_episode_title,
|
||
|
filter(same_pgroup_after, leftover))
|
||
|
if len(title_candidates) == 1:
|
||
|
guess = guessed({ 'title': title_candidates[0][0] }, confidence = 0.5)
|
||
|
leftover = update_found(leftover, title_candidates[0][1], guess)
|
||
|
else:
|
||
|
# try in the same explicit group, with lower confidence
|
||
|
title_candidates = filter(lambda g:g[0].lower() not in non_episode_title,
|
||
|
filter(same_egroup_after, leftover))
|
||
|
if len(title_candidates) == 1:
|
||
|
guess = guessed({ 'title': title_candidates[0][0] }, confidence = 0.4)
|
||
|
leftover = update_found(leftover, title_candidates[0][1], guess)
|
||
|
|
||
|
# epnumber is the first group and there are only 2 after it in same path group
|
||
|
# -> season title - episode title
|
||
|
already_has_title = (find_group(match_tree, 'title') != [])
|
||
|
|
||
|
title_candidates = filter(lambda g:g[0].lower() not in non_episode_title,
|
||
|
filter(same_pgroup_after, leftover))
|
||
|
if (not already_has_title and # no title
|
||
|
not filter(same_pgroup_before, leftover) and # no groups before
|
||
|
len(title_candidates) == 2): # only 2 groups after
|
||
|
|
||
|
guess = guessed({ 'series': title_candidates[0][0] }, confidence = 0.4)
|
||
|
leftover = update_found(leftover, title_candidates[0][1], guess)
|
||
|
guess = guessed({ 'title': title_candidates[1][0] }, confidence = 0.4)
|
||
|
leftover = update_found(leftover, title_candidates[1][1], guess)
|
||
|
|
||
|
|
||
|
# if we only have 1 remaining valid group in the pathpart before the filename,
|
||
|
# then it's likely that it is the series name
|
||
|
series_candidates = [ group for group in leftover if group[1][0] == pidx-1 ]
|
||
|
if len(series_candidates) == 1:
|
||
|
guess = guessed({ 'series': series_candidates[0][0] }, confidence = 0.5)
|
||
|
leftover = update_found(leftover, series_candidates[0][1], guess)
|
||
|
|
||
|
return match_tree
|
||
|
|
||
|
|
||
|
|
||
|
class IterativeMatcher(object):
|
||
|
def __init__(self, filename, filetype = 'autodetect'):
|
||
|
"""An iterative matcher tries to match different patterns that appear
|
||
|
in the filename.
|
||
|
|
||
|
The 'filetype' argument indicates which type of file you want to match.
|
||
|
If it is 'autodetect', the matcher will try to see whether it can guess
|
||
|
that the file corresponds to an episode, or otherwise will assume it is
|
||
|
a movie.
|
||
|
|
||
|
The recognized 'filetype' values are:
|
||
|
[ autodetect, subtitle, movie, moviesubtitle, episode, episodesubtitle ]
|
||
|
|
||
|
|
||
|
The IterativeMatcher works mainly in 2 steps:
|
||
|
|
||
|
First, it splits the filename into a match_tree, which is a tree of groups
|
||
|
which have a semantic meaning, such as episode number, movie title,
|
||
|
etc...
|
||
|
|
||
|
The match_tree created looks like the following:
|
||
|
|
||
|
0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
|
||
|
0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
|
||
|
0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
|
||
|
__________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___
|
||
|
xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc
|
||
|
[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv
|
||
|
|
||
|
The first 3 lines indicates the group index in which a char in the
|
||
|
filename is located. So for instance, x264 is the group (0, 4, 1), and
|
||
|
it corresponds to a video codec, denoted by the letter'v' in the 4th line.
|
||
|
(for more info, see guess.matchtree.tree_to_string)
|
||
|
|
||
|
|
||
|
Second, it tries to merge all this information into a single object
|
||
|
containing all the found properties, and does some (basic) conflict
|
||
|
resolution when they arise.
|
||
|
"""
|
||
|
|
||
|
if filetype not in ('autodetect', 'subtitle', 'movie', 'moviesubtitle',
|
||
|
'episode', 'episodesubtitle'):
|
||
|
raise ValueError, "filetype needs to be one of ('autodetect', 'subtitle', 'movie', 'moviesubtitle', 'episode', 'episodesubtitle')"
|
||
|
if not isinstance(filename, unicode):
|
||
|
log.debug('WARNING: given filename to matcher is not unicode...')
|
||
|
|
||
|
match_tree = []
|
||
|
result = [] # list of found metadata
|
||
|
|
||
|
def guessed(match_dict, confidence):
|
||
|
guess = format_guess(Guess(match_dict, confidence = confidence))
|
||
|
result.append(guess)
|
||
|
log.debug('Found with confidence %.2f: %s' % (confidence, guess))
|
||
|
return guess
|
||
|
|
||
|
def update_found(leftover, group_pos, guess):
|
||
|
pidx, eidx, gidx = group_pos
|
||
|
group = match_tree[pidx][eidx][gidx]
|
||
|
match_tree[pidx][eidx][gidx] = (group[0],
|
||
|
deleted * len(group[0]),
|
||
|
guess)
|
||
|
return [ g for g in leftover if g[1] != group_pos ]
|
||
|
|
||
|
|
||
|
# 1- first split our path into dirs + basename + ext
|
||
|
match_tree = split_path_components(filename)
|
||
|
|
||
|
fileext = match_tree.pop(-1)[1:].lower()
|
||
|
if fileext in subtitle_exts:
|
||
|
if 'movie' in filetype:
|
||
|
filetype = 'moviesubtitle'
|
||
|
elif 'episode' in filetype:
|
||
|
filetype = 'episodesubtitle'
|
||
|
else:
|
||
|
filetype = 'subtitle'
|
||
|
extguess = guessed({ 'container': fileext }, confidence = 1.0)
|
||
|
elif fileext in video_exts:
|
||
|
extguess = guessed({ 'container': fileext }, confidence = 1.0)
|
||
|
else:
|
||
|
extguess = guessed({ 'extension': fileext}, confidence = 1.0)
|
||
|
|
||
|
# TODO: depending on the extension, we could already grab some info and maybe specialized
|
||
|
# guessers, eg: a lang parser for idx files, an automatic detection of the language
|
||
|
# for srt files, a video metadata extractor for avi, mkv, ...
|
||
|
|
||
|
# if we are on autodetect, try to do it now so we can tell the
|
||
|
# guess_groups function what type of info it should be looking for
|
||
|
if filetype in ('autodetect', 'subtitle'):
|
||
|
for rexp, confidence, span_adjust in episode_rexps:
|
||
|
match = re.search(rexp, filename, re.IGNORECASE)
|
||
|
if match:
|
||
|
if filetype == 'autodetect':
|
||
|
filetype = 'episode'
|
||
|
elif filetype == 'subtitle':
|
||
|
filetype = 'episodesubtitle'
|
||
|
break
|
||
|
|
||
|
# if no episode info found, assume it's a movie
|
||
|
if filetype == 'autodetect':
|
||
|
filetype = 'movie'
|
||
|
elif filetype == 'subtitle':
|
||
|
filetype = 'moviesubtitle'
|
||
|
|
||
|
guessed({ 'type': filetype }, confidence = 1.0)
|
||
|
|
||
|
|
||
|
# 2- split each of those into explicit groups, if any
|
||
|
# note: be careful, as this might split some regexps with more confidence such as
|
||
|
# Alfleni-Team, or [XCT] or split a date such as (14-01-2008)
|
||
|
match_tree = [ split_explicit_groups(part) for part in match_tree ]
|
||
|
|
||
|
|
||
|
# 3- try to match information in decreasing order of confidence and
|
||
|
# blank the matching group in the string if we found something
|
||
|
for pathpart in match_tree:
|
||
|
for gidx, explicit_group in enumerate(pathpart):
|
||
|
pathpart[gidx] = guess_groups(explicit_group, result, filetype = filetype)
|
||
|
|
||
|
# 4- try to identify the remaining unknown groups by looking at their position
|
||
|
# relative to other known elements
|
||
|
|
||
|
if filetype in ('episode', 'episodesubtitle'):
|
||
|
eps = find_group(match_tree, 'episodeNumber')
|
||
|
if eps:
|
||
|
match_tree = match_from_epnum_position(match_tree, eps[0], guessed, update_found)
|
||
|
|
||
|
leftover = leftover_valid_groups(match_tree)
|
||
|
|
||
|
if not eps:
|
||
|
# if we don't have the episode number, but at least 2 groups in the
|
||
|
# last path group, then it's probably series - eptitle
|
||
|
title_candidates = filter(lambda g:g[0].lower() not in non_episode_title,
|
||
|
filter(lambda g: g[1][0] == len(match_tree)-1,
|
||
|
leftover_valid_groups(match_tree)))
|
||
|
if len(title_candidates) >= 2:
|
||
|
guess = guessed({ 'series': title_candidates[0][0] }, confidence = 0.4)
|
||
|
leftover = update_found(leftover, title_candidates[0][1], guess)
|
||
|
guess = guessed({ 'title': title_candidates[1][0] }, confidence = 0.4)
|
||
|
leftover = update_found(leftover, title_candidates[1][1], guess)
|
||
|
|
||
|
|
||
|
# if there's a path group that only contains the season info, then the previous one
|
||
|
# is most likely the series title (ie: .../series/season X/...)
|
||
|
eps = [ gpos for gpos in find_group(match_tree, 'season')
|
||
|
if 'episodeNumber' not in get_group(match_tree, gpos)[2] ]
|
||
|
|
||
|
if eps:
|
||
|
pidx, eidx, gidx = eps[0]
|
||
|
previous = [ group for group in leftover if group[1][0] == pidx - 1 ]
|
||
|
if len(previous) == 1:
|
||
|
guess = guessed({ 'series': previous[0][0] }, confidence = 0.5)
|
||
|
leftover = update_found(leftover, previous[0][1], guess)
|
||
|
|
||
|
|
||
|
elif filetype in ('movie', 'moviesubtitle'):
|
||
|
leftover_all = leftover_valid_groups(match_tree)
|
||
|
|
||
|
# specific cases:
|
||
|
# - movies/tttttt (yyyy)/tttttt.ccc
|
||
|
try:
|
||
|
if match_tree[-3][0][0][0].lower() == 'movies':
|
||
|
# Note:too generic, might solve all the unittests as they all contain 'movies'
|
||
|
# in their path
|
||
|
#
|
||
|
#if len(match_tree[-2][0]) == 1:
|
||
|
# title = match_tree[-2][0][0]
|
||
|
# guess = guessed({ 'title': clean_string(title[0]) }, confidence = 0.7)
|
||
|
# update_found(leftover_all, title, guess)
|
||
|
|
||
|
year_group = filter(lambda gpos: gpos[0] == len(match_tree)-2,
|
||
|
find_group(match_tree, 'year'))[0]
|
||
|
leftover = leftover_valid_groups(match_tree,
|
||
|
valid = lambda g: ((g[0] and g[0][0] not in sep) and
|
||
|
g[1][0] == len(match_tree) - 2))
|
||
|
if len(match_tree[-2]) == 2 and year_group[1] == 1:
|
||
|
title = leftover[0]
|
||
|
guess = guessed({ 'title': clean_string(title[0]) },
|
||
|
confidence = 0.8)
|
||
|
update_found(leftover_all, title[1], guess)
|
||
|
raise Exception # to exit the try catch now
|
||
|
|
||
|
leftover = [ g for g in leftover_all if (g[1][0] == year_group[0] and
|
||
|
g[1][1] < year_group[1] and
|
||
|
g[1][2] < year_group[2]) ]
|
||
|
leftover = sorted(leftover, key = lambda x:x[1])
|
||
|
title = leftover[0]
|
||
|
guess = guessed({ 'title': title[0] }, confidence = 0.8)
|
||
|
leftover = update_found(leftover, title[1], guess)
|
||
|
except:
|
||
|
pass
|
||
|
|
||
|
# if we have either format or videoCodec in the folder containing the file
|
||
|
# or one of its parents, then we should probably look for the title in
|
||
|
# there rather than in the basename
|
||
|
props = filter(lambda g: g[0] <= len(match_tree) - 2,
|
||
|
find_group(match_tree, 'videoCodec') +
|
||
|
find_group(match_tree, 'format') +
|
||
|
find_group(match_tree, 'language'))
|
||
|
leftover = None
|
||
|
if props and all(g[0] == props[0][0] for g in props):
|
||
|
leftover = [ g for g in leftover_all if g[1][0] == props[0][0] ]
|
||
|
|
||
|
if props and leftover:
|
||
|
guess = guessed({ 'title': leftover[0][0] }, confidence = 0.7)
|
||
|
leftover = update_found(leftover, leftover[0][1], guess)
|
||
|
|
||
|
else:
|
||
|
# first leftover group in the last path part sounds like a good candidate for title,
|
||
|
# except if it's only one word and that the first group before has at least 3 words in it
|
||
|
# (case where the filename contains an 8 chars short name and the movie title is
|
||
|
# actually in the parent directory name)
|
||
|
leftover = [ g for g in leftover_all if g[1][0] == len(match_tree)-1 ]
|
||
|
if leftover:
|
||
|
title, (pidx, eidx, gidx) = leftover[0]
|
||
|
previous_pgroup_leftover = filter(lambda g: g[1][0] == pidx-1, leftover_all)
|
||
|
|
||
|
if (title.count(' ') == 0 and
|
||
|
previous_pgroup_leftover and
|
||
|
previous_pgroup_leftover[0][0].count(' ') >= 2):
|
||
|
|
||
|
guess = guessed({ 'title': previous_pgroup_leftover[0][0] }, confidence = 0.6)
|
||
|
leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess)
|
||
|
|
||
|
else:
|
||
|
guess = guessed({ 'title': title }, confidence = 0.6)
|
||
|
leftover = update_found(leftover, leftover[0][1], guess)
|
||
|
else:
|
||
|
# if there were no leftover groups in the last path part, look in the one before that
|
||
|
previous_pgroup_leftover = filter(lambda g: g[1][0] == len(match_tree)-2, leftover_all)
|
||
|
if previous_pgroup_leftover:
|
||
|
guess = guessed({ 'title': previous_pgroup_leftover[0][0] }, confidence = 0.6)
|
||
|
leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess)
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
# 5- perform some post-processing steps
|
||
|
|
||
|
# 5.1- try to promote language to subtitle language where it makes sense
|
||
|
for pidx, eidx, gidx in find_group(match_tree, 'language'):
|
||
|
string, remaining, guess = get_group(match_tree, (pidx, eidx, gidx))
|
||
|
|
||
|
def promote_subtitle():
|
||
|
guess.set('subtitleLanguage', guess['language'], confidence = guess.confidence('language'))
|
||
|
del guess['language']
|
||
|
|
||
|
# - if we matched a language in a file with a sub extension and that the group
|
||
|
# is the last group of the filename, it is probably the language of the subtitle
|
||
|
# (eg: 'xxx.english.srt')
|
||
|
if (fileext in subtitle_exts and
|
||
|
pidx == len(match_tree) - 1 and
|
||
|
eidx == len(match_tree[pidx]) - 1):
|
||
|
promote_subtitle()
|
||
|
|
||
|
# - if a language is in an explicit group just preceded by "st", it is a subtitle
|
||
|
# language (eg: '...st[fr-eng]...')
|
||
|
if eidx > 0:
|
||
|
previous = get_group(match_tree, (pidx, eidx-1, -1))
|
||
|
if previous[0][-2:].lower() == 'st':
|
||
|
promote_subtitle()
|
||
|
|
||
|
|
||
|
|
||
|
# re-append the extension now
|
||
|
match_tree.append([[(fileext, deleted*len(fileext), extguess)]])
|
||
|
|
||
|
self.parts = result
|
||
|
self.match_tree = match_tree
|
||
|
|
||
|
if filename.startswith('/'):
|
||
|
filename = ' ' + filename
|
||
|
|
||
|
log.debug('Found match tree:\n%s\n%s' % (to_utf8(tree_to_string(match_tree)),
|
||
|
to_utf8(filename)))
|
||
|
|
||
|
|
||
|
def matched(self):
|
||
|
# we need to make a copy here, as the merge functions work in place and
|
||
|
# calling them on the match tree would modify it
|
||
|
parts = copy.deepcopy(self.parts)
|
||
|
|
||
|
# 1- start by doing some common preprocessing tasks
|
||
|
|
||
|
# 1.1- ", the" at the end of a series title should be prepended to it
|
||
|
for part in parts:
|
||
|
if 'series' not in part:
|
||
|
continue
|
||
|
|
||
|
series = part['series']
|
||
|
lseries = series.lower()
|
||
|
|
||
|
if lseries[-4:] == ',the':
|
||
|
part['series'] = 'The ' + series[:-4]
|
||
|
|
||
|
if lseries[-5:] == ', the':
|
||
|
part['series'] = 'The ' + series[:-5]
|
||
|
|
||
|
|
||
|
# 2- try to merge similar information together and give it a higher confidence
|
||
|
for int_part in ('year', 'season', 'episodeNumber'):
|
||
|
merge_similar_guesses(parts, int_part, choose_int)
|
||
|
|
||
|
for string_part in ('title', 'series', 'container', 'format', 'releaseGroup', 'website',
|
||
|
'audioCodec', 'videoCodec', 'screenSize', 'episodeFormat'):
|
||
|
merge_similar_guesses(parts, string_part, choose_string)
|
||
|
|
||
|
result = merge_all(parts, append = ['language', 'subtitleLanguage', 'other'])
|
||
|
|
||
|
# 3- some last minute post-processing
|
||
|
if (result['type'] == 'episode' and
|
||
|
'season' not in result and
|
||
|
result.get('episodeFormat', '') == 'Minisode'):
|
||
|
result['season'] = 0
|
||
|
|
||
|
log.debug('Final result: ' + result.nice_string())
|
||
|
return result
|