CouchPotatoServer/couchpotato/core/plugins/scanner.py

import os
import re
import threading
import time
import traceback

from couchpotato import get_db
from couchpotato.core.event import fireEvent, addEvent
from couchpotato.core.helpers.encoding import toUnicode, simplifyString, sp
from couchpotato.core.helpers.variable import getExt, getImdb, tryInt, \
    splitString, getIdentifier
from couchpotato.core.logger import CPLog
from couchpotato.core.plugins.base import Plugin
from enzyme.exceptions import NoParserError, ParseError
from guessit import guess_movie_info
from scandir import scandir
from subliminal.videos import Video
import enzyme
from six.moves import filter, map, zip


log = CPLog(__name__)

autoload = 'Scanner'


class Scanner(Plugin):

    ignored_in_path = [os.path.sep + 'extracted' + os.path.sep, 'extracting', '_unpack', '_failed_', '_unknown_', '_exists_', '_failed_remove_',
                       '_failed_rename_', '.appledouble', '.appledb', '.appledesktop', os.path.sep + '._', '.ds_store', 'cp.cpnfo',
                       'thumbs.db', 'ehthumbs.db', 'desktop.ini']  #unpacking, smb-crap, hidden files
    ignore_names = ['extract', 'extracting', 'extracted', 'movie', 'movies', 'film', 'films', 'download', 'downloads', 'video_ts', 'audio_ts', 'bdmv', 'certificate']
    extensions = {
        'movie': ['mkv', 'wmv', 'avi', 'mpg', 'mpeg', 'mp4', 'm2ts', 'iso', 'img', 'mdf', 'ts', 'm4v'],
        'movie_extra': ['mds'],
        'dvd': ['vts_*', 'vob'],
        'nfo': ['nfo', 'txt', 'tag'],
        'subtitle': ['sub', 'srt', 'ssa', 'ass'],
        'subtitle_extra': ['idx'],
        'trailer': ['mov', 'mp4', 'flv']
    }

    file_types = {
        'subtitle': ('subtitle', 'subtitle'),
        'subtitle_extra': ('subtitle', 'subtitle_extra'),
        'trailer': ('video', 'trailer'),
        'nfo': ('nfo', 'nfo'),
        'movie': ('video', 'movie'),
        'movie_extra': ('movie', 'movie_extra'),
        'backdrop': ('image', 'backdrop'),
        'poster': ('image', 'poster'),
        'thumbnail': ('image', 'thumbnail'),
        'leftover': ('leftover', 'leftover'),
    }

    file_sizes = {  # in MB
        'movie': {'min': 300},
        'trailer': {'min': 2, 'max': 250},
        'backdrop': {'min': 0, 'max': 5},
    }

    codecs = {
        'audio': ['dts', 'ac3', 'ac3d', 'mp3'],
        'video': ['x264', 'h264', 'divx', 'xvid']
    }

    audio_codec_map = {
        0x2000: 'ac3',
        0x2001: 'dts',
        0x0055: 'mp3',
        0x0050: 'mp2',
        0x0001: 'pcm',
        0x003: 'pcm',
        0x77a1: 'tta1',
        0x5756: 'wav',
        0x6750: 'vorbis',
        0xF1AC: 'flac',
        0x00ff: 'aac',
    }

    source_media = {
        'bluray': ['bluray', 'blu-ray', 'brrip', 'br-rip'],
        'hddvd': ['hddvd', 'hd-dvd'],
        'dvd': ['dvd'],
        'hdtv': ['hdtv']
    }

    clean = '[ _\,\.\(\)\[\]\-]?(3d|hsbs|sbs|extended.cut|directors.cut|french|swedisch|danish|dutch|swesub|spanish|german|ac3|dts|custom|dc|divx|divx5|dsr|dsrip|dutch|dvd|dvdr|dvdrip|dvdscr|dvdscreener|screener|dvdivx|cam|fragment|fs|hdtv|hdrip' \
            '|hdtvrip|internal|limited|multisubs|ntsc|ogg|ogm|pal|pdtv|proper|repack|rerip|retail|r3|r5|bd5|se|svcd|swedish|german|read.nfo|nfofix|unrated|ws|telesync|ts|telecine|tc|brrip|bdrip|video_ts|audio_ts|480p|480i|576p|576i|720p|720i|1080p|1080i|hrhd|hrhdtv|hddvd|bluray|x264|h264|xvid|xvidvd|xxx|www.www|cd[1-9]|\[.*\])([ _\,\.\(\)\[\]\-]|$)'
    multipart_regex = [
        '[ _\.-]+cd[ _\.-]*([0-9a-d]+)',  #*cd1
        '[ _\.-]+dvd[ _\.-]*([0-9a-d]+)',  #*dvd1
        '[ _\.-]+part[ _\.-]*([0-9a-d]+)',  #*part1
        '[ _\.-]+dis[ck][ _\.-]*([0-9a-d]+)',  #*disk1
        'cd[ _\.-]*([0-9a-d]+)$',  #cd1.ext
        'dvd[ _\.-]*([0-9a-d]+)$',  #dvd1.ext
        'part[ _\.-]*([0-9a-d]+)$',  #part1.mkv
        'dis[ck][ _\.-]*([0-9a-d]+)$',  #disk1.mkv
        '()[ _\.-]+([0-9]*[abcd]+)(\.....?)$',
        '([a-z])([0-9]+)(\.....?)$',
        '()([ab])(\.....?)$'  #*a.mkv
    ]

    cp_imdb = '(.cp.(?P<id>tt[0-9{7}]+).)'

    def __init__(self):

        addEvent('scanner.create_file_identifier', self.createStringIdentifier)
        addEvent('scanner.remove_cptag', self.removeCPTag)

        addEvent('scanner.scan', self.scan)
        addEvent('scanner.name_year', self.getReleaseNameYear)
        addEvent('scanner.partnumber', self.getPartNumber)

    def scan(self, folder = None, files = None, release_download = None, simple = False, newer_than = 0, return_ignored = True, on_found = None):

        folder = sp(folder)

        if not folder or not os.path.isdir(folder):
            log.error('Folder doesn\'t exists: %s', folder)
            return {}

        # Get movie "master" files
        movie_files = {}
        leftovers = []

        # Scan all files of the folder if no files are set
        if not files:
            check_file_date = True
            try:
                files = []
                for root, dirs, walk_files in scandir.walk(folder):
                    files.extend([sp(os.path.join(root, filename)) for filename in walk_files])

                    # Break if CP wants to shut down
                    if self.shuttingDown():
                        break

            except:
                log.error('Failed getting files from %s: %s', (folder, traceback.format_exc()))

            log.debug('Found %s files to scan and group in %s', (len(files), folder))
        else:
            check_file_date = False
            files = [sp(x) for x in files]

        for file_path in files:

            if not os.path.exists(file_path):
                continue

            # Remove ignored files
            if self.isSampleFile(file_path):
                leftovers.append(file_path)
                continue
            elif not self.keepFile(file_path):
                continue

            is_dvd_file = self.isDVDFile(file_path)
            if self.filesizeBetween(file_path, self.file_sizes['movie']) or is_dvd_file: # Minimal 300MB files or is DVD file

                # Normal identifier
                identifier = self.createStringIdentifier(file_path, folder, exclude_filename = is_dvd_file)
                identifiers = [identifier]

                # Identifier with quality
                quality = fireEvent('quality.guess', [file_path], single = True) if not is_dvd_file else {'identifier':'dvdr'}
                if quality:
                    identifier_with_quality = '%s %s' % (identifier, quality.get('identifier', ''))
                    identifiers = [identifier_with_quality, identifier]

                if not movie_files.get(identifier):
                    movie_files[identifier] = {
                        'unsorted_files': [],
                        'identifiers': identifiers,
                        'is_dvd': is_dvd_file,
                    }

                movie_files[identifier]['unsorted_files'].append(file_path)
            else:
                leftovers.append(file_path)

            # Break if CP wants to shut down
            if self.shuttingDown():
                break

        # Cleanup
        del files

        # Sort reverse, this prevents "Iron man 2" from getting grouped with "Iron man" as the "Iron Man 2"
        # files will be grouped first.
        leftovers = set(sorted(leftovers, reverse = True))

        # Group files minus extension
        ignored_identifiers = []
        for identifier, group in movie_files.items():
            if identifier not in group['identifiers'] and len(identifier) > 0: group['identifiers'].append(identifier)

            log.debug('Grouping files: %s', identifier)

            has_ignored = 0
            for file_path in list(group['unsorted_files']):
                ext = getExt(file_path)
                wo_ext = file_path[:-(len(ext) + 1)]
                found_files = set([i for i in leftovers if wo_ext in i])
                group['unsorted_files'].extend(found_files)
                leftovers = leftovers - found_files

                has_ignored += 1 if ext == 'ignore' else 0

            if has_ignored == 0:
                for file_path in list(group['unsorted_files']):
                    ext = getExt(file_path)
                    has_ignored += 1 if ext == 'ignore' else 0

            if has_ignored > 0:
                ignored_identifiers.append(identifier)

            # Break if CP wants to shut down
            if self.shuttingDown():
                break


        # Create identifiers for all leftover files
        path_identifiers = {}
        for file_path in leftovers:
            identifier = self.createStringIdentifier(file_path, folder)

            if not path_identifiers.get(identifier):
                path_identifiers[identifier] = []

            path_identifiers[identifier].append(file_path)


        # Group the files based on the identifier
        delete_identifiers = []
        for identifier, found_files in path_identifiers.items():
            log.debug('Grouping files on identifier: %s', identifier)

            group = movie_files.get(identifier)
            if group:
                group['unsorted_files'].extend(found_files)
                delete_identifiers.append(identifier)

                # Remove the found files from the leftover stack
                leftovers = leftovers - set(found_files)

            # Break if CP wants to shut down
            if self.shuttingDown():
                break

        # Cleaning up used
        for identifier in delete_identifiers:
            if path_identifiers.get(identifier):
                del path_identifiers[identifier]
        del delete_identifiers

        # Group based on folder
        delete_identifiers = []
        for identifier, found_files in path_identifiers.items():
            log.debug('Grouping files on foldername: %s', identifier)

            for ff in found_files:
                new_identifier = self.createStringIdentifier(os.path.dirname(ff), folder)

                group = movie_files.get(new_identifier)
                if group:
                    group['unsorted_files'].extend([ff])
                    delete_identifiers.append(identifier)

                    # Remove the found files from the leftover stack
                    leftovers -= leftovers - set([ff])

            # Break if CP wants to shut down
            if self.shuttingDown():
                break

        # leftovers should be empty
        if leftovers:
            log.debug('Some files are still left over: %s', leftovers)

        # Cleaning up used
        for identifier in delete_identifiers:
            if path_identifiers.get(identifier):
                del path_identifiers[identifier]
        del delete_identifiers

        # Make sure we remove older / still extracting files
        valid_files = {}
        while True and not self.shuttingDown():
            try:
                identifier, group = movie_files.popitem()
            except:
                break

            # Check if movie is fresh and maybe still unpacking, ignore files newer than 1 minute
            if check_file_date:
                files_too_new, time_string = self.checkFilesChanged(group['unsorted_files'])
                if files_too_new:
                    log.info('Files seem to be still unpacking or just unpacked (created on %s), ignoring for now: %s', (time_string, identifier))

                    # Delete the unsorted list
                    del group['unsorted_files']

                    continue

            # Only process movies newer than x
            if newer_than and newer_than > 0:
                has_new_files = False
                for cur_file in group['unsorted_files']:
                    file_time = self.getFileTimes(cur_file)
                    if file_time[0] > newer_than or file_time[1] > newer_than:
                        has_new_files = True
                        break

                if not has_new_files:
                    log.debug('None of the files have changed since %s for %s, skipping.', (time.ctime(newer_than), identifier))

                    # Delete the unsorted list
                    del group['unsorted_files']

                    continue

            valid_files[identifier] = group

        del movie_files

        total_found = len(valid_files)

        # Make sure only one movie was found if a download ID is provided
        if release_download and total_found == 0:
            log.info('Download ID provided (%s), but no groups found! Make sure the download contains valid media files (fully extracted).', release_download.get('imdb_id'))
        elif release_download and total_found > 1:
            log.info('Download ID provided (%s), but more than one group found (%s). Ignoring Download ID...', (release_download.get('imdb_id'), len(valid_files)))
            release_download = None

        # Determine file types
        processed_movies = {}
        while True and not self.shuttingDown():
            try:
                identifier, group = valid_files.popitem()
            except:
                break

            if return_ignored is False and identifier in ignored_identifiers:
                log.debug('Ignore file found, ignoring release: %s', identifier)
                continue

            # Group extra (and easy) files first
            group['files'] = {
                'movie_extra': self.getMovieExtras(group['unsorted_files']),
                'subtitle': self.getSubtitles(group['unsorted_files']),
                'subtitle_extra': self.getSubtitlesExtras(group['unsorted_files']),
                'nfo': self.getNfo(group['unsorted_files']),
                'trailer': self.getTrailers(group['unsorted_files']),
                'leftover': set(group['unsorted_files']),
            }

            # Media files
            if group['is_dvd']:
                group['files']['movie'] = self.getDVDFiles(group['unsorted_files'])
            else:
                group['files']['movie'] = self.getMediaFiles(group['unsorted_files'])

            if len(group['files']['movie']) == 0:
                log.error('Couldn\'t find any movie files for %s', identifier)
                continue

            log.debug('Getting metadata for %s', identifier)
            group['meta_data'] = self.getMetaData(group, folder = folder, release_download = release_download)

            # Subtitle meta
            group['subtitle_language'] = self.getSubtitleLanguage(group) if not simple else {}

            # Get parent dir from movie files
            for movie_file in group['files']['movie']:
                group['parentdir'] = os.path.dirname(movie_file)
                group['dirname'] = None

                folder_names = group['parentdir'].replace(folder, '').split(os.path.sep)
                folder_names.reverse()

                # Try and get a proper dirname, so no "A", "Movie", "Download" etc
                for folder_name in folder_names:
                    if folder_name.lower() not in self.ignore_names and len(folder_name) > 2:
                        group['dirname'] = folder_name
                        break

                break

            # Leftover "sorted" files
            for file_type in group['files']:
                if not file_type is 'leftover':
                    group['files']['leftover'] -= set(group['files'][file_type])
                    group['files'][file_type] = list(group['files'][file_type])
            group['files']['leftover'] = list(group['files']['leftover'])

            # Delete the unsorted list
            del group['unsorted_files']

            # Determine movie
            group['media'] = self.determineMedia(group, release_download = release_download)
            if not group['media']:
                log.error('Unable to determine media: %s', group['identifiers'])
            else:
                group['identifier'] = getIdentifier(group['media']) or group['media']['info'].get('imdb')

            processed_movies[identifier] = group

            # Notify parent & progress on something found
            if on_found:
                on_found(group, total_found, total_found - len(processed_movies))

            # Wait for all the async events calm down a bit
            while threading.activeCount() > 100 and not self.shuttingDown():
                log.debug('Too many threads active, waiting a few seconds')
                time.sleep(10)

        if len(processed_movies) > 0:
            log.info('Found %s movies in the folder %s', (len(processed_movies), folder))
        else:
            log.debug('Found no movies in the folder %s', folder)

        return processed_movies

    def getMetaData(self, group, folder = '', release_download = None):

        data = {}
        files = list(group['files']['movie'])

        for cur_file in files:
            if not self.filesizeBetween(cur_file, self.file_sizes['movie']): continue  # Ignore smaller files

            meta = self.getMeta(cur_file)

            try:
                data['video'] = meta.get('video', self.getCodec(cur_file, self.codecs['video']))
                data['audio'] = meta.get('audio', self.getCodec(cur_file, self.codecs['audio']))
                data['resolution_width'] = meta.get('resolution_width', 720)
                data['resolution_height'] = meta.get('resolution_height', 480)
                data['audio_channels'] = meta.get('audio_channels', 2.0)
                data['aspect'] = round(float(meta.get('resolution_width', 720)) / meta.get('resolution_height', 480), 2)
            except:
                log.debug('Error parsing metadata: %s %s', (cur_file, traceback.format_exc()))
                pass

            if data.get('audio'): break

        # Use the quality guess first, if that failes use the quality we wanted to download
        data['quality'] = None
        if release_download and release_download.get('quality'):
            data['quality'] = fireEvent('quality.single', release_download.get('quality'), single = True)

        if not data['quality']:
            data['quality'] = fireEvent('quality.guess', files = files, extra = data, single = True)

            if not data['quality']:
                data['quality'] = fireEvent('quality.single', 'dvdr' if group['is_dvd'] else 'dvdrip', single = True)

        data['quality_type'] = 'HD' if data.get('resolution_width', 0) >= 1280 or data['quality'].get('hd') else 'SD'

        filename = re.sub('(.cp\(tt[0-9{7}]+\))', '', files[0])
        data['group'] = self.getGroup(filename[len(folder):])
        data['source'] = self.getSourceMedia(filename)

        return data

    def getMeta(self, filename):

        try:
            p = enzyme.parse(filename)

            # Video codec
            vc = ('h264' if p.video[0].codec == 'AVC1' else p.video[0].codec).lower()

            # Audio codec
            ac = p.audio[0].codec
            try: ac = self.audio_codec_map.get(p.audio[0].codec)
            except: pass

            return {
                'video': vc,
                'audio': ac,
                'resolution_width': tryInt(p.video[0].width),
                'resolution_height': tryInt(p.video[0].height),
                'audio_channels': p.audio[0].channels,
            }
        except ParseError:
            log.debug('Failed to parse meta for %s', filename)
        except NoParserError:
            log.debug('No parser found for %s', filename)
        except:
            log.debug('Failed parsing %s', filename)

        return {}

    def getSubtitleLanguage(self, group):
        detected_languages = {}

        # Subliminal scanner
        paths = None
        try:
            paths = group['files']['movie']
            scan_result = []
            for p in paths:
                if not group['is_dvd']:
                    video = Video.from_path(toUnicode(p))
                    video_result = [(video, video.scan())]
                    scan_result.extend(video_result)

            for video, detected_subtitles in scan_result:
                for s in detected_subtitles:
                    if s.language and s.path not in paths:
                        detected_languages[s.path] = [s.language]
        except:
            log.debug('Failed parsing subtitle languages for %s: %s', (paths, traceback.format_exc()))

        # IDX
        for extra in group['files']['subtitle_extra']:
            try:
                if os.path.isfile(extra):
                    output = open(extra, 'r')
                    txt = output.read()
                    output.close()

                    idx_langs = re.findall('\nid: (\w+)', txt)

                    sub_file = '%s.sub' % os.path.splitext(extra)[0]
                    if len(idx_langs) > 0 and os.path.isfile(sub_file):
                        detected_languages[sub_file] = idx_langs
            except:
                log.error('Failed parsing subtitle idx for %s: %s', (extra, traceback.format_exc()))

        return detected_languages

    def determineMedia(self, group, release_download = None):

        # Get imdb id from downloader
        imdb_id = release_download and release_download.get('imdb_id')
        if imdb_id:
            log.debug('Found movie via imdb id from it\'s download id: %s', release_download.get('imdb_id'))

        files = group['files']

        # Check for CP(imdb_id) string in the file paths
        if not imdb_id:
            for cur_file in files['movie']:
                imdb_id = self.getCPImdb(cur_file)
                if imdb_id:
                    log.debug('Found movie via CP tag: %s', cur_file)
                    break

        # Check and see if nfo contains the imdb-id
        nfo_file = None
        if not imdb_id:
            try:
                for nf in files['nfo']:
                    imdb_id = getImdb(nf, check_inside = True)
                    if imdb_id:
                        log.debug('Found movie via nfo file: %s', nf)
                        nfo_file = nf
                        break
            except:
                pass

        # Check and see if filenames contains the imdb-id
        if not imdb_id:
            try:
                for filetype in files:
                    for filetype_file in files[filetype]:
                        imdb_id = getImdb(filetype_file)
                        if imdb_id:
                            log.debug('Found movie via imdb in filename: %s', nfo_file)
                            break
            except:
                pass

        # Search based on identifiers
        if not imdb_id:
            for identifier in group['identifiers']:

                if len(identifier) > 2:
                    try: filename = list(group['files'].get('movie'))[0]
                    except: filename = None

                    name_year = self.getReleaseNameYear(identifier, file_name = filename if not group['is_dvd'] else None)
                    if name_year.get('name') and name_year.get('year'):
                        movie = fireEvent('movie.search', q = '%(name)s %(year)s' % name_year, merge = True, limit = 1)

                        if len(movie) > 0:
                            imdb_id = movie[0].get('imdb')
                            log.debug('Found movie via search: %s', cur_file)
                            if imdb_id: break
                else:
                    log.debug('Identifier to short to use for search: %s', identifier)

        if imdb_id:
            try:
                db = get_db()
                return db.get('media', imdb_id, with_doc = True)['doc']
            except:
                log.debug('Movie "%s" not in library, just getting info', imdb_id)
                return {
                    'identifier': imdb_id,
                    'info': fireEvent('movie.info', identifier = imdb_id, merge = True, extended = False)
                }

        log.error('No imdb_id found for %s. Add a NFO file with IMDB id or add the year to the filename.', group['identifiers'])
        return {}

    def getCPImdb(self, string):

        try:
            m = re.search(self.cp_imdb, string.lower())
            id = m.group('id')
            if id: return id
        except AttributeError:
            pass

        return False

    def removeCPTag(self, name):
        try:
            return re.sub(self.cp_imdb, '', name)
        except:
            pass
        return name

    def getSamples(self, files):
        return set(filter(lambda s: self.isSampleFile(s), files))

    def getMediaFiles(self, files):

        def test(s):
            return self.filesizeBetween(s, self.file_sizes['movie']) and getExt(s.lower()) in self.extensions['movie'] and not self.isSampleFile(s)

        return set(filter(test, files))

    def getMovieExtras(self, files):
        return set(filter(lambda s: getExt(s.lower()) in self.extensions['movie_extra'], files))

    def getDVDFiles(self, files):
        def test(s):
            return self.isDVDFile(s)

        return set(filter(test, files))

    def getSubtitles(self, files):
        return set(filter(lambda s: getExt(s.lower()) in self.extensions['subtitle'], files))

    def getSubtitlesExtras(self, files):
        return set(filter(lambda s: getExt(s.lower()) in self.extensions['subtitle_extra'], files))

    def getNfo(self, files):
        return set(filter(lambda s: getExt(s.lower()) in self.extensions['nfo'], files))

    def getTrailers(self, files):

        def test(s):
            return re.search('(^|[\W_])trailer\d*[\W_]', s.lower()) and self.filesizeBetween(s, self.file_sizes['trailer'])

        return set(filter(test, files))

    def getImages(self, files):

        def test(s):
            return getExt(s.lower()) in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tbn']
        files = set(filter(test, files))

        images = {
            'backdrop': set(filter(lambda s: re.search('(^|[\W_])fanart|backdrop\d*[\W_]', s.lower()) and self.filesizeBetween(s, self.file_sizes['backdrop']), files))
        }

        # Rest
        images['rest'] = files - images['backdrop']

        return images


    def isDVDFile(self, file_name):

        if list(set(file_name.lower().split(os.path.sep)) & set(['video_ts', 'audio_ts'])):
            return True

        for needle in ['vts_', 'video_ts', 'audio_ts', 'bdmv', 'certificate']:
            if needle in file_name.lower():
                return True

        return False

    def keepFile(self, filename):

        # ignoredpaths
        for i in self.ignored_in_path:
            if i in filename.lower():
                log.debug('Ignored "%s" contains "%s".', (filename, i))
                return False

        # All is OK
        return True

    def isSampleFile(self, filename):
        is_sample = re.search('(^|[\W_])sample\d*[\W_]', filename.lower())
        if is_sample: log.debug('Is sample file: %s', filename)
        return is_sample

    def filesizeBetween(self, file, file_size = None):
        if not file_size: file_size = []

        try:
            return (file_size.get('min', 0) * 1048576) < os.path.getsize(file) < (file_size.get('max', 100000) * 1048576)
        except:
            log.error('Couldn\'t get filesize of %s.', file)

        return False

    def createStringIdentifier(self, file_path, folder = '', exclude_filename = False):

        year = self.findYear(file_path)

        identifier = file_path.replace(folder, '').lstrip(os.path.sep) # root folder
        identifier = os.path.splitext(identifier)[0] # ext

        try:
            path_split = splitString(identifier, os.path.sep)
            identifier = path_split[-2] if len(path_split) > 1 and len(path_split[-2]) > len(path_split[-1]) else path_split[-1] # Only get filename
        except: pass

        if exclude_filename:
            identifier = identifier[:len(identifier) - len(os.path.split(identifier)[-1])]

        # multipart
        identifier = self.removeMultipart(identifier)

        # remove cptag
        identifier = self.removeCPTag(identifier)

        # groups, release tags, scenename cleaner, regex isn't correct
        identifier = re.sub(self.clean, '::', simplifyString(identifier)).strip(':')

        # Year
        if year and identifier[:4] != year:
            split_by = ':::' if ':::' in identifier else year
            identifier = '%s %s' % (identifier.split(split_by)[0].strip(), year)
        else:
            identifier = identifier.split('::')[0]

        # Remove duplicates
        out = []
        for word in identifier.split():
            if not word in out:
                out.append(word)

        identifier = ' '.join(out)

        return simplifyString(identifier)


    def removeMultipart(self, name):
        for regex in self.multipart_regex:
            try:
                found = re.sub(regex, '', name)
                if found != name:
                    name = found
            except:
                pass
        return name

    def getPartNumber(self, name):
        for regex in self.multipart_regex:
            try:
                found = re.search(regex, name)
                if found:
                    return found.group(1)
                return 1
            except:
                pass
        return 1

    def getCodec(self, filename, codecs):
        codecs = map(re.escape, codecs)
        try:
            codec = re.search('[^A-Z0-9](?P<codec>' + '|'.join(codecs) + ')[^A-Z0-9]', filename, re.I)
            return (codec and codec.group('codec')) or ''
        except:
            return ''

    def getGroup(self, file):
        try:
            match = re.findall('\-([A-Z0-9]+)[\.\/]', file, re.I)
            return match[-1] or ''
        except:
            return ''

    def getSourceMedia(self, file):
        for media in self.source_media:
            for alias in self.source_media[media]:
                if alias in file.lower():
                    return media

        return None

    def findYear(self, text):

        # Search year inside () or [] first
        matches = re.findall('(\(|\[)(?P<year>19[0-9]{2}|20[0-9]{2})(\]|\))', text)
        if matches:
            return matches[-1][1]

        # Search normal
        matches = re.findall('(?P<year>19[0-9]{2}|20[0-9]{2})', text)
        if matches:
            return matches[-1]

        return ''

    def getReleaseNameYear(self, release_name, file_name = None):

        release_name = release_name.strip(' .-_')

        # Use guessit first
        guess = {}
        if file_name:
            try:
                guessit = guess_movie_info(toUnicode(file_name))
                if guessit.get('title') and guessit.get('year'):
                    guess = {
                        'name': guessit.get('title'),
                        'year': guessit.get('year'),
                    }
            except:
                log.debug('Could not detect via guessit "%s": %s', (file_name, traceback.format_exc()))

        # Backup to simple
        cleaned = ' '.join(re.split('\W+', simplifyString(release_name)))
        cleaned = re.sub(self.clean, ' ', cleaned)

        for year_str in [file_name, release_name, cleaned]:
            if not year_str: continue
            year = self.findYear(year_str)
            if year:
                break

        cp_guess = {}

        if year: # Split name on year
            try:
                movie_name = cleaned.rsplit(year, 1).pop(0).strip()
                if movie_name:
                    cp_guess = {
                        'name': movie_name,
                        'year': int(year),
                    }
            except:
                pass

        if not cp_guess:  # Split name on multiple spaces
            try:
                movie_name = cleaned.split('  ').pop(0).strip()
                cp_guess = {
                    'name': movie_name,
                    'year': int(year) if movie_name[:4] != year else 0,
                }
            except:
                pass

        if cp_guess.get('year') == guess.get('year') and len(cp_guess.get('name', '')) > len(guess.get('name', '')):
            return cp_guess
        elif guess == {}:
            return cp_guess

        return guess