Merge pull request #2356 from fuzeman/tv_searcher

[TV][Searcher] Release Matching and Snatching
12 years ago · 611c159373
23 changed files with 1122 additions and 374 deletions
--- a/couchpotato/core/helpers/variable.py
+++ b/couchpotato/core/helpers/variable.py
@ -211,3 +211,6 @@ def randomString(size = 8, chars = string.ascii_uppercase + string.digits):
 def splitString(str, split_on = ',', clean = True):
    list = [x.strip() for x in str.split(split_on)] if str else []
    return filter(None, list) if clean else list
 def dictIsSubset(a, b):
    return all([k in b and b[k] == v for k, v in a.items()])
--- a/couchpotato/core/media/_base/searcher/main.py
+++ b/couchpotato/core/media/_base/searcher/main.py
@ -7,7 +7,6 @@ from couchpotato.core.logger import CPLog
 from couchpotato.core.media._base.searcher.base import SearcherBase
 from couchpotato.core.settings.model import Media, Release, ReleaseInfo
 from couchpotato.environment import Env
 from sqlalchemy.exc import InterfaceError
 from inspect import ismethod, isfunction
 import datetime
 import re
@ -25,9 +24,9 @@ class Searcher(SearcherBase):
        addEvent('searcher.correct_year', self.correctYear)
        addEvent('searcher.correct_name', self.correctName)
        addEvent('searcher.correct_words', self.correctWords)
        addEvent('searcher.try_download_result', self.tryDownloadResult)
        addEvent('searcher.download', self.download)
        addEvent('searcher.search', self.search)
        addEvent('searcher.create_releases', self.createReleases)
        addApiView('searcher.full_search', self.searchAllView, docs = {
            'desc': 'Starts a full search for all media',
@ -53,27 +52,51 @@ class Searcher(SearcherBase):
        progress = fireEvent('searcher.progress', merge = True)
        return progress
-    def download(self, data, movie, manual = False):
+    def tryDownloadResult(self, results, media, quality_type, manual = False):
        ignored_status, failed_status = fireEvent('status.get', ['ignored', 'failed'], single = True)
-        if not data.get('protocol'):
+        for rel in results:
-            data['protocol'] = data['type']
+            if not quality_type.get('finish', False) and quality_type.get('wait_for', 0) > 0 and rel.get('age') <= quality_type.get('wait_for', 0):
-            data['type'] = 'movie'
+                log.info('Ignored, waiting %s days: %s', (quality_type.get('wait_for'), rel['name']))
                continue
            if rel['status_id'] in [ignored_status.get('id'), failed_status.get('id')]:
                log.info('Ignored: %s', rel['name'])
                continue
            if rel['score'] <= 0:
                log.info('Ignored, score to low: %s', rel['name'])
                continue
            downloaded = fireEvent('searcher.download', data = rel, media = media, manual = manual, single = True)
            if downloaded is True:
                return True
            elif downloaded != 'try_next':
                break
        return False
    def download(self, data, media, manual = False):
        # TODO what is this for?
        #if not data.get('protocol'):
        #    data['protocol'] = data['type']
        #    data['type'] = 'movie'
        # Test to see if any downloaders are enabled for this type
        downloader_enabled = fireEvent('download.enabled', manual, data, single = True)
        if downloader_enabled:
            snatched_status, active_status, done_status = fireEvent('status.get', ['snatched', 'active', 'done'], single = True)
-            snatched_status = fireEvent('status.get', 'snatched', single = True)
+            # Download release to temp
            # Download movie to temp
            filedata = None
            if data.get('download') and (ismethod(data.get('download')) or isfunction(data.get('download'))):
                filedata = data.get('download')(url = data.get('url'), nzb_id = data.get('id'))
                if filedata == 'try_next':
                    return filedata
-            download_result = fireEvent('download', data = data, movie = movie, manual = manual, filedata = filedata, single = True)
+            download_result = fireEvent('download', data = data, movie = media, manual = manual, filedata = filedata, single = True)
            log.debug('Downloader result: %s', download_result)
            if download_result:
@ -84,7 +107,6 @@ class Searcher(SearcherBase):
                    if rls:
                        renamer_enabled = Env.setting('enabled', 'renamer')
                        done_status = fireEvent('status.get', 'done', single = True)
                        rls.status_id = done_status.get('id') if not renamer_enabled else snatched_status.get('id')
                        # Save download-id info if returned
@ -97,36 +119,34 @@ class Searcher(SearcherBase):
                                rls.info.append(rls_info)
                        db.commit()
-                        log_movie = '%s (%s) in %s' % (getTitle(movie['library']), movie['library']['year'], rls.quality.label)
+                        log_movie = '%s (%s) in %s' % (getTitle(media['library']), media['library']['year'], rls.quality.label)
                        snatch_message = 'Snatched "%s": %s' % (data.get('name'), log_movie)
                        log.info(snatch_message)
-                        fireEvent('movie.snatched', message = snatch_message, data = rls.to_dict())
+                        fireEvent('%s.snatched' % data['type'], message = snatch_message, data = rls.to_dict())
-                        # If renamer isn't used, mark movie done
+                        # If renamer isn't used, mark media done
                        if not renamer_enabled:
                            active_status = fireEvent('status.get', 'active', single = True)
                            done_status = fireEvent('status.get', 'done', single = True)
                            try:
-                                if movie['status_id'] == active_status.get('id'):
+                                if media['status_id'] == active_status.get('id'):
-                                    for profile_type in movie['profile']['types']:
+                                    for profile_type in media['profile']['types']:
                                        if profile_type['quality_id'] == rls.quality.id and profile_type['finish']:
-                                            log.info('Renamer disabled, marking movie as finished: %s', log_movie)
+                                            log.info('Renamer disabled, marking media as finished: %s', log_movie)
                                            # Mark release done
                                            rls.status_id = done_status.get('id')
                                            rls.last_edit = int(time.time())
                                            db.commit()
-                                            # Mark movie done
+                                            # Mark media done
-                                            mvie = db.query(Media).filter_by(id = movie['id']).first()
+                                            mdia = db.query(Media).filter_by(id = media['id']).first()
-                                            mvie.status_id = done_status.get('id')
+                                            mdia.status_id = done_status.get('id')
-                                            mvie.last_edit = int(time.time())
+                                            mdia.last_edit = int(time.time())
                                            db.commit()
                            except:
-                                log.error('Failed marking movie finished, renamer disabled: %s', traceback.format_exc())
+                                log.error('Failed marking media finished, renamer disabled: %s', traceback.format_exc())
                except:
-                    log.error('Failed marking movie finished: %s', traceback.format_exc())
+                    log.error('Failed marking media finished: %s', traceback.format_exc())
                return True
@ -137,14 +157,11 @@ class Searcher(SearcherBase):
    def search(self, protocols, media, quality):
        results = []
-        search_type = None
+        # TODO could this be handled better? (removing the need for 'searcher.get_media_searcher_id')
-        if media['type'] == 'movie':
+        searcher_id = fireEvent('searcher.get_media_searcher_id', media['type'], single = True)
            search_type = 'movie'
        elif media['type'] in ['show', 'season', 'episode']:
            search_type = 'show'
        for search_protocol in protocols:
-            protocol_results = fireEvent('provider.search.%s.%s' % (search_protocol, search_type), media, quality, merge = True)
+            protocol_results = fireEvent('provider.search.%s.%s' % (search_protocol, searcher_id), media, quality, merge = True)
            if protocol_results:
                results += protocol_results
@ -156,52 +173,6 @@ class Searcher(SearcherBase):
        return sorted_results
    def createReleases(self, search_results, media, quality_type):
        available_status, ignored_status, failed_status = fireEvent('status.get', ['available', 'ignored', 'failed'], single = True)
        db = get_session()
        found_releases = []
        for rel in search_results:
            nzb_identifier = md5(rel['url'])
            found_releases.append(nzb_identifier)
            rls = db.query(Release).filter_by(identifier = nzb_identifier).first()
            if not rls:
                rls = Release(
                    identifier = nzb_identifier,
                    media_id = media.get('id'),
                    quality_id = quality_type.get('quality_id'),
                    status_id = available_status.get('id')
                )
                db.add(rls)
            else:
                [db.delete(old_info) for old_info in rls.info]
                rls.last_edit = int(time.time())
            db.commit()
            for info in rel:
                try:
                    if not isinstance(rel[info], (str, unicode, int, long, float)):
                        continue
                    rls_info = ReleaseInfo(
                        identifier = info,
                        value = toUnicode(rel[info])
                    )
                    rls.info.append(rls_info)
                except InterfaceError:
                    log.debug('Couldn\'t add %s to ReleaseInfo: %s', (info, traceback.format_exc()))
            db.commit()
            rel['status_id'] = rls.status_id
        return found_releases
    def getSearchProtocols(self):
        download_protocols = fireEvent('download.enabled_protocols', merge = True)
--- a/couchpotato/core/media/movie/searcher/main.py
+++ b/couchpotato/core/media/movie/searcher/main.py
@ -31,6 +31,7 @@ class MovieSearcher(SearcherBase, MovieTypeBase):
        addEvent('movie.searcher.could_be_released', self.couldBeReleased)
        addEvent('searcher.correct_release', self.correctRelease)
        addEvent('searcher.get_search_title', self.getSearchTitle)
        addEvent('searcher.get_media_searcher_id', self.getMediaSearcherId)
        addApiView('movie.searcher.try_next', self.tryNextReleaseView, docs = {
            'desc': 'Marks the snatched results as ignored and try the next best release',
@ -175,27 +176,11 @@ class MovieSearcher(SearcherBase, MovieTypeBase):
                    break
                # Add them to this movie releases list
-                found_releases += fireEvent('searcher.create_releases', results, movie, quality_type, single = True)
+                found_releases += fireEvent('release.create_from_search', results, movie, quality_type, single = True)
-                for nzb in results:
+                # Try find a valid result and download it
-                    if not quality_type.get('finish', False) and quality_type.get('wait_for', 0) > 0 and nzb.get('age') <= quality_type.get('wait_for', 0):
+                if fireEvent('searcher.try_download_result', results, movie, quality_type, manual, single = True):
-                        log.info('Ignored, waiting %s days: %s', (quality_type.get('wait_for'), nzb['name']))
+                    ret = True
                        continue
                    if nzb['status_id'] in [ignored_status.get('id'), failed_status.get('id')]:
                        log.info('Ignored: %s', nzb['name'])
                        continue
                    if nzb['score'] <= 0:
                        log.info('Ignored, score to low: %s', nzb['name'])
                        continue
                    downloaded = fireEvent('searcher.download', data = nzb, movie = movie, manual = manual, single = True)
                    if downloaded is True:
                        ret = True
                        break
                    elif downloaded != 'try_next':
                        break
                # Remove releases that aren't found anymore
                for release in movie.get('releases', []):
@ -359,5 +344,9 @@ class MovieSearcher(SearcherBase, MovieTypeBase):
        if media['type'] == 'movie':
            return getTitle(media['library'])
    def getMediaSearcherId(self, media_type):
        if media_type == 'movie':
            return 'movie'
 class SearchSetupError(Exception):
    pass
--- a/couchpotato/core/media/show/searcher/main.py
+++ b/couchpotato/core/media/show/searcher/main.py
@ -1,14 +1,12 @@
 import pprint
 import re
 from couchpotato import get_session, Env
 from couchpotato.core.event import addEvent, fireEvent
-from couchpotato.core.helpers.encoding import simplifyString
+from couchpotato.core.helpers.variable import getTitle, tryInt
 from couchpotato.core.helpers.variable import getTitle, tryInt, possibleTitles
 from couchpotato.core.logger import CPLog
 from couchpotato.core.media._base.searcher.main import SearchSetupError
 from couchpotato.core.plugins.base import Plugin
 from couchpotato.core.settings.model import Media, Library
-from caper import Caper
+from qcond import QueryCondenser
 from qcond.helpers import simplify
 log = CPLog(__name__)
@ -29,38 +27,19 @@ class ShowSearcher(Plugin):
    def __init__(self):
        super(ShowSearcher, self).__init__()
        self.query_condenser = QueryCondenser()
        addEvent('show.searcher.single', self.single)
        addEvent('searcher.correct_release', self.correctRelease)
        addEvent('searcher.get_search_title', self.getSearchTitle)
-        self.caper = Caper()
+        addEvent('searcher.correct_match', self.correctMatch)
-
+        addEvent('searcher.correct_release', self.correctRelease)
    def _lookupMedia(self, media):
        db = get_session()
        media_library = db.query(Library).filter_by(id = media['library_id']).first()
        show = None
        season = None
        episode = None
        if media['type'] == 'episode':
            show = media_library.parent.parent
            season = media_library.parent
            episode = media_library
        if media['type'] == 'season':
            show = media_library.parent
            season = media_library
        if media['type'] == 'show':
            show = media_library
        return show, season, episode
-    def single(self, media, search_protocols = None):
+        addEvent('searcher.get_media_identifier', self.getMediaIdentifier)
-        pprint.pprint(media)
+        addEvent('searcher.get_media_root', self.getMediaRoot)
        addEvent('searcher.get_media_searcher_id', self.getMediaSearcherId)
    def single(self, media, search_protocols = None, manual = False):
        if media['type'] == 'show':
            # TODO handle show searches (scan all seasons)
            return
@ -72,7 +51,7 @@ class ShowSearcher(Plugin):
        except SearchSetupError:
            return
-        done_status = fireEvent('status.get', 'done', single = True)
+        done_status, available_status, ignored_status, failed_status = fireEvent('status.get', ['done', 'available', 'ignored', 'failed'], single = True)
        if not media['profile'] or media['status_id'] == done_status.get('id'):
            log.debug('Episode doesn\'t have a profile or already done, assuming in manage tab.')
@ -80,19 +59,18 @@ class ShowSearcher(Plugin):
        db = get_session()
-        pre_releases = fireEvent('quality.pre_releases', single = True)
+        #pre_releases = fireEvent('quality.pre_releases', single = True)
        available_status, ignored_status, failed_status = fireEvent('status.get', ['available', 'ignored', 'failed'], single = True)
        found_releases = []
        too_early_to_search = []
-        default_title = self.getSearchTitle(media['library'])
+        default_title = self.getSearchTitle(media)
        if not default_title:
            log.error('No proper info found for episode, removing it from library to cause it from having more issues.')
            #fireEvent('episode.delete', episode['id'], single = True)
            return
-        show, season, episode = self._lookupMedia(media)
+        show, season, episode = self.getMedia(media)
        if show is None or season is None:
            log.error('Unable to find show or season library in database, missing required data for searching')
            return
@ -128,9 +106,81 @@ class ShowSearcher(Plugin):
                    break
                # Add them to this movie releases list
-                found_releases += fireEvent('searcher.create_releases', results, media, quality_type, single = True)
+                found_releases += fireEvent('release.create_from_search', results, media, quality_type, single = True)
                # Try find a valid result and download it
                if fireEvent('searcher.try_download_result', results, media, quality_type, manual, single = True):
                    ret = True
                # Remove releases that aren't found anymore
                for release in media.get('releases', []):
                    if release.get('status_id') == available_status.get('id') and release.get('identifier') not in found_releases:
                        fireEvent('release.delete', release.get('id'), single = True)
            else:
                log.info('Better quality (%s) already available or snatched for %s', (quality_type['quality']['label'], default_title))
                fireEvent('movie.restatus', media['id'])
                break
            # Break if CP wants to shut down
            if self.shuttingDown() or ret:
                break
        if len(too_early_to_search) > 0:
            log.info2('Too early to search for %s, %s', (too_early_to_search, default_title))
        fireEvent('notify.frontend', type = 'show.searcher.ended.%s' % media['id'], data = True)
        return ret
    def getSearchTitle(self, media):
        if media['type'] not in ['show', 'season', 'episode']:
            return
        show, season, episode = self.getMedia(media)
        if show is None:
            return None
        titles = []
        # Add season map_names if they exist
        if season is not None and 'map_names' in show.info:
            season_names = show.info['map_names'].get(str(season.season_number), {})
            # Add titles from all locations
            # TODO only add name maps from a specific location
            for location, names in season_names.items():
                titles += [name for name in names if name not in titles]
        # Add show titles
        titles += [title.title for title in show.titles if title.title not in titles]
        # Use QueryCondenser to build a list of optimal search titles
        condensed_titles = self.query_condenser.distinct(titles)
        title = None
        # TODO try other titles if searching doesn't return results
        if len(condensed_titles):
            # Return the first condensed title if one exists
            title = condensed_titles[0]
        elif len(titles):
            # Fallback to first raw title
            title = simplify(titles[0])
        else:
            return None
        # Add the identifier to search title
        # TODO supporting other identifier formats
        identifier = fireEvent('searcher.get_media_identifier', media['library'], single = True)
        if identifier['season']:
            title += ' S%02d' % identifier['season']
-                log.info('%d results found' % len(results))
+            if identifier['episode']:
                title += 'E%02d' % identifier['episode']
        return title
    def correctRelease(self, release = None, media = None, quality = None, **kwargs):
@ -146,128 +196,97 @@ class ShowSearcher(Plugin):
        if not fireEvent('searcher.correct_words', release['name'], media, single = True):
            return False
-        show, season, episode = self._lookupMedia(media)
+        show, season, episode = self.getMedia(media)
        if show is None or season is None:
            log.error('Unable to find show or season library in database, missing required data for searching')
            return
-        release_info = self.caper.parse(release['name'])
+        match = fireEvent('matcher.best', release, media, quality, single = True)
-        if len(release_info.chains) < 1:
+        if match:
-            log.info2('Wrong: %s, unable to parse release name (no chains)', release['name'])
+            return match.weight
-            return False
+
        return False
-        # TODO look at all chains
+    def correctMatch(self, chain, release, media, quality):
-        chain = release_info.chains[0]
+        log.info("Checking if '%s' is valid", release['name'])
-        if not self.correctQuality(chain, quality['identifier']):
+        if not fireEvent('matcher.correct_quality', chain, quality, self.quality_map, single = True):
            log.info('Wrong: %s, quality does not match', release['name'])
            return False
-        if not self.correctIdentifier(chain, media):
+        if not fireEvent('matcher.correct_identifier', chain, media):
            log.info('Wrong: %s, identifier does not match', release['name'])
            return False
-        if 'show_name' not in chain.info or not len(chain.info['show_name']):
+        if not fireEvent('matcher.correct_title', chain, media):
-            log.info('Wrong: %s, missing show name in parsed result', release['name'])
+            log.info("Wrong: '%s', undetermined naming.", (' '.join(chain.info['show_name'])))
            return False
        chain_words = [x.lower() for x in chain.info['show_name']]
        chain_title = ' '.join(chain_words)
        library_title = None
        # Check show titles match
        for raw_title in show.titles:
            for valid_words in [x.split(' ') for x in possibleTitles(raw_title.title)]:
                if not library_title:
                    library_title = ' '.join(valid_words)
                if valid_words == chain_words:
                    return True
        log.info("Wrong: title '%s', undetermined show naming. Looking for '%s (%s)'", (chain_title, library_title, media['library']['year']))
        return False
    def correctQuality(self, chain, quality_identifier):
        if quality_identifier not in self.quality_map:
            log.info2('Wrong: unknown preferred quality %s for TV searching', quality_identifier)
            return False
        if 'video' not in chain.info:
            log.info2('Wrong: no video tags found')
            return False
        video_tags = self.quality_map[quality_identifier]
        if not self.chainMatches(chain, 'video', video_tags):
            log.info2('Wrong: %s tags not in chain', video_tags)
            return False
        return True
-    def correctIdentifier(self, chain, media):
+    def getMediaIdentifier(self, media_library):
-        required_id = self.getIdentifier(media['library'], 'season_number', 'episode_number')
+        if media_library['type'] not in ['show', 'season', 'episode']:
-
+            return None
        if 'identifier' not in chain.info:
            return False
        # TODO could be handled better?
        if len(chain.info['identifier']) != 1:
            return False
        identifier = chain.info['identifier'][0]
-        # TODO air by date episodes
+        identifier = {
-        release_id = self.getIdentifier(identifier, 'season', 'episode')
+            'season': None,
            'episode': None
        }
-        if required_id != release_id:
+        if media_library['type'] == 'episode':
-            log.info2('Wrong: required identifier %s does not match release identifier %s', (str(required_id), str(release_id)))
+            map_episode = media_library['info'].get('map_episode')
            return False
-        return True
+            if map_episode and 'scene' in map_episode:
                identifier['season'] = map_episode['scene'].get('season')
                identifier['episode'] = map_episode['scene'].get('episode')
            else:
                # TODO xem mapping?
                identifier['season'] = media_library.get('season_number')
                identifier['episode'] = media_library.get('episode_number')
-    def getIdentifier(self, d, episode_key, season_key):
+        if media_library['type'] == 'season':
-        return (
+            identifier['season'] = media_library.get('season_number')
            tryInt(d.get(season_key), None) if season_key in d else None,
            tryInt(d.get(episode_key), None) if episode_key in d else None
        )
-    def chainMatches(self, chain, group, tags):
+        # Try cast identifier values to integers
-        found_tags = []
+        identifier['season'] = tryInt(identifier['season'], None)
        identifier['episode'] = tryInt(identifier['episode'], None)
-        for match in chain.info[group]:
+        return identifier
            for ck, cv in match.items():
                if ck in tags and self.cleanMatchValue(cv) in tags[ck]:
                    found_tags.append(ck)
    def getMediaRoot(self, media):
        if media['type'] not in ['show', 'season', 'episode']:
            return None
-        if set(tags.keys()) == set(found_tags):
+        show, season, episode = self.getMedia(media)
-            return True
+        if show is None or season is None:
            log.error('Unable to find show or season library in database, missing required data for searching')
            return
-        return set([key for key, value in tags.items() if value]) == set(found_tags)
+        return show.to_dict()
-    def cleanMatchValue(self, value):
+    def getMediaSearcherId(self, media_type):
-        value = value.lower()
+        if media_type in ['show', 'season', 'episode']:
-        value = value.strip()
+            return 'show'
-        for ch in [' ', '-', '.']:
+    def getMedia(self, media):
-            value = value.replace(ch, '')
+        db = get_session()
-        return value
+        media_library = db.query(Library).filter_by(id = media['library_id']).first()
-    def getSearchTitle(self, media):
+        show = None
-        show, season, episode = self._lookupMedia(media)
+        season = None
-        if show is None:
+        episode = None
            return None
-        name = ''
+        if media['type'] == 'episode':
-        if season is not None:
+            show = media_library.parent.parent
-            name = ' S%02d' % season.season_number
+            season = media_library.parent
            episode = media_library
-            if episode is not None:
+        if media['type'] == 'season':
-                name += 'E%02d' % episode.episode_number
+            show = media_library.parent
            season = media_library
-        show_title = getTitle(show)
+        if media['type'] == 'show':
-        if not show_title:
+            show = media_library
            return None
-        return show_title + name
+        return show, season, episode
--- a/couchpotato/core/plugins/matcher/init.py
+++ b/couchpotato/core/plugins/matcher/init.py
@ -0,0 +1,6 @@
 from .main import Matcher
 def start():
    return Matcher()
 config = []
--- a/couchpotato/core/plugins/matcher/main.py
+++ b/couchpotato/core/plugins/matcher/main.py
@ -0,0 +1,109 @@
 from caper import Caper
 from couchpotato import CPLog, tryInt
 from couchpotato.core.event import addEvent, fireEvent
 from couchpotato.core.helpers.encoding import simplifyString
 from couchpotato.core.helpers.variable import possibleTitles, dictIsSubset
 from couchpotato.core.plugins.base import Plugin
 log = CPLog(__name__)
 class Matcher(Plugin):
    def __init__(self):
        self.caper = Caper()
        addEvent('matcher.parse', self.parse)
        addEvent('matcher.best', self.best)
        addEvent('matcher.correct_title', self.correctTitle)
        addEvent('matcher.correct_identifier', self.correctIdentifier)
        addEvent('matcher.correct_quality', self.correctQuality)
    def parse(self, release):
        return self.caper.parse(release['name'])
    def best(self, release, media, quality):
        rel_info = fireEvent('matcher.parse', release, single = True)
        if len(rel_info.chains) < 1:
            log.info2('Wrong: %s, unable to parse release name (no chains)', release['name'])
            return False
        for chain in rel_info.chains:
            if fireEvent('searcher.correct_match', chain, release, media, quality, single = True):
                return chain
        return None
    def chainMatch(self, chain, group, tags):
        found_tags = []
        for match in chain.info[group]:
            for ck, cv in match.items():
                if ck in tags and simplifyString(cv) in tags[ck]:
                    found_tags.append(ck)
        if set(tags.keys()) == set(found_tags):
            return True
        return set([key for key, value in tags.items() if None not in value]) == set(found_tags)
    def correctIdentifier(self, chain, media):
        required_id = fireEvent('searcher.get_media_identifier', media['library'], single = True)
        if 'identifier' not in chain.info:
            return False
        # TODO could be handled better?
        if len(chain.info['identifier']) != 1:
            return False
        identifier = chain.info['identifier'][0]
        # TODO air by date episodes
        # TODO this should support identifiers with characters 'a', 'b', etc..
        for k, v in identifier.items():
            identifier[k] = tryInt(v, None)
        if not dictIsSubset(required_id, identifier):
            log.info2('Wrong: required identifier %s does not match release identifier %s', (str(required_id), str(identifier)))
            return False
        return True
    def correctTitle(self, chain, media):
        root_library = fireEvent('searcher.get_media_root', media['library'], single = True)
        if 'show_name' not in chain.info or not len(chain.info['show_name']):
            log.info('Wrong: missing show name in parsed result')
            return False
        chain_words = [x.lower() for x in chain.info['show_name']]
        # Check show titles match
        # TODO check xem names
        for title in root_library['info']['titles']:
            for valid_words in [x.split(' ') for x in possibleTitles(title)]:
                if valid_words == chain_words:
                    return True
        return False
    def correctQuality(self, chain, quality, quality_map):
        if quality['identifier'] not in quality_map:
            log.info2('Wrong: unknown preferred quality %s', quality['identifier'])
            return False
        if 'video' not in chain.info:
            log.info2('Wrong: no video tags found')
            return False
        video_tags = quality_map[quality['identifier']]
        if not self.chainMatch(chain, 'video', video_tags):
            log.info2('Wrong: %s tags not in chain', video_tags)
            return False
        return True
--- a/couchpotato/core/plugins/release/main.py
+++ b/couchpotato/core/plugins/release/main.py
@ -1,11 +1,12 @@
-from couchpotato import get_session
+from couchpotato import get_session, md5
 from couchpotato.api import addApiView
 from couchpotato.core.event import fireEvent, addEvent
-from couchpotato.core.helpers.encoding import ss
+from couchpotato.core.helpers.encoding import ss, toUnicode
 from couchpotato.core.logger import CPLog
 from couchpotato.core.plugins.base import Plugin
 from couchpotato.core.plugins.scanner.main import Scanner
-from couchpotato.core.settings.model import File, Release as Relea, Media
+from couchpotato.core.settings.model import File, Release as Relea, Media, ReleaseInfo
 from sqlalchemy.exc import InterfaceError
 from sqlalchemy.orm import joinedload_all
 from sqlalchemy.sql.expression import and_, or_
 import os
@ -45,6 +46,7 @@ class Release(Plugin):
            }
        })
        addEvent('release.create_from_search', self.createFromSearch)
        addEvent('release.for_movie', self.forMovie)
        addEvent('release.delete', self.delete)
        addEvent('release.clean', self.clean)
@ -191,7 +193,7 @@ class Release(Plugin):
            if item.get('protocol') != 'torrent_magnet':
                item['download'] = provider.loginDownload if provider.urls.get('login') else provider.download
-            success = fireEvent('searcher.download', data = item, movie = rel.media.to_dict({
+            success = fireEvent('searcher.download', data = item, media = rel.media.to_dict({
                'profile': {'types': {'quality': {}}},
                'releases': {'status': {}, 'quality': {}},
                'library': {'titles': {}, 'files':{}},
@ -213,6 +215,52 @@ class Release(Plugin):
            'success': False
        }
    def createFromSearch(self, search_results, media, quality_type):
        available_status, ignored_status, failed_status = fireEvent('status.get', ['available', 'ignored', 'failed'], single = True)
        db = get_session()
        found_releases = []
        for rel in search_results:
            rel_identifier = md5(rel['url'])
            found_releases.append(rel_identifier)
            rls = db.query(Relea).filter_by(identifier = rel_identifier).first()
            if not rls:
                rls = Relea(
                    identifier = rel_identifier,
                    media_id = media.get('id'),
                    quality_id = quality_type.get('quality_id'),
                    status_id = available_status.get('id')
                )
                db.add(rls)
            else:
                [db.delete(old_info) for old_info in rls.info]
                rls.last_edit = int(time.time())
            db.commit()
            for info in rel:
                try:
                    if not isinstance(rel[info], (str, unicode, int, long, float)):
                        continue
                    rls_info = ReleaseInfo(
                        identifier = info,
                        value = toUnicode(rel[info])
                    )
                    rls.info.append(rls_info)
                except InterfaceError:
                    log.debug('Couldn\'t add %s to ReleaseInfo: %s', (info, traceback.format_exc()))
            db.commit()
            rel['status_id'] = rls.status_id
        return found_releases
    def forMovie(self, id = None):
        db = get_session()
--- a/couchpotato/core/providers/base.py
+++ b/couchpotato/core/providers/base.py
@ -274,7 +274,10 @@ class YarrProvider(Provider):
            if identifier in qualities:
                return ids
-        return [self.cat_backup_id]
+        if self.cat_backup_id:
            return [self.cat_backup_id]
        return []
 class ResultList(list):
@ -302,12 +305,23 @@ class ResultList(list):
        new_result = self.fillResult(result)
-        is_correct_movie = fireEvent('searcher.correct_release', new_result, self.movie, self.quality,
+        is_correct = fireEvent('searcher.correct_release', new_result, self.movie, self.quality,
                                     imdb_results = self.kwargs.get('imdb_results', False), single = True)
-        if is_correct_movie and new_result['id'] not in self.result_ids:
+        if is_correct and new_result['id'] not in self.result_ids:
            is_correct_weight = float(is_correct)
            new_result['score'] += fireEvent('score.calculate', new_result, self.movie, single = True)
            old_score = new_result['score']
            new_result['score'] = int(old_score * is_correct_weight)
            log.info('Found correct release with weight %.02f, old_score(%d) now scaled to score(%d)', (
                is_correct_weight,
                old_score,
                new_result['score']
            ))
            self.found(new_result)
            self.result_ids.append(result['id'])
--- a/couchpotato/core/providers/torrent/iptorrents/main.py
+++ b/couchpotato/core/providers/torrent/iptorrents/main.py
@ -23,7 +23,7 @@ class Base(TorrentProvider):
        'base_url' : 'http://www.iptorrents.com',
        'login' : 'http://www.iptorrents.com/torrents/',
        'login_check': 'http://www.iptorrents.com/inbox.php',
-        'search' : 'http://www.iptorrents.com/torrents/?l%d=1%%s&q=%s&qf=ti&p=%%d',
+        'search' : 'http://www.iptorrents.com/torrents/?%s%%s&q=%s&qf=ti&p=%%d',
    }
    http_time_between_calls = 1 #seconds
@ -31,12 +31,13 @@ class Base(TorrentProvider):
    def _buildUrl(self, query, quality_identifier, cat_ids_group = None):
-        cat_id = self.getCatId(quality_identifier, cat_ids_group)[0]
+        cat_ids = self.getCatId(quality_identifier, cat_ids_group)
-        if not cat_id:
+
        if not cat_ids or not len(cat_ids):
            log.warning('Unable to find category for quality %s', quality_identifier)
            return
-        return self.urls['search'] % (cat_id, tryUrlencode(query).replace('%', '%%'))
+        return self.urls['search'] % ("&".join(("l%d=" % x) for x in cat_ids), tryUrlencode(query).replace('%', '%%'))
    def _searchOnTitle(self, title, media, quality, results):
@ -140,8 +141,7 @@ class Show(ShowProvider, Base):
        ]),
        ('episode', [
            ([5], ['hdtv_720p', 'webdl_720p', 'webdl_1080p']),
-            ([78], ['hdtv_sd']),
+            ([4, 78, 79], ['hdtv_sd'])
            ([4, 79], ['hdtv_sd'])
        ])
    ]
--- a/libs/caper/init.py
+++ b/libs/caper/init.py
@ -19,7 +19,7 @@ from caper.parsers.anime import AnimeParser
 from caper.parsers.scene import SceneParser
-__version_info__ = ('0', '2', '0')
+__version_info__ = ('0', '2', '2')
 __version_branch__ = 'master'
 __version__ = "%s%s" % (
--- a/libs/caper/constraint.py
+++ b/libs/caper/constraint.py
@ -38,7 +38,7 @@ class CaptureConstraint(object):
    def _compare_eq(self, fragment, name, expected):
        if not hasattr(fragment, name):
-            return None
+            return 1.0, False
        return 1.0, getattr(fragment, name) == expected
--- a/libs/caper/group.py
+++ b/libs/caper/group.py
@ -14,8 +14,9 @@
 from logr import Logr
 from caper import CaperClosure
 from caper.helpers import clean_dict
-from caper.result import CaperFragmentNode
+from caper.result import CaperFragmentNode, CaperClosureNode
 from caper.step import CaptureStep
 from caper.constraint import CaptureConstraint
@ -70,7 +71,9 @@ class CaptureGroup(object):
    def parse_subject(self, parent_head, subject):
        parent_node = parent_head[0] if type(parent_head) is list else parent_head
-        # TODO - if subject is a closure?
+        # TODO just jumping into closures for now, will be fixed later
        if type(subject) is CaperClosure:
            return [CaperClosureNode(subject, parent_head)]
        nodes = []
--- a/libs/caper/matcher.py
+++ b/libs/caper/matcher.py
@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import pprint
 import re
 from logr import Logr
-from caper.helpers import is_list_type, clean_dict
+from caper.helpers import is_list_type
 class FragmentMatcher(object):
@ -57,8 +56,6 @@ class FragmentMatcher(object):
                self.regex[group_name].append((weight, weight_patterns))
        pprint.pprint(self.regex)
    def find_group(self, name):
        for group_name, weight_groups in self.regex.items():
            if group_name and group_name == name:
@ -66,62 +63,6 @@ class FragmentMatcher(object):
        return None
    def parser_match(self, parser, group_name, single=True):
        """
        :type parser: caper.parsers.base.Parser
        """
        result = None
        for group, weight_groups in self.regex.items():
            if group_name and group != group_name:
                continue
            # TODO handle multiple weights
            weight, patterns = weight_groups[0]
            for pattern in patterns:
                fragments = []
                pattern_matched = True
                pattern_result = {}
                for fragment_pattern in pattern:
                    if not parser.fragment_available():
                        pattern_matched = False
                        break
                    fragment = parser.next_fragment()
                    fragments.append(fragment)
                    Logr.debug('[r"%s"].match("%s")', fragment_pattern.pattern, fragment.value)
                    match = fragment_pattern.match(fragment.value)
                    if match:
                        Logr.debug('Pattern "%s" matched', fragment_pattern.pattern)
                    else:
                        pattern_matched = False
                        break
                    pattern_result.update(clean_dict(match.groupdict()))
                if pattern_matched:
                    if result is None:
                        result = {}
                    if group not in result:
                        result[group] = {}
                    Logr.debug('Matched on <%s>', ' '.join([f.value for f in fragments]))
                    result[group].update(pattern_result)
                    parser.commit()
                    if single:
                        return result
                else:
                    parser.rewind()
        return result
    def value_match(self, value, group_name=None, single=True):
        result = None
--- a/libs/caper/parsers/base.py
+++ b/libs/caper/parsers/base.py
@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from logr import Logr
 from caper import FragmentMatcher
 from caper.group import CaptureGroup
 from caper.result import CaperResult, CaperClosureNode
@ -60,62 +59,6 @@ class Parser(object):
        raise NotImplementedError()
    #
    # Closure Methods
    #
    def next_closure(self):
        self._closure_pos += 1
        closure = self.closures[self._closure_pos]
        self._history.append(('fragment', -1 - self._fragment_pos))
        self._fragment_pos = -1
        if self._closure_pos != 0:
            self._history.append(('closure', 1))
        Logr.debug('(next_closure) closure.value: "%s"', closure.value)
        return closure
    def closure_available(self):
        return self._closure_pos + 1 < len(self.closures)
    #
    # Fragment Methods
    #
    def next_fragment(self):
        closure = self.closures[self._closure_pos]
        self._fragment_pos += 1
        fragment = closure.fragments[self._fragment_pos]
        self._history.append(('fragment', 1))
        Logr.debug('(next_fragment) closure.value "%s" - fragment.value: "%s"', closure.value, fragment.value)
        return fragment
    def fragment_available(self):
        if not self.closure_available():
            return False
        return self._fragment_pos + 1 < len(self.closures[self._closure_pos].fragments)
    def rewind(self):
        for source, delta in reversed(self._history):
            Logr.debug('(rewind) Rewinding step: %s', (source, delta))
            if source == 'fragment':
                self._fragment_pos -= delta
            elif source == 'closure':
                self._closure_pos -= delta
            else:
                raise NotImplementedError()
        self.commit()
    def commit(self):
        Logr.debug('(commit)')
        self._history = []
    #
    # Capture Methods
    #
--- a/libs/caper/step.py
+++ b/libs/caper/step.py
@ -33,18 +33,6 @@ class CaptureStep(object):
        #: @type: bool
        self.single = single
    def _get_next_subject(self, parser):
        if self.source == 'fragment':
            if not parser.fragment_available():
                return None
            return parser.next_fragment()
        elif self.source == 'closure':
            if not parser.closure_available():
                return None
            return parser.next_closure()
        raise NotImplementedError()
    def execute(self, fragment):
        if self.regex:
            weight, match, num_fragments = self.capture_group.parser.matcher.fragment_match(fragment, self.regex)
--- a/libs/qcond/init.py
+++ b/libs/qcond/init.py
@ -0,0 +1,42 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from qcond.transformers.merge import MergeTransformer
 from qcond.transformers.slice import SliceTransformer
 from qcond.transformers.strip_common import StripCommonTransformer
 __version_info__ = ('0', '1', '0')
 __version_branch__ = 'master'
 __version__ = "%s%s" % (
    '.'.join(__version_info__),
    '-' + __version_branch__ if __version_branch__ else ''
 )
 class QueryCondenser(object):
    def __init__(self):
        self.transformers = [
            MergeTransformer(),
            SliceTransformer(),
            StripCommonTransformer()
        ]
    def distinct(self, titles):
        for transformer in self.transformers:
            titles = transformer.run(titles)
        return titles
--- a/libs/qcond/compat.py
+++ b/libs/qcond/compat.py
@ -0,0 +1,23 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import sys
 PY3 = sys.version_info[0] == 3
 if PY3:
    xrange = range
 else:
    xrange = xrange
--- a/libs/qcond/helpers.py
+++ b/libs/qcond/helpers.py
@ -0,0 +1,84 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from difflib import SequenceMatcher
 import re
 import sys
 from logr import Logr
 from qcond.compat import xrange
 PY3 = sys.version_info[0] == 3
 def simplify(s):
    s = s.lower()
    s = re.sub(r"(\w)'(\w)", r"\1\2", s)
    return s
 def strip(s):
    return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s)
 def create_matcher(a, b, swap_longest = True, case_sensitive = False):
    # Ensure longest string is a
    if swap_longest and len(b) > len(a):
        a_ = a
        a = b
        b = a_
    if not case_sensitive:
        a = a.upper()
        b = b.upper()
    return SequenceMatcher(None, a, b)
 def first(function_or_none, sequence):
    if PY3:
        for item in filter(function_or_none, sequence):
            return item
    else:
        result = filter(function_or_none, sequence)
        if len(result):
            return result[0]
    return None
 def sorted_append(sequence, item, func):
    if not len(sequence):
        sequence.insert(0, item)
        return
    x = 0
    for x in xrange(len(sequence)):
        if func(sequence[x]):
            sequence.insert(x, item)
            return
    sequence.append(item)
 def itemsMatch(L1, L2):
    return len(L1) == len(L2) and sorted(L1) == sorted(L2)
 def distinct(sequence):
    result = []
    for item in sequence:
        if item not in result:
            result.append(item)
    return result
--- a/libs/qcond/transformers/init.py
+++ b/libs/qcond/transformers/init.py
--- a/libs/qcond/transformers/base.py
+++ b/libs/qcond/transformers/base.py
@ -0,0 +1,21 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 class Transformer(object):
    def __init__(self):
        pass
    def run(self, titles):
        raise NotImplementedError()
--- a/libs/qcond/transformers/merge.py
+++ b/libs/qcond/transformers/merge.py
@ -0,0 +1,238 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from operator import itemgetter
 from logr import Logr
 from qcond.helpers import simplify, strip, first, sorted_append, distinct
 from qcond.transformers.base import Transformer
 from qcond.compat import xrange
 class MergeTransformer(Transformer):
    def __init__(self):
        super(MergeTransformer, self).__init__()
    def run(self, titles):
        titles = distinct([simplify(title) for title in titles])
        Logr.info(str(titles))
        Logr.debug("------------------------------------------------------------")
        root, tails = self.parse(titles)
        Logr.debug("--------------------------PARSE-----------------------------")
        for node in root:
            print_tree(node)
        Logr.debug("--------------------------MERGE-----------------------------")
        self.merge(root)
        Logr.debug("--------------------------FINAL-----------------------------")
        for node in root:
            print_tree(node)
        Logr.debug("--------------------------RESULT-----------------------------")
        scores = {}
        results = []
        for tail in tails:
            score, value, original_value = tail.full_value()
            if value in scores:
                scores[value] += score
            else:
                results.append((value, original_value))
                scores[value] = score
                Logr.debug("%s %s %s", score, value, original_value)
        sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True)
        return [result[0] for result in sorted_results]
    def parse(self, titles):
        root = []
        tails = []
        for title in titles:
            Logr.debug(title)
            cur = None
            words = title.split(' ')
            for wx in xrange(len(words)):
                word = strip(words[wx])
                if cur is None:
                    cur = find_node(root, word)
                    if cur is None:
                        cur = DNode(word, None, num_children=len(words) - wx, original_value=title)
                        root.append(cur)
                else:
                    parent = cur
                    parent.weight += 1
                    cur = find_node(parent.right, word)
                    if cur is None:
                        Logr.debug("%s %d", word, len(words) - wx)
                        cur = DNode(word, parent, num_children=len(words) - wx)
                        sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children)
                    else:
                        cur.weight += 1
            tails.append(cur)
        return root, tails
    def merge(self, root):
        for x in range(len(root)):
            Logr.debug(root[x])
            root[x].right = self._merge(root[x].right)
            Logr.debug('=================================================================')
        return root
    def get_nodes_right(self, value):
        if type(value) is not list:
            value = [value]
        nodes = []
        for node in value:
            nodes.append(node)
            for child in self.get_nodes_right(node.right):
                nodes.append(child)
        return nodes
    def destroy_nodes_right(self, value):
        nodes = self.get_nodes_right(value)
        for node in nodes:
            node.value = None
            node.dead = True
    def _merge(self, nodes, depth = 0):
        Logr.debug(str('\t' * depth) + str(nodes))
        top = nodes[0]
        # Merge into top
        for x in range(len(nodes)):
            # Merge extra results into top
            if x > 0:
                top.value = None
                top.weight += nodes[x].weight
                self.destroy_nodes_right(top.right)
                if len(nodes[x].right):
                    top.join_right(nodes[x].right)
                    Logr.debug("= %s joined %s", nodes[x], top)
                nodes[x].dead = True
        nodes = [n for n in nodes if not n.dead]
        # Traverse further
        for node in nodes:
            if len(node.right):
                node.right = self._merge(node.right, depth + 1)
        return nodes
 def print_tree(node, depth = 0):
    Logr.debug(str('\t' * depth) + str(node))
    if len(node.right):
        for child in node.right:
            print_tree(child, depth + 1)
    else:
        Logr.debug(node.full_value()[1])
 def find_node(node_list, value):
    # Try find adjacent node match
    for node in node_list:
        if node.value == value:
            return node
    return None
 class DNode(object):
    def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None):
        self.value = value
        self.parent = parent
        if right is None:
            right = []
        self.right = right
        self.weight = weight
        self.original_value = original_value
        self.num_children = num_children
        self.dead = False
    def join_right(self, nodes):
        for node in nodes:
            duplicate = first(lambda x: x.value == node.value, self.right)
            if duplicate:
                duplicate.weight += node.weight
                duplicate.join_right(node.right)
            else:
                node.parent = self
                self.right.append(node)
    def full_value(self):
        words = []
        total_score = 0
        cur = self
        root = None
        while cur is not None:
            if cur.value and not cur.dead:
                words.insert(0, cur.value)
                total_score += cur.weight
            if cur.parent is None:
                root = cur
            cur = cur.parent
        return float(total_score) / len(words), ' '.join(words), root.original_value if root else None
    def __repr__(self):
        return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % (
            'DNode',
            self.value,
            self.weight,
            self.num_children,
            (', original_value: %s' % self.original_value) if self.original_value else '',
            ' REMOVING' if self.dead else ''
        )
--- a/libs/qcond/transformers/slice.py
+++ b/libs/qcond/transformers/slice.py
@ -0,0 +1,280 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from logr import Logr
 from qcond.helpers import create_matcher
 from qcond.transformers.base import Transformer
 class SliceTransformer(Transformer):
    def __init__(self):
        super(SliceTransformer, self).__init__()
    def run(self, titles):
        nodes = []
        # Create a node for each title
        for title in titles:
            nodes.append(SimNode(title))
        # Calculate similarities between nodes
        for node in nodes:
            calculate_sim_links(node, [n for n in nodes if n != node])
        kill_nodes_above(nodes, 0.90)
        Logr.debug('---------------------------------------------------------------------')
        print_link_tree(nodes)
        Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
        Logr.debug('---------------------------------------------------------------------')
        kill_trailing_nodes(nodes)
        Logr.debug('---------------------------------------------------------------------')
        # Sort remaining nodes by 'num_merges'
        nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True)
        print_link_tree(nodes)
        Logr.debug('---------------------------------------------------------------------')
        Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
        return [n.value for n in nodes]
 class SimLink(object):
    def __init__(self, similarity, opcodes, stats):
        self.similarity = similarity
        self.opcodes = opcodes
        self.stats = stats
 class SimNode(object):
    def __init__(self, value):
        self.value = value
        self.dead = False
        self.num_merges = 0
        self.links = {}  # {<other SimNode>: <SimLink>}
 def kill_nodes(nodes, killed_nodes):
    # Remove killed nodes from root list
    for node in killed_nodes:
        if node in nodes:
            nodes.remove(node)
    # Remove killed nodes from links
    for killed_node in killed_nodes:
        for node in nodes:
            if killed_node in node.links:
                node.links.pop(killed_node)
 def kill_nodes_above(nodes, above_sim):
    killed_nodes = []
    for node in nodes:
        if node.dead:
            continue
        Logr.debug(node.value)
        for link_node, link in node.links.items():
            if link_node.dead:
                continue
            Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
            if link.similarity >= above_sim:
                if len(link_node.value) > len(node.value):
                    Logr.debug('\t\tvery similar, killed this node')
                    link_node.dead = True
                    node.num_merges += 1
                    killed_nodes.append(link_node)
                else:
                    Logr.debug('\t\tvery similar, killed owner')
                    node.dead = True
                    link_node.num_merges += 1
                    killed_nodes.append(node)
    kill_nodes(nodes, killed_nodes)
 def print_link_tree(nodes):
    for node in nodes:
        Logr.debug(node.value)
        Logr.debug('\tnum_merges: %s', node.num_merges)
        if len(node.links):
            Logr.debug('\t========== LINKS ==========')
            for link_node, link in node.links.items():
                Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
            Logr.debug('\t---------------------------')
 def kill_trailing_nodes(nodes):
    killed_nodes = []
    for node in nodes:
        if node.dead:
            continue
        Logr.debug(node.value)
        for link_node, link in node.links.items():
            if link_node.dead:
                continue
            is_valid = link.stats.get('valid', False)
            has_deletions = False
            has_insertions = False
            has_replacements = False
            for opcode in link.opcodes:
                if opcode[0] == 'delete':
                    has_deletions = True
                if opcode[0] == 'insert':
                    has_insertions = True
                if opcode[0] == 'replace':
                    has_replacements = True
            equal_perc = link.stats.get('equal', 0) / float(len(node.value))
            insert_perc = link.stats.get('insert', 0) / float(len(node.value))
            Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format(
                'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements),
                len(node.value), len(link_node.value), link.stats.get('equal', 0),
                equal_perc * 100, insert_perc * 100,
                '"{0}"'.format(link_node.value)
            ))
            Logr.debug('\t\t%s', link.stats)
            kill = all([
                is_valid,
                equal_perc >= 0.5,
                insert_perc < 2,
                has_insertions,
                not has_deletions,
                not has_replacements
            ])
            if kill:
                Logr.debug('\t\tkilled this node')
                link_node.dead = True
                node.num_merges += 1
                killed_nodes.append(link_node)
    kill_nodes(nodes, killed_nodes)
 stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})"
 def get_index_values(iterable, a, b):
    return (
        iterable[a] if a else None,
        iterable[b] if b else None
    )
 def get_indices(iterable, a, b):
    return (
        a if 0 < a < len(iterable) else None,
        b if 0 < b < len(iterable) else None
    )
 def get_opcode_stats(for_node, node, opcodes):
    stats = {}
    for tag, i1, i2, j1, j2 in opcodes:
        Logr.debug(stats_print_format.format(
            tag, i1, i2, j1, j2
        ))
        if tag in ['insert', 'delete']:
            ax = None, None
            bx = None, None
            if tag == 'insert':
                ax = get_indices(for_node.value, i1 - 1, i1)
                bx = get_indices(node.value, j1, j2 - 1)
            if tag == 'delete':
                ax = get_indices(for_node.value, j1 - 1, j1)
                bx = get_indices(node.value, i1, i2 - 1)
            av = get_index_values(for_node.value, *ax)
            bv = get_index_values(node.value, *bx)
            Logr.debug(
                '\t\t%s %s [%s><%s] <---> %s %s [%s><%s]',
                ax, av, av[0], av[1],
                bx, bv, bv[0], bv[1]
            )
            head_valid = av[0] in [None, ' '] or bv[0] in [None, ' ']
            tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' ']
            valid = head_valid and tail_valid
            if 'valid' not in stats or (stats['valid'] and not valid):
                stats['valid'] = valid
            Logr.debug('\t\t' + ('VALID' if valid else 'INVALID'))
        if tag not in stats:
            stats[tag] = 0
        stats[tag] += (i2 - i1) or (j2 - j1)
    return stats
 def calculate_sim_links(for_node, other_nodes):
    for node in other_nodes:
        if node in for_node.links:
            continue
        Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value)
        # Get similarity
        similarity_matcher = create_matcher(for_node.value, node.value)
        similarity = similarity_matcher.quick_ratio()
        # Get for_node -> node opcodes
        a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False)
        a_opcodes = a_opcodes_matcher.get_opcodes()
        a_stats = get_opcode_stats(for_node, node, a_opcodes)
        Logr.debug('-' * 100)
        # Get node -> for_node opcodes
        b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False)
        b_opcodes = b_opcodes_matcher.get_opcodes()
        b_stats = get_opcode_stats(for_node, node, b_opcodes)
        for_node.links[node] = SimLink(similarity, a_opcodes, a_stats)
        node.links[for_node] = SimLink(similarity, b_opcodes, b_stats)
        #raw_input('Press ENTER to continue')
--- a/libs/qcond/transformers/strip_common.py
+++ b/libs/qcond/transformers/strip_common.py
@ -0,0 +1,26 @@
 # Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from qcond.transformers.base import Transformer
 COMMON_WORDS = [
    'the'
 ]
 class StripCommonTransformer(Transformer):
    def run(self, titles):
        return [title for title in titles if title.lower() not in COMMON_WORDS]