Browse Source

Merge pull request #2356 from fuzeman/tv_searcher

[TV][Searcher] Release Matching and Snatching
pull/2515/head
Joel Kåberg 12 years ago
parent
commit
611c159373
  1. 3
      couchpotato/core/helpers/variable.py
  2. 125
      couchpotato/core/media/_base/searcher/main.py
  3. 29
      couchpotato/core/media/movie/searcher/main.py
  4. 281
      couchpotato/core/media/show/searcher/main.py
  5. 6
      couchpotato/core/plugins/matcher/__init__.py
  6. 109
      couchpotato/core/plugins/matcher/main.py
  7. 56
      couchpotato/core/plugins/release/main.py
  8. 20
      couchpotato/core/providers/base.py
  9. 12
      couchpotato/core/providers/torrent/iptorrents/main.py
  10. 2
      libs/caper/__init__.py
  11. 2
      libs/caper/constraint.py
  12. 7
      libs/caper/group.py
  13. 61
      libs/caper/matcher.py
  14. 57
      libs/caper/parsers/base.py
  15. 12
      libs/caper/step.py
  16. 42
      libs/qcond/__init__.py
  17. 23
      libs/qcond/compat.py
  18. 84
      libs/qcond/helpers.py
  19. 0
      libs/qcond/transformers/__init__.py
  20. 21
      libs/qcond/transformers/base.py
  21. 238
      libs/qcond/transformers/merge.py
  22. 280
      libs/qcond/transformers/slice.py
  23. 26
      libs/qcond/transformers/strip_common.py

3
couchpotato/core/helpers/variable.py

@ -211,3 +211,6 @@ def randomString(size = 8, chars = string.ascii_uppercase + string.digits):
def splitString(str, split_on = ',', clean = True):
list = [x.strip() for x in str.split(split_on)] if str else []
return filter(None, list) if clean else list
def dictIsSubset(a, b):
return all([k in b and b[k] == v for k, v in a.items()])

125
couchpotato/core/media/_base/searcher/main.py

@ -7,7 +7,6 @@ from couchpotato.core.logger import CPLog
from couchpotato.core.media._base.searcher.base import SearcherBase
from couchpotato.core.settings.model import Media, Release, ReleaseInfo
from couchpotato.environment import Env
from sqlalchemy.exc import InterfaceError
from inspect import ismethod, isfunction
import datetime
import re
@ -25,9 +24,9 @@ class Searcher(SearcherBase):
addEvent('searcher.correct_year', self.correctYear)
addEvent('searcher.correct_name', self.correctName)
addEvent('searcher.correct_words', self.correctWords)
addEvent('searcher.try_download_result', self.tryDownloadResult)
addEvent('searcher.download', self.download)
addEvent('searcher.search', self.search)
addEvent('searcher.create_releases', self.createReleases)
addApiView('searcher.full_search', self.searchAllView, docs = {
'desc': 'Starts a full search for all media',
@ -53,27 +52,51 @@ class Searcher(SearcherBase):
progress = fireEvent('searcher.progress', merge = True)
return progress
def download(self, data, movie, manual = False):
def tryDownloadResult(self, results, media, quality_type, manual = False):
ignored_status, failed_status = fireEvent('status.get', ['ignored', 'failed'], single = True)
if not data.get('protocol'):
data['protocol'] = data['type']
data['type'] = 'movie'
for rel in results:
if not quality_type.get('finish', False) and quality_type.get('wait_for', 0) > 0 and rel.get('age') <= quality_type.get('wait_for', 0):
log.info('Ignored, waiting %s days: %s', (quality_type.get('wait_for'), rel['name']))
continue
if rel['status_id'] in [ignored_status.get('id'), failed_status.get('id')]:
log.info('Ignored: %s', rel['name'])
continue
if rel['score'] <= 0:
log.info('Ignored, score to low: %s', rel['name'])
continue
downloaded = fireEvent('searcher.download', data = rel, media = media, manual = manual, single = True)
if downloaded is True:
return True
elif downloaded != 'try_next':
break
return False
def download(self, data, media, manual = False):
# TODO what is this for?
#if not data.get('protocol'):
# data['protocol'] = data['type']
# data['type'] = 'movie'
# Test to see if any downloaders are enabled for this type
downloader_enabled = fireEvent('download.enabled', manual, data, single = True)
if downloader_enabled:
snatched_status, active_status, done_status = fireEvent('status.get', ['snatched', 'active', 'done'], single = True)
snatched_status = fireEvent('status.get', 'snatched', single = True)
# Download movie to temp
# Download release to temp
filedata = None
if data.get('download') and (ismethod(data.get('download')) or isfunction(data.get('download'))):
filedata = data.get('download')(url = data.get('url'), nzb_id = data.get('id'))
if filedata == 'try_next':
return filedata
download_result = fireEvent('download', data = data, movie = movie, manual = manual, filedata = filedata, single = True)
download_result = fireEvent('download', data = data, movie = media, manual = manual, filedata = filedata, single = True)
log.debug('Downloader result: %s', download_result)
if download_result:
@ -84,7 +107,6 @@ class Searcher(SearcherBase):
if rls:
renamer_enabled = Env.setting('enabled', 'renamer')
done_status = fireEvent('status.get', 'done', single = True)
rls.status_id = done_status.get('id') if not renamer_enabled else snatched_status.get('id')
# Save download-id info if returned
@ -97,36 +119,34 @@ class Searcher(SearcherBase):
rls.info.append(rls_info)
db.commit()
log_movie = '%s (%s) in %s' % (getTitle(movie['library']), movie['library']['year'], rls.quality.label)
log_movie = '%s (%s) in %s' % (getTitle(media['library']), media['library']['year'], rls.quality.label)
snatch_message = 'Snatched "%s": %s' % (data.get('name'), log_movie)
log.info(snatch_message)
fireEvent('movie.snatched', message = snatch_message, data = rls.to_dict())
fireEvent('%s.snatched' % data['type'], message = snatch_message, data = rls.to_dict())
# If renamer isn't used, mark movie done
# If renamer isn't used, mark media done
if not renamer_enabled:
active_status = fireEvent('status.get', 'active', single = True)
done_status = fireEvent('status.get', 'done', single = True)
try:
if movie['status_id'] == active_status.get('id'):
for profile_type in movie['profile']['types']:
if media['status_id'] == active_status.get('id'):
for profile_type in media['profile']['types']:
if profile_type['quality_id'] == rls.quality.id and profile_type['finish']:
log.info('Renamer disabled, marking movie as finished: %s', log_movie)
log.info('Renamer disabled, marking media as finished: %s', log_movie)
# Mark release done
rls.status_id = done_status.get('id')
rls.last_edit = int(time.time())
db.commit()
# Mark movie done
mvie = db.query(Media).filter_by(id = movie['id']).first()
mvie.status_id = done_status.get('id')
mvie.last_edit = int(time.time())
# Mark media done
mdia = db.query(Media).filter_by(id = media['id']).first()
mdia.status_id = done_status.get('id')
mdia.last_edit = int(time.time())
db.commit()
except:
log.error('Failed marking movie finished, renamer disabled: %s', traceback.format_exc())
log.error('Failed marking media finished, renamer disabled: %s', traceback.format_exc())
except:
log.error('Failed marking movie finished: %s', traceback.format_exc())
log.error('Failed marking media finished: %s', traceback.format_exc())
return True
@ -137,14 +157,11 @@ class Searcher(SearcherBase):
def search(self, protocols, media, quality):
results = []
search_type = None
if media['type'] == 'movie':
search_type = 'movie'
elif media['type'] in ['show', 'season', 'episode']:
search_type = 'show'
# TODO could this be handled better? (removing the need for 'searcher.get_media_searcher_id')
searcher_id = fireEvent('searcher.get_media_searcher_id', media['type'], single = True)
for search_protocol in protocols:
protocol_results = fireEvent('provider.search.%s.%s' % (search_protocol, search_type), media, quality, merge = True)
protocol_results = fireEvent('provider.search.%s.%s' % (search_protocol, searcher_id), media, quality, merge = True)
if protocol_results:
results += protocol_results
@ -156,52 +173,6 @@ class Searcher(SearcherBase):
return sorted_results
def createReleases(self, search_results, media, quality_type):
available_status, ignored_status, failed_status = fireEvent('status.get', ['available', 'ignored', 'failed'], single = True)
db = get_session()
found_releases = []
for rel in search_results:
nzb_identifier = md5(rel['url'])
found_releases.append(nzb_identifier)
rls = db.query(Release).filter_by(identifier = nzb_identifier).first()
if not rls:
rls = Release(
identifier = nzb_identifier,
media_id = media.get('id'),
quality_id = quality_type.get('quality_id'),
status_id = available_status.get('id')
)
db.add(rls)
else:
[db.delete(old_info) for old_info in rls.info]
rls.last_edit = int(time.time())
db.commit()
for info in rel:
try:
if not isinstance(rel[info], (str, unicode, int, long, float)):
continue
rls_info = ReleaseInfo(
identifier = info,
value = toUnicode(rel[info])
)
rls.info.append(rls_info)
except InterfaceError:
log.debug('Couldn\'t add %s to ReleaseInfo: %s', (info, traceback.format_exc()))
db.commit()
rel['status_id'] = rls.status_id
return found_releases
def getSearchProtocols(self):
download_protocols = fireEvent('download.enabled_protocols', merge = True)

29
couchpotato/core/media/movie/searcher/main.py

@ -31,6 +31,7 @@ class MovieSearcher(SearcherBase, MovieTypeBase):
addEvent('movie.searcher.could_be_released', self.couldBeReleased)
addEvent('searcher.correct_release', self.correctRelease)
addEvent('searcher.get_search_title', self.getSearchTitle)
addEvent('searcher.get_media_searcher_id', self.getMediaSearcherId)
addApiView('movie.searcher.try_next', self.tryNextReleaseView, docs = {
'desc': 'Marks the snatched results as ignored and try the next best release',
@ -175,27 +176,11 @@ class MovieSearcher(SearcherBase, MovieTypeBase):
break
# Add them to this movie releases list
found_releases += fireEvent('searcher.create_releases', results, movie, quality_type, single = True)
found_releases += fireEvent('release.create_from_search', results, movie, quality_type, single = True)
for nzb in results:
if not quality_type.get('finish', False) and quality_type.get('wait_for', 0) > 0 and nzb.get('age') <= quality_type.get('wait_for', 0):
log.info('Ignored, waiting %s days: %s', (quality_type.get('wait_for'), nzb['name']))
continue
if nzb['status_id'] in [ignored_status.get('id'), failed_status.get('id')]:
log.info('Ignored: %s', nzb['name'])
continue
if nzb['score'] <= 0:
log.info('Ignored, score to low: %s', nzb['name'])
continue
downloaded = fireEvent('searcher.download', data = nzb, movie = movie, manual = manual, single = True)
if downloaded is True:
ret = True
break
elif downloaded != 'try_next':
break
# Try find a valid result and download it
if fireEvent('searcher.try_download_result', results, movie, quality_type, manual, single = True):
ret = True
# Remove releases that aren't found anymore
for release in movie.get('releases', []):
@ -359,5 +344,9 @@ class MovieSearcher(SearcherBase, MovieTypeBase):
if media['type'] == 'movie':
return getTitle(media['library'])
def getMediaSearcherId(self, media_type):
if media_type == 'movie':
return 'movie'
class SearchSetupError(Exception):
pass

281
couchpotato/core/media/show/searcher/main.py

@ -1,14 +1,12 @@
import pprint
import re
from couchpotato import get_session, Env
from couchpotato.core.event import addEvent, fireEvent
from couchpotato.core.helpers.encoding import simplifyString
from couchpotato.core.helpers.variable import getTitle, tryInt, possibleTitles
from couchpotato.core.helpers.variable import getTitle, tryInt
from couchpotato.core.logger import CPLog
from couchpotato.core.media._base.searcher.main import SearchSetupError
from couchpotato.core.plugins.base import Plugin
from couchpotato.core.settings.model import Media, Library
from caper import Caper
from qcond import QueryCondenser
from qcond.helpers import simplify
log = CPLog(__name__)
@ -29,38 +27,19 @@ class ShowSearcher(Plugin):
def __init__(self):
super(ShowSearcher, self).__init__()
self.query_condenser = QueryCondenser()
addEvent('show.searcher.single', self.single)
addEvent('searcher.correct_release', self.correctRelease)
addEvent('searcher.get_search_title', self.getSearchTitle)
self.caper = Caper()
def _lookupMedia(self, media):
db = get_session()
media_library = db.query(Library).filter_by(id = media['library_id']).first()
show = None
season = None
episode = None
if media['type'] == 'episode':
show = media_library.parent.parent
season = media_library.parent
episode = media_library
if media['type'] == 'season':
show = media_library.parent
season = media_library
if media['type'] == 'show':
show = media_library
return show, season, episode
addEvent('searcher.correct_match', self.correctMatch)
addEvent('searcher.correct_release', self.correctRelease)
def single(self, media, search_protocols = None):
pprint.pprint(media)
addEvent('searcher.get_media_identifier', self.getMediaIdentifier)
addEvent('searcher.get_media_root', self.getMediaRoot)
addEvent('searcher.get_media_searcher_id', self.getMediaSearcherId)
def single(self, media, search_protocols = None, manual = False):
if media['type'] == 'show':
# TODO handle show searches (scan all seasons)
return
@ -72,7 +51,7 @@ class ShowSearcher(Plugin):
except SearchSetupError:
return
done_status = fireEvent('status.get', 'done', single = True)
done_status, available_status, ignored_status, failed_status = fireEvent('status.get', ['done', 'available', 'ignored', 'failed'], single = True)
if not media['profile'] or media['status_id'] == done_status.get('id'):
log.debug('Episode doesn\'t have a profile or already done, assuming in manage tab.')
@ -80,19 +59,18 @@ class ShowSearcher(Plugin):
db = get_session()
pre_releases = fireEvent('quality.pre_releases', single = True)
available_status, ignored_status, failed_status = fireEvent('status.get', ['available', 'ignored', 'failed'], single = True)
#pre_releases = fireEvent('quality.pre_releases', single = True)
found_releases = []
too_early_to_search = []
default_title = self.getSearchTitle(media['library'])
default_title = self.getSearchTitle(media)
if not default_title:
log.error('No proper info found for episode, removing it from library to cause it from having more issues.')
#fireEvent('episode.delete', episode['id'], single = True)
return
show, season, episode = self._lookupMedia(media)
show, season, episode = self.getMedia(media)
if show is None or season is None:
log.error('Unable to find show or season library in database, missing required data for searching')
return
@ -128,9 +106,81 @@ class ShowSearcher(Plugin):
break
# Add them to this movie releases list
found_releases += fireEvent('searcher.create_releases', results, media, quality_type, single = True)
found_releases += fireEvent('release.create_from_search', results, media, quality_type, single = True)
# Try find a valid result and download it
if fireEvent('searcher.try_download_result', results, media, quality_type, manual, single = True):
ret = True
# Remove releases that aren't found anymore
for release in media.get('releases', []):
if release.get('status_id') == available_status.get('id') and release.get('identifier') not in found_releases:
fireEvent('release.delete', release.get('id'), single = True)
else:
log.info('Better quality (%s) already available or snatched for %s', (quality_type['quality']['label'], default_title))
fireEvent('movie.restatus', media['id'])
break
# Break if CP wants to shut down
if self.shuttingDown() or ret:
break
if len(too_early_to_search) > 0:
log.info2('Too early to search for %s, %s', (too_early_to_search, default_title))
fireEvent('notify.frontend', type = 'show.searcher.ended.%s' % media['id'], data = True)
return ret
def getSearchTitle(self, media):
if media['type'] not in ['show', 'season', 'episode']:
return
show, season, episode = self.getMedia(media)
if show is None:
return None
titles = []
# Add season map_names if they exist
if season is not None and 'map_names' in show.info:
season_names = show.info['map_names'].get(str(season.season_number), {})
# Add titles from all locations
# TODO only add name maps from a specific location
for location, names in season_names.items():
titles += [name for name in names if name not in titles]
# Add show titles
titles += [title.title for title in show.titles if title.title not in titles]
# Use QueryCondenser to build a list of optimal search titles
condensed_titles = self.query_condenser.distinct(titles)
title = None
# TODO try other titles if searching doesn't return results
if len(condensed_titles):
# Return the first condensed title if one exists
title = condensed_titles[0]
elif len(titles):
# Fallback to first raw title
title = simplify(titles[0])
else:
return None
# Add the identifier to search title
# TODO supporting other identifier formats
identifier = fireEvent('searcher.get_media_identifier', media['library'], single = True)
if identifier['season']:
title += ' S%02d' % identifier['season']
log.info('%d results found' % len(results))
if identifier['episode']:
title += 'E%02d' % identifier['episode']
return title
def correctRelease(self, release = None, media = None, quality = None, **kwargs):
@ -146,128 +196,97 @@ class ShowSearcher(Plugin):
if not fireEvent('searcher.correct_words', release['name'], media, single = True):
return False
show, season, episode = self._lookupMedia(media)
show, season, episode = self.getMedia(media)
if show is None or season is None:
log.error('Unable to find show or season library in database, missing required data for searching')
return
release_info = self.caper.parse(release['name'])
if len(release_info.chains) < 1:
log.info2('Wrong: %s, unable to parse release name (no chains)', release['name'])
return False
match = fireEvent('matcher.best', release, media, quality, single = True)
if match:
return match.weight
return False
# TODO look at all chains
chain = release_info.chains[0]
def correctMatch(self, chain, release, media, quality):
log.info("Checking if '%s' is valid", release['name'])
if not self.correctQuality(chain, quality['identifier']):
if not fireEvent('matcher.correct_quality', chain, quality, self.quality_map, single = True):
log.info('Wrong: %s, quality does not match', release['name'])
return False
if not self.correctIdentifier(chain, media):
if not fireEvent('matcher.correct_identifier', chain, media):
log.info('Wrong: %s, identifier does not match', release['name'])
return False
if 'show_name' not in chain.info or not len(chain.info['show_name']):
log.info('Wrong: %s, missing show name in parsed result', release['name'])
return False
chain_words = [x.lower() for x in chain.info['show_name']]
chain_title = ' '.join(chain_words)
library_title = None
# Check show titles match
for raw_title in show.titles:
for valid_words in [x.split(' ') for x in possibleTitles(raw_title.title)]:
if not library_title:
library_title = ' '.join(valid_words)
if valid_words == chain_words:
return True
log.info("Wrong: title '%s', undetermined show naming. Looking for '%s (%s)'", (chain_title, library_title, media['library']['year']))
return False
def correctQuality(self, chain, quality_identifier):
if quality_identifier not in self.quality_map:
log.info2('Wrong: unknown preferred quality %s for TV searching', quality_identifier)
return False
if 'video' not in chain.info:
log.info2('Wrong: no video tags found')
return False
video_tags = self.quality_map[quality_identifier]
if not self.chainMatches(chain, 'video', video_tags):
log.info2('Wrong: %s tags not in chain', video_tags)
if not fireEvent('matcher.correct_title', chain, media):
log.info("Wrong: '%s', undetermined naming.", (' '.join(chain.info['show_name'])))
return False
return True
def correctIdentifier(self, chain, media):
required_id = self.getIdentifier(media['library'], 'season_number', 'episode_number')
if 'identifier' not in chain.info:
return False
# TODO could be handled better?
if len(chain.info['identifier']) != 1:
return False
identifier = chain.info['identifier'][0]
def getMediaIdentifier(self, media_library):
if media_library['type'] not in ['show', 'season', 'episode']:
return None
# TODO air by date episodes
release_id = self.getIdentifier(identifier, 'season', 'episode')
identifier = {
'season': None,
'episode': None
}
if required_id != release_id:
log.info2('Wrong: required identifier %s does not match release identifier %s', (str(required_id), str(release_id)))
return False
if media_library['type'] == 'episode':
map_episode = media_library['info'].get('map_episode')
return True
if map_episode and 'scene' in map_episode:
identifier['season'] = map_episode['scene'].get('season')
identifier['episode'] = map_episode['scene'].get('episode')
else:
# TODO xem mapping?
identifier['season'] = media_library.get('season_number')
identifier['episode'] = media_library.get('episode_number')
def getIdentifier(self, d, episode_key, season_key):
return (
tryInt(d.get(season_key), None) if season_key in d else None,
tryInt(d.get(episode_key), None) if episode_key in d else None
)
if media_library['type'] == 'season':
identifier['season'] = media_library.get('season_number')
def chainMatches(self, chain, group, tags):
found_tags = []
# Try cast identifier values to integers
identifier['season'] = tryInt(identifier['season'], None)
identifier['episode'] = tryInt(identifier['episode'], None)
for match in chain.info[group]:
for ck, cv in match.items():
if ck in tags and self.cleanMatchValue(cv) in tags[ck]:
found_tags.append(ck)
return identifier
def getMediaRoot(self, media):
if media['type'] not in ['show', 'season', 'episode']:
return None
if set(tags.keys()) == set(found_tags):
return True
show, season, episode = self.getMedia(media)
if show is None or season is None:
log.error('Unable to find show or season library in database, missing required data for searching')
return
return set([key for key, value in tags.items() if value]) == set(found_tags)
return show.to_dict()
def cleanMatchValue(self, value):
value = value.lower()
value = value.strip()
def getMediaSearcherId(self, media_type):
if media_type in ['show', 'season', 'episode']:
return 'show'
for ch in [' ', '-', '.']:
value = value.replace(ch, '')
def getMedia(self, media):
db = get_session()
return value
media_library = db.query(Library).filter_by(id = media['library_id']).first()
def getSearchTitle(self, media):
show, season, episode = self._lookupMedia(media)
if show is None:
return None
show = None
season = None
episode = None
name = ''
if season is not None:
name = ' S%02d' % season.season_number
if media['type'] == 'episode':
show = media_library.parent.parent
season = media_library.parent
episode = media_library
if episode is not None:
name += 'E%02d' % episode.episode_number
if media['type'] == 'season':
show = media_library.parent
season = media_library
show_title = getTitle(show)
if not show_title:
return None
if media['type'] == 'show':
show = media_library
return show_title + name
return show, season, episode

6
couchpotato/core/plugins/matcher/__init__.py

@ -0,0 +1,6 @@
from .main import Matcher
def start():
return Matcher()
config = []

109
couchpotato/core/plugins/matcher/main.py

@ -0,0 +1,109 @@
from caper import Caper
from couchpotato import CPLog, tryInt
from couchpotato.core.event import addEvent, fireEvent
from couchpotato.core.helpers.encoding import simplifyString
from couchpotato.core.helpers.variable import possibleTitles, dictIsSubset
from couchpotato.core.plugins.base import Plugin
log = CPLog(__name__)
class Matcher(Plugin):
def __init__(self):
self.caper = Caper()
addEvent('matcher.parse', self.parse)
addEvent('matcher.best', self.best)
addEvent('matcher.correct_title', self.correctTitle)
addEvent('matcher.correct_identifier', self.correctIdentifier)
addEvent('matcher.correct_quality', self.correctQuality)
def parse(self, release):
return self.caper.parse(release['name'])
def best(self, release, media, quality):
rel_info = fireEvent('matcher.parse', release, single = True)
if len(rel_info.chains) < 1:
log.info2('Wrong: %s, unable to parse release name (no chains)', release['name'])
return False
for chain in rel_info.chains:
if fireEvent('searcher.correct_match', chain, release, media, quality, single = True):
return chain
return None
def chainMatch(self, chain, group, tags):
found_tags = []
for match in chain.info[group]:
for ck, cv in match.items():
if ck in tags and simplifyString(cv) in tags[ck]:
found_tags.append(ck)
if set(tags.keys()) == set(found_tags):
return True
return set([key for key, value in tags.items() if None not in value]) == set(found_tags)
def correctIdentifier(self, chain, media):
required_id = fireEvent('searcher.get_media_identifier', media['library'], single = True)
if 'identifier' not in chain.info:
return False
# TODO could be handled better?
if len(chain.info['identifier']) != 1:
return False
identifier = chain.info['identifier'][0]
# TODO air by date episodes
# TODO this should support identifiers with characters 'a', 'b', etc..
for k, v in identifier.items():
identifier[k] = tryInt(v, None)
if not dictIsSubset(required_id, identifier):
log.info2('Wrong: required identifier %s does not match release identifier %s', (str(required_id), str(identifier)))
return False
return True
def correctTitle(self, chain, media):
root_library = fireEvent('searcher.get_media_root', media['library'], single = True)
if 'show_name' not in chain.info or not len(chain.info['show_name']):
log.info('Wrong: missing show name in parsed result')
return False
chain_words = [x.lower() for x in chain.info['show_name']]
# Check show titles match
# TODO check xem names
for title in root_library['info']['titles']:
for valid_words in [x.split(' ') for x in possibleTitles(title)]:
if valid_words == chain_words:
return True
return False
def correctQuality(self, chain, quality, quality_map):
if quality['identifier'] not in quality_map:
log.info2('Wrong: unknown preferred quality %s', quality['identifier'])
return False
if 'video' not in chain.info:
log.info2('Wrong: no video tags found')
return False
video_tags = quality_map[quality['identifier']]
if not self.chainMatch(chain, 'video', video_tags):
log.info2('Wrong: %s tags not in chain', video_tags)
return False
return True

56
couchpotato/core/plugins/release/main.py

@ -1,11 +1,12 @@
from couchpotato import get_session
from couchpotato import get_session, md5
from couchpotato.api import addApiView
from couchpotato.core.event import fireEvent, addEvent
from couchpotato.core.helpers.encoding import ss
from couchpotato.core.helpers.encoding import ss, toUnicode
from couchpotato.core.logger import CPLog
from couchpotato.core.plugins.base import Plugin
from couchpotato.core.plugins.scanner.main import Scanner
from couchpotato.core.settings.model import File, Release as Relea, Media
from couchpotato.core.settings.model import File, Release as Relea, Media, ReleaseInfo
from sqlalchemy.exc import InterfaceError
from sqlalchemy.orm import joinedload_all
from sqlalchemy.sql.expression import and_, or_
import os
@ -45,6 +46,7 @@ class Release(Plugin):
}
})
addEvent('release.create_from_search', self.createFromSearch)
addEvent('release.for_movie', self.forMovie)
addEvent('release.delete', self.delete)
addEvent('release.clean', self.clean)
@ -191,7 +193,7 @@ class Release(Plugin):
if item.get('protocol') != 'torrent_magnet':
item['download'] = provider.loginDownload if provider.urls.get('login') else provider.download
success = fireEvent('searcher.download', data = item, movie = rel.media.to_dict({
success = fireEvent('searcher.download', data = item, media = rel.media.to_dict({
'profile': {'types': {'quality': {}}},
'releases': {'status': {}, 'quality': {}},
'library': {'titles': {}, 'files':{}},
@ -213,6 +215,52 @@ class Release(Plugin):
'success': False
}
def createFromSearch(self, search_results, media, quality_type):
available_status, ignored_status, failed_status = fireEvent('status.get', ['available', 'ignored', 'failed'], single = True)
db = get_session()
found_releases = []
for rel in search_results:
rel_identifier = md5(rel['url'])
found_releases.append(rel_identifier)
rls = db.query(Relea).filter_by(identifier = rel_identifier).first()
if not rls:
rls = Relea(
identifier = rel_identifier,
media_id = media.get('id'),
quality_id = quality_type.get('quality_id'),
status_id = available_status.get('id')
)
db.add(rls)
else:
[db.delete(old_info) for old_info in rls.info]
rls.last_edit = int(time.time())
db.commit()
for info in rel:
try:
if not isinstance(rel[info], (str, unicode, int, long, float)):
continue
rls_info = ReleaseInfo(
identifier = info,
value = toUnicode(rel[info])
)
rls.info.append(rls_info)
except InterfaceError:
log.debug('Couldn\'t add %s to ReleaseInfo: %s', (info, traceback.format_exc()))
db.commit()
rel['status_id'] = rls.status_id
return found_releases
def forMovie(self, id = None):
db = get_session()

20
couchpotato/core/providers/base.py

@ -274,7 +274,10 @@ class YarrProvider(Provider):
if identifier in qualities:
return ids
return [self.cat_backup_id]
if self.cat_backup_id:
return [self.cat_backup_id]
return []
class ResultList(list):
@ -302,12 +305,23 @@ class ResultList(list):
new_result = self.fillResult(result)
is_correct_movie = fireEvent('searcher.correct_release', new_result, self.movie, self.quality,
is_correct = fireEvent('searcher.correct_release', new_result, self.movie, self.quality,
imdb_results = self.kwargs.get('imdb_results', False), single = True)
if is_correct_movie and new_result['id'] not in self.result_ids:
if is_correct and new_result['id'] not in self.result_ids:
is_correct_weight = float(is_correct)
new_result['score'] += fireEvent('score.calculate', new_result, self.movie, single = True)
old_score = new_result['score']
new_result['score'] = int(old_score * is_correct_weight)
log.info('Found correct release with weight %.02f, old_score(%d) now scaled to score(%d)', (
is_correct_weight,
old_score,
new_result['score']
))
self.found(new_result)
self.result_ids.append(result['id'])

12
couchpotato/core/providers/torrent/iptorrents/main.py

@ -23,7 +23,7 @@ class Base(TorrentProvider):
'base_url' : 'http://www.iptorrents.com',
'login' : 'http://www.iptorrents.com/torrents/',
'login_check': 'http://www.iptorrents.com/inbox.php',
'search' : 'http://www.iptorrents.com/torrents/?l%d=1%%s&q=%s&qf=ti&p=%%d',
'search' : 'http://www.iptorrents.com/torrents/?%s%%s&q=%s&qf=ti&p=%%d',
}
http_time_between_calls = 1 #seconds
@ -31,12 +31,13 @@ class Base(TorrentProvider):
def _buildUrl(self, query, quality_identifier, cat_ids_group = None):
cat_id = self.getCatId(quality_identifier, cat_ids_group)[0]
if not cat_id:
cat_ids = self.getCatId(quality_identifier, cat_ids_group)
if not cat_ids or not len(cat_ids):
log.warning('Unable to find category for quality %s', quality_identifier)
return
return self.urls['search'] % (cat_id, tryUrlencode(query).replace('%', '%%'))
return self.urls['search'] % ("&".join(("l%d=" % x) for x in cat_ids), tryUrlencode(query).replace('%', '%%'))
def _searchOnTitle(self, title, media, quality, results):
@ -140,8 +141,7 @@ class Show(ShowProvider, Base):
]),
('episode', [
([5], ['hdtv_720p', 'webdl_720p', 'webdl_1080p']),
([78], ['hdtv_sd']),
([4, 79], ['hdtv_sd'])
([4, 78, 79], ['hdtv_sd'])
])
]

2
libs/caper/__init__.py

@ -19,7 +19,7 @@ from caper.parsers.anime import AnimeParser
from caper.parsers.scene import SceneParser
__version_info__ = ('0', '2', '0')
__version_info__ = ('0', '2', '2')
__version_branch__ = 'master'
__version__ = "%s%s" % (

2
libs/caper/constraint.py

@ -38,7 +38,7 @@ class CaptureConstraint(object):
def _compare_eq(self, fragment, name, expected):
if not hasattr(fragment, name):
return None
return 1.0, False
return 1.0, getattr(fragment, name) == expected

7
libs/caper/group.py

@ -14,8 +14,9 @@
from logr import Logr
from caper import CaperClosure
from caper.helpers import clean_dict
from caper.result import CaperFragmentNode
from caper.result import CaperFragmentNode, CaperClosureNode
from caper.step import CaptureStep
from caper.constraint import CaptureConstraint
@ -70,7 +71,9 @@ class CaptureGroup(object):
def parse_subject(self, parent_head, subject):
parent_node = parent_head[0] if type(parent_head) is list else parent_head
# TODO - if subject is a closure?
# TODO just jumping into closures for now, will be fixed later
if type(subject) is CaperClosure:
return [CaperClosureNode(subject, parent_head)]
nodes = []

61
libs/caper/matcher.py

@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import pprint
import re
from logr import Logr
from caper.helpers import is_list_type, clean_dict
from caper.helpers import is_list_type
class FragmentMatcher(object):
@ -57,8 +56,6 @@ class FragmentMatcher(object):
self.regex[group_name].append((weight, weight_patterns))
pprint.pprint(self.regex)
def find_group(self, name):
for group_name, weight_groups in self.regex.items():
if group_name and group_name == name:
@ -66,62 +63,6 @@ class FragmentMatcher(object):
return None
def parser_match(self, parser, group_name, single=True):
"""
:type parser: caper.parsers.base.Parser
"""
result = None
for group, weight_groups in self.regex.items():
if group_name and group != group_name:
continue
# TODO handle multiple weights
weight, patterns = weight_groups[0]
for pattern in patterns:
fragments = []
pattern_matched = True
pattern_result = {}
for fragment_pattern in pattern:
if not parser.fragment_available():
pattern_matched = False
break
fragment = parser.next_fragment()
fragments.append(fragment)
Logr.debug('[r"%s"].match("%s")', fragment_pattern.pattern, fragment.value)
match = fragment_pattern.match(fragment.value)
if match:
Logr.debug('Pattern "%s" matched', fragment_pattern.pattern)
else:
pattern_matched = False
break
pattern_result.update(clean_dict(match.groupdict()))
if pattern_matched:
if result is None:
result = {}
if group not in result:
result[group] = {}
Logr.debug('Matched on <%s>', ' '.join([f.value for f in fragments]))
result[group].update(pattern_result)
parser.commit()
if single:
return result
else:
parser.rewind()
return result
def value_match(self, value, group_name=None, single=True):
result = None

57
libs/caper/parsers/base.py

@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from logr import Logr
from caper import FragmentMatcher
from caper.group import CaptureGroup
from caper.result import CaperResult, CaperClosureNode
@ -60,62 +59,6 @@ class Parser(object):
raise NotImplementedError()
#
# Closure Methods
#
def next_closure(self):
self._closure_pos += 1
closure = self.closures[self._closure_pos]
self._history.append(('fragment', -1 - self._fragment_pos))
self._fragment_pos = -1
if self._closure_pos != 0:
self._history.append(('closure', 1))
Logr.debug('(next_closure) closure.value: "%s"', closure.value)
return closure
def closure_available(self):
return self._closure_pos + 1 < len(self.closures)
#
# Fragment Methods
#
def next_fragment(self):
closure = self.closures[self._closure_pos]
self._fragment_pos += 1
fragment = closure.fragments[self._fragment_pos]
self._history.append(('fragment', 1))
Logr.debug('(next_fragment) closure.value "%s" - fragment.value: "%s"', closure.value, fragment.value)
return fragment
def fragment_available(self):
if not self.closure_available():
return False
return self._fragment_pos + 1 < len(self.closures[self._closure_pos].fragments)
def rewind(self):
for source, delta in reversed(self._history):
Logr.debug('(rewind) Rewinding step: %s', (source, delta))
if source == 'fragment':
self._fragment_pos -= delta
elif source == 'closure':
self._closure_pos -= delta
else:
raise NotImplementedError()
self.commit()
def commit(self):
Logr.debug('(commit)')
self._history = []
#
# Capture Methods
#

12
libs/caper/step.py

@ -33,18 +33,6 @@ class CaptureStep(object):
#: @type: bool
self.single = single
def _get_next_subject(self, parser):
if self.source == 'fragment':
if not parser.fragment_available():
return None
return parser.next_fragment()
elif self.source == 'closure':
if not parser.closure_available():
return None
return parser.next_closure()
raise NotImplementedError()
def execute(self, fragment):
if self.regex:
weight, match, num_fragments = self.capture_group.parser.matcher.fragment_match(fragment, self.regex)

42
libs/qcond/__init__.py

@ -0,0 +1,42 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from qcond.transformers.merge import MergeTransformer
from qcond.transformers.slice import SliceTransformer
from qcond.transformers.strip_common import StripCommonTransformer
__version_info__ = ('0', '1', '0')
__version_branch__ = 'master'
__version__ = "%s%s" % (
'.'.join(__version_info__),
'-' + __version_branch__ if __version_branch__ else ''
)
class QueryCondenser(object):
def __init__(self):
self.transformers = [
MergeTransformer(),
SliceTransformer(),
StripCommonTransformer()
]
def distinct(self, titles):
for transformer in self.transformers:
titles = transformer.run(titles)
return titles

23
libs/qcond/compat.py

@ -0,0 +1,23 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
PY3 = sys.version_info[0] == 3
if PY3:
xrange = range
else:
xrange = xrange

84
libs/qcond/helpers.py

@ -0,0 +1,84 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from difflib import SequenceMatcher
import re
import sys
from logr import Logr
from qcond.compat import xrange
PY3 = sys.version_info[0] == 3
def simplify(s):
s = s.lower()
s = re.sub(r"(\w)'(\w)", r"\1\2", s)
return s
def strip(s):
return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s)
def create_matcher(a, b, swap_longest = True, case_sensitive = False):
# Ensure longest string is a
if swap_longest and len(b) > len(a):
a_ = a
a = b
b = a_
if not case_sensitive:
a = a.upper()
b = b.upper()
return SequenceMatcher(None, a, b)
def first(function_or_none, sequence):
if PY3:
for item in filter(function_or_none, sequence):
return item
else:
result = filter(function_or_none, sequence)
if len(result):
return result[0]
return None
def sorted_append(sequence, item, func):
if not len(sequence):
sequence.insert(0, item)
return
x = 0
for x in xrange(len(sequence)):
if func(sequence[x]):
sequence.insert(x, item)
return
sequence.append(item)
def itemsMatch(L1, L2):
return len(L1) == len(L2) and sorted(L1) == sorted(L2)
def distinct(sequence):
result = []
for item in sequence:
if item not in result:
result.append(item)
return result

0
libs/qcond/transformers/__init__.py

21
libs/qcond/transformers/base.py

@ -0,0 +1,21 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class Transformer(object):
def __init__(self):
pass
def run(self, titles):
raise NotImplementedError()

238
libs/qcond/transformers/merge.py

@ -0,0 +1,238 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from operator import itemgetter
from logr import Logr
from qcond.helpers import simplify, strip, first, sorted_append, distinct
from qcond.transformers.base import Transformer
from qcond.compat import xrange
class MergeTransformer(Transformer):
def __init__(self):
super(MergeTransformer, self).__init__()
def run(self, titles):
titles = distinct([simplify(title) for title in titles])
Logr.info(str(titles))
Logr.debug("------------------------------------------------------------")
root, tails = self.parse(titles)
Logr.debug("--------------------------PARSE-----------------------------")
for node in root:
print_tree(node)
Logr.debug("--------------------------MERGE-----------------------------")
self.merge(root)
Logr.debug("--------------------------FINAL-----------------------------")
for node in root:
print_tree(node)
Logr.debug("--------------------------RESULT-----------------------------")
scores = {}
results = []
for tail in tails:
score, value, original_value = tail.full_value()
if value in scores:
scores[value] += score
else:
results.append((value, original_value))
scores[value] = score
Logr.debug("%s %s %s", score, value, original_value)
sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True)
return [result[0] for result in sorted_results]
def parse(self, titles):
root = []
tails = []
for title in titles:
Logr.debug(title)
cur = None
words = title.split(' ')
for wx in xrange(len(words)):
word = strip(words[wx])
if cur is None:
cur = find_node(root, word)
if cur is None:
cur = DNode(word, None, num_children=len(words) - wx, original_value=title)
root.append(cur)
else:
parent = cur
parent.weight += 1
cur = find_node(parent.right, word)
if cur is None:
Logr.debug("%s %d", word, len(words) - wx)
cur = DNode(word, parent, num_children=len(words) - wx)
sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children)
else:
cur.weight += 1
tails.append(cur)
return root, tails
def merge(self, root):
for x in range(len(root)):
Logr.debug(root[x])
root[x].right = self._merge(root[x].right)
Logr.debug('=================================================================')
return root
def get_nodes_right(self, value):
if type(value) is not list:
value = [value]
nodes = []
for node in value:
nodes.append(node)
for child in self.get_nodes_right(node.right):
nodes.append(child)
return nodes
def destroy_nodes_right(self, value):
nodes = self.get_nodes_right(value)
for node in nodes:
node.value = None
node.dead = True
def _merge(self, nodes, depth = 0):
Logr.debug(str('\t' * depth) + str(nodes))
top = nodes[0]
# Merge into top
for x in range(len(nodes)):
# Merge extra results into top
if x > 0:
top.value = None
top.weight += nodes[x].weight
self.destroy_nodes_right(top.right)
if len(nodes[x].right):
top.join_right(nodes[x].right)
Logr.debug("= %s joined %s", nodes[x], top)
nodes[x].dead = True
nodes = [n for n in nodes if not n.dead]
# Traverse further
for node in nodes:
if len(node.right):
node.right = self._merge(node.right, depth + 1)
return nodes
def print_tree(node, depth = 0):
Logr.debug(str('\t' * depth) + str(node))
if len(node.right):
for child in node.right:
print_tree(child, depth + 1)
else:
Logr.debug(node.full_value()[1])
def find_node(node_list, value):
# Try find adjacent node match
for node in node_list:
if node.value == value:
return node
return None
class DNode(object):
def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None):
self.value = value
self.parent = parent
if right is None:
right = []
self.right = right
self.weight = weight
self.original_value = original_value
self.num_children = num_children
self.dead = False
def join_right(self, nodes):
for node in nodes:
duplicate = first(lambda x: x.value == node.value, self.right)
if duplicate:
duplicate.weight += node.weight
duplicate.join_right(node.right)
else:
node.parent = self
self.right.append(node)
def full_value(self):
words = []
total_score = 0
cur = self
root = None
while cur is not None:
if cur.value and not cur.dead:
words.insert(0, cur.value)
total_score += cur.weight
if cur.parent is None:
root = cur
cur = cur.parent
return float(total_score) / len(words), ' '.join(words), root.original_value if root else None
def __repr__(self):
return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % (
'DNode',
self.value,
self.weight,
self.num_children,
(', original_value: %s' % self.original_value) if self.original_value else '',
' REMOVING' if self.dead else ''
)

280
libs/qcond/transformers/slice.py

@ -0,0 +1,280 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from logr import Logr
from qcond.helpers import create_matcher
from qcond.transformers.base import Transformer
class SliceTransformer(Transformer):
def __init__(self):
super(SliceTransformer, self).__init__()
def run(self, titles):
nodes = []
# Create a node for each title
for title in titles:
nodes.append(SimNode(title))
# Calculate similarities between nodes
for node in nodes:
calculate_sim_links(node, [n for n in nodes if n != node])
kill_nodes_above(nodes, 0.90)
Logr.debug('---------------------------------------------------------------------')
print_link_tree(nodes)
Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
Logr.debug('---------------------------------------------------------------------')
kill_trailing_nodes(nodes)
Logr.debug('---------------------------------------------------------------------')
# Sort remaining nodes by 'num_merges'
nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True)
print_link_tree(nodes)
Logr.debug('---------------------------------------------------------------------')
Logr.debug('%s %s', len(nodes), [n.value for n in nodes])
return [n.value for n in nodes]
class SimLink(object):
def __init__(self, similarity, opcodes, stats):
self.similarity = similarity
self.opcodes = opcodes
self.stats = stats
class SimNode(object):
def __init__(self, value):
self.value = value
self.dead = False
self.num_merges = 0
self.links = {} # {<other SimNode>: <SimLink>}
def kill_nodes(nodes, killed_nodes):
# Remove killed nodes from root list
for node in killed_nodes:
if node in nodes:
nodes.remove(node)
# Remove killed nodes from links
for killed_node in killed_nodes:
for node in nodes:
if killed_node in node.links:
node.links.pop(killed_node)
def kill_nodes_above(nodes, above_sim):
killed_nodes = []
for node in nodes:
if node.dead:
continue
Logr.debug(node.value)
for link_node, link in node.links.items():
if link_node.dead:
continue
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
if link.similarity >= above_sim:
if len(link_node.value) > len(node.value):
Logr.debug('\t\tvery similar, killed this node')
link_node.dead = True
node.num_merges += 1
killed_nodes.append(link_node)
else:
Logr.debug('\t\tvery similar, killed owner')
node.dead = True
link_node.num_merges += 1
killed_nodes.append(node)
kill_nodes(nodes, killed_nodes)
def print_link_tree(nodes):
for node in nodes:
Logr.debug(node.value)
Logr.debug('\tnum_merges: %s', node.num_merges)
if len(node.links):
Logr.debug('\t========== LINKS ==========')
for link_node, link in node.links.items():
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value)
Logr.debug('\t---------------------------')
def kill_trailing_nodes(nodes):
killed_nodes = []
for node in nodes:
if node.dead:
continue
Logr.debug(node.value)
for link_node, link in node.links.items():
if link_node.dead:
continue
is_valid = link.stats.get('valid', False)
has_deletions = False
has_insertions = False
has_replacements = False
for opcode in link.opcodes:
if opcode[0] == 'delete':
has_deletions = True
if opcode[0] == 'insert':
has_insertions = True
if opcode[0] == 'replace':
has_replacements = True
equal_perc = link.stats.get('equal', 0) / float(len(node.value))
insert_perc = link.stats.get('insert', 0) / float(len(node.value))
Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format(
'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements),
len(node.value), len(link_node.value), link.stats.get('equal', 0),
equal_perc * 100, insert_perc * 100,
'"{0}"'.format(link_node.value)
))
Logr.debug('\t\t%s', link.stats)
kill = all([
is_valid,
equal_perc >= 0.5,
insert_perc < 2,
has_insertions,
not has_deletions,
not has_replacements
])
if kill:
Logr.debug('\t\tkilled this node')
link_node.dead = True
node.num_merges += 1
killed_nodes.append(link_node)
kill_nodes(nodes, killed_nodes)
stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})"
def get_index_values(iterable, a, b):
return (
iterable[a] if a else None,
iterable[b] if b else None
)
def get_indices(iterable, a, b):
return (
a if 0 < a < len(iterable) else None,
b if 0 < b < len(iterable) else None
)
def get_opcode_stats(for_node, node, opcodes):
stats = {}
for tag, i1, i2, j1, j2 in opcodes:
Logr.debug(stats_print_format.format(
tag, i1, i2, j1, j2
))
if tag in ['insert', 'delete']:
ax = None, None
bx = None, None
if tag == 'insert':
ax = get_indices(for_node.value, i1 - 1, i1)
bx = get_indices(node.value, j1, j2 - 1)
if tag == 'delete':
ax = get_indices(for_node.value, j1 - 1, j1)
bx = get_indices(node.value, i1, i2 - 1)
av = get_index_values(for_node.value, *ax)
bv = get_index_values(node.value, *bx)
Logr.debug(
'\t\t%s %s [%s><%s] <---> %s %s [%s><%s]',
ax, av, av[0], av[1],
bx, bv, bv[0], bv[1]
)
head_valid = av[0] in [None, ' '] or bv[0] in [None, ' ']
tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' ']
valid = head_valid and tail_valid
if 'valid' not in stats or (stats['valid'] and not valid):
stats['valid'] = valid
Logr.debug('\t\t' + ('VALID' if valid else 'INVALID'))
if tag not in stats:
stats[tag] = 0
stats[tag] += (i2 - i1) or (j2 - j1)
return stats
def calculate_sim_links(for_node, other_nodes):
for node in other_nodes:
if node in for_node.links:
continue
Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value)
# Get similarity
similarity_matcher = create_matcher(for_node.value, node.value)
similarity = similarity_matcher.quick_ratio()
# Get for_node -> node opcodes
a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False)
a_opcodes = a_opcodes_matcher.get_opcodes()
a_stats = get_opcode_stats(for_node, node, a_opcodes)
Logr.debug('-' * 100)
# Get node -> for_node opcodes
b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False)
b_opcodes = b_opcodes_matcher.get_opcodes()
b_stats = get_opcode_stats(for_node, node, b_opcodes)
for_node.links[node] = SimLink(similarity, a_opcodes, a_stats)
node.links[for_node] = SimLink(similarity, b_opcodes, b_stats)
#raw_input('Press ENTER to continue')

26
libs/qcond/transformers/strip_common.py

@ -0,0 +1,26 @@
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from qcond.transformers.base import Transformer
COMMON_WORDS = [
'the'
]
class StripCommonTransformer(Transformer):
def run(self, titles):
return [title for title in titles if title.lower() not in COMMON_WORDS]
Loading…
Cancel
Save