23 changed files with 1122 additions and 374 deletions
@ -0,0 +1,6 @@ |
|||
from .main import Matcher |
|||
|
|||
def start(): |
|||
return Matcher() |
|||
|
|||
config = [] |
@ -0,0 +1,109 @@ |
|||
from caper import Caper |
|||
from couchpotato import CPLog, tryInt |
|||
from couchpotato.core.event import addEvent, fireEvent |
|||
from couchpotato.core.helpers.encoding import simplifyString |
|||
from couchpotato.core.helpers.variable import possibleTitles, dictIsSubset |
|||
from couchpotato.core.plugins.base import Plugin |
|||
|
|||
log = CPLog(__name__) |
|||
|
|||
|
|||
class Matcher(Plugin): |
|||
def __init__(self): |
|||
self.caper = Caper() |
|||
|
|||
addEvent('matcher.parse', self.parse) |
|||
addEvent('matcher.best', self.best) |
|||
|
|||
addEvent('matcher.correct_title', self.correctTitle) |
|||
addEvent('matcher.correct_identifier', self.correctIdentifier) |
|||
addEvent('matcher.correct_quality', self.correctQuality) |
|||
|
|||
def parse(self, release): |
|||
return self.caper.parse(release['name']) |
|||
|
|||
def best(self, release, media, quality): |
|||
rel_info = fireEvent('matcher.parse', release, single = True) |
|||
|
|||
if len(rel_info.chains) < 1: |
|||
log.info2('Wrong: %s, unable to parse release name (no chains)', release['name']) |
|||
return False |
|||
|
|||
for chain in rel_info.chains: |
|||
if fireEvent('searcher.correct_match', chain, release, media, quality, single = True): |
|||
return chain |
|||
|
|||
return None |
|||
|
|||
def chainMatch(self, chain, group, tags): |
|||
found_tags = [] |
|||
|
|||
for match in chain.info[group]: |
|||
for ck, cv in match.items(): |
|||
if ck in tags and simplifyString(cv) in tags[ck]: |
|||
found_tags.append(ck) |
|||
|
|||
|
|||
if set(tags.keys()) == set(found_tags): |
|||
return True |
|||
|
|||
return set([key for key, value in tags.items() if None not in value]) == set(found_tags) |
|||
|
|||
def correctIdentifier(self, chain, media): |
|||
required_id = fireEvent('searcher.get_media_identifier', media['library'], single = True) |
|||
|
|||
if 'identifier' not in chain.info: |
|||
return False |
|||
|
|||
# TODO could be handled better? |
|||
if len(chain.info['identifier']) != 1: |
|||
return False |
|||
identifier = chain.info['identifier'][0] |
|||
|
|||
# TODO air by date episodes |
|||
|
|||
# TODO this should support identifiers with characters 'a', 'b', etc.. |
|||
for k, v in identifier.items(): |
|||
identifier[k] = tryInt(v, None) |
|||
|
|||
if not dictIsSubset(required_id, identifier): |
|||
log.info2('Wrong: required identifier %s does not match release identifier %s', (str(required_id), str(identifier))) |
|||
return False |
|||
|
|||
return True |
|||
|
|||
def correctTitle(self, chain, media): |
|||
root_library = fireEvent('searcher.get_media_root', media['library'], single = True) |
|||
|
|||
if 'show_name' not in chain.info or not len(chain.info['show_name']): |
|||
log.info('Wrong: missing show name in parsed result') |
|||
return False |
|||
|
|||
chain_words = [x.lower() for x in chain.info['show_name']] |
|||
|
|||
# Check show titles match |
|||
# TODO check xem names |
|||
for title in root_library['info']['titles']: |
|||
for valid_words in [x.split(' ') for x in possibleTitles(title)]: |
|||
|
|||
if valid_words == chain_words: |
|||
return True |
|||
|
|||
return False |
|||
|
|||
def correctQuality(self, chain, quality, quality_map): |
|||
if quality['identifier'] not in quality_map: |
|||
log.info2('Wrong: unknown preferred quality %s', quality['identifier']) |
|||
return False |
|||
|
|||
if 'video' not in chain.info: |
|||
log.info2('Wrong: no video tags found') |
|||
return False |
|||
|
|||
video_tags = quality_map[quality['identifier']] |
|||
|
|||
if not self.chainMatch(chain, 'video', video_tags): |
|||
log.info2('Wrong: %s tags not in chain', video_tags) |
|||
return False |
|||
|
|||
return True |
@ -0,0 +1,42 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
from qcond.transformers.merge import MergeTransformer |
|||
from qcond.transformers.slice import SliceTransformer |
|||
from qcond.transformers.strip_common import StripCommonTransformer |
|||
|
|||
|
|||
__version_info__ = ('0', '1', '0') |
|||
__version_branch__ = 'master' |
|||
|
|||
__version__ = "%s%s" % ( |
|||
'.'.join(__version_info__), |
|||
'-' + __version_branch__ if __version_branch__ else '' |
|||
) |
|||
|
|||
|
|||
class QueryCondenser(object): |
|||
def __init__(self): |
|||
self.transformers = [ |
|||
MergeTransformer(), |
|||
SliceTransformer(), |
|||
StripCommonTransformer() |
|||
] |
|||
|
|||
def distinct(self, titles): |
|||
for transformer in self.transformers: |
|||
titles = transformer.run(titles) |
|||
|
|||
return titles |
@ -0,0 +1,23 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
import sys |
|||
|
|||
PY3 = sys.version_info[0] == 3 |
|||
|
|||
if PY3: |
|||
xrange = range |
|||
else: |
|||
xrange = xrange |
@ -0,0 +1,84 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
from difflib import SequenceMatcher |
|||
import re |
|||
import sys |
|||
from logr import Logr |
|||
from qcond.compat import xrange |
|||
|
|||
|
|||
PY3 = sys.version_info[0] == 3 |
|||
|
|||
|
|||
def simplify(s): |
|||
s = s.lower() |
|||
s = re.sub(r"(\w)'(\w)", r"\1\2", s) |
|||
return s |
|||
|
|||
|
|||
def strip(s): |
|||
return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s) |
|||
|
|||
|
|||
def create_matcher(a, b, swap_longest = True, case_sensitive = False): |
|||
# Ensure longest string is a |
|||
if swap_longest and len(b) > len(a): |
|||
a_ = a |
|||
a = b |
|||
b = a_ |
|||
|
|||
if not case_sensitive: |
|||
a = a.upper() |
|||
b = b.upper() |
|||
|
|||
return SequenceMatcher(None, a, b) |
|||
|
|||
|
|||
def first(function_or_none, sequence): |
|||
if PY3: |
|||
for item in filter(function_or_none, sequence): |
|||
return item |
|||
else: |
|||
result = filter(function_or_none, sequence) |
|||
if len(result): |
|||
return result[0] |
|||
|
|||
return None |
|||
|
|||
def sorted_append(sequence, item, func): |
|||
if not len(sequence): |
|||
sequence.insert(0, item) |
|||
return |
|||
|
|||
x = 0 |
|||
for x in xrange(len(sequence)): |
|||
if func(sequence[x]): |
|||
sequence.insert(x, item) |
|||
return |
|||
|
|||
sequence.append(item) |
|||
|
|||
def itemsMatch(L1, L2): |
|||
return len(L1) == len(L2) and sorted(L1) == sorted(L2) |
|||
|
|||
def distinct(sequence): |
|||
result = [] |
|||
|
|||
for item in sequence: |
|||
if item not in result: |
|||
result.append(item) |
|||
|
|||
return result |
@ -0,0 +1,21 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
class Transformer(object): |
|||
def __init__(self): |
|||
pass |
|||
|
|||
def run(self, titles): |
|||
raise NotImplementedError() |
@ -0,0 +1,238 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
from operator import itemgetter |
|||
from logr import Logr |
|||
from qcond.helpers import simplify, strip, first, sorted_append, distinct |
|||
from qcond.transformers.base import Transformer |
|||
from qcond.compat import xrange |
|||
|
|||
|
|||
class MergeTransformer(Transformer): |
|||
def __init__(self): |
|||
super(MergeTransformer, self).__init__() |
|||
|
|||
def run(self, titles): |
|||
titles = distinct([simplify(title) for title in titles]) |
|||
|
|||
Logr.info(str(titles)) |
|||
|
|||
Logr.debug("------------------------------------------------------------") |
|||
|
|||
root, tails = self.parse(titles) |
|||
|
|||
Logr.debug("--------------------------PARSE-----------------------------") |
|||
|
|||
for node in root: |
|||
print_tree(node) |
|||
|
|||
Logr.debug("--------------------------MERGE-----------------------------") |
|||
|
|||
self.merge(root) |
|||
|
|||
Logr.debug("--------------------------FINAL-----------------------------") |
|||
|
|||
for node in root: |
|||
print_tree(node) |
|||
|
|||
Logr.debug("--------------------------RESULT-----------------------------") |
|||
|
|||
scores = {} |
|||
results = [] |
|||
|
|||
for tail in tails: |
|||
score, value, original_value = tail.full_value() |
|||
|
|||
if value in scores: |
|||
scores[value] += score |
|||
else: |
|||
results.append((value, original_value)) |
|||
scores[value] = score |
|||
|
|||
Logr.debug("%s %s %s", score, value, original_value) |
|||
|
|||
sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True) |
|||
|
|||
return [result[0] for result in sorted_results] |
|||
|
|||
def parse(self, titles): |
|||
root = [] |
|||
tails = [] |
|||
|
|||
for title in titles: |
|||
Logr.debug(title) |
|||
|
|||
cur = None |
|||
words = title.split(' ') |
|||
|
|||
for wx in xrange(len(words)): |
|||
word = strip(words[wx]) |
|||
|
|||
if cur is None: |
|||
cur = find_node(root, word) |
|||
|
|||
if cur is None: |
|||
cur = DNode(word, None, num_children=len(words) - wx, original_value=title) |
|||
root.append(cur) |
|||
else: |
|||
parent = cur |
|||
parent.weight += 1 |
|||
|
|||
cur = find_node(parent.right, word) |
|||
|
|||
if cur is None: |
|||
Logr.debug("%s %d", word, len(words) - wx) |
|||
cur = DNode(word, parent, num_children=len(words) - wx) |
|||
sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children) |
|||
else: |
|||
cur.weight += 1 |
|||
|
|||
tails.append(cur) |
|||
|
|||
return root, tails |
|||
|
|||
def merge(self, root): |
|||
for x in range(len(root)): |
|||
Logr.debug(root[x]) |
|||
root[x].right = self._merge(root[x].right) |
|||
Logr.debug('=================================================================') |
|||
|
|||
return root |
|||
|
|||
def get_nodes_right(self, value): |
|||
if type(value) is not list: |
|||
value = [value] |
|||
|
|||
nodes = [] |
|||
|
|||
for node in value: |
|||
nodes.append(node) |
|||
|
|||
for child in self.get_nodes_right(node.right): |
|||
nodes.append(child) |
|||
|
|||
return nodes |
|||
|
|||
def destroy_nodes_right(self, value): |
|||
nodes = self.get_nodes_right(value) |
|||
|
|||
for node in nodes: |
|||
node.value = None |
|||
node.dead = True |
|||
|
|||
def _merge(self, nodes, depth = 0): |
|||
Logr.debug(str('\t' * depth) + str(nodes)) |
|||
|
|||
top = nodes[0] |
|||
|
|||
# Merge into top |
|||
for x in range(len(nodes)): |
|||
# Merge extra results into top |
|||
if x > 0: |
|||
top.value = None |
|||
top.weight += nodes[x].weight |
|||
self.destroy_nodes_right(top.right) |
|||
|
|||
if len(nodes[x].right): |
|||
top.join_right(nodes[x].right) |
|||
|
|||
Logr.debug("= %s joined %s", nodes[x], top) |
|||
|
|||
nodes[x].dead = True |
|||
|
|||
nodes = [n for n in nodes if not n.dead] |
|||
|
|||
# Traverse further |
|||
for node in nodes: |
|||
if len(node.right): |
|||
node.right = self._merge(node.right, depth + 1) |
|||
|
|||
return nodes |
|||
|
|||
|
|||
def print_tree(node, depth = 0): |
|||
Logr.debug(str('\t' * depth) + str(node)) |
|||
|
|||
if len(node.right): |
|||
for child in node.right: |
|||
print_tree(child, depth + 1) |
|||
else: |
|||
Logr.debug(node.full_value()[1]) |
|||
|
|||
|
|||
def find_node(node_list, value): |
|||
# Try find adjacent node match |
|||
for node in node_list: |
|||
if node.value == value: |
|||
return node |
|||
|
|||
return None |
|||
|
|||
|
|||
class DNode(object): |
|||
def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None): |
|||
self.value = value |
|||
|
|||
self.parent = parent |
|||
|
|||
if right is None: |
|||
right = [] |
|||
self.right = right |
|||
|
|||
self.weight = weight |
|||
|
|||
self.original_value = original_value |
|||
self.num_children = num_children |
|||
|
|||
self.dead = False |
|||
|
|||
def join_right(self, nodes): |
|||
for node in nodes: |
|||
duplicate = first(lambda x: x.value == node.value, self.right) |
|||
|
|||
if duplicate: |
|||
duplicate.weight += node.weight |
|||
duplicate.join_right(node.right) |
|||
else: |
|||
node.parent = self |
|||
self.right.append(node) |
|||
|
|||
def full_value(self): |
|||
words = [] |
|||
total_score = 0 |
|||
|
|||
cur = self |
|||
root = None |
|||
|
|||
while cur is not None: |
|||
if cur.value and not cur.dead: |
|||
words.insert(0, cur.value) |
|||
total_score += cur.weight |
|||
|
|||
if cur.parent is None: |
|||
root = cur |
|||
cur = cur.parent |
|||
|
|||
return float(total_score) / len(words), ' '.join(words), root.original_value if root else None |
|||
|
|||
def __repr__(self): |
|||
return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % ( |
|||
'DNode', |
|||
self.value, |
|||
self.weight, |
|||
self.num_children, |
|||
(', original_value: %s' % self.original_value) if self.original_value else '', |
|||
' REMOVING' if self.dead else '' |
|||
) |
@ -0,0 +1,280 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
from logr import Logr |
|||
from qcond.helpers import create_matcher |
|||
from qcond.transformers.base import Transformer |
|||
|
|||
|
|||
class SliceTransformer(Transformer): |
|||
def __init__(self): |
|||
super(SliceTransformer, self).__init__() |
|||
|
|||
def run(self, titles): |
|||
nodes = [] |
|||
|
|||
# Create a node for each title |
|||
for title in titles: |
|||
nodes.append(SimNode(title)) |
|||
|
|||
# Calculate similarities between nodes |
|||
for node in nodes: |
|||
calculate_sim_links(node, [n for n in nodes if n != node]) |
|||
|
|||
kill_nodes_above(nodes, 0.90) |
|||
|
|||
Logr.debug('---------------------------------------------------------------------') |
|||
|
|||
print_link_tree(nodes) |
|||
Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) |
|||
|
|||
Logr.debug('---------------------------------------------------------------------') |
|||
|
|||
kill_trailing_nodes(nodes) |
|||
|
|||
Logr.debug('---------------------------------------------------------------------') |
|||
|
|||
# Sort remaining nodes by 'num_merges' |
|||
nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True) |
|||
|
|||
print_link_tree(nodes) |
|||
|
|||
Logr.debug('---------------------------------------------------------------------') |
|||
|
|||
Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) |
|||
|
|||
return [n.value for n in nodes] |
|||
|
|||
|
|||
class SimLink(object): |
|||
def __init__(self, similarity, opcodes, stats): |
|||
self.similarity = similarity |
|||
self.opcodes = opcodes |
|||
self.stats = stats |
|||
|
|||
|
|||
class SimNode(object): |
|||
def __init__(self, value): |
|||
self.value = value |
|||
|
|||
self.dead = False |
|||
self.num_merges = 0 |
|||
|
|||
self.links = {} # {<other SimNode>: <SimLink>} |
|||
|
|||
|
|||
def kill_nodes(nodes, killed_nodes): |
|||
# Remove killed nodes from root list |
|||
for node in killed_nodes: |
|||
if node in nodes: |
|||
nodes.remove(node) |
|||
|
|||
# Remove killed nodes from links |
|||
for killed_node in killed_nodes: |
|||
for node in nodes: |
|||
if killed_node in node.links: |
|||
node.links.pop(killed_node) |
|||
|
|||
|
|||
def kill_nodes_above(nodes, above_sim): |
|||
killed_nodes = [] |
|||
|
|||
for node in nodes: |
|||
if node.dead: |
|||
continue |
|||
|
|||
Logr.debug(node.value) |
|||
|
|||
for link_node, link in node.links.items(): |
|||
if link_node.dead: |
|||
continue |
|||
|
|||
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) |
|||
|
|||
if link.similarity >= above_sim: |
|||
if len(link_node.value) > len(node.value): |
|||
Logr.debug('\t\tvery similar, killed this node') |
|||
link_node.dead = True |
|||
node.num_merges += 1 |
|||
killed_nodes.append(link_node) |
|||
else: |
|||
Logr.debug('\t\tvery similar, killed owner') |
|||
node.dead = True |
|||
link_node.num_merges += 1 |
|||
killed_nodes.append(node) |
|||
|
|||
kill_nodes(nodes, killed_nodes) |
|||
|
|||
|
|||
def print_link_tree(nodes): |
|||
for node in nodes: |
|||
Logr.debug(node.value) |
|||
Logr.debug('\tnum_merges: %s', node.num_merges) |
|||
|
|||
if len(node.links): |
|||
Logr.debug('\t========== LINKS ==========') |
|||
for link_node, link in node.links.items(): |
|||
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) |
|||
|
|||
Logr.debug('\t---------------------------') |
|||
|
|||
|
|||
def kill_trailing_nodes(nodes): |
|||
killed_nodes = [] |
|||
|
|||
for node in nodes: |
|||
if node.dead: |
|||
continue |
|||
|
|||
Logr.debug(node.value) |
|||
|
|||
for link_node, link in node.links.items(): |
|||
if link_node.dead: |
|||
continue |
|||
|
|||
is_valid = link.stats.get('valid', False) |
|||
|
|||
has_deletions = False |
|||
has_insertions = False |
|||
has_replacements = False |
|||
|
|||
for opcode in link.opcodes: |
|||
if opcode[0] == 'delete': |
|||
has_deletions = True |
|||
if opcode[0] == 'insert': |
|||
has_insertions = True |
|||
if opcode[0] == 'replace': |
|||
has_replacements = True |
|||
|
|||
equal_perc = link.stats.get('equal', 0) / float(len(node.value)) |
|||
insert_perc = link.stats.get('insert', 0) / float(len(node.value)) |
|||
|
|||
Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format( |
|||
'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements), |
|||
len(node.value), len(link_node.value), link.stats.get('equal', 0), |
|||
equal_perc * 100, insert_perc * 100, |
|||
'"{0}"'.format(link_node.value) |
|||
)) |
|||
|
|||
Logr.debug('\t\t%s', link.stats) |
|||
|
|||
kill = all([ |
|||
is_valid, |
|||
equal_perc >= 0.5, |
|||
insert_perc < 2, |
|||
has_insertions, |
|||
not has_deletions, |
|||
not has_replacements |
|||
]) |
|||
|
|||
if kill: |
|||
Logr.debug('\t\tkilled this node') |
|||
|
|||
link_node.dead = True |
|||
node.num_merges += 1 |
|||
killed_nodes.append(link_node) |
|||
|
|||
kill_nodes(nodes, killed_nodes) |
|||
|
|||
stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})" |
|||
|
|||
|
|||
def get_index_values(iterable, a, b): |
|||
return ( |
|||
iterable[a] if a else None, |
|||
iterable[b] if b else None |
|||
) |
|||
|
|||
|
|||
def get_indices(iterable, a, b): |
|||
return ( |
|||
a if 0 < a < len(iterable) else None, |
|||
b if 0 < b < len(iterable) else None |
|||
) |
|||
|
|||
|
|||
def get_opcode_stats(for_node, node, opcodes): |
|||
stats = {} |
|||
|
|||
for tag, i1, i2, j1, j2 in opcodes: |
|||
Logr.debug(stats_print_format.format( |
|||
tag, i1, i2, j1, j2 |
|||
)) |
|||
|
|||
if tag in ['insert', 'delete']: |
|||
ax = None, None |
|||
bx = None, None |
|||
|
|||
if tag == 'insert': |
|||
ax = get_indices(for_node.value, i1 - 1, i1) |
|||
bx = get_indices(node.value, j1, j2 - 1) |
|||
|
|||
if tag == 'delete': |
|||
ax = get_indices(for_node.value, j1 - 1, j1) |
|||
bx = get_indices(node.value, i1, i2 - 1) |
|||
|
|||
av = get_index_values(for_node.value, *ax) |
|||
bv = get_index_values(node.value, *bx) |
|||
|
|||
Logr.debug( |
|||
'\t\t%s %s [%s><%s] <---> %s %s [%s><%s]', |
|||
ax, av, av[0], av[1], |
|||
bx, bv, bv[0], bv[1] |
|||
) |
|||
|
|||
head_valid = av[0] in [None, ' '] or bv[0] in [None, ' '] |
|||
tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' '] |
|||
valid = head_valid and tail_valid |
|||
|
|||
if 'valid' not in stats or (stats['valid'] and not valid): |
|||
stats['valid'] = valid |
|||
|
|||
Logr.debug('\t\t' + ('VALID' if valid else 'INVALID')) |
|||
|
|||
if tag not in stats: |
|||
stats[tag] = 0 |
|||
|
|||
stats[tag] += (i2 - i1) or (j2 - j1) |
|||
|
|||
return stats |
|||
|
|||
|
|||
def calculate_sim_links(for_node, other_nodes): |
|||
for node in other_nodes: |
|||
if node in for_node.links: |
|||
continue |
|||
|
|||
Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value) |
|||
|
|||
# Get similarity |
|||
similarity_matcher = create_matcher(for_node.value, node.value) |
|||
similarity = similarity_matcher.quick_ratio() |
|||
|
|||
# Get for_node -> node opcodes |
|||
a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False) |
|||
a_opcodes = a_opcodes_matcher.get_opcodes() |
|||
a_stats = get_opcode_stats(for_node, node, a_opcodes) |
|||
|
|||
Logr.debug('-' * 100) |
|||
|
|||
# Get node -> for_node opcodes |
|||
b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False) |
|||
b_opcodes = b_opcodes_matcher.get_opcodes() |
|||
b_stats = get_opcode_stats(for_node, node, b_opcodes) |
|||
|
|||
for_node.links[node] = SimLink(similarity, a_opcodes, a_stats) |
|||
node.links[for_node] = SimLink(similarity, b_opcodes, b_stats) |
|||
|
|||
#raw_input('Press ENTER to continue') |
@ -0,0 +1,26 @@ |
|||
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
|||
# |
|||
# Licensed under the Apache License, Version 2.0 (the "License"); |
|||
# you may not use this file except in compliance with the License. |
|||
# You may obtain a copy of the License at |
|||
# |
|||
# http://www.apache.org/licenses/LICENSE-2.0 |
|||
# |
|||
# Unless required by applicable law or agreed to in writing, software |
|||
# distributed under the License is distributed on an "AS IS" BASIS, |
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|||
# See the License for the specific language governing permissions and |
|||
# limitations under the License. |
|||
|
|||
|
|||
from qcond.transformers.base import Transformer |
|||
|
|||
|
|||
COMMON_WORDS = [ |
|||
'the' |
|||
] |
|||
|
|||
|
|||
class StripCommonTransformer(Transformer): |
|||
def run(self, titles): |
|||
return [title for title in titles if title.lower() not in COMMON_WORDS] |
Loading…
Reference in new issue