23 changed files with 1122 additions and 374 deletions
@ -0,0 +1,6 @@ |
|||||
|
from .main import Matcher |
||||
|
|
||||
|
def start(): |
||||
|
return Matcher() |
||||
|
|
||||
|
config = [] |
@ -0,0 +1,109 @@ |
|||||
|
from caper import Caper |
||||
|
from couchpotato import CPLog, tryInt |
||||
|
from couchpotato.core.event import addEvent, fireEvent |
||||
|
from couchpotato.core.helpers.encoding import simplifyString |
||||
|
from couchpotato.core.helpers.variable import possibleTitles, dictIsSubset |
||||
|
from couchpotato.core.plugins.base import Plugin |
||||
|
|
||||
|
log = CPLog(__name__) |
||||
|
|
||||
|
|
||||
|
class Matcher(Plugin): |
||||
|
def __init__(self): |
||||
|
self.caper = Caper() |
||||
|
|
||||
|
addEvent('matcher.parse', self.parse) |
||||
|
addEvent('matcher.best', self.best) |
||||
|
|
||||
|
addEvent('matcher.correct_title', self.correctTitle) |
||||
|
addEvent('matcher.correct_identifier', self.correctIdentifier) |
||||
|
addEvent('matcher.correct_quality', self.correctQuality) |
||||
|
|
||||
|
def parse(self, release): |
||||
|
return self.caper.parse(release['name']) |
||||
|
|
||||
|
def best(self, release, media, quality): |
||||
|
rel_info = fireEvent('matcher.parse', release, single = True) |
||||
|
|
||||
|
if len(rel_info.chains) < 1: |
||||
|
log.info2('Wrong: %s, unable to parse release name (no chains)', release['name']) |
||||
|
return False |
||||
|
|
||||
|
for chain in rel_info.chains: |
||||
|
if fireEvent('searcher.correct_match', chain, release, media, quality, single = True): |
||||
|
return chain |
||||
|
|
||||
|
return None |
||||
|
|
||||
|
def chainMatch(self, chain, group, tags): |
||||
|
found_tags = [] |
||||
|
|
||||
|
for match in chain.info[group]: |
||||
|
for ck, cv in match.items(): |
||||
|
if ck in tags and simplifyString(cv) in tags[ck]: |
||||
|
found_tags.append(ck) |
||||
|
|
||||
|
|
||||
|
if set(tags.keys()) == set(found_tags): |
||||
|
return True |
||||
|
|
||||
|
return set([key for key, value in tags.items() if None not in value]) == set(found_tags) |
||||
|
|
||||
|
def correctIdentifier(self, chain, media): |
||||
|
required_id = fireEvent('searcher.get_media_identifier', media['library'], single = True) |
||||
|
|
||||
|
if 'identifier' not in chain.info: |
||||
|
return False |
||||
|
|
||||
|
# TODO could be handled better? |
||||
|
if len(chain.info['identifier']) != 1: |
||||
|
return False |
||||
|
identifier = chain.info['identifier'][0] |
||||
|
|
||||
|
# TODO air by date episodes |
||||
|
|
||||
|
# TODO this should support identifiers with characters 'a', 'b', etc.. |
||||
|
for k, v in identifier.items(): |
||||
|
identifier[k] = tryInt(v, None) |
||||
|
|
||||
|
if not dictIsSubset(required_id, identifier): |
||||
|
log.info2('Wrong: required identifier %s does not match release identifier %s', (str(required_id), str(identifier))) |
||||
|
return False |
||||
|
|
||||
|
return True |
||||
|
|
||||
|
def correctTitle(self, chain, media): |
||||
|
root_library = fireEvent('searcher.get_media_root', media['library'], single = True) |
||||
|
|
||||
|
if 'show_name' not in chain.info or not len(chain.info['show_name']): |
||||
|
log.info('Wrong: missing show name in parsed result') |
||||
|
return False |
||||
|
|
||||
|
chain_words = [x.lower() for x in chain.info['show_name']] |
||||
|
|
||||
|
# Check show titles match |
||||
|
# TODO check xem names |
||||
|
for title in root_library['info']['titles']: |
||||
|
for valid_words in [x.split(' ') for x in possibleTitles(title)]: |
||||
|
|
||||
|
if valid_words == chain_words: |
||||
|
return True |
||||
|
|
||||
|
return False |
||||
|
|
||||
|
def correctQuality(self, chain, quality, quality_map): |
||||
|
if quality['identifier'] not in quality_map: |
||||
|
log.info2('Wrong: unknown preferred quality %s', quality['identifier']) |
||||
|
return False |
||||
|
|
||||
|
if 'video' not in chain.info: |
||||
|
log.info2('Wrong: no video tags found') |
||||
|
return False |
||||
|
|
||||
|
video_tags = quality_map[quality['identifier']] |
||||
|
|
||||
|
if not self.chainMatch(chain, 'video', video_tags): |
||||
|
log.info2('Wrong: %s tags not in chain', video_tags) |
||||
|
return False |
||||
|
|
||||
|
return True |
@ -0,0 +1,42 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
from qcond.transformers.merge import MergeTransformer |
||||
|
from qcond.transformers.slice import SliceTransformer |
||||
|
from qcond.transformers.strip_common import StripCommonTransformer |
||||
|
|
||||
|
|
||||
|
__version_info__ = ('0', '1', '0') |
||||
|
__version_branch__ = 'master' |
||||
|
|
||||
|
__version__ = "%s%s" % ( |
||||
|
'.'.join(__version_info__), |
||||
|
'-' + __version_branch__ if __version_branch__ else '' |
||||
|
) |
||||
|
|
||||
|
|
||||
|
class QueryCondenser(object): |
||||
|
def __init__(self): |
||||
|
self.transformers = [ |
||||
|
MergeTransformer(), |
||||
|
SliceTransformer(), |
||||
|
StripCommonTransformer() |
||||
|
] |
||||
|
|
||||
|
def distinct(self, titles): |
||||
|
for transformer in self.transformers: |
||||
|
titles = transformer.run(titles) |
||||
|
|
||||
|
return titles |
@ -0,0 +1,23 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
import sys |
||||
|
|
||||
|
PY3 = sys.version_info[0] == 3 |
||||
|
|
||||
|
if PY3: |
||||
|
xrange = range |
||||
|
else: |
||||
|
xrange = xrange |
@ -0,0 +1,84 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
from difflib import SequenceMatcher |
||||
|
import re |
||||
|
import sys |
||||
|
from logr import Logr |
||||
|
from qcond.compat import xrange |
||||
|
|
||||
|
|
||||
|
PY3 = sys.version_info[0] == 3 |
||||
|
|
||||
|
|
||||
|
def simplify(s): |
||||
|
s = s.lower() |
||||
|
s = re.sub(r"(\w)'(\w)", r"\1\2", s) |
||||
|
return s |
||||
|
|
||||
|
|
||||
|
def strip(s): |
||||
|
return re.sub(r"^(\W*)(.*?)(\W*)$", r"\2", s) |
||||
|
|
||||
|
|
||||
|
def create_matcher(a, b, swap_longest = True, case_sensitive = False): |
||||
|
# Ensure longest string is a |
||||
|
if swap_longest and len(b) > len(a): |
||||
|
a_ = a |
||||
|
a = b |
||||
|
b = a_ |
||||
|
|
||||
|
if not case_sensitive: |
||||
|
a = a.upper() |
||||
|
b = b.upper() |
||||
|
|
||||
|
return SequenceMatcher(None, a, b) |
||||
|
|
||||
|
|
||||
|
def first(function_or_none, sequence): |
||||
|
if PY3: |
||||
|
for item in filter(function_or_none, sequence): |
||||
|
return item |
||||
|
else: |
||||
|
result = filter(function_or_none, sequence) |
||||
|
if len(result): |
||||
|
return result[0] |
||||
|
|
||||
|
return None |
||||
|
|
||||
|
def sorted_append(sequence, item, func): |
||||
|
if not len(sequence): |
||||
|
sequence.insert(0, item) |
||||
|
return |
||||
|
|
||||
|
x = 0 |
||||
|
for x in xrange(len(sequence)): |
||||
|
if func(sequence[x]): |
||||
|
sequence.insert(x, item) |
||||
|
return |
||||
|
|
||||
|
sequence.append(item) |
||||
|
|
||||
|
def itemsMatch(L1, L2): |
||||
|
return len(L1) == len(L2) and sorted(L1) == sorted(L2) |
||||
|
|
||||
|
def distinct(sequence): |
||||
|
result = [] |
||||
|
|
||||
|
for item in sequence: |
||||
|
if item not in result: |
||||
|
result.append(item) |
||||
|
|
||||
|
return result |
@ -0,0 +1,21 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
class Transformer(object): |
||||
|
def __init__(self): |
||||
|
pass |
||||
|
|
||||
|
def run(self, titles): |
||||
|
raise NotImplementedError() |
@ -0,0 +1,238 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
from operator import itemgetter |
||||
|
from logr import Logr |
||||
|
from qcond.helpers import simplify, strip, first, sorted_append, distinct |
||||
|
from qcond.transformers.base import Transformer |
||||
|
from qcond.compat import xrange |
||||
|
|
||||
|
|
||||
|
class MergeTransformer(Transformer): |
||||
|
def __init__(self): |
||||
|
super(MergeTransformer, self).__init__() |
||||
|
|
||||
|
def run(self, titles): |
||||
|
titles = distinct([simplify(title) for title in titles]) |
||||
|
|
||||
|
Logr.info(str(titles)) |
||||
|
|
||||
|
Logr.debug("------------------------------------------------------------") |
||||
|
|
||||
|
root, tails = self.parse(titles) |
||||
|
|
||||
|
Logr.debug("--------------------------PARSE-----------------------------") |
||||
|
|
||||
|
for node in root: |
||||
|
print_tree(node) |
||||
|
|
||||
|
Logr.debug("--------------------------MERGE-----------------------------") |
||||
|
|
||||
|
self.merge(root) |
||||
|
|
||||
|
Logr.debug("--------------------------FINAL-----------------------------") |
||||
|
|
||||
|
for node in root: |
||||
|
print_tree(node) |
||||
|
|
||||
|
Logr.debug("--------------------------RESULT-----------------------------") |
||||
|
|
||||
|
scores = {} |
||||
|
results = [] |
||||
|
|
||||
|
for tail in tails: |
||||
|
score, value, original_value = tail.full_value() |
||||
|
|
||||
|
if value in scores: |
||||
|
scores[value] += score |
||||
|
else: |
||||
|
results.append((value, original_value)) |
||||
|
scores[value] = score |
||||
|
|
||||
|
Logr.debug("%s %s %s", score, value, original_value) |
||||
|
|
||||
|
sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True) |
||||
|
|
||||
|
return [result[0] for result in sorted_results] |
||||
|
|
||||
|
def parse(self, titles): |
||||
|
root = [] |
||||
|
tails = [] |
||||
|
|
||||
|
for title in titles: |
||||
|
Logr.debug(title) |
||||
|
|
||||
|
cur = None |
||||
|
words = title.split(' ') |
||||
|
|
||||
|
for wx in xrange(len(words)): |
||||
|
word = strip(words[wx]) |
||||
|
|
||||
|
if cur is None: |
||||
|
cur = find_node(root, word) |
||||
|
|
||||
|
if cur is None: |
||||
|
cur = DNode(word, None, num_children=len(words) - wx, original_value=title) |
||||
|
root.append(cur) |
||||
|
else: |
||||
|
parent = cur |
||||
|
parent.weight += 1 |
||||
|
|
||||
|
cur = find_node(parent.right, word) |
||||
|
|
||||
|
if cur is None: |
||||
|
Logr.debug("%s %d", word, len(words) - wx) |
||||
|
cur = DNode(word, parent, num_children=len(words) - wx) |
||||
|
sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children) |
||||
|
else: |
||||
|
cur.weight += 1 |
||||
|
|
||||
|
tails.append(cur) |
||||
|
|
||||
|
return root, tails |
||||
|
|
||||
|
def merge(self, root): |
||||
|
for x in range(len(root)): |
||||
|
Logr.debug(root[x]) |
||||
|
root[x].right = self._merge(root[x].right) |
||||
|
Logr.debug('=================================================================') |
||||
|
|
||||
|
return root |
||||
|
|
||||
|
def get_nodes_right(self, value): |
||||
|
if type(value) is not list: |
||||
|
value = [value] |
||||
|
|
||||
|
nodes = [] |
||||
|
|
||||
|
for node in value: |
||||
|
nodes.append(node) |
||||
|
|
||||
|
for child in self.get_nodes_right(node.right): |
||||
|
nodes.append(child) |
||||
|
|
||||
|
return nodes |
||||
|
|
||||
|
def destroy_nodes_right(self, value): |
||||
|
nodes = self.get_nodes_right(value) |
||||
|
|
||||
|
for node in nodes: |
||||
|
node.value = None |
||||
|
node.dead = True |
||||
|
|
||||
|
def _merge(self, nodes, depth = 0): |
||||
|
Logr.debug(str('\t' * depth) + str(nodes)) |
||||
|
|
||||
|
top = nodes[0] |
||||
|
|
||||
|
# Merge into top |
||||
|
for x in range(len(nodes)): |
||||
|
# Merge extra results into top |
||||
|
if x > 0: |
||||
|
top.value = None |
||||
|
top.weight += nodes[x].weight |
||||
|
self.destroy_nodes_right(top.right) |
||||
|
|
||||
|
if len(nodes[x].right): |
||||
|
top.join_right(nodes[x].right) |
||||
|
|
||||
|
Logr.debug("= %s joined %s", nodes[x], top) |
||||
|
|
||||
|
nodes[x].dead = True |
||||
|
|
||||
|
nodes = [n for n in nodes if not n.dead] |
||||
|
|
||||
|
# Traverse further |
||||
|
for node in nodes: |
||||
|
if len(node.right): |
||||
|
node.right = self._merge(node.right, depth + 1) |
||||
|
|
||||
|
return nodes |
||||
|
|
||||
|
|
||||
|
def print_tree(node, depth = 0): |
||||
|
Logr.debug(str('\t' * depth) + str(node)) |
||||
|
|
||||
|
if len(node.right): |
||||
|
for child in node.right: |
||||
|
print_tree(child, depth + 1) |
||||
|
else: |
||||
|
Logr.debug(node.full_value()[1]) |
||||
|
|
||||
|
|
||||
|
def find_node(node_list, value): |
||||
|
# Try find adjacent node match |
||||
|
for node in node_list: |
||||
|
if node.value == value: |
||||
|
return node |
||||
|
|
||||
|
return None |
||||
|
|
||||
|
|
||||
|
class DNode(object): |
||||
|
def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None): |
||||
|
self.value = value |
||||
|
|
||||
|
self.parent = parent |
||||
|
|
||||
|
if right is None: |
||||
|
right = [] |
||||
|
self.right = right |
||||
|
|
||||
|
self.weight = weight |
||||
|
|
||||
|
self.original_value = original_value |
||||
|
self.num_children = num_children |
||||
|
|
||||
|
self.dead = False |
||||
|
|
||||
|
def join_right(self, nodes): |
||||
|
for node in nodes: |
||||
|
duplicate = first(lambda x: x.value == node.value, self.right) |
||||
|
|
||||
|
if duplicate: |
||||
|
duplicate.weight += node.weight |
||||
|
duplicate.join_right(node.right) |
||||
|
else: |
||||
|
node.parent = self |
||||
|
self.right.append(node) |
||||
|
|
||||
|
def full_value(self): |
||||
|
words = [] |
||||
|
total_score = 0 |
||||
|
|
||||
|
cur = self |
||||
|
root = None |
||||
|
|
||||
|
while cur is not None: |
||||
|
if cur.value and not cur.dead: |
||||
|
words.insert(0, cur.value) |
||||
|
total_score += cur.weight |
||||
|
|
||||
|
if cur.parent is None: |
||||
|
root = cur |
||||
|
cur = cur.parent |
||||
|
|
||||
|
return float(total_score) / len(words), ' '.join(words), root.original_value if root else None |
||||
|
|
||||
|
def __repr__(self): |
||||
|
return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % ( |
||||
|
'DNode', |
||||
|
self.value, |
||||
|
self.weight, |
||||
|
self.num_children, |
||||
|
(', original_value: %s' % self.original_value) if self.original_value else '', |
||||
|
' REMOVING' if self.dead else '' |
||||
|
) |
@ -0,0 +1,280 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
from logr import Logr |
||||
|
from qcond.helpers import create_matcher |
||||
|
from qcond.transformers.base import Transformer |
||||
|
|
||||
|
|
||||
|
class SliceTransformer(Transformer): |
||||
|
def __init__(self): |
||||
|
super(SliceTransformer, self).__init__() |
||||
|
|
||||
|
def run(self, titles): |
||||
|
nodes = [] |
||||
|
|
||||
|
# Create a node for each title |
||||
|
for title in titles: |
||||
|
nodes.append(SimNode(title)) |
||||
|
|
||||
|
# Calculate similarities between nodes |
||||
|
for node in nodes: |
||||
|
calculate_sim_links(node, [n for n in nodes if n != node]) |
||||
|
|
||||
|
kill_nodes_above(nodes, 0.90) |
||||
|
|
||||
|
Logr.debug('---------------------------------------------------------------------') |
||||
|
|
||||
|
print_link_tree(nodes) |
||||
|
Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) |
||||
|
|
||||
|
Logr.debug('---------------------------------------------------------------------') |
||||
|
|
||||
|
kill_trailing_nodes(nodes) |
||||
|
|
||||
|
Logr.debug('---------------------------------------------------------------------') |
||||
|
|
||||
|
# Sort remaining nodes by 'num_merges' |
||||
|
nodes = sorted(nodes, key=lambda n: n.num_merges, reverse=True) |
||||
|
|
||||
|
print_link_tree(nodes) |
||||
|
|
||||
|
Logr.debug('---------------------------------------------------------------------') |
||||
|
|
||||
|
Logr.debug('%s %s', len(nodes), [n.value for n in nodes]) |
||||
|
|
||||
|
return [n.value for n in nodes] |
||||
|
|
||||
|
|
||||
|
class SimLink(object): |
||||
|
def __init__(self, similarity, opcodes, stats): |
||||
|
self.similarity = similarity |
||||
|
self.opcodes = opcodes |
||||
|
self.stats = stats |
||||
|
|
||||
|
|
||||
|
class SimNode(object): |
||||
|
def __init__(self, value): |
||||
|
self.value = value |
||||
|
|
||||
|
self.dead = False |
||||
|
self.num_merges = 0 |
||||
|
|
||||
|
self.links = {} # {<other SimNode>: <SimLink>} |
||||
|
|
||||
|
|
||||
|
def kill_nodes(nodes, killed_nodes): |
||||
|
# Remove killed nodes from root list |
||||
|
for node in killed_nodes: |
||||
|
if node in nodes: |
||||
|
nodes.remove(node) |
||||
|
|
||||
|
# Remove killed nodes from links |
||||
|
for killed_node in killed_nodes: |
||||
|
for node in nodes: |
||||
|
if killed_node in node.links: |
||||
|
node.links.pop(killed_node) |
||||
|
|
||||
|
|
||||
|
def kill_nodes_above(nodes, above_sim): |
||||
|
killed_nodes = [] |
||||
|
|
||||
|
for node in nodes: |
||||
|
if node.dead: |
||||
|
continue |
||||
|
|
||||
|
Logr.debug(node.value) |
||||
|
|
||||
|
for link_node, link in node.links.items(): |
||||
|
if link_node.dead: |
||||
|
continue |
||||
|
|
||||
|
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) |
||||
|
|
||||
|
if link.similarity >= above_sim: |
||||
|
if len(link_node.value) > len(node.value): |
||||
|
Logr.debug('\t\tvery similar, killed this node') |
||||
|
link_node.dead = True |
||||
|
node.num_merges += 1 |
||||
|
killed_nodes.append(link_node) |
||||
|
else: |
||||
|
Logr.debug('\t\tvery similar, killed owner') |
||||
|
node.dead = True |
||||
|
link_node.num_merges += 1 |
||||
|
killed_nodes.append(node) |
||||
|
|
||||
|
kill_nodes(nodes, killed_nodes) |
||||
|
|
||||
|
|
||||
|
def print_link_tree(nodes): |
||||
|
for node in nodes: |
||||
|
Logr.debug(node.value) |
||||
|
Logr.debug('\tnum_merges: %s', node.num_merges) |
||||
|
|
||||
|
if len(node.links): |
||||
|
Logr.debug('\t========== LINKS ==========') |
||||
|
for link_node, link in node.links.items(): |
||||
|
Logr.debug('\t%0.2f -- %s', link.similarity, link_node.value) |
||||
|
|
||||
|
Logr.debug('\t---------------------------') |
||||
|
|
||||
|
|
||||
|
def kill_trailing_nodes(nodes): |
||||
|
killed_nodes = [] |
||||
|
|
||||
|
for node in nodes: |
||||
|
if node.dead: |
||||
|
continue |
||||
|
|
||||
|
Logr.debug(node.value) |
||||
|
|
||||
|
for link_node, link in node.links.items(): |
||||
|
if link_node.dead: |
||||
|
continue |
||||
|
|
||||
|
is_valid = link.stats.get('valid', False) |
||||
|
|
||||
|
has_deletions = False |
||||
|
has_insertions = False |
||||
|
has_replacements = False |
||||
|
|
||||
|
for opcode in link.opcodes: |
||||
|
if opcode[0] == 'delete': |
||||
|
has_deletions = True |
||||
|
if opcode[0] == 'insert': |
||||
|
has_insertions = True |
||||
|
if opcode[0] == 'replace': |
||||
|
has_replacements = True |
||||
|
|
||||
|
equal_perc = link.stats.get('equal', 0) / float(len(node.value)) |
||||
|
insert_perc = link.stats.get('insert', 0) / float(len(node.value)) |
||||
|
|
||||
|
Logr.debug('\t({0:<24}) [{1:02d}:{2:02d} = {3:02d} {4:3.0f}% {5:3.0f}%] -- {6:<45}'.format( |
||||
|
'd:%s, i:%s, r:%s' % (has_deletions, has_insertions, has_replacements), |
||||
|
len(node.value), len(link_node.value), link.stats.get('equal', 0), |
||||
|
equal_perc * 100, insert_perc * 100, |
||||
|
'"{0}"'.format(link_node.value) |
||||
|
)) |
||||
|
|
||||
|
Logr.debug('\t\t%s', link.stats) |
||||
|
|
||||
|
kill = all([ |
||||
|
is_valid, |
||||
|
equal_perc >= 0.5, |
||||
|
insert_perc < 2, |
||||
|
has_insertions, |
||||
|
not has_deletions, |
||||
|
not has_replacements |
||||
|
]) |
||||
|
|
||||
|
if kill: |
||||
|
Logr.debug('\t\tkilled this node') |
||||
|
|
||||
|
link_node.dead = True |
||||
|
node.num_merges += 1 |
||||
|
killed_nodes.append(link_node) |
||||
|
|
||||
|
kill_nodes(nodes, killed_nodes) |
||||
|
|
||||
|
stats_print_format = "\t{0:<8} ({1:2d}:{2:2d}) ({3:2d}:{4:2d})" |
||||
|
|
||||
|
|
||||
|
def get_index_values(iterable, a, b): |
||||
|
return ( |
||||
|
iterable[a] if a else None, |
||||
|
iterable[b] if b else None |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def get_indices(iterable, a, b): |
||||
|
return ( |
||||
|
a if 0 < a < len(iterable) else None, |
||||
|
b if 0 < b < len(iterable) else None |
||||
|
) |
||||
|
|
||||
|
|
||||
|
def get_opcode_stats(for_node, node, opcodes): |
||||
|
stats = {} |
||||
|
|
||||
|
for tag, i1, i2, j1, j2 in opcodes: |
||||
|
Logr.debug(stats_print_format.format( |
||||
|
tag, i1, i2, j1, j2 |
||||
|
)) |
||||
|
|
||||
|
if tag in ['insert', 'delete']: |
||||
|
ax = None, None |
||||
|
bx = None, None |
||||
|
|
||||
|
if tag == 'insert': |
||||
|
ax = get_indices(for_node.value, i1 - 1, i1) |
||||
|
bx = get_indices(node.value, j1, j2 - 1) |
||||
|
|
||||
|
if tag == 'delete': |
||||
|
ax = get_indices(for_node.value, j1 - 1, j1) |
||||
|
bx = get_indices(node.value, i1, i2 - 1) |
||||
|
|
||||
|
av = get_index_values(for_node.value, *ax) |
||||
|
bv = get_index_values(node.value, *bx) |
||||
|
|
||||
|
Logr.debug( |
||||
|
'\t\t%s %s [%s><%s] <---> %s %s [%s><%s]', |
||||
|
ax, av, av[0], av[1], |
||||
|
bx, bv, bv[0], bv[1] |
||||
|
) |
||||
|
|
||||
|
head_valid = av[0] in [None, ' '] or bv[0] in [None, ' '] |
||||
|
tail_valid = av[1] in [None, ' '] or bv[1] in [None, ' '] |
||||
|
valid = head_valid and tail_valid |
||||
|
|
||||
|
if 'valid' not in stats or (stats['valid'] and not valid): |
||||
|
stats['valid'] = valid |
||||
|
|
||||
|
Logr.debug('\t\t' + ('VALID' if valid else 'INVALID')) |
||||
|
|
||||
|
if tag not in stats: |
||||
|
stats[tag] = 0 |
||||
|
|
||||
|
stats[tag] += (i2 - i1) or (j2 - j1) |
||||
|
|
||||
|
return stats |
||||
|
|
||||
|
|
||||
|
def calculate_sim_links(for_node, other_nodes): |
||||
|
for node in other_nodes: |
||||
|
if node in for_node.links: |
||||
|
continue |
||||
|
|
||||
|
Logr.debug('calculating similarity between "%s" and "%s"', for_node.value, node.value) |
||||
|
|
||||
|
# Get similarity |
||||
|
similarity_matcher = create_matcher(for_node.value, node.value) |
||||
|
similarity = similarity_matcher.quick_ratio() |
||||
|
|
||||
|
# Get for_node -> node opcodes |
||||
|
a_opcodes_matcher = create_matcher(for_node.value, node.value, swap_longest = False) |
||||
|
a_opcodes = a_opcodes_matcher.get_opcodes() |
||||
|
a_stats = get_opcode_stats(for_node, node, a_opcodes) |
||||
|
|
||||
|
Logr.debug('-' * 100) |
||||
|
|
||||
|
# Get node -> for_node opcodes |
||||
|
b_opcodes_matcher = create_matcher(node.value, for_node.value, swap_longest = False) |
||||
|
b_opcodes = b_opcodes_matcher.get_opcodes() |
||||
|
b_stats = get_opcode_stats(for_node, node, b_opcodes) |
||||
|
|
||||
|
for_node.links[node] = SimLink(similarity, a_opcodes, a_stats) |
||||
|
node.links[for_node] = SimLink(similarity, b_opcodes, b_stats) |
||||
|
|
||||
|
#raw_input('Press ENTER to continue') |
@ -0,0 +1,26 @@ |
|||||
|
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com> |
||||
|
# |
||||
|
# Licensed under the Apache License, Version 2.0 (the "License"); |
||||
|
# you may not use this file except in compliance with the License. |
||||
|
# You may obtain a copy of the License at |
||||
|
# |
||||
|
# http://www.apache.org/licenses/LICENSE-2.0 |
||||
|
# |
||||
|
# Unless required by applicable law or agreed to in writing, software |
||||
|
# distributed under the License is distributed on an "AS IS" BASIS, |
||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
|
# See the License for the specific language governing permissions and |
||||
|
# limitations under the License. |
||||
|
|
||||
|
|
||||
|
from qcond.transformers.base import Transformer |
||||
|
|
||||
|
|
||||
|
COMMON_WORDS = [ |
||||
|
'the' |
||||
|
] |
||||
|
|
||||
|
|
||||
|
class StripCommonTransformer(Transformer): |
||||
|
def run(self, titles): |
||||
|
return [title for title in titles if title.lower() not in COMMON_WORDS] |
Loading…
Reference in new issue