You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
241 lines
6.7 KiB
241 lines
6.7 KiB
# Copyright 2013 Dean Gardiner <gardiner91@gmail.com>
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
|
|
from operator import itemgetter
|
|
from logr import Logr
|
|
from qcond.helpers import simplify, strip, first, sorted_append, distinct
|
|
from qcond.transformers.base import Transformer
|
|
from qcond.compat import xrange
|
|
|
|
|
|
class MergeTransformer(Transformer):
|
|
def __init__(self):
|
|
super(MergeTransformer, self).__init__()
|
|
|
|
def run(self, titles):
|
|
titles = distinct([simplify(title) for title in titles])
|
|
|
|
Logr.info(str(titles))
|
|
|
|
Logr.debug("------------------------------------------------------------")
|
|
|
|
root, tails = self.parse(titles)
|
|
|
|
Logr.debug("--------------------------PARSE-----------------------------")
|
|
|
|
for node in root:
|
|
print_tree(node)
|
|
|
|
Logr.debug("--------------------------MERGE-----------------------------")
|
|
|
|
self.merge(root)
|
|
|
|
Logr.debug("--------------------------FINAL-----------------------------")
|
|
|
|
for node in root:
|
|
print_tree(node)
|
|
|
|
Logr.debug("--------------------------RESULT-----------------------------")
|
|
|
|
scores = {}
|
|
results = []
|
|
|
|
for tail in tails:
|
|
score, value, original_value = tail.full_value()
|
|
|
|
if value in scores:
|
|
scores[value] += score
|
|
else:
|
|
results.append((value, original_value))
|
|
scores[value] = score
|
|
|
|
Logr.debug("%s %s %s", score, value, original_value)
|
|
|
|
sorted_results = sorted(results, key=lambda item: (scores[item[0]], item[1]), reverse = True)
|
|
|
|
return [result[0] for result in sorted_results]
|
|
|
|
def parse(self, titles):
|
|
root = []
|
|
tails = []
|
|
|
|
for title in titles:
|
|
Logr.debug(title)
|
|
|
|
cur = None
|
|
words = title.split(' ')
|
|
|
|
for wx in xrange(len(words)):
|
|
word = strip(words[wx])
|
|
|
|
if cur is None:
|
|
cur = find_node(root, word)
|
|
|
|
if cur is None:
|
|
cur = DNode(word, None, num_children=len(words) - wx, original_value=title)
|
|
root.append(cur)
|
|
else:
|
|
parent = cur
|
|
parent.weight += 1
|
|
|
|
cur = find_node(parent.right, word)
|
|
|
|
if cur is None:
|
|
Logr.debug("%s %d", word, len(words) - wx)
|
|
cur = DNode(word, parent, num_children=len(words) - wx)
|
|
sorted_append(parent.right, cur, lambda a: a.num_children < cur.num_children)
|
|
else:
|
|
cur.weight += 1
|
|
|
|
tails.append(cur)
|
|
|
|
return root, tails
|
|
|
|
def merge(self, root):
|
|
for x in range(len(root)):
|
|
Logr.debug(root[x])
|
|
root[x].right = self._merge(root[x].right)
|
|
Logr.debug('=================================================================')
|
|
|
|
return root
|
|
|
|
def get_nodes_right(self, value):
|
|
if type(value) is not list:
|
|
value = [value]
|
|
|
|
nodes = []
|
|
|
|
for node in value:
|
|
nodes.append(node)
|
|
|
|
for child in self.get_nodes_right(node.right):
|
|
nodes.append(child)
|
|
|
|
return nodes
|
|
|
|
def destroy_nodes_right(self, value):
|
|
nodes = self.get_nodes_right(value)
|
|
|
|
for node in nodes:
|
|
node.value = None
|
|
node.dead = True
|
|
|
|
def _merge(self, nodes, depth = 0):
|
|
Logr.debug(str('\t' * depth) + str(nodes))
|
|
|
|
if not len(nodes):
|
|
return []
|
|
|
|
top = nodes[0]
|
|
|
|
# Merge into top
|
|
for x in range(len(nodes)):
|
|
# Merge extra results into top
|
|
if x > 0:
|
|
top.value = None
|
|
top.weight += nodes[x].weight
|
|
self.destroy_nodes_right(top.right)
|
|
|
|
if len(nodes[x].right):
|
|
top.join_right(nodes[x].right)
|
|
|
|
Logr.debug("= %s joined %s", nodes[x], top)
|
|
|
|
nodes[x].dead = True
|
|
|
|
nodes = [n for n in nodes if not n.dead]
|
|
|
|
# Traverse further
|
|
for node in nodes:
|
|
if len(node.right):
|
|
node.right = self._merge(node.right, depth + 1)
|
|
|
|
return nodes
|
|
|
|
|
|
def print_tree(node, depth = 0):
|
|
Logr.debug(str('\t' * depth) + str(node))
|
|
|
|
if len(node.right):
|
|
for child in node.right:
|
|
print_tree(child, depth + 1)
|
|
else:
|
|
Logr.debug(node.full_value()[1])
|
|
|
|
|
|
def find_node(node_list, value):
|
|
# Try find adjacent node match
|
|
for node in node_list:
|
|
if node.value == value:
|
|
return node
|
|
|
|
return None
|
|
|
|
|
|
class DNode(object):
|
|
def __init__(self, value, parent, right=None, weight=1, num_children=None, original_value=None):
|
|
self.value = value
|
|
|
|
self.parent = parent
|
|
|
|
if right is None:
|
|
right = []
|
|
self.right = right
|
|
|
|
self.weight = weight
|
|
|
|
self.original_value = original_value
|
|
self.num_children = num_children
|
|
|
|
self.dead = False
|
|
|
|
def join_right(self, nodes):
|
|
for node in nodes:
|
|
duplicate = first(lambda x: x.value == node.value, self.right)
|
|
|
|
if duplicate:
|
|
duplicate.weight += node.weight
|
|
duplicate.join_right(node.right)
|
|
else:
|
|
node.parent = self
|
|
self.right.append(node)
|
|
|
|
def full_value(self):
|
|
words = []
|
|
total_score = 0
|
|
|
|
cur = self
|
|
root = None
|
|
|
|
while cur is not None:
|
|
if cur.value and not cur.dead:
|
|
words.insert(0, cur.value)
|
|
total_score += cur.weight
|
|
|
|
if cur.parent is None:
|
|
root = cur
|
|
cur = cur.parent
|
|
|
|
return float(total_score) / len(words), ' '.join(words), root.original_value if root else None
|
|
|
|
def __repr__(self):
|
|
return '<%s value:"%s", weight: %s, num_children: %s%s%s>' % (
|
|
'DNode',
|
|
self.value,
|
|
self.weight,
|
|
self.num_children,
|
|
(', original_value: %s' % self.original_value) if self.original_value else '',
|
|
' REMOVING' if self.dead else ''
|
|
)
|
|
|