|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
#
|
|
|
|
# Copyright 2011-2013 Codernity (http://codernity.com)
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
|
|
|
|
from index import Index, IndexException, DocIdNotFound, ElemNotFound
|
|
|
|
import struct
|
|
|
|
import marshal
|
|
|
|
import os
|
|
|
|
import io
|
|
|
|
import shutil
|
|
|
|
from storage import IU_Storage
|
|
|
|
# from ipdb import set_trace
|
|
|
|
|
|
|
|
from CodernityDB.env import cdb_environment
|
|
|
|
from CodernityDB.index import TryReindexException
|
|
|
|
|
|
|
|
if cdb_environment.get('rlock_obj'):
|
|
|
|
from CodernityDB import patch
|
|
|
|
patch.patch_cache_rr(cdb_environment['rlock_obj'])
|
|
|
|
|
|
|
|
from CodernityDB.rr_cache import cache1lvl, cache2lvl
|
|
|
|
|
|
|
|
tree_buffer_size = io.DEFAULT_BUFFER_SIZE
|
|
|
|
|
|
|
|
cdb_environment['tree_buffer_size'] = tree_buffer_size
|
|
|
|
|
|
|
|
|
|
|
|
MODE_FIRST = 0
|
|
|
|
MODE_LAST = 1
|
|
|
|
|
|
|
|
MOVE_BUFFER_PREV = 0
|
|
|
|
MOVE_BUFFER_NEXT = 1
|
|
|
|
|
|
|
|
|
|
|
|
class NodeCapacityException(IndexException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class IU_TreeBasedIndex(Index):
|
|
|
|
|
|
|
|
custom_header = 'from CodernityDB.tree_index import TreeBasedIndex'
|
|
|
|
|
|
|
|
def __init__(self, db_path, name, key_format='32s', pointer_format='I',
|
|
|
|
meta_format='32sIIc', node_capacity=10, storage_class=None):
|
|
|
|
if node_capacity < 3:
|
|
|
|
raise NodeCapacityException
|
|
|
|
super(IU_TreeBasedIndex, self).__init__(db_path, name)
|
|
|
|
self.data_start = self._start_ind + 1
|
|
|
|
self.node_capacity = node_capacity
|
|
|
|
self.flag_format = 'c'
|
|
|
|
self.elements_counter_format = 'h'
|
|
|
|
self.pointer_format = pointer_format
|
|
|
|
self.key_format = key_format
|
|
|
|
self.meta_format = meta_format
|
|
|
|
self._count_props()
|
|
|
|
if not storage_class:
|
|
|
|
storage_class = IU_Storage
|
|
|
|
if storage_class and not isinstance(storage_class, basestring):
|
|
|
|
storage_class = storage_class.__name__
|
|
|
|
self.storage_class = storage_class
|
|
|
|
self.storage = None
|
|
|
|
cache = cache1lvl(100)
|
|
|
|
twolvl_cache = cache2lvl(150)
|
|
|
|
self._find_key = cache(self._find_key)
|
|
|
|
self._match_doc_id = cache(self._match_doc_id)
|
|
|
|
# self._read_single_leaf_record =
|
|
|
|
# twolvl_cache(self._read_single_leaf_record)
|
|
|
|
self._find_key_in_leaf = twolvl_cache(self._find_key_in_leaf)
|
|
|
|
self._read_single_node_key = twolvl_cache(self._read_single_node_key)
|
|
|
|
self._find_first_key_occurence_in_node = twolvl_cache(
|
|
|
|
self._find_first_key_occurence_in_node)
|
|
|
|
self._find_last_key_occurence_in_node = twolvl_cache(
|
|
|
|
self._find_last_key_occurence_in_node)
|
|
|
|
self._read_leaf_nr_of_elements = cache(self._read_leaf_nr_of_elements)
|
|
|
|
self._read_leaf_neighbours = cache(self._read_leaf_neighbours)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours = cache(
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours)
|
|
|
|
self._read_node_nr_of_elements_and_children_flag = cache(
|
|
|
|
self._read_node_nr_of_elements_and_children_flag)
|
|
|
|
|
|
|
|
def _count_props(self):
|
|
|
|
"""
|
|
|
|
Counts dynamic properties for tree, such as all complex formats
|
|
|
|
"""
|
|
|
|
self.single_leaf_record_format = self.key_format + self.meta_format
|
|
|
|
self.single_node_record_format = self.pointer_format + \
|
|
|
|
self.key_format + self.pointer_format
|
|
|
|
self.node_format = self.elements_counter_format + self.flag_format\
|
|
|
|
+ self.pointer_format + (self.key_format +
|
|
|
|
self.pointer_format) * self.node_capacity
|
|
|
|
self.leaf_format = self.elements_counter_format + self.pointer_format * 2\
|
|
|
|
+ (self.single_leaf_record_format) * self.node_capacity
|
|
|
|
self.leaf_heading_format = self.elements_counter_format + \
|
|
|
|
self.pointer_format * 2
|
|
|
|
self.node_heading_format = self.elements_counter_format + \
|
|
|
|
self.flag_format
|
|
|
|
self.key_size = struct.calcsize('<' + self.key_format)
|
|
|
|
self.meta_size = struct.calcsize('<' + self.meta_format)
|
|
|
|
self.single_leaf_record_size = struct.calcsize('<' + self.
|
|
|
|
single_leaf_record_format)
|
|
|
|
self.single_node_record_size = struct.calcsize('<' + self.
|
|
|
|
single_node_record_format)
|
|
|
|
self.node_size = struct.calcsize('<' + self.node_format)
|
|
|
|
self.leaf_size = struct.calcsize('<' + self.leaf_format)
|
|
|
|
self.flag_size = struct.calcsize('<' + self.flag_format)
|
|
|
|
self.elements_counter_size = struct.calcsize('<' + self.
|
|
|
|
elements_counter_format)
|
|
|
|
self.pointer_size = struct.calcsize('<' + self.pointer_format)
|
|
|
|
self.leaf_heading_size = struct.calcsize(
|
|
|
|
'<' + self.leaf_heading_format)
|
|
|
|
self.node_heading_size = struct.calcsize(
|
|
|
|
'<' + self.node_heading_format)
|
|
|
|
|
|
|
|
def create_index(self):
|
|
|
|
if os.path.isfile(os.path.join(self.db_path, self.name + '_buck')):
|
|
|
|
raise IndexException('Already exists')
|
|
|
|
with io.open(os.path.join(self.db_path, self.name + "_buck"), 'w+b') as f:
|
|
|
|
props = dict(name=self.name,
|
|
|
|
flag_format=self.flag_format,
|
|
|
|
pointer_format=self.pointer_format,
|
|
|
|
elements_counter_format=self.elements_counter_format,
|
|
|
|
node_capacity=self.node_capacity,
|
|
|
|
key_format=self.key_format,
|
|
|
|
meta_format=self.meta_format,
|
|
|
|
version=self.__version__,
|
|
|
|
storage_class=self.storage_class)
|
|
|
|
f.write(marshal.dumps(props))
|
|
|
|
self.buckets = io.open(os.path.join(self.db_path, self.name +
|
|
|
|
"_buck"), 'r+b', buffering=0)
|
|
|
|
self._create_storage()
|
|
|
|
self.buckets.seek(self._start_ind)
|
|
|
|
self.buckets.write(struct.pack('<c', 'l'))
|
|
|
|
self._insert_empty_root()
|
|
|
|
self.root_flag = 'l'
|
|
|
|
|
|
|
|
def destroy(self):
|
|
|
|
super(IU_TreeBasedIndex, self).destroy()
|
|
|
|
self._clear_cache()
|
|
|
|
|
|
|
|
def open_index(self):
|
|
|
|
if not os.path.isfile(os.path.join(self.db_path, self.name + '_buck')):
|
|
|
|
raise IndexException("Doesn't exists")
|
|
|
|
self.buckets = io.open(
|
|
|
|
os.path.join(self.db_path, self.name + "_buck"), 'r+b', buffering=0)
|
|
|
|
self.buckets.seek(self._start_ind)
|
|
|
|
self.root_flag = struct.unpack('<c', self.buckets.read(1))[0]
|
|
|
|
self._fix_params()
|
|
|
|
self._open_storage()
|
|
|
|
|
|
|
|
def _insert_empty_root(self):
|
|
|
|
self.buckets.seek(self.data_start)
|
|
|
|
root = struct.pack('<' + self.leaf_heading_format,
|
|
|
|
0,
|
|
|
|
0,
|
|
|
|
0)
|
|
|
|
root += self.single_leaf_record_size * self.node_capacity * '\x00'
|
|
|
|
self.buckets.write(root)
|
|
|
|
self.flush()
|
|
|
|
|
|
|
|
def insert(self, doc_id, key, start, size, status='o'):
|
|
|
|
nodes_stack, indexes = self._find_leaf_to_insert(key)
|
|
|
|
self._insert_new_record_into_leaf(nodes_stack.pop(),
|
|
|
|
key,
|
|
|
|
doc_id,
|
|
|
|
start,
|
|
|
|
size,
|
|
|
|
status,
|
|
|
|
nodes_stack,
|
|
|
|
indexes)
|
|
|
|
|
|
|
|
self._match_doc_id.delete(doc_id)
|
|
|
|
|
|
|
|
def _read_leaf_nr_of_elements_and_neighbours(self, leaf_start):
|
|
|
|
self.buckets.seek(leaf_start)
|
|
|
|
data = self.buckets.read(
|
|
|
|
self.elements_counter_size + 2 * self.pointer_size)
|
|
|
|
nr_of_elements, prev_l, next_l = struct.unpack(
|
|
|
|
'<' + self.elements_counter_format + 2 * self.pointer_format,
|
|
|
|
data)
|
|
|
|
return nr_of_elements, prev_l, next_l
|
|
|
|
|
|
|
|
def _read_node_nr_of_elements_and_children_flag(self, start):
|
|
|
|
self.buckets.seek(start)
|
|
|
|
data = self.buckets.read(self.elements_counter_size + self.flag_size)
|
|
|
|
nr_of_elements, children_flag = struct.unpack(
|
|
|
|
'<' + self.elements_counter_format + self.flag_format,
|
|
|
|
data)
|
|
|
|
return nr_of_elements, children_flag
|
|
|
|
|
|
|
|
def _read_leaf_nr_of_elements(self, start):
|
|
|
|
self.buckets.seek(start)
|
|
|
|
data = self.buckets.read(self.elements_counter_size)
|
|
|
|
nr_of_elements = struct.unpack(
|
|
|
|
'<' + self.elements_counter_format, data)
|
|
|
|
return nr_of_elements[0]
|
|
|
|
|
|
|
|
def _read_single_node_key(self, node_start, key_index):
|
|
|
|
self.buckets.seek(self._calculate_key_position(
|
|
|
|
node_start, key_index, 'n'))
|
|
|
|
data = self.buckets.read(self.single_node_record_size)
|
|
|
|
flag_left, key, pointer_right = struct.unpack(
|
|
|
|
'<' + self.single_node_record_format, data)
|
|
|
|
return flag_left, key, pointer_right
|
|
|
|
|
|
|
|
def _read_single_leaf_record(self, leaf_start, key_index):
|
|
|
|
self.buckets.seek(self._calculate_key_position(
|
|
|
|
leaf_start, key_index, 'l'))
|
|
|
|
data = self.buckets.read(self.single_leaf_record_size)
|
|
|
|
key, doc_id, start, size, status = struct.unpack('<' + self.
|
|
|
|
single_leaf_record_format, data)
|
|
|
|
return key, doc_id, start, size, status
|
|
|
|
|
|
|
|
def _calculate_key_position(self, start, key_index, flag):
|
|
|
|
"""
|
|
|
|
Calculates position of key in buckets file
|
|
|
|
"""
|
|
|
|
if flag == 'l':
|
|
|
|
return start + self.leaf_heading_size + key_index * self.single_leaf_record_size
|
|
|
|
elif flag == 'n':
|
|
|
|
# returns start position of flag before key[key_index]
|
|
|
|
return start + self.node_heading_size + key_index * (self.pointer_size + self.key_size)
|
|
|
|
|
|
|
|
def _match_doc_id(self, doc_id, key, element_index, leaf_start, nr_of_elements):
|
|
|
|
curr_key_index = element_index + 1
|
|
|
|
curr_leaf_start = leaf_start
|
|
|
|
next_leaf = self._read_leaf_neighbours(leaf_start)[1]
|
|
|
|
while True:
|
|
|
|
if curr_key_index < nr_of_elements:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size,\
|
|
|
|
curr_status = self._read_single_leaf_record(
|
|
|
|
curr_leaf_start, curr_key_index)
|
|
|
|
if key != curr_key:
|
|
|
|
# should't happen, crashes earlier on id index
|
|
|
|
raise DocIdNotFound
|
|
|
|
elif doc_id == curr_doc_id and curr_status != 'd':
|
|
|
|
return curr_leaf_start, nr_of_elements, curr_key_index
|
|
|
|
else:
|
|
|
|
curr_key_index = curr_key_index + 1
|
|
|
|
else: # there are no more elements in current leaf, must jump to next
|
|
|
|
if not next_leaf: # end of leaf linked list
|
|
|
|
# should't happen, crashes earlier on id index
|
|
|
|
raise DocIdNotFound
|
|
|
|
else:
|
|
|
|
curr_leaf_start = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
curr_key_index = 0
|
|
|
|
|
|
|
|
def _find_existing(self, key, element_index, leaf_start, nr_of_elements):
|
|
|
|
curr_key_index = element_index + 1
|
|
|
|
curr_leaf_start = leaf_start
|
|
|
|
next_leaf = self._read_leaf_neighbours(leaf_start)[1]
|
|
|
|
while True:
|
|
|
|
if curr_key_index < nr_of_elements:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size,\
|
|
|
|
curr_status = self._read_single_leaf_record(
|
|
|
|
curr_leaf_start, curr_key_index)
|
|
|
|
if key != curr_key:
|
|
|
|
raise ElemNotFound
|
|
|
|
elif curr_status != 'd':
|
|
|
|
return curr_leaf_start, nr_of_elements, curr_key_index
|
|
|
|
else:
|
|
|
|
curr_key_index = curr_key_index + 1
|
|
|
|
else: # there are no more elements in current leaf, must jump to next
|
|
|
|
if not next_leaf: # end of leaf linked list
|
|
|
|
raise ElemNotFound
|
|
|
|
else:
|
|
|
|
curr_leaf_start = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
curr_key_index = 0
|
|
|
|
|
|
|
|
def _update_element(self, leaf_start, key_index, new_data):
|
|
|
|
self.buckets.seek(self._calculate_key_position(leaf_start, key_index, 'l')
|
|
|
|
+ self.key_size)
|
|
|
|
self.buckets.write(struct.pack('<' + self.meta_format,
|
|
|
|
*new_data))
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start_position, key_index)
|
|
|
|
|
|
|
|
def _delete_element(self, leaf_start, key_index):
|
|
|
|
self.buckets.seek(self._calculate_key_position(leaf_start, key_index, 'l')
|
|
|
|
+ self.single_leaf_record_size - 1)
|
|
|
|
self.buckets.write(struct.pack('<c', 'd'))
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start_position, key_index)
|
|
|
|
|
|
|
|
def _leaf_linear_key_search(self, key, start, start_index, end_index):
|
|
|
|
self.buckets.seek(start)
|
|
|
|
data = self.buckets.read(
|
|
|
|
(end_index - start_index + 1) * self.single_leaf_record_size)
|
|
|
|
curr_key = struct.unpack(
|
|
|
|
'<' + self.key_format, data[:self.key_size])[0]
|
|
|
|
data = data[self.single_leaf_record_size:]
|
|
|
|
curr_index = 0
|
|
|
|
while curr_key != key:
|
|
|
|
curr_index += 1
|
|
|
|
curr_key = struct.unpack(
|
|
|
|
'<' + self.key_format, data[:self.key_size])[0]
|
|
|
|
data = data[self.single_leaf_record_size:]
|
|
|
|
return start_index + curr_index
|
|
|
|
|
|
|
|
def _node_linear_key_search(self, key, start, start_index, end_index):
|
|
|
|
self.buckets.seek(start + self.pointer_size)
|
|
|
|
data = self.buckets.read((end_index - start_index + 1) * (
|
|
|
|
self.key_size + self.pointer_size))
|
|
|
|
curr_key = struct.unpack(
|
|
|
|
'<' + self.key_format, data[:self.key_size])[0]
|
|
|
|
data = data[self.key_size + self.pointer_size:]
|
|
|
|
curr_index = 0
|
|
|
|
while curr_key != key:
|
|
|
|
curr_index += 1
|
|
|
|
curr_key = struct.unpack(
|
|
|
|
'<' + self.key_format, data[:self.key_size])[0]
|
|
|
|
data = data[self.key_size + self.pointer_size:]
|
|
|
|
return start_index + curr_index
|
|
|
|
|
|
|
|
def _next_buffer(self, buffer_start, buffer_end):
|
|
|
|
return buffer_end, buffer_end + tree_buffer_size
|
|
|
|
|
|
|
|
def _prev_buffer(self, buffer_start, buffer_end):
|
|
|
|
return buffer_start - tree_buffer_size, buffer_start
|
|
|
|
|
|
|
|
def _choose_next_candidate_index_in_leaf(self, leaf_start, candidate_start, buffer_start, buffer_end, imin, imax):
|
|
|
|
if buffer_start > candidate_start:
|
|
|
|
move_buffer = MOVE_BUFFER_PREV
|
|
|
|
elif buffer_end < candidate_start + self.single_leaf_record_size:
|
|
|
|
move_buffer = MOVE_BUFFER_NEXT
|
|
|
|
else:
|
|
|
|
move_buffer = None
|
|
|
|
return self._calculate_key_position(leaf_start, (imin + imax) / 2, 'l'), (imin + imax) / 2, move_buffer
|
|
|
|
|
|
|
|
def _choose_next_candidate_index_in_node(self, node_start, candidate_start, buffer_start, buffer_end, imin, imax):
|
|
|
|
if buffer_start > candidate_start:
|
|
|
|
move_buffer = MOVE_BUFFER_PREV
|
|
|
|
elif buffer_end < candidate_start + self.single_node_record_size:
|
|
|
|
(self.pointer_size + self.key_size) - 1
|
|
|
|
move_buffer = MOVE_BUFFER_NEXT
|
|
|
|
else:
|
|
|
|
move_buffer = None
|
|
|
|
return self._calculate_key_position(node_start, (imin + imax) / 2, 'n'), (imin + imax) / 2, move_buffer
|
|
|
|
|
|
|
|
def _find_key_in_leaf(self, leaf_start, key, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_leaf_with_one_element(key, leaf_start)[-5:]
|
|
|
|
else:
|
|
|
|
return self._find_key_in_leaf_using_binary_search(key, leaf_start, nr_of_elements)[-5:]
|
|
|
|
|
|
|
|
def _find_key_in_leaf_for_update(self, key, doc_id, leaf_start, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_leaf_with_one_element(key, leaf_start, doc_id=doc_id)
|
|
|
|
else:
|
|
|
|
return self._find_key_in_leaf_using_binary_search(key, leaf_start, nr_of_elements, mode=MODE_FIRST, doc_id=doc_id)
|
|
|
|
|
|
|
|
def _find_index_of_first_key_equal_or_smaller_key(self, key, leaf_start, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_leaf_with_one_element(key, leaf_start, mode=MODE_FIRST, return_closest=True)[:2]
|
|
|
|
else:
|
|
|
|
return self._find_key_in_leaf_using_binary_search(key, leaf_start, nr_of_elements, mode=MODE_FIRST, return_closest=True)[:2]
|
|
|
|
|
|
|
|
def _find_index_of_last_key_equal_or_smaller_key(self, key, leaf_start, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_leaf_with_one_element(key, leaf_start, mode=MODE_LAST, return_closest=True)[:2]
|
|
|
|
else:
|
|
|
|
return self._find_key_in_leaf_using_binary_search(key, leaf_start, nr_of_elements, mode=MODE_LAST, return_closest=True)[:2]
|
|
|
|
|
|
|
|
def _find_index_of_first_key_equal(self, key, leaf_start, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_leaf_with_one_element(key, leaf_start, mode=MODE_FIRST)[:2]
|
|
|
|
else:
|
|
|
|
return self._find_key_in_leaf_using_binary_search(key, leaf_start, nr_of_elements, mode=MODE_FIRST)[:2]
|
|
|
|
|
|
|
|
def _find_key_in_leaf_with_one_element(self, key, leaf_start, doc_id=None, mode=None, return_closest=False):
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size,\
|
|
|
|
curr_status = self._read_single_leaf_record(leaf_start, 0)
|
|
|
|
if key != curr_key:
|
|
|
|
if return_closest and curr_status != 'd':
|
|
|
|
return leaf_start, 0
|
|
|
|
else:
|
|
|
|
raise ElemNotFound
|
|
|
|
else:
|
|
|
|
if curr_status == 'd':
|
|
|
|
raise ElemNotFound
|
|
|
|
elif doc_id is not None and doc_id != curr_doc_id:
|
|
|
|
# should't happen, crashes earlier on id index
|
|
|
|
raise DocIdNotFound
|
|
|
|
else:
|
|
|
|
return leaf_start, 0, curr_doc_id, curr_key, curr_start, curr_size, curr_status
|
|
|
|
|
|
|
|
def _find_key_in_leaf_using_binary_search(self, key, leaf_start, nr_of_elements, doc_id=None, mode=None, return_closest=False):
|
|
|
|
"""
|
|
|
|
Binary search implementation used in all get functions
|
|
|
|
"""
|
|
|
|
imin, imax = 0, nr_of_elements - 1
|
|
|
|
buffer_start, buffer_end = self._set_buffer_limits()
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
self._calculate_key_position(leaf_start,
|
|
|
|
(imin + imax) / 2,
|
|
|
|
'l'),
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
while imax != imin and imax > imin:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
candidate_index)
|
|
|
|
candidate_start = self._calculate_key_position(
|
|
|
|
leaf_start, candidate_index, 'l')
|
|
|
|
if key < curr_key:
|
|
|
|
if move_buffer == MOVE_BUFFER_PREV:
|
|
|
|
buffer_start, buffer_end = self._prev_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else: # if next chosen element is in current buffer, abort moving to other
|
|
|
|
move_buffer is None
|
|
|
|
imax = candidate_index - 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
elif key == curr_key:
|
|
|
|
if mode == MODE_LAST:
|
|
|
|
if move_buffer == MOVE_BUFFER_NEXT:
|
|
|
|
buffer_start, buffer_end = self._next_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else:
|
|
|
|
move_buffer is None
|
|
|
|
imin = candidate_index + 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
else:
|
|
|
|
if curr_status == 'o':
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
if move_buffer == MOVE_BUFFER_PREV:
|
|
|
|
buffer_start, buffer_end = self._prev_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else:
|
|
|
|
move_buffer is None
|
|
|
|
imax = candidate_index
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
else:
|
|
|
|
if move_buffer == MOVE_BUFFER_NEXT:
|
|
|
|
buffer_start, buffer_end = self._next_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else:
|
|
|
|
move_buffer is None
|
|
|
|
imin = candidate_index + 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
|
|
|
|
if imax > imin:
|
|
|
|
chosen_key_position = candidate_index
|
|
|
|
else:
|
|
|
|
chosen_key_position = imax
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
chosen_key_position)
|
|
|
|
if key != curr_key:
|
|
|
|
if return_closest: # useful for find all bigger/smaller methods
|
|
|
|
return leaf_start, chosen_key_position
|
|
|
|
else:
|
|
|
|
raise ElemNotFound
|
|
|
|
if doc_id and doc_id == curr_doc_id and curr_status == 'o':
|
|
|
|
return leaf_start, chosen_key_position, curr_doc_id, curr_key, curr_start, curr_size, curr_status
|
|
|
|
else:
|
|
|
|
if mode == MODE_FIRST and imin < chosen_key_position: # check if there isn't any element with equal key before chosen one
|
|
|
|
matching_record_index = self._leaf_linear_key_search(key,
|
|
|
|
self._calculate_key_position(leaf_start,
|
|
|
|
imin,
|
|
|
|
'l'),
|
|
|
|
imin,
|
|
|
|
chosen_key_position)
|
|
|
|
else:
|
|
|
|
matching_record_index = chosen_key_position
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
matching_record_index)
|
|
|
|
if curr_status == 'd' and not return_closest:
|
|
|
|
leaf_start, nr_of_elements, matching_record_index = self._find_existing(key,
|
|
|
|
matching_record_index,
|
|
|
|
leaf_start,
|
|
|
|
nr_of_elements)
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
matching_record_index)
|
|
|
|
if doc_id is not None and doc_id != curr_doc_id:
|
|
|
|
leaf_start, nr_of_elements, matching_record_index = self._match_doc_id(doc_id,
|
|
|
|
key,
|
|
|
|
matching_record_index,
|
|
|
|
leaf_start,
|
|
|
|
nr_of_elements)
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
matching_record_index)
|
|
|
|
return leaf_start, matching_record_index, curr_doc_id, curr_key, curr_start, curr_size, curr_status
|
|
|
|
|
|
|
|
def _find_place_in_leaf(self, key, leaf_start, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_place_in_leaf_with_one_element(key, leaf_start)
|
|
|
|
else:
|
|
|
|
return self._find_place_in_leaf_using_binary_search(key, leaf_start, nr_of_elements)
|
|
|
|
|
|
|
|
def _find_place_in_leaf_with_one_element(self, key, leaf_start):
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size,\
|
|
|
|
curr_status = self._read_single_leaf_record(leaf_start, 0)
|
|
|
|
if curr_status == 'd':
|
|
|
|
return leaf_start, 0, 0, False, True # leaf start, index of new key position, nr of rec to rewrite, full_leaf flag, on_deleted flag
|
|
|
|
else:
|
|
|
|
if key < curr_key:
|
|
|
|
return leaf_start, 0, 1, False, False
|
|
|
|
else:
|
|
|
|
return leaf_start, 1, 0, False, False
|
|
|
|
|
|
|
|
def _find_place_in_leaf_using_binary_search(self, key, leaf_start, nr_of_elements):
|
|
|
|
"""
|
|
|
|
Binary search implementation used in insert function
|
|
|
|
"""
|
|
|
|
imin, imax = 0, nr_of_elements - 1
|
|
|
|
buffer_start, buffer_end = self._set_buffer_limits()
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
self._calculate_key_position(leaf_start,
|
|
|
|
(imin + imax) / 2,
|
|
|
|
'l'),
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
while imax != imin and imax > imin:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
candidate_index)
|
|
|
|
candidate_start = self._calculate_key_position(
|
|
|
|
leaf_start, candidate_index, 'l')
|
|
|
|
if key < curr_key:
|
|
|
|
if move_buffer == MOVE_BUFFER_PREV:
|
|
|
|
buffer_start, buffer_end = self._prev_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else: # if next chosen element is in current buffer, abort moving to other
|
|
|
|
move_buffer is None
|
|
|
|
imax = candidate_index - 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
else:
|
|
|
|
if move_buffer == MOVE_BUFFER_NEXT:
|
|
|
|
buffer_start, buffer_end = self._next_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else:
|
|
|
|
move_buffer is None
|
|
|
|
imin = candidate_index + 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_leaf(leaf_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
if imax < imin and imin < nr_of_elements:
|
|
|
|
chosen_key_position = imin
|
|
|
|
else:
|
|
|
|
chosen_key_position = imax
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
chosen_key_position)
|
|
|
|
if curr_status == 'd':
|
|
|
|
return leaf_start, chosen_key_position, 0, False, True
|
|
|
|
elif key < curr_key:
|
|
|
|
if chosen_key_position > 0:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
chosen_key_position - 1)
|
|
|
|
if curr_start == 'd':
|
|
|
|
return leaf_start, chosen_key_position - 1, 0, False, True
|
|
|
|
else:
|
|
|
|
return leaf_start, chosen_key_position, nr_of_elements - chosen_key_position, (nr_of_elements == self.node_capacity), False
|
|
|
|
else:
|
|
|
|
return leaf_start, chosen_key_position, nr_of_elements - chosen_key_position, (nr_of_elements == self.node_capacity), False
|
|
|
|
else:
|
|
|
|
if chosen_key_position < nr_of_elements - 1:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_start,
|
|
|
|
chosen_key_position + 1)
|
|
|
|
if curr_start == 'd':
|
|
|
|
return leaf_start, chosen_key_position + 1, 0, False, True
|
|
|
|
else:
|
|
|
|
return leaf_start, chosen_key_position + 1, nr_of_elements - chosen_key_position - 1, (nr_of_elements == self.node_capacity), False
|
|
|
|
else:
|
|
|
|
return leaf_start, chosen_key_position + 1, nr_of_elements - chosen_key_position - 1, (nr_of_elements == self.node_capacity), False
|
|
|
|
|
|
|
|
def _set_buffer_limits(self):
|
|
|
|
pos = self.buckets.tell()
|
|
|
|
buffer_start = pos - (pos % tree_buffer_size)
|
|
|
|
return buffer_start, (buffer_start + tree_buffer_size)
|
|
|
|
|
|
|
|
def _find_first_key_occurence_in_node(self, node_start, key, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_node_with_one_element(key, node_start, mode=MODE_FIRST)
|
|
|
|
else:
|
|
|
|
return self._find_key_in_node_using_binary_search(key, node_start, nr_of_elements, mode=MODE_FIRST)
|
|
|
|
|
|
|
|
def _find_last_key_occurence_in_node(self, node_start, key, nr_of_elements):
|
|
|
|
if nr_of_elements == 1:
|
|
|
|
return self._find_key_in_node_with_one_element(key, node_start, mode=MODE_LAST)
|
|
|
|
else:
|
|
|
|
return self._find_key_in_node_using_binary_search(key, node_start, nr_of_elements, mode=MODE_LAST)
|
|
|
|
|
|
|
|
def _find_key_in_node_with_one_element(self, key, node_start, mode=None):
|
|
|
|
l_pointer, curr_key, r_pointer = self._read_single_node_key(
|
|
|
|
node_start, 0)
|
|
|
|
if key < curr_key:
|
|
|
|
return 0, l_pointer
|
|
|
|
elif key > curr_key:
|
|
|
|
return 0, r_pointer
|
|
|
|
else:
|
|
|
|
if mode == MODE_FIRST:
|
|
|
|
return 0, l_pointer
|
|
|
|
elif mode == MODE_LAST:
|
|
|
|
return 0, r_pointer
|
|
|
|
else:
|
|
|
|
raise Exception('Invalid mode declared: set first/last')
|
|
|
|
|
|
|
|
def _find_key_in_node_using_binary_search(self, key, node_start, nr_of_elements, mode=None):
|
|
|
|
imin, imax = 0, nr_of_elements - 1
|
|
|
|
buffer_start, buffer_end = self._set_buffer_limits()
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_node(node_start,
|
|
|
|
self._calculate_key_position(node_start,
|
|
|
|
(imin + imax) / 2,
|
|
|
|
'n'),
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
while imax != imin and imax > imin:
|
|
|
|
l_pointer, curr_key, r_pointer = self._read_single_node_key(
|
|
|
|
node_start, candidate_index)
|
|
|
|
candidate_start = self._calculate_key_position(
|
|
|
|
node_start, candidate_index, 'n')
|
|
|
|
if key < curr_key:
|
|
|
|
if move_buffer == MOVE_BUFFER_PREV:
|
|
|
|
buffer_start, buffer_end = self._prev_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else: # if next chosen element is in current buffer, abort moving to other
|
|
|
|
move_buffer is None
|
|
|
|
imax = candidate_index - 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_node(node_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
elif key == curr_key:
|
|
|
|
if mode == MODE_LAST:
|
|
|
|
if move_buffer == MOVE_BUFFER_NEXT:
|
|
|
|
buffer_start, buffer_end = self._next_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else:
|
|
|
|
move_buffer is None
|
|
|
|
imin = candidate_index + 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_node(node_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
if move_buffer == MOVE_BUFFER_NEXT:
|
|
|
|
buffer_start, buffer_end = self._next_buffer(
|
|
|
|
buffer_start, buffer_end)
|
|
|
|
else:
|
|
|
|
move_buffer is None
|
|
|
|
imin = candidate_index + 1
|
|
|
|
candidate_start, candidate_index, move_buffer = self._choose_next_candidate_index_in_node(node_start,
|
|
|
|
candidate_start,
|
|
|
|
buffer_start,
|
|
|
|
buffer_end,
|
|
|
|
imin, imax)
|
|
|
|
|
|
|
|
if imax > imin:
|
|
|
|
chosen_key_position = candidate_index
|
|
|
|
elif imax < imin and imin < nr_of_elements:
|
|
|
|
chosen_key_position = imin
|
|
|
|
else:
|
|
|
|
chosen_key_position = imax
|
|
|
|
l_pointer, curr_key, r_pointer = self._read_single_node_key(
|
|
|
|
node_start, chosen_key_position)
|
|
|
|
if mode == MODE_FIRST and imin < chosen_key_position: # check if there is no elements with equal key before chosen one
|
|
|
|
matching_record_index = self._node_linear_key_search(key,
|
|
|
|
self._calculate_key_position(node_start,
|
|
|
|
imin,
|
|
|
|
'n'),
|
|
|
|
imin,
|
|
|
|
chosen_key_position)
|
|
|
|
else:
|
|
|
|
matching_record_index = chosen_key_position
|
|
|
|
l_pointer, curr_key, r_pointer = self._read_single_node_key(
|
|
|
|
node_start, matching_record_index)
|
|
|
|
if key < curr_key:
|
|
|
|
return matching_record_index, l_pointer
|
|
|
|
elif key > curr_key:
|
|
|
|
return matching_record_index, r_pointer
|
|
|
|
else:
|
|
|
|
if mode == MODE_FIRST:
|
|
|
|
return matching_record_index, l_pointer
|
|
|
|
elif mode == MODE_LAST:
|
|
|
|
return matching_record_index, r_pointer
|
|
|
|
else:
|
|
|
|
raise Exception('Invalid mode declared: first/last')
|
|
|
|
|
|
|
|
def _update_leaf_ready_data(self, leaf_start, start_index, new_nr_of_elements, records_to_rewrite):
|
|
|
|
self.buckets.seek(leaf_start)
|
|
|
|
self.buckets.write(struct.pack('<h', new_nr_of_elements))
|
|
|
|
start_position = self._calculate_key_position(
|
|
|
|
leaf_start, start_index, 'l')
|
|
|
|
self.buckets.seek(start_position)
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack(
|
|
|
|
'<' + (new_nr_of_elements - start_index) *
|
|
|
|
self.single_leaf_record_format,
|
|
|
|
*records_to_rewrite))
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(leaf_start)
|
|
|
|
|
|
|
|
def _update_leaf(self, leaf_start, new_record_position, nr_of_elements,
|
|
|
|
nr_of_records_to_rewrite, on_deleted, new_key,
|
|
|
|
new_doc_id, new_start, new_size, new_status):
|
|
|
|
if nr_of_records_to_rewrite == 0: # just write at set position
|
|
|
|
self.buckets.seek(self._calculate_key_position(
|
|
|
|
leaf_start, new_record_position, 'l'))
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack('<' + self.single_leaf_record_format,
|
|
|
|
new_key,
|
|
|
|
new_doc_id,
|
|
|
|
new_start,
|
|
|
|
new_size,
|
|
|
|
new_status))
|
|
|
|
self.flush()
|
|
|
|
else: # must read all elems after new one, and rewrite them after new
|
|
|
|
start = self._calculate_key_position(
|
|
|
|
leaf_start, new_record_position, 'l')
|
|
|
|
self.buckets.seek(start)
|
|
|
|
data = self.buckets.read(nr_of_records_to_rewrite *
|
|
|
|
self.single_leaf_record_size)
|
|
|
|
records_to_rewrite = struct.unpack('<' + nr_of_records_to_rewrite *
|
|
|
|
self.single_leaf_record_format, data)
|
|
|
|
curr_index = 0
|
|
|
|
records_to_rewrite = list(records_to_rewrite)
|
|
|
|
for status in records_to_rewrite[4::5]: # don't write back deleted records, deleting them from list
|
|
|
|
if status != 'o':
|
|
|
|
del records_to_rewrite[curr_index * 5:curr_index * 5 + 5]
|
|
|
|
nr_of_records_to_rewrite -= 1
|
|
|
|
nr_of_elements -= 1
|
|
|
|
else:
|
|
|
|
curr_index += 1
|
|
|
|
|
|
|
|
self.buckets.seek(start)
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack(
|
|
|
|
'<' + (nr_of_records_to_rewrite +
|
|
|
|
1) * self.single_leaf_record_format,
|
|
|
|
new_key,
|
|
|
|
new_doc_id,
|
|
|
|
new_start,
|
|
|
|
new_size,
|
|
|
|
new_status,
|
|
|
|
*tuple(records_to_rewrite)))
|
|
|
|
self.flush()
|
|
|
|
self.buckets.seek(leaf_start)
|
|
|
|
if not on_deleted: # when new record replaced deleted one, nr of leaf elements stays the same
|
|
|
|
self.buckets.write(struct.pack('<h', nr_of_elements + 1))
|
|
|
|
|
|
|
|
self._read_leaf_nr_of_elements.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(leaf_start)
|
|
|
|
self._find_key_in_leaf.delete(leaf_start)
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start)
|
|
|
|
|
|
|
|
def _read_leaf_neighbours(self, leaf_start):
|
|
|
|
self.buckets.seek(leaf_start + self.elements_counter_size)
|
|
|
|
neihbours_data = self.buckets.read(2 * self.pointer_size)
|
|
|
|
prev_l, next_l = struct.unpack(
|
|
|
|
'<' + 2 * self.pointer_format, neihbours_data)
|
|
|
|
return prev_l, next_l
|
|
|
|
|
|
|
|
def _update_leaf_size_and_pointers(self, leaf_start, new_size, new_prev, new_next):
|
|
|
|
self.buckets.seek(leaf_start)
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack(
|
|
|
|
'<' + self.elements_counter_format + 2 * self.pointer_format,
|
|
|
|
new_size,
|
|
|
|
new_prev,
|
|
|
|
new_next))
|
|
|
|
|
|
|
|
self._read_leaf_nr_of_elements.delete(leaf_start)
|
|
|
|
self._read_leaf_neighbours.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(leaf_start)
|
|
|
|
|
|
|
|
def _update_leaf_prev_pointer(self, leaf_start, pointer):
|
|
|
|
self.buckets.seek(leaf_start + self.elements_counter_size)
|
|
|
|
self.buckets.write(struct.pack('<' + self.pointer_format,
|
|
|
|
pointer))
|
|
|
|
|
|
|
|
self._read_leaf_neighbours.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(leaf_start)
|
|
|
|
|
|
|
|
def _update_size(self, start, new_size):
|
|
|
|
self.buckets.seek(start)
|
|
|
|
self.buckets.write(struct.pack('<' + self.elements_counter_format,
|
|
|
|
new_size))
|
|
|
|
|
|
|
|
self._read_leaf_nr_of_elements.delete(start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(start)
|
|
|
|
|
|
|
|
def _create_new_root_from_leaf(self, leaf_start, nr_of_records_to_rewrite, new_leaf_size, old_leaf_size, half_size, new_data):
|
|
|
|
blanks = (self.node_capacity - new_leaf_size) * \
|
|
|
|
self.single_leaf_record_size * '\x00'
|
|
|
|
left_leaf_start_position = self.data_start + self.node_size
|
|
|
|
right_leaf_start_position = self.data_start + \
|
|
|
|
self.node_size + self.leaf_size
|
|
|
|
self.buckets.seek(self.data_start + self.leaf_heading_size)
|
|
|
|
# read old root
|
|
|
|
data = self.buckets.read(
|
|
|
|
self.single_leaf_record_size * self.node_capacity)
|
|
|
|
leaf_data = struct.unpack('<' + self.
|
|
|
|
single_leaf_record_format * self.node_capacity, data)
|
|
|
|
# remove deleted records, if succeded abort spliting
|
|
|
|
if self._update_if_has_deleted(self.data_start, leaf_data, 0, new_data):
|
|
|
|
return None
|
|
|
|
# find out key which goes to parent node
|
|
|
|
if nr_of_records_to_rewrite > new_leaf_size - 1:
|
|
|
|
key_moved_to_parent_node = leaf_data[(old_leaf_size - 1) * 5]
|
|
|
|
elif nr_of_records_to_rewrite == new_leaf_size - 1:
|
|
|
|
key_moved_to_parent_node = new_data[0]
|
|
|
|
else:
|
|
|
|
key_moved_to_parent_node = leaf_data[old_leaf_size * 5]
|
|
|
|
data_to_write = self._prepare_new_root_data(key_moved_to_parent_node,
|
|
|
|
left_leaf_start_position,
|
|
|
|
right_leaf_start_position,
|
|
|
|
'l')
|
|
|
|
if nr_of_records_to_rewrite > half_size:
|
|
|
|
# key goes to first half
|
|
|
|
# prepare left leaf data
|
|
|
|
left_leaf_data = struct.pack('<' + self.leaf_heading_format + self.single_leaf_record_format
|
|
|
|
* (self.node_capacity - nr_of_records_to_rewrite),
|
|
|
|
old_leaf_size,
|
|
|
|
0,
|
|
|
|
right_leaf_start_position,
|
|
|
|
*leaf_data[:-nr_of_records_to_rewrite * 5])
|
|
|
|
left_leaf_data += struct.pack(
|
|
|
|
'<' + self.single_leaf_record_format * (
|
|
|
|
nr_of_records_to_rewrite - new_leaf_size + 1),
|
|
|
|
new_data[0],
|
|
|
|
new_data[1],
|
|
|
|
new_data[2],
|
|
|
|
new_data[3],
|
|
|
|
new_data[4],
|
|
|
|
*leaf_data[-nr_of_records_to_rewrite * 5:(old_leaf_size - 1) * 5])
|
|
|
|
# prepare right leaf_data
|
|
|
|
right_leaf_data = struct.pack('<' + self.elements_counter_format + 2 * self.pointer_format +
|
|
|
|
self.single_leaf_record_format *
|
|
|
|
new_leaf_size,
|
|
|
|
new_leaf_size,
|
|
|
|
left_leaf_start_position,
|
|
|
|
0,
|
|
|
|
*leaf_data[-new_leaf_size * 5:])
|
|
|
|
else:
|
|
|
|
# key goes to second half
|
|
|
|
if nr_of_records_to_rewrite:
|
|
|
|
records_before = leaf_data[old_leaf_size *
|
|
|
|
5:-nr_of_records_to_rewrite * 5]
|
|
|
|
records_after = leaf_data[-nr_of_records_to_rewrite * 5:]
|
|
|
|
else:
|
|
|
|
records_before = leaf_data[old_leaf_size * 5:]
|
|
|
|
records_after = []
|
|
|
|
|
|
|
|
left_leaf_data = struct.pack(
|
|
|
|
'<' + self.leaf_heading_format +
|
|
|
|
self.single_leaf_record_format * old_leaf_size,
|
|
|
|
old_leaf_size,
|
|
|
|
0,
|
|
|
|
right_leaf_start_position,
|
|
|
|
*leaf_data[:old_leaf_size * 5])
|
|
|
|
# prepare right leaf_data
|
|
|
|
right_leaf_data = struct.pack('<' + self.elements_counter_format + 2 * self.pointer_format +
|
|
|
|
self.single_leaf_record_format * (new_leaf_size -
|
|
|
|
nr_of_records_to_rewrite - 1),
|
|
|
|
new_leaf_size,
|
|
|
|
left_leaf_start_position,
|
|
|
|
0,
|
|
|
|
*records_before)
|
|
|
|
right_leaf_data += struct.pack(
|
|
|
|
'<' + self.single_leaf_record_format * (
|
|
|
|
nr_of_records_to_rewrite + 1),
|
|
|
|
new_data[0],
|
|
|
|
new_data[1],
|
|
|
|
new_data[2],
|
|
|
|
new_data[3],
|
|
|
|
new_data[4],
|
|
|
|
*records_after)
|
|
|
|
left_leaf_data += (self.node_capacity -
|
|
|
|
old_leaf_size) * self.single_leaf_record_size * '\x00'
|
|
|
|
right_leaf_data += blanks
|
|
|
|
data_to_write += left_leaf_data
|
|
|
|
data_to_write += right_leaf_data
|
|
|
|
self.buckets.seek(self._start_ind)
|
|
|
|
self.buckets.write(struct.pack('<c', 'n') + data_to_write)
|
|
|
|
self.root_flag = 'n'
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start)
|
|
|
|
self._find_key_in_leaf.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(leaf_start)
|
|
|
|
self._read_leaf_neighbours.delete(leaf_start)
|
|
|
|
return None
|
|
|
|
|
|
|
|
def _split_leaf(
|
|
|
|
self, leaf_start, nr_of_records_to_rewrite, new_key, new_doc_id, new_start, new_size, new_status,
|
|
|
|
create_new_root=False):
|
|
|
|
"""
|
|
|
|
Splits full leaf in two separate ones, first half of records stays on old position,
|
|
|
|
second half is written as new leaf at the end of file.
|
|
|
|
"""
|
|
|
|
half_size = self.node_capacity / 2
|
|
|
|
if self.node_capacity % 2 == 0:
|
|
|
|
old_leaf_size = half_size
|
|
|
|
new_leaf_size = half_size + 1
|
|
|
|
else:
|
|
|
|
old_leaf_size = new_leaf_size = half_size + 1
|
|
|
|
if create_new_root: # leaf is a root
|
|
|
|
new_data = [new_key, new_doc_id, new_start, new_size, new_status]
|
|
|
|
self._create_new_root_from_leaf(leaf_start, nr_of_records_to_rewrite, new_leaf_size, old_leaf_size, half_size, new_data)
|
|
|
|
else:
|
|
|
|
blanks = (self.node_capacity - new_leaf_size) * \
|
|
|
|
self.single_leaf_record_size * '\x00'
|
|
|
|
prev_l, next_l = self._read_leaf_neighbours(leaf_start)
|
|
|
|
if nr_of_records_to_rewrite > half_size: # insert key into first half of leaf
|
|
|
|
self.buckets.seek(self._calculate_key_position(leaf_start,
|
|
|
|
self.node_capacity - nr_of_records_to_rewrite,
|
|
|
|
'l'))
|
|
|
|
# read all records with key>new_key
|
|
|
|
data = self.buckets.read(
|
|
|
|
nr_of_records_to_rewrite * self.single_leaf_record_size)
|
|
|
|
records_to_rewrite = struct.unpack(
|
|
|
|
'<' + nr_of_records_to_rewrite * self.single_leaf_record_format, data)
|
|
|
|
# remove deleted records, if succeded abort spliting
|
|
|
|
if self._update_if_has_deleted(leaf_start,
|
|
|
|
records_to_rewrite,
|
|
|
|
self.node_capacity -
|
|
|
|
nr_of_records_to_rewrite,
|
|
|
|
[new_key, new_doc_id, new_start, new_size, new_status]):
|
|
|
|
return None
|
|
|
|
key_moved_to_parent_node = records_to_rewrite[
|
|
|
|
-new_leaf_size * 5]
|
|
|
|
# write new leaf at end of file
|
|
|
|
self.buckets.seek(0, 2) # end of file
|
|
|
|
new_leaf_start = self.buckets.tell()
|
|
|
|
# prepare new leaf_data
|
|
|
|
new_leaf = struct.pack('<' + self.elements_counter_format + 2 * self.pointer_format +
|
|
|
|
self.single_leaf_record_format *
|
|
|
|
new_leaf_size,
|
|
|
|
new_leaf_size,
|
|
|
|
leaf_start,
|
|
|
|
next_l,
|
|
|
|
*records_to_rewrite[-new_leaf_size * 5:])
|
|
|
|
new_leaf += blanks
|
|
|
|
# write new leaf
|
|
|
|
self.buckets.write(new_leaf)
|
|
|
|
# update old leaf heading
|
|
|
|
self._update_leaf_size_and_pointers(leaf_start,
|
|
|
|
old_leaf_size,
|
|
|
|
prev_l,
|
|
|
|
new_leaf_start)
|
|
|
|
# seek position of new key in first half
|
|
|
|
self.buckets.seek(self._calculate_key_position(leaf_start,
|
|
|
|
self.node_capacity - nr_of_records_to_rewrite,
|
|
|
|
'l'))
|
|
|
|
# write new key and keys after
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack(
|
|
|
|
'<' + self.single_leaf_record_format *
|
|
|
|
(nr_of_records_to_rewrite - new_leaf_size + 1),
|
|
|
|
new_key,
|
|
|
|
new_doc_id,
|
|
|
|
new_start,
|
|
|
|
new_size,
|
|
|
|
'o',
|
|
|
|
*records_to_rewrite[:-new_leaf_size * 5]))
|
|
|
|
|
|
|
|
if next_l: # when next_l is 0 there is no next leaf to update, avoids writing data at 0 position of file
|
|
|
|
self._update_leaf_prev_pointer(
|
|
|
|
next_l, new_leaf_start)
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start)
|
|
|
|
self._find_key_in_leaf.delete(leaf_start)
|
|
|
|
|
|
|
|
return new_leaf_start, key_moved_to_parent_node
|
|
|
|
else: # key goes into second half of leaf '
|
|
|
|
# seek half of the leaf
|
|
|
|
self.buckets.seek(self._calculate_key_position(
|
|
|
|
leaf_start, old_leaf_size, 'l'))
|
|
|
|
data = self.buckets.read(
|
|
|
|
self.single_leaf_record_size * (new_leaf_size - 1))
|
|
|
|
records_to_rewrite = struct.unpack('<' + (new_leaf_size - 1) *
|
|
|
|
self.single_leaf_record_format, data)
|
|
|
|
# remove deleted records, if succeded abort spliting
|
|
|
|
if self._update_if_has_deleted(leaf_start,
|
|
|
|
records_to_rewrite,
|
|
|
|
old_leaf_size,
|
|
|
|
[new_key, new_doc_id, new_start, new_size, new_status]):
|
|
|
|
return None
|
|
|
|
key_moved_to_parent_node = records_to_rewrite[
|
|
|
|
-(new_leaf_size - 1) * 5]
|
|
|
|
if key_moved_to_parent_node > new_key:
|
|
|
|
key_moved_to_parent_node = new_key
|
|
|
|
self.buckets.seek(0, 2) # end of file
|
|
|
|
new_leaf_start = self.buckets.tell()
|
|
|
|
# prepare new leaf data
|
|
|
|
index_of_records_split = nr_of_records_to_rewrite * 5
|
|
|
|
if index_of_records_split:
|
|
|
|
records_before = records_to_rewrite[
|
|
|
|
:-index_of_records_split]
|
|
|
|
records_after = records_to_rewrite[
|
|
|
|
-index_of_records_split:]
|
|
|
|
else:
|
|
|
|
records_before = records_to_rewrite
|
|
|
|
records_after = []
|
|
|
|
new_leaf = struct.pack('<' + self.elements_counter_format + 2 * self.pointer_format
|
|
|
|
+ self.single_leaf_record_format * (new_leaf_size -
|
|
|
|
nr_of_records_to_rewrite - 1),
|
|
|
|
new_leaf_size,
|
|
|
|
leaf_start,
|
|
|
|
next_l,
|
|
|
|
*records_before)
|
|
|
|
new_leaf += struct.pack(
|
|
|
|
'<' + self.single_leaf_record_format *
|
|
|
|
(nr_of_records_to_rewrite + 1),
|
|
|
|
new_key,
|
|
|
|
new_doc_id,
|
|
|
|
new_start,
|
|
|
|
new_size,
|
|
|
|
'o',
|
|
|
|
*records_after)
|
|
|
|
new_leaf += blanks
|
|
|
|
self.buckets.write(new_leaf)
|
|
|
|
self._update_leaf_size_and_pointers(leaf_start,
|
|
|
|
old_leaf_size,
|
|
|
|
prev_l,
|
|
|
|
new_leaf_start)
|
|
|
|
if next_l: # pren next_l is 0 there is no next leaf to update, avoids writing data at 0 position of file
|
|
|
|
self._update_leaf_prev_pointer(
|
|
|
|
next_l, new_leaf_start)
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start)
|
|
|
|
self._find_key_in_leaf.delete(leaf_start)
|
|
|
|
|
|
|
|
return new_leaf_start, key_moved_to_parent_node
|
|
|
|
|
|
|
|
def _update_if_has_deleted(self, leaf_start, records_to_rewrite, start_position, new_record_data):
|
|
|
|
"""
|
|
|
|
Checks if there are any deleted elements in data to rewrite and prevent from writing then back.
|
|
|
|
"""
|
|
|
|
curr_index = 0
|
|
|
|
nr_of_elements = self.node_capacity
|
|
|
|
records_to_rewrite = list(records_to_rewrite)
|
|
|
|
for status in records_to_rewrite[4::5]: # remove deleted from list
|
|
|
|
if status != 'o':
|
|
|
|
del records_to_rewrite[curr_index * 5:curr_index * 5 + 5]
|
|
|
|
nr_of_elements -= 1
|
|
|
|
else:
|
|
|
|
curr_index += 1
|
|
|
|
# if were deleted dont have to split, just update leaf
|
|
|
|
if nr_of_elements < self.node_capacity:
|
|
|
|
data_split_index = 0
|
|
|
|
for key in records_to_rewrite[0::5]:
|
|
|
|
if key > new_record_data[0]:
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
data_split_index += 1
|
|
|
|
records_to_rewrite = records_to_rewrite[:data_split_index * 5]\
|
|
|
|
+ new_record_data\
|
|
|
|
+ records_to_rewrite[data_split_index * 5:]
|
|
|
|
self._update_leaf_ready_data(leaf_start,
|
|
|
|
start_position,
|
|
|
|
nr_of_elements + 1,
|
|
|
|
records_to_rewrite),
|
|
|
|
return True
|
|
|
|
else: # did not found any deleted records in leaf
|
|
|
|
return False
|
|
|
|
|
|
|
|
def _prepare_new_root_data(self, root_key, left_pointer, right_pointer, children_flag='n'):
|
|
|
|
new_root = struct.pack(
|
|
|
|
'<' + self.node_heading_format + self.single_node_record_format,
|
|
|
|
1,
|
|
|
|
children_flag,
|
|
|
|
left_pointer,
|
|
|
|
root_key,
|
|
|
|
right_pointer)
|
|
|
|
new_root += (self.key_size + self.pointer_size) * (self.
|
|
|
|
node_capacity - 1) * '\x00'
|
|
|
|
return new_root
|
|
|
|
|
|
|
|
def _create_new_root_from_node(self, node_start, children_flag, nr_of_keys_to_rewrite, new_node_size, old_node_size, new_key, new_pointer):
|
|
|
|
# reading second half of node
|
|
|
|
self.buckets.seek(self.data_start + self.node_heading_size)
|
|
|
|
# read all keys with key>new_key
|
|
|
|
data = self.buckets.read(self.pointer_size + self.
|
|
|
|
node_capacity * (self.key_size + self.pointer_size))
|
|
|
|
old_node_data = struct.unpack('<' + self.pointer_format + self.node_capacity *
|
|
|
|
(self.key_format + self.pointer_format), data)
|
|
|
|
self.buckets.seek(0, 2) # end of file
|
|
|
|
new_node_start = self.buckets.tell()
|
|
|
|
if nr_of_keys_to_rewrite == new_node_size:
|
|
|
|
key_moved_to_root = new_key
|
|
|
|
# prepare new nodes data
|
|
|
|
left_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
old_node_size * (self.
|
|
|
|
key_format + self.pointer_format),
|
|
|
|
old_node_size,
|
|
|
|
children_flag,
|
|
|
|
*old_node_data[:old_node_size * 2 + 1])
|
|
|
|
|
|
|
|
right_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
new_node_size * (self.
|
|
|
|
key_format + self.pointer_format),
|
|
|
|
new_node_size,
|
|
|
|
children_flag,
|
|
|
|
new_pointer,
|
|
|
|
*old_node_data[old_node_size * 2 + 1:])
|
|
|
|
elif nr_of_keys_to_rewrite > new_node_size:
|
|
|
|
key_moved_to_root = old_node_data[old_node_size * 2 - 1]
|
|
|
|
# prepare new nodes data
|
|
|
|
if nr_of_keys_to_rewrite == self.node_capacity:
|
|
|
|
keys_before = old_node_data[:1]
|
|
|
|
keys_after = old_node_data[1:old_node_size * 2 - 1]
|
|
|
|
else:
|
|
|
|
keys_before = old_node_data[:-nr_of_keys_to_rewrite * 2]
|
|
|
|
keys_after = old_node_data[-(
|
|
|
|
nr_of_keys_to_rewrite) * 2:old_node_size * 2 - 1]
|
|
|
|
left_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
(self.node_capacity - nr_of_keys_to_rewrite) * (self.
|
|
|
|
key_format + self.pointer_format),
|
|
|
|
old_node_size,
|
|
|
|
children_flag,
|
|
|
|
*keys_before)
|
|
|
|
left_node += struct.pack(
|
|
|
|
'<' + (self.key_format + self.pointer_format) *
|
|
|
|
(nr_of_keys_to_rewrite - new_node_size),
|
|
|
|
new_key,
|
|
|
|
new_pointer,
|
|
|
|
*keys_after)
|
|
|
|
|
|
|
|
right_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
new_node_size * (self.
|
|
|
|
key_format + self.pointer_format),
|
|
|
|
new_node_size,
|
|
|
|
children_flag,
|
|
|
|
*old_node_data[old_node_size * 2:])
|
|
|
|
else:
|
|
|
|
# 'inserting key into second half of node and creating new root'
|
|
|
|
key_moved_to_root = old_node_data[old_node_size * 2 + 1]
|
|
|
|
# prepare new nodes data
|
|
|
|
left_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
old_node_size * (self.
|
|
|
|
key_format + self.pointer_format),
|
|
|
|
old_node_size,
|
|
|
|
children_flag,
|
|
|
|
*old_node_data[:old_node_size * 2 + 1])
|
|
|
|
if nr_of_keys_to_rewrite:
|
|
|
|
keys_before = old_node_data[(old_node_size +
|
|
|
|
1) * 2:-nr_of_keys_to_rewrite * 2]
|
|
|
|
keys_after = old_node_data[-nr_of_keys_to_rewrite * 2:]
|
|
|
|
else:
|
|
|
|
keys_before = old_node_data[(old_node_size + 1) * 2:]
|
|
|
|
keys_after = []
|
|
|
|
right_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
(new_node_size - nr_of_keys_to_rewrite - 1) * (self.
|
|
|
|
key_format + self.pointer_format),
|
|
|
|
new_node_size,
|
|
|
|
children_flag,
|
|
|
|
*keys_before)
|
|
|
|
right_node += struct.pack(
|
|
|
|
'<' + (nr_of_keys_to_rewrite + 1) *
|
|
|
|
(self.key_format + self.pointer_format),
|
|
|
|
new_key,
|
|
|
|
new_pointer,
|
|
|
|
*keys_after)
|
|
|
|
new_root = self._prepare_new_root_data(key_moved_to_root,
|
|
|
|
new_node_start,
|
|
|
|
new_node_start + self.node_size)
|
|
|
|
left_node += (self.node_capacity - old_node_size) * \
|
|
|
|
(self.key_size + self.pointer_size) * '\x00'
|
|
|
|
# adding blanks after new node
|
|
|
|
right_node += (self.node_capacity - new_node_size) * \
|
|
|
|
(self.key_size + self.pointer_size) * '\x00'
|
|
|
|
self.buckets.seek(0, 2)
|
|
|
|
self.buckets.write(left_node + right_node)
|
|
|
|
self.buckets.seek(self.data_start)
|
|
|
|
self.buckets.write(new_root)
|
|
|
|
|
|
|
|
self._read_single_node_key.delete(node_start)
|
|
|
|
self._read_node_nr_of_elements_and_children_flag.delete(node_start)
|
|
|
|
return None
|
|
|
|
|
|
|
|
def _split_node(self, node_start, nr_of_keys_to_rewrite, new_key, new_pointer, children_flag, create_new_root=False):
|
|
|
|
"""
|
|
|
|
Splits full node in two separate ones, first half of records stays on old position,
|
|
|
|
second half is written as new leaf at the end of file.
|
|
|
|
"""
|
|
|
|
half_size = self.node_capacity / 2
|
|
|
|
if self.node_capacity % 2 == 0:
|
|
|
|
old_node_size = new_node_size = half_size
|
|
|
|
else:
|
|
|
|
old_node_size = half_size
|
|
|
|
new_node_size = half_size + 1
|
|
|
|
if create_new_root:
|
|
|
|
self._create_new_root_from_node(node_start, children_flag, nr_of_keys_to_rewrite, new_node_size, old_node_size, new_key, new_pointer)
|
|
|
|
else:
|
|
|
|
blanks = (self.node_capacity - new_node_size) * (
|
|
|
|
self.key_size + self.pointer_size) * '\x00'
|
|
|
|
if nr_of_keys_to_rewrite == new_node_size: # insert key into first half of node
|
|
|
|
# reading second half of node
|
|
|
|
self.buckets.seek(self._calculate_key_position(node_start,
|
|
|
|
old_node_size,
|
|
|
|
'n') + self.pointer_size)
|
|
|
|
# read all keys with key>new_key
|
|
|
|
data = self.buckets.read(nr_of_keys_to_rewrite *
|
|
|
|
(self.key_size + self.pointer_size))
|
|
|
|
old_node_data = struct.unpack('<' + nr_of_keys_to_rewrite *
|
|
|
|
(self.key_format + self.pointer_format), data)
|
|
|
|
# write new node at end of file
|
|
|
|
self.buckets.seek(0, 2)
|
|
|
|
new_node_start = self.buckets.tell()
|
|
|
|
# prepare new node_data
|
|
|
|
new_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
(self.key_format +
|
|
|
|
self.pointer_format) * new_node_size,
|
|
|
|
new_node_size,
|
|
|
|
children_flag,
|
|
|
|
new_pointer,
|
|
|
|
*old_node_data)
|
|
|
|
new_node += blanks
|
|
|
|
# write new node
|
|
|
|
self.buckets.write(new_node)
|
|
|
|
# update old node data
|
|
|
|
self._update_size(
|
|
|
|
node_start, old_node_size)
|
|
|
|
|
|
|
|
self._read_single_node_key.delete(node_start)
|
|
|
|
self._read_node_nr_of_elements_and_children_flag.delete(
|
|
|
|
node_start)
|
|
|
|
|
|
|
|
return new_node_start, new_key
|
|
|
|
elif nr_of_keys_to_rewrite > half_size: # insert key into first half of node
|
|
|
|
# seek for first key to rewrite
|
|
|
|
self.buckets.seek(self._calculate_key_position(node_start, self.node_capacity - nr_of_keys_to_rewrite, 'n')
|
|
|
|
+ self.pointer_size)
|
|
|
|
# read all keys with key>new_key
|
|
|
|
data = self.buckets.read(
|
|
|
|
nr_of_keys_to_rewrite * (self.key_size + self.pointer_size))
|
|
|
|
old_node_data = struct.unpack(
|
|
|
|
'<' + nr_of_keys_to_rewrite * (self.key_format + self.pointer_format), data)
|
|
|
|
key_moved_to_parent_node = old_node_data[-(
|
|
|
|
new_node_size + 1) * 2]
|
|
|
|
self.buckets.seek(0, 2)
|
|
|
|
new_node_start = self.buckets.tell()
|
|
|
|
# prepare new node_data
|
|
|
|
new_node = struct.pack('<' + self.node_heading_format +
|
|
|
|
self.pointer_format + (self.key_format +
|
|
|
|
self.pointer_format) * new_node_size,
|
|
|
|
new_node_size,
|
|
|
|
children_flag,
|
|
|
|
old_node_data[-new_node_size * 2 - 1],
|
|
|
|
*old_node_data[-new_node_size * 2:])
|
|
|
|
new_node += blanks
|
|
|
|
# write new node
|
|
|
|
self.buckets.write(new_node)
|
|
|
|
self._update_size(
|
|
|
|
node_start, old_node_size)
|
|
|
|
# seek position of new key in first half
|
|
|
|
self.buckets.seek(self._calculate_key_position(node_start, self.node_capacity - nr_of_keys_to_rewrite, 'n')
|
|
|
|
+ self.pointer_size)
|
|
|
|
# write new key and keys after
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack(
|
|
|
|
'<' + (self.key_format + self.pointer_format) *
|
|
|
|
(nr_of_keys_to_rewrite - new_node_size),
|
|
|
|
new_key,
|
|
|
|
new_pointer,
|
|
|
|
*old_node_data[:-(new_node_size + 1) * 2]))
|
|
|
|
|
|
|
|
self._read_single_node_key.delete(node_start)
|
|
|
|
self._read_node_nr_of_elements_and_children_flag.delete(
|
|
|
|
node_start)
|
|
|
|
|
|
|
|
return new_node_start, key_moved_to_parent_node
|
|
|
|
else: # key goes into second half
|
|
|
|
# reading second half of node
|
|
|
|
self.buckets.seek(self._calculate_key_position(node_start,
|
|
|
|
old_node_size,
|
|
|
|
'n')
|
|
|
|
+ self.pointer_size)
|
|
|
|
data = self.buckets.read(
|
|
|
|
new_node_size * (self.key_size + self.pointer_size))
|
|
|
|
old_node_data = struct.unpack('<' + new_node_size *
|
|
|
|
(self.key_format + self.pointer_format), data)
|
|
|
|
# find key which goes to parent node
|
|
|
|
key_moved_to_parent_node = old_node_data[0]
|
|
|
|
self.buckets.seek(0, 2) # end of file
|
|
|
|
new_node_start = self.buckets.tell()
|
|
|
|
index_of_records_split = nr_of_keys_to_rewrite * 2
|
|
|
|
# prepare new node_data
|
|
|
|
first_leaf_pointer = old_node_data[1]
|
|
|
|
old_node_data = old_node_data[2:]
|
|
|
|
if index_of_records_split:
|
|
|
|
keys_before = old_node_data[:-index_of_records_split]
|
|
|
|
keys_after = old_node_data[-index_of_records_split:]
|
|
|
|
else:
|
|
|
|
keys_before = old_node_data
|
|
|
|
keys_after = []
|
|
|
|
new_node = struct.pack('<' + self.node_heading_format + self.pointer_format +
|
|
|
|
(self.key_format + self.pointer_format) *
|
|
|
|
(new_node_size -
|
|
|
|
nr_of_keys_to_rewrite - 1),
|
|
|
|
new_node_size,
|
|
|
|
children_flag,
|
|
|
|
first_leaf_pointer,
|
|
|
|
*keys_before)
|
|
|
|
new_node += struct.pack('<' + (self.key_format + self.pointer_format) *
|
|
|
|
(nr_of_keys_to_rewrite + 1),
|
|
|
|
new_key,
|
|
|
|
new_pointer,
|
|
|
|
*keys_after)
|
|
|
|
new_node += blanks
|
|
|
|
# write new node
|
|
|
|
self.buckets.write(new_node)
|
|
|
|
self._update_size(node_start, old_node_size)
|
|
|
|
|
|
|
|
self._read_single_node_key.delete(node_start)
|
|
|
|
self._read_node_nr_of_elements_and_children_flag.delete(
|
|
|
|
node_start)
|
|
|
|
|
|
|
|
return new_node_start, key_moved_to_parent_node
|
|
|
|
|
|
|
|
def insert_first_record_into_leaf(self, leaf_start, key, doc_id, start, size, status):
|
|
|
|
self.buckets.seek(leaf_start)
|
|
|
|
self.buckets.write(struct.pack('<' + self.elements_counter_format,
|
|
|
|
1))
|
|
|
|
self.buckets.seek(leaf_start + self.leaf_heading_size)
|
|
|
|
self.buckets.write(struct.pack('<' + self.single_leaf_record_format,
|
|
|
|
key,
|
|
|
|
doc_id,
|
|
|
|
start,
|
|
|
|
size,
|
|
|
|
status))
|
|
|
|
|
|
|
|
# self._read_single_leaf_record.delete(leaf_start)
|
|
|
|
self._find_key_in_leaf.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements.delete(leaf_start)
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.delete(leaf_start)
|
|
|
|
|
|
|
|
def _insert_new_record_into_leaf(self, leaf_start, key, doc_id, start, size, status, nodes_stack, indexes):
|
|
|
|
nr_of_elements = self._read_leaf_nr_of_elements(leaf_start)
|
|
|
|
if nr_of_elements == 0:
|
|
|
|
self.insert_first_record_into_leaf(
|
|
|
|
leaf_start, key, doc_id, start, size, status)
|
|
|
|
return
|
|
|
|
leaf_start, new_record_position, nr_of_records_to_rewrite, full_leaf, on_deleted\
|
|
|
|
= self._find_place_in_leaf(key, leaf_start, nr_of_elements)
|
|
|
|
if full_leaf:
|
|
|
|
try: # check if leaf has parent node
|
|
|
|
leaf_parent_pointer = nodes_stack.pop()
|
|
|
|
except IndexError: # leaf is a root
|
|
|
|
leaf_parent_pointer = 0
|
|
|
|
split_data = self._split_leaf(leaf_start,
|
|
|
|
nr_of_records_to_rewrite,
|
|
|
|
key,
|
|
|
|
doc_id,
|
|
|
|
start,
|
|
|
|
size,
|
|
|
|
status,
|
|
|
|
create_new_root=(False if leaf_parent_pointer else True))
|
|
|
|
if split_data is not None: # means that split created new root or replaced split with update_if_has_deleted
|
|
|
|
new_leaf_start_position, key_moved_to_parent_node = split_data
|
|
|
|
self._insert_new_key_into_node(leaf_parent_pointer,
|
|
|
|
key_moved_to_parent_node,
|
|
|
|
leaf_start,
|
|
|
|
new_leaf_start_position,
|
|
|
|
nodes_stack,
|
|
|
|
indexes)
|
|
|
|
else: # there is a place for record in leaf
|
|
|
|
self.buckets.seek(leaf_start)
|
|
|
|
self._update_leaf(
|
|
|
|
leaf_start, new_record_position, nr_of_elements, nr_of_records_to_rewrite,
|
|
|
|
on_deleted, key, doc_id, start, size, status)
|
|
|
|
|
|
|
|
def _update_node(self, new_key_position, nr_of_keys_to_rewrite, new_key, new_pointer):
|
|
|
|
if nr_of_keys_to_rewrite == 0:
|
|
|
|
self.buckets.seek(new_key_position)
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack('<' + self.key_format + self.pointer_format,
|
|
|
|
new_key,
|
|
|
|
new_pointer))
|
|
|
|
self.flush()
|
|
|
|
else:
|
|
|
|
self.buckets.seek(new_key_position)
|
|
|
|
data = self.buckets.read(nr_of_keys_to_rewrite * (
|
|
|
|
self.key_size + self.pointer_size))
|
|
|
|
keys_to_rewrite = struct.unpack(
|
|
|
|
'<' + nr_of_keys_to_rewrite * (self.key_format + self.pointer_format), data)
|
|
|
|
self.buckets.seek(new_key_position)
|
|
|
|
self.buckets.write(
|
|
|
|
struct.pack(
|
|
|
|
'<' + (nr_of_keys_to_rewrite + 1) *
|
|
|
|
(self.key_format + self.pointer_format),
|
|
|
|
new_key,
|
|
|
|
new_pointer,
|
|
|
|
*keys_to_rewrite))
|
|
|
|
self.flush()
|
|
|
|
|
|
|
|
def _insert_new_key_into_node(self, node_start, new_key, old_half_start, new_half_start, nodes_stack, indexes):
|
|
|
|
parent_key_index = indexes.pop()
|
|
|
|
nr_of_elements, children_flag = self._read_node_nr_of_elements_and_children_flag(node_start)
|
|
|
|
parent_prev_pointer = self._read_single_node_key(
|
|
|
|
node_start, parent_key_index)[0]
|
|
|
|
if parent_prev_pointer == old_half_start: # splited child was on the left side of his parent key, must write new key before it
|
|
|
|
new_key_position = self.pointer_size + self._calculate_key_position(node_start, parent_key_index, 'n')
|
|
|
|
nr_of_keys_to_rewrite = nr_of_elements - parent_key_index
|
|
|
|
else: # splited child was on the right side of his parent key, must write new key after it
|
|
|
|
new_key_position = self.pointer_size + self._calculate_key_position(node_start, parent_key_index + 1, 'n')
|
|
|
|
nr_of_keys_to_rewrite = nr_of_elements - (parent_key_index + 1)
|
|
|
|
if nr_of_elements == self.node_capacity:
|
|
|
|
try: # check if node has parent
|
|
|
|
node_parent_pointer = nodes_stack.pop()
|
|
|
|
except IndexError: # node is a root
|
|
|
|
node_parent_pointer = 0
|
|
|
|
new_data = self._split_node(node_start,
|
|
|
|
nr_of_keys_to_rewrite,
|
|
|
|
new_key,
|
|
|
|
new_half_start,
|
|
|
|
children_flag,
|
|
|
|
create_new_root=(False if node_parent_pointer else True))
|
|
|
|
if new_data: # if not new_data, new root has been created
|
|
|
|
new_node_start_position, key_moved_to_parent_node = new_data
|
|
|
|
self._insert_new_key_into_node(node_parent_pointer,
|
|
|
|
key_moved_to_parent_node,
|
|
|
|
node_start,
|
|
|
|
new_node_start_position,
|
|
|
|
nodes_stack,
|
|
|
|
indexes)
|
|
|
|
|
|
|
|
self._find_first_key_occurence_in_node.delete(node_start)
|
|
|
|
self._find_last_key_occurence_in_node.delete(node_start)
|
|
|
|
else: # there is a empty slot for new key in node
|
|
|
|
self._update_size(node_start, nr_of_elements + 1)
|
|
|
|
self._update_node(new_key_position,
|
|
|
|
nr_of_keys_to_rewrite,
|
|
|
|
new_key,
|
|
|
|
new_half_start)
|
|
|
|
|
|
|
|
self._find_first_key_occurence_in_node.delete(node_start)
|
|
|
|
self._find_last_key_occurence_in_node.delete(node_start)
|
|
|
|
self._read_single_node_key.delete(node_start)
|
|
|
|
self._read_node_nr_of_elements_and_children_flag.delete(node_start)
|
|
|
|
|
|
|
|
def _find_leaf_to_insert(self, key):
|
|
|
|
"""
|
|
|
|
Traverses tree in search for leaf for insert, remembering parent nodes in path,
|
|
|
|
looks for last occurence of key if already in tree.
|
|
|
|
"""
|
|
|
|
nodes_stack = [self.data_start]
|
|
|
|
if self.root_flag == 'l':
|
|
|
|
return nodes_stack, []
|
|
|
|
else:
|
|
|
|
nr_of_elements, curr_child_flag = self._read_node_nr_of_elements_and_children_flag(self.data_start)
|
|
|
|
curr_index, curr_pointer = self._find_last_key_occurence_in_node(
|
|
|
|
self.data_start, key, nr_of_elements)
|
|
|
|
nodes_stack.append(curr_pointer)
|
|
|
|
indexes = [curr_index]
|
|
|
|
while(curr_child_flag == 'n'):
|
|
|
|
nr_of_elements, curr_child_flag = self._read_node_nr_of_elements_and_children_flag(curr_pointer)
|
|
|
|
curr_index, curr_pointer = self._find_last_key_occurence_in_node(curr_pointer, key, nr_of_elements)
|
|
|
|
nodes_stack.append(curr_pointer)
|
|
|
|
indexes.append(curr_index)
|
|
|
|
return nodes_stack, indexes
|
|
|
|
# nodes stack contains start addreses of nodes directly above leaf with key, indexes match keys adjacent nodes_stack values (as pointers)
|
|
|
|
# required when inserting new keys in upper tree levels
|
|
|
|
|
|
|
|
def _find_leaf_with_last_key_occurence(self, key):
|
|
|
|
if self.root_flag == 'l':
|
|
|
|
return self.data_start
|
|
|
|
else:
|
|
|
|
nr_of_elements, curr_child_flag = self._read_node_nr_of_elements_and_children_flag(self.data_start)
|
|
|
|
curr_position = self._find_last_key_occurence_in_node(
|
|
|
|
self.data_start, key, nr_of_elements)[1]
|
|
|
|
while(curr_child_flag == 'n'):
|
|
|
|
nr_of_elements, curr_child_flag = self._read_node_nr_of_elements_and_children_flag(curr_position)
|
|
|
|
curr_position = self._find_last_key_occurence_in_node(
|
|
|
|
curr_position, key, nr_of_elements)[1]
|
|
|
|
return curr_position
|
|
|
|
|
|
|
|
def _find_leaf_with_first_key_occurence(self, key):
|
|
|
|
if self.root_flag == 'l':
|
|
|
|
return self.data_start
|
|
|
|
else:
|
|
|
|
nr_of_elements, curr_child_flag = self._read_node_nr_of_elements_and_children_flag(self.data_start)
|
|
|
|
curr_position = self._find_first_key_occurence_in_node(
|
|
|
|
self.data_start, key, nr_of_elements)[1]
|
|
|
|
while(curr_child_flag == 'n'):
|
|
|
|
nr_of_elements, curr_child_flag = self._read_node_nr_of_elements_and_children_flag(curr_position)
|
|
|
|
curr_position = self._find_first_key_occurence_in_node(
|
|
|
|
curr_position, key, nr_of_elements)[1]
|
|
|
|
return curr_position
|
|
|
|
|
|
|
|
def _find_key(self, key):
|
|
|
|
containing_leaf_start = self._find_leaf_with_first_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(containing_leaf_start)
|
|
|
|
try:
|
|
|
|
doc_id, l_key, start, size, status = self._find_key_in_leaf(
|
|
|
|
containing_leaf_start, key, nr_of_elements)
|
|
|
|
except ElemNotFound:
|
|
|
|
if next_leaf:
|
|
|
|
nr_of_elements = self._read_leaf_nr_of_elements(next_leaf)
|
|
|
|
else:
|
|
|
|
raise ElemNotFound
|
|
|
|
doc_id, l_key, start, size, status = self._find_key_in_leaf(
|
|
|
|
next_leaf, key, nr_of_elements)
|
|
|
|
return doc_id, l_key, start, size, status
|
|
|
|
|
|
|
|
def _find_key_to_update(self, key, doc_id):
|
|
|
|
"""
|
|
|
|
Search tree for key that matches not only given key but also doc_id.
|
|
|
|
"""
|
|
|
|
containing_leaf_start = self._find_leaf_with_first_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(containing_leaf_start)
|
|
|
|
try:
|
|
|
|
leaf_start, record_index, doc_id, l_key, start, size, status = self._find_key_in_leaf_for_update(key,
|
|
|
|
doc_id,
|
|
|
|
containing_leaf_start,
|
|
|
|
nr_of_elements)
|
|
|
|
except ElemNotFound:
|
|
|
|
if next_leaf:
|
|
|
|
nr_of_elements = self._read_leaf_nr_of_elements(next_leaf)
|
|
|
|
else:
|
|
|
|
raise TryReindexException()
|
|
|
|
try:
|
|
|
|
leaf_start, record_index, doc_id, l_key, start, size, status = self._find_key_in_leaf_for_update(key,
|
|
|
|
doc_id,
|
|
|
|
next_leaf,
|
|
|
|
nr_of_elements)
|
|
|
|
except ElemNotFound:
|
|
|
|
raise TryReindexException()
|
|
|
|
return leaf_start, record_index, doc_id, l_key, start, size, status
|
|
|
|
|
|
|
|
def update(self, doc_id, key, u_start=0, u_size=0, u_status='o'):
|
|
|
|
containing_leaf_start, element_index, old_doc_id, old_key, old_start, old_size, old_status = self._find_key_to_update(key, doc_id)
|
|
|
|
if u_start:
|
|
|
|
old_start = u_start
|
|
|
|
if u_size:
|
|
|
|
old_size = u_size
|
|
|
|
if u_status:
|
|
|
|
old_status = u_status
|
|
|
|
new_data = (old_doc_id, old_start, old_size, old_status)
|
|
|
|
self._update_element(containing_leaf_start, element_index, new_data)
|
|
|
|
|
|
|
|
self._find_key.delete(key)
|
|
|
|
self._match_doc_id.delete(doc_id)
|
|
|
|
self._find_key_in_leaf.delete(containing_leaf_start, key)
|
|
|
|
return True
|
|
|
|
|
|
|
|
def delete(self, doc_id, key, start=0, size=0):
|
|
|
|
containing_leaf_start, element_index = self._find_key_to_update(
|
|
|
|
key, doc_id)[:2]
|
|
|
|
self._delete_element(containing_leaf_start, element_index)
|
|
|
|
|
|
|
|
self._find_key.delete(key)
|
|
|
|
self._match_doc_id.delete(doc_id)
|
|
|
|
self._find_key_in_leaf.delete(containing_leaf_start, key)
|
|
|
|
return True
|
|
|
|
|
|
|
|
def _find_key_many(self, key, limit=1, offset=0):
|
|
|
|
leaf_with_key = self._find_leaf_with_first_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
try:
|
|
|
|
leaf_with_key, key_index = self._find_index_of_first_key_equal(
|
|
|
|
key, leaf_with_key, nr_of_elements)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
except ElemNotFound:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
key_index = 0
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
while offset:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if key == curr_key:
|
|
|
|
if status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if key == curr_key:
|
|
|
|
if status != 'd':
|
|
|
|
yield doc_id, start, size, status
|
|
|
|
limit -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def _find_key_smaller(self, key, limit=1, offset=0):
|
|
|
|
leaf_with_key = self._find_leaf_with_first_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
leaf_with_key, key_index = self._find_index_of_first_key_equal_or_smaller_key(key, leaf_with_key, nr_of_elements)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
curr_key = self._read_single_leaf_record(leaf_with_key, key_index)[0]
|
|
|
|
if curr_key >= key:
|
|
|
|
key_index -= 1
|
|
|
|
while offset:
|
|
|
|
if key_index >= 0:
|
|
|
|
key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index -= 1
|
|
|
|
else:
|
|
|
|
if prev_leaf:
|
|
|
|
leaf_with_key = prev_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(prev_leaf)
|
|
|
|
key_index = nr_of_elements - 1
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index >= 0:
|
|
|
|
key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
yield doc_id, key, start, size, status
|
|
|
|
limit -= 1
|
|
|
|
key_index -= 1
|
|
|
|
else:
|
|
|
|
if prev_leaf:
|
|
|
|
leaf_with_key = prev_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(prev_leaf)
|
|
|
|
key_index = nr_of_elements - 1
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def _find_key_equal_and_smaller(self, key, limit=1, offset=0):
|
|
|
|
leaf_with_key = self._find_leaf_with_last_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
try:
|
|
|
|
leaf_with_key, key_index = self._find_index_of_last_key_equal_or_smaller_key(key, leaf_with_key, nr_of_elements)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
except ElemNotFound:
|
|
|
|
leaf_with_key = prev_leaf
|
|
|
|
key_index = self._read_leaf_nr_of_elements_and_neighbours(
|
|
|
|
leaf_with_key)[0]
|
|
|
|
curr_key = self._read_single_leaf_record(leaf_with_key, key_index)[0]
|
|
|
|
if curr_key > key:
|
|
|
|
key_index -= 1
|
|
|
|
while offset:
|
|
|
|
if key_index >= 0:
|
|
|
|
key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index -= 1
|
|
|
|
else:
|
|
|
|
if prev_leaf:
|
|
|
|
leaf_with_key = prev_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(prev_leaf)
|
|
|
|
key_index = nr_of_elements - 1
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index >= 0:
|
|
|
|
key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
yield doc_id, key, start, size, status
|
|
|
|
limit -= 1
|
|
|
|
key_index -= 1
|
|
|
|
else:
|
|
|
|
if prev_leaf:
|
|
|
|
leaf_with_key = prev_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(prev_leaf)
|
|
|
|
key_index = nr_of_elements - 1
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def _find_key_bigger(self, key, limit=1, offset=0):
|
|
|
|
leaf_with_key = self._find_leaf_with_last_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
try:
|
|
|
|
leaf_with_key, key_index = self._find_index_of_last_key_equal_or_smaller_key(key, leaf_with_key, nr_of_elements)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
except ElemNotFound:
|
|
|
|
key_index = 0
|
|
|
|
curr_key = self._read_single_leaf_record(leaf_with_key, key_index)[0]
|
|
|
|
if curr_key <= key:
|
|
|
|
key_index += 1
|
|
|
|
while offset:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
yield doc_id, curr_key, start, size, status
|
|
|
|
limit -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def _find_key_equal_and_bigger(self, key, limit=1, offset=0):
|
|
|
|
leaf_with_key = self._find_leaf_with_first_key_occurence(key)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
leaf_with_key, key_index = self._find_index_of_first_key_equal_or_smaller_key(key, leaf_with_key, nr_of_elements)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
curr_key = self._read_single_leaf_record(leaf_with_key, key_index)[0]
|
|
|
|
if curr_key < key:
|
|
|
|
key_index += 1
|
|
|
|
while offset:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
yield doc_id, curr_key, start, size, status
|
|
|
|
limit -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def _find_key_between(self, start, end, limit, offset, inclusive_start, inclusive_end):
|
|
|
|
"""
|
|
|
|
Returns generator containing all keys withing given interval.
|
|
|
|
"""
|
|
|
|
if inclusive_start:
|
|
|
|
leaf_with_key = self._find_leaf_with_first_key_occurence(start)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
leaf_with_key, key_index = self._find_index_of_first_key_equal_or_smaller_key(start, leaf_with_key, nr_of_elements)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
curr_key = self._read_single_leaf_record(
|
|
|
|
leaf_with_key, key_index)[0]
|
|
|
|
if curr_key < start:
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
leaf_with_key = self._find_leaf_with_last_key_occurence(start)
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_with_key)
|
|
|
|
leaf_with_key, key_index = self._find_index_of_last_key_equal_or_smaller_key(start, leaf_with_key, nr_of_elements)
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_with_key, key_index)
|
|
|
|
if curr_key <= start:
|
|
|
|
key_index += 1
|
|
|
|
while offset:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_with_key, key_index)
|
|
|
|
if curr_status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, curr_doc_id, curr_start, curr_size, curr_status = self._read_single_leaf_record(leaf_with_key, key_index)
|
|
|
|
if curr_key > end or (curr_key == end and not inclusive_end):
|
|
|
|
return
|
|
|
|
elif curr_status != 'd':
|
|
|
|
yield curr_doc_id, curr_key, curr_start, curr_size, curr_status
|
|
|
|
limit -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_with_key = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def get(self, key):
|
|
|
|
return self._find_key(self.make_key(key))
|
|
|
|
|
|
|
|
def get_many(self, key, limit=1, offset=0):
|
|
|
|
return self._find_key_many(self.make_key(key), limit, offset)
|
|
|
|
|
|
|
|
def get_between(self, start, end, limit=1, offset=0, inclusive_start=True, inclusive_end=True):
|
|
|
|
if start is None:
|
|
|
|
end = self.make_key(end)
|
|
|
|
if inclusive_end:
|
|
|
|
return self._find_key_equal_and_smaller(end, limit, offset)
|
|
|
|
else:
|
|
|
|
return self._find_key_smaller(end, limit, offset)
|
|
|
|
elif end is None:
|
|
|
|
start = self.make_key(start)
|
|
|
|
if inclusive_start:
|
|
|
|
return self._find_key_equal_and_bigger(start, limit, offset)
|
|
|
|
else:
|
|
|
|
return self._find_key_bigger(start, limit, offset)
|
|
|
|
else:
|
|
|
|
start = self.make_key(start)
|
|
|
|
end = self.make_key(end)
|
|
|
|
return self._find_key_between(start, end, limit, offset, inclusive_start, inclusive_end)
|
|
|
|
|
|
|
|
def all(self, limit=-1, offset=0):
|
|
|
|
"""
|
|
|
|
Traverses linked list of all tree leaves and returns generator containing all elements stored in index.
|
|
|
|
"""
|
|
|
|
if self.root_flag == 'n':
|
|
|
|
leaf_start = self.data_start + self.node_size
|
|
|
|
else:
|
|
|
|
leaf_start = self.data_start
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(leaf_start)
|
|
|
|
key_index = 0
|
|
|
|
while offset:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_start, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
offset -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_start = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
while limit:
|
|
|
|
if key_index < nr_of_elements:
|
|
|
|
curr_key, doc_id, start, size, status = self._read_single_leaf_record(
|
|
|
|
leaf_start, key_index)
|
|
|
|
if status != 'd':
|
|
|
|
yield doc_id, curr_key, start, size, status
|
|
|
|
limit -= 1
|
|
|
|
key_index += 1
|
|
|
|
else:
|
|
|
|
key_index = 0
|
|
|
|
if next_leaf:
|
|
|
|
leaf_start = next_leaf
|
|
|
|
nr_of_elements, prev_leaf, next_leaf = self._read_leaf_nr_of_elements_and_neighbours(next_leaf)
|
|
|
|
else:
|
|
|
|
return
|
|
|
|
|
|
|
|
def make_key(self, key):
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
def make_key_value(self, data):
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
def _open_storage(self):
|
|
|
|
s = globals()[self.storage_class]
|
|
|
|
if not self.storage:
|
|
|
|
self.storage = s(self.db_path, self.name)
|
|
|
|
self.storage.open()
|
|
|
|
|
|
|
|
def _create_storage(self):
|
|
|
|
s = globals()[self.storage_class]
|
|
|
|
if not self.storage:
|
|
|
|
self.storage = s(self.db_path, self.name)
|
|
|
|
self.storage.create()
|
|
|
|
|
|
|
|
def compact(self, node_capacity=0):
|
|
|
|
if not node_capacity:
|
|
|
|
node_capacity = self.node_capacity
|
|
|
|
|
|
|
|
compact_ind = self.__class__(
|
|
|
|
self.db_path, self.name + '_compact', node_capacity=node_capacity)
|
|
|
|
compact_ind.create_index()
|
|
|
|
|
|
|
|
gen = self.all()
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
doc_id, key, start, size, status = gen.next()
|
|
|
|
except StopIteration:
|
|
|
|
break
|
|
|
|
self.storage._f.seek(start)
|
|
|
|
value = self.storage._f.read(size)
|
|
|
|
start_ = compact_ind.storage._f.tell()
|
|
|
|
compact_ind.storage._f.write(value)
|
|
|
|
compact_ind.insert(doc_id, key, start_, size, status)
|
|
|
|
|
|
|
|
compact_ind.close_index()
|
|
|
|
original_name = self.name
|
|
|
|
# os.unlink(os.path.join(self.db_path, self.name + "_buck"))
|
|
|
|
self.close_index()
|
|
|
|
shutil.move(os.path.join(compact_ind.db_path, compact_ind.
|
|
|
|
name + "_buck"), os.path.join(self.db_path, self.name + "_buck"))
|
|
|
|
shutil.move(os.path.join(compact_ind.db_path, compact_ind.
|
|
|
|
name + "_stor"), os.path.join(self.db_path, self.name + "_stor"))
|
|
|
|
# self.name = original_name
|
|
|
|
self.open_index() # reload...
|
|
|
|
self.name = original_name
|
|
|
|
self._save_params(dict(name=original_name))
|
|
|
|
self._fix_params()
|
|
|
|
self._clear_cache()
|
|
|
|
return True
|
|
|
|
|
|
|
|
def _fix_params(self):
|
|
|
|
super(IU_TreeBasedIndex, self)._fix_params()
|
|
|
|
self._count_props()
|
|
|
|
|
|
|
|
def _clear_cache(self):
|
|
|
|
self._find_key.clear()
|
|
|
|
self._match_doc_id.clear()
|
|
|
|
# self._read_single_leaf_record.clear()
|
|
|
|
self._find_key_in_leaf.clear()
|
|
|
|
self._read_single_node_key.clear()
|
|
|
|
self._find_first_key_occurence_in_node.clear()
|
|
|
|
self._find_last_key_occurence_in_node.clear()
|
|
|
|
self._read_leaf_nr_of_elements.clear()
|
|
|
|
self._read_leaf_neighbours.clear()
|
|
|
|
self._read_leaf_nr_of_elements_and_neighbours.clear()
|
|
|
|
self._read_node_nr_of_elements_and_children_flag.clear()
|
|
|
|
|
|
|
|
def close_index(self):
|
|
|
|
super(IU_TreeBasedIndex, self).close_index()
|
|
|
|
self._clear_cache()
|
|
|
|
|
|
|
|
|
|
|
|
class IU_MultiTreeBasedIndex(IU_TreeBasedIndex):
|
|
|
|
"""
|
|
|
|
Class that allows to index more than one key per database record.
|
|
|
|
|
|
|
|
It operates very well on GET/INSERT. It's not optimized for
|
|
|
|
UPDATE operations (will always readd everything)
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
|
|
super(IU_MultiTreeBasedIndex, self).__init__(*args, **kwargs)
|
|
|
|
|
|
|
|
def insert(self, doc_id, key, start, size, status='o'):
|
|
|
|
if isinstance(key, (list, tuple)):
|
|
|
|
key = set(key)
|
|
|
|
elif not isinstance(key, set):
|
|
|
|
key = set([key])
|
|
|
|
ins = super(IU_MultiTreeBasedIndex, self).insert
|
|
|
|
for curr_key in key:
|
|
|
|
ins(doc_id, curr_key, start, size, status)
|
|
|
|
return True
|
|
|
|
|
|
|
|
def update(self, doc_id, key, u_start, u_size, u_status='o'):
|
|
|
|
if isinstance(key, (list, tuple)):
|
|
|
|
key = set(key)
|
|
|
|
elif not isinstance(key, set):
|
|
|
|
key = set([key])
|
|
|
|
upd = super(IU_MultiTreeBasedIndex, self).update
|
|
|
|
for curr_key in key:
|
|
|
|
upd(doc_id, curr_key, u_start, u_size, u_status)
|
|
|
|
|
|
|
|
def delete(self, doc_id, key, start=0, size=0):
|
|
|
|
if isinstance(key, (list, tuple)):
|
|
|
|
key = set(key)
|
|
|
|
elif not isinstance(key, set):
|
|
|
|
key = set([key])
|
|
|
|
delete = super(IU_MultiTreeBasedIndex, self).delete
|
|
|
|
for curr_key in key:
|
|
|
|
delete(doc_id, curr_key, start, size)
|
|
|
|
|
|
|
|
def get(self, key):
|
|
|
|
return super(IU_MultiTreeBasedIndex, self).get(key)
|
|
|
|
|
|
|
|
def make_key_value(self, data):
|
|
|
|
raise NotImplementedError()
|
|
|
|
|
|
|
|
|
|
|
|
# classes for public use, done in this way because of
|
|
|
|
# generation static files with indexes (_index directory)
|
|
|
|
|
|
|
|
|
|
|
|
class TreeBasedIndex(IU_TreeBasedIndex):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class MultiTreeBasedIndex(IU_MultiTreeBasedIndex):
|
|
|
|
"""
|
|
|
|
It allows to index more than one key for record. (ie. prefix/infix/suffix search mechanizms)
|
|
|
|
That class is designed to be used in custom indexes.
|
|
|
|
"""
|
|
|
|
pass
|