You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 

645 lines
25 KiB

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2011-2013 Codernity (http://codernity.com)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
import tokenize
import token
import uuid
class IndexCreatorException(Exception):
def __init__(self, ex, line=None):
self.ex = ex
self.line = line
def __str__(self):
if self.line:
return repr(self.ex + "(in line: %d)" % self.line)
return repr(self.ex)
class IndexCreatorFunctionException(IndexCreatorException):
pass
class IndexCreatorValueException(IndexCreatorException):
pass
class Parser(object):
def __init__(self):
pass
def parse(self, data, name=None):
if not name:
self.name = "_" + uuid.uuid4().hex
else:
self.name = name
self.ind = 0
self.stage = 0
self.logic = ['and', 'or', 'in']
self.logic2 = ['&', '|']
self.allowed_props = {'TreeBasedIndex': ['type', 'name', 'key_format', 'node_capacity', 'pointer_format', 'meta_format'],
'HashIndex': ['type', 'name', 'key_format', 'hash_lim', 'entry_line_format'],
'MultiHashIndex': ['type', 'name', 'key_format', 'hash_lim', 'entry_line_format'],
'MultiTreeBasedIndex': ['type', 'name', 'key_format', 'node_capacity', 'pointer_format', 'meta_format']
}
self.funcs = {'md5': (['md5'], ['.digest()']),
'len': (['len'], []),
'str': (['str'], []),
'fix_r': (['self.fix_r'], []),
'prefix': (['self.prefix'], []),
'infix': (['self.infix'], []),
'suffix': (['self.suffix'], [])
}
self.handle_int_imports = {'infix': "from itertools import izip\n"}
self.funcs_with_body = {'fix_r':
(""" def fix_r(self,s,l):
e = len(s)
if e == l:
return s
elif e > l:
return s[:l]
else:
return s.rjust(l,'_')\n""", False),
'prefix':
(""" def prefix(self,s,m,l,f):
t = len(s)
if m < 1:
m = 1
o = set()
if t > l:
s = s[:l]
t = l
while m <= t:
o.add(s.rjust(f,'_'))
s = s[:-1]
t -= 1
return o\n""", False),
'suffix':
(""" def suffix(self,s,m,l,f):
t = len(s)
if m < 1:
m = 1
o = set()
if t > l:
s = s[t-l:]
t = len(s)
while m <= t:
o.add(s.rjust(f,'_'))
s = s[1:]
t -= 1
return o\n""", False),
'infix':
(""" def infix(self,s,m,l,f):
t = len(s)
o = set()
for x in xrange(m - 1, l):
t = (s, )
for y in xrange(0, x):
t += (s[y + 1:],)
o.update(set(''.join(x).rjust(f, '_').lower() for x in izip(*t)))
return o\n""", False)}
self.none = ['None', 'none', 'null']
self.props_assign = ['=', ':']
self.all_adj_num_comp = {token.NUMBER: (
token.NUMBER, token.NAME, '-', '('),
token.NAME: (token.NUMBER, token.NAME, '-', '('),
')': (token.NUMBER, token.NAME, '-', '(')
}
self.all_adj_num_op = {token.NUMBER: (token.NUMBER, token.NAME, '('),
token.NAME: (token.NUMBER, token.NAME, '('),
')': (token.NUMBER, token.NAME, '(')
}
self.allowed_adjacent = {
"<=": self.all_adj_num_comp,
">=": self.all_adj_num_comp,
">": self.all_adj_num_comp,
"<": self.all_adj_num_comp,
"==": {token.NUMBER: (token.NUMBER, token.NAME, '('),
token.NAME: (token.NUMBER, token.NAME, token.STRING, '('),
token.STRING: (token.NAME, token.STRING, '('),
')': (token.NUMBER, token.NAME, token.STRING, '('),
']': (token.NUMBER, token.NAME, token.STRING, '(')
},
"+": {token.NUMBER: (token.NUMBER, token.NAME, '('),
token.NAME: (token.NUMBER, token.NAME, token.STRING, '('),
token.STRING: (token.NAME, token.STRING, '('),
')': (token.NUMBER, token.NAME, token.STRING, '('),
']': (token.NUMBER, token.NAME, token.STRING, '(')
},
"-": {token.NUMBER: (token.NUMBER, token.NAME, '('),
token.NAME: (token.NUMBER, token.NAME, '('),
')': (token.NUMBER, token.NAME, '('),
'<': (token.NUMBER, token.NAME, '('),
'>': (token.NUMBER, token.NAME, '('),
'<=': (token.NUMBER, token.NAME, '('),
'>=': (token.NUMBER, token.NAME, '('),
'==': (token.NUMBER, token.NAME, '('),
']': (token.NUMBER, token.NAME, '(')
},
"*": self.all_adj_num_op,
"/": self.all_adj_num_op,
"%": self.all_adj_num_op,
",": {token.NUMBER: (token.NUMBER, token.NAME, token.STRING, '{', '[', '('),
token.NAME: (token.NUMBER, token.NAME, token.STRING, '(', '{', '['),
token.STRING: (token.NAME, token.STRING, token.NUMBER, '(', '{', '['),
')': (token.NUMBER, token.NAME, token.STRING, '(', '{', '['),
']': (token.NUMBER, token.NAME, token.STRING, '(', '{', '['),
'}': (token.NUMBER, token.NAME, token.STRING, '(', '{', '[')
}
}
def is_num(s):
m = re.search('[^0-9*()+\-\s/]+', s)
return not m
def is_string(s):
m = re.search('\s*(?P<a>[\'\"]+).*?(?P=a)\s*', s)
return m
data = re.split('make_key_value\:', data)
if len(data) < 2:
raise IndexCreatorFunctionException(
"Couldn't find a definition of make_key_value function!\n")
spl1 = re.split('make_key\:', data[0])
spl2 = re.split('make_key\:', data[1])
self.funcs_rev = False
if len(spl1) > 1:
data = [spl1[0]] + [data[1]] + [spl1[1]]
self.funcs_rev = True
elif len(spl2) > 1:
data = [data[0]] + spl2
else:
data.append("key")
if data[1] == re.search('\s*', data[1], re.S | re.M).group(0):
raise IndexCreatorFunctionException("Empty function body ",
len(re.split('\n', data[0])) + (len(re.split('\n', data[2])) if self.funcs_rev else 1) - 1)
if data[2] == re.search('\s*', data[2], re.S | re.M).group(0):
raise IndexCreatorFunctionException("Empty function body ",
len(re.split('\n', data[0])) + (1 if self.funcs_rev else len(re.split('\n', data[1]))) - 1)
if data[0] == re.search('\s*', data[0], re.S | re.M).group(0):
raise IndexCreatorValueException("You didn't set any properity or you set them not at the begining of the code\n")
data = [re.split(
'\n', data[0]), re.split('\n', data[1]), re.split('\n', data[2])]
self.cnt_lines = (len(data[0]), len(data[1]), len(data[2]))
ind = 0
self.predata = data
self.data = [[], [], []]
for i, v in enumerate(self.predata[0]):
for k, w in enumerate(self.predata[0][i]):
if self.predata[0][i][k] in self.props_assign:
if not is_num(self.predata[0][i][k + 1:]) and self.predata[0][i].strip()[:4] != 'type' and self.predata[0][i].strip()[:4] != 'name':
s = self.predata[0][i][k + 1:]
self.predata[0][i] = self.predata[0][i][:k + 1]
m = re.search('\s+', s.strip())
if not is_string(s) and not m:
s = "'" + s.strip() + "'"
self.predata[0][i] += s
break
for n, i in enumerate(self.predata):
for k in i:
k = k.strip()
if k:
self.data[ind].append(k)
self.check_enclosures(k, n)
ind += 1
return self.parse_ex()
def readline(self, stage):
def foo():
if len(self.data[stage]) <= self.ind:
self.ind = 0
return ""
else:
self.ind += 1
return self.data[stage][self.ind - 1]
return foo
def add(self, l, i):
def add_aux(*args):
# print args,self.ind
if len(l[i]) < self.ind:
l[i].append([])
l[i][self.ind - 1].append(args)
return add_aux
def parse_ex(self):
self.index_name = ""
self.index_type = ""
self.curLine = -1
self.con = -1
self.brackets = -1
self.curFunc = None
self.colons = 0
self.line_cons = ([], [], [])
self.pre_tokens = ([], [], [])
self.known_dicts_in_mkv = []
self.prop_name = True
self.prop_assign = False
self.is_one_arg_enough = False
self.funcs_stack = []
self.last_line = [-1, -1, -1]
self.props_set = []
self.custom_header = set()
self.tokens = []
self.tokens_head = ['# %s\n' % self.name, 'class %s(' % self.name, '):\n', ' def __init__(self, *args, **kwargs): ']
for i in range(3):
tokenize.tokenize(self.readline(i), self.add(self.pre_tokens, i))
# tokenize treats some keyword not in the right way, thats why we
# have to change some of them
for nk, k in enumerate(self.pre_tokens[i]):
for na, a in enumerate(k):
if a[0] == token.NAME and a[1] in self.logic:
self.pre_tokens[i][nk][
na] = (token.OP, a[1], a[2], a[3], a[4])
for i in self.pre_tokens[1]:
self.line_cons[1].append(self.check_colons(i, 1))
self.check_adjacents(i, 1)
if self.check_for_2nd_arg(i) == -1 and not self.is_one_arg_enough:
raise IndexCreatorValueException("No 2nd value to return (did u forget about ',None'?", self.cnt_line_nr(i[0][4], 1))
self.is_one_arg_enough = False
for i in self.pre_tokens[2]:
self.line_cons[2].append(self.check_colons(i, 2))
self.check_adjacents(i, 2)
for i in self.pre_tokens[0]:
self.handle_prop_line(i)
self.cur_brackets = 0
self.tokens += ['\n super(%s, self).__init__(*args, **kwargs)\n def make_key_value(self, data): ' % self.name]
for i in self.pre_tokens[1]:
for k in i:
self.handle_make_value(*k)
self.curLine = -1
self.con = -1
self.cur_brackets = 0
self.tokens += ['\n def make_key(self, key):']
for i in self.pre_tokens[2]:
for k in i:
self.handle_make_key(*k)
if self.index_type == "":
raise IndexCreatorValueException("Missing index type definition\n")
if self.index_name == "":
raise IndexCreatorValueException("Missing index name\n")
self.tokens_head[0] = "# " + self.index_name + "\n" + \
self.tokens_head[0]
for i in self.funcs_with_body:
if self.funcs_with_body[i][1]:
self.tokens_head.insert(4, self.funcs_with_body[i][0])
if None in self.custom_header:
self.custom_header.remove(None)
if self.custom_header:
s = ' custom_header = """'
for i in self.custom_header:
s += i
s += '"""\n'
self.tokens_head.insert(4, s)
if self.index_type in self.allowed_props:
for i in self.props_set:
if i not in self.allowed_props[self.index_type]:
raise IndexCreatorValueException("Properity %s is not allowed for index type: %s" % (i, self.index_type))
# print "".join(self.tokens_head)
# print "----------"
# print (" ".join(self.tokens))
return "".join(self.custom_header), "".join(self.tokens_head) + (" ".join(self.tokens))
# has to be run BEFORE tokenize
def check_enclosures(self, d, st):
encs = []
contr = {'(': ')', '{': '}', '[': ']', "'": "'", '"': '"'}
ends = [')', '}', ']', "'", '"']
for i in d:
if len(encs) > 0 and encs[-1] in ['"', "'"]:
if encs[-1] == i:
del encs[-1]
elif i in contr:
encs += [i]
elif i in ends:
if len(encs) < 1 or contr[encs[-1]] != i:
raise IndexCreatorValueException("Missing opening enclosure for \'%s\'" % i, self.cnt_line_nr(d, st))
del encs[-1]
if len(encs) > 0:
raise IndexCreatorValueException("Missing closing enclosure for \'%s\'" % encs[0], self.cnt_line_nr(d, st))
def check_adjacents(self, d, st):
def std_check(d, n):
if n == 0:
prev = -1
else:
prev = d[n - 1][1] if d[n - 1][0] == token.OP else d[n - 1][0]
cur = d[n][1] if d[n][0] == token.OP else d[n][0]
# there always is an endmarker at the end, but this is a precaution
if n + 2 > len(d):
nex = -1
else:
nex = d[n + 1][1] if d[n + 1][0] == token.OP else d[n + 1][0]
if prev not in self.allowed_adjacent[cur]:
raise IndexCreatorValueException("Wrong left value of the %s" % cur, self.cnt_line_nr(line, st))
# there is an assumption that whole data always ends with 0 marker, the idea prolly needs a rewritting to allow more whitespaces
# between tokens, so it will be handled anyway
elif nex not in self.allowed_adjacent[cur][prev]:
raise IndexCreatorValueException("Wrong right value of the %s" % cur, self.cnt_line_nr(line, st))
for n, (t, i, _, _, line) in enumerate(d):
if t == token.NAME or t == token.STRING:
if n + 1 < len(d) and d[n + 1][0] in [token.NAME, token.STRING]:
raise IndexCreatorValueException("Did you forget about an operator in between?", self.cnt_line_nr(line, st))
elif i in self.allowed_adjacent:
std_check(d, n)
def check_colons(self, d, st):
cnt = 0
br = 0
def check_ret_args_nr(a, s):
c_b_cnt = 0
s_b_cnt = 0
n_b_cnt = 0
comas_cnt = 0
for _, i, _, _, line in a:
if c_b_cnt == n_b_cnt == s_b_cnt == 0:
if i == ',':
comas_cnt += 1
if (s == 1 and comas_cnt > 1) or (s == 2 and comas_cnt > 0):
raise IndexCreatorFunctionException("Too much arguments to return", self.cnt_line_nr(line, st))
if s == 0 and comas_cnt > 0:
raise IndexCreatorValueException("A coma here doesn't make any sense", self.cnt_line_nr(line, st))
elif i == ':':
if s == 0:
raise IndexCreatorValueException("A colon here doesn't make any sense", self.cnt_line_nr(line, st))
raise IndexCreatorFunctionException("Two colons don't make any sense", self.cnt_line_nr(line, st))
if i == '{':
c_b_cnt += 1
elif i == '}':
c_b_cnt -= 1
elif i == '(':
n_b_cnt += 1
elif i == ')':
n_b_cnt -= 1
elif i == '[':
s_b_cnt += 1
elif i == ']':
s_b_cnt -= 1
def check_if_empty(a):
for i in a:
if i not in [token.NEWLINE, token.INDENT, token.ENDMARKER]:
return False
return True
if st == 0:
check_ret_args_nr(d, st)
return
for n, i in enumerate(d):
if i[1] == ':':
if br == 0:
if len(d) < n or check_if_empty(d[n + 1:]):
raise IndexCreatorValueException(
"Empty return value", self.cnt_line_nr(i[4], st))
elif len(d) >= n:
check_ret_args_nr(d[n + 1:], st)
return cnt
else:
cnt += 1
elif i[1] == '{':
br += 1
elif i[1] == '}':
br -= 1
check_ret_args_nr(d, st)
return -1
def check_for_2nd_arg(self, d):
c_b_cnt = 0 # curly brackets counter '{}'
s_b_cnt = 0 # square brackets counter '[]'
n_b_cnt = 0 # normal brackets counter '()'
def check_2nd_arg(d, ind):
d = d[ind[0]:]
for t, i, (n, r), _, line in d:
if i == '{' or i is None:
return 0
elif t == token.NAME:
self.known_dicts_in_mkv.append((i, (n, r)))
return 0
elif t == token.STRING or t == token.NUMBER:
raise IndexCreatorValueException("Second return value of make_key_value function has to be a dictionary!", self.cnt_line_nr(line, 1))
for ind in enumerate(d):
t, i, _, _, _ = ind[1]
if s_b_cnt == n_b_cnt == c_b_cnt == 0:
if i == ',':
return check_2nd_arg(d, ind)
elif (t == token.NAME and i not in self.funcs) or i == '{':
self.is_one_arg_enough = True
if i == '{':
c_b_cnt += 1
self.is_one_arg_enough = True
elif i == '}':
c_b_cnt -= 1
elif i == '(':
n_b_cnt += 1
elif i == ')':
n_b_cnt -= 1
elif i == '[':
s_b_cnt += 1
elif i == ']':
s_b_cnt -= 1
return -1
def cnt_line_nr(self, l, stage):
nr = -1
for n, i in enumerate(self.predata[stage]):
# print i,"|||",i.strip(),"|||",l
if l == i.strip():
nr = n
if nr == -1:
return -1
if stage == 0:
return nr + 1
elif stage == 1:
return nr + self.cnt_lines[0] + (self.cnt_lines[2] - 1 if self.funcs_rev else 0)
elif stage == 2:
return nr + self.cnt_lines[0] + (self.cnt_lines[1] - 1 if not self.funcs_rev else 0)
return -1
def handle_prop_line(self, d):
d_len = len(d)
if d[d_len - 1][0] == token.ENDMARKER:
d_len -= 1
if d_len < 3:
raise IndexCreatorValueException("Can't handle properity assingment ", self.cnt_line_nr(d[0][4], 0))
if not d[1][1] in self.props_assign:
raise IndexCreatorValueException(
"Did you forget : or =?", self.cnt_line_nr(d[0][4], 0))
if d[0][0] == token.NAME or d[0][0] == token.STRING:
if d[0][1] in self.props_set:
raise IndexCreatorValueException("Properity %s is set more than once" % d[0][1], self.cnt_line_nr(d[0][4], 0))
self.props_set += [d[0][1]]
if d[0][1] == "type" or d[0][1] == "name":
t, tk, _, _, line = d[2]
if d_len > 3:
raise IndexCreatorValueException(
"Wrong value to assign", self.cnt_line_nr(line, 0))
if t == token.STRING:
m = re.search('\s*(?P<a>[\'\"]+)(.*?)(?P=a)\s*', tk)
if m:
tk = m.groups()[1]
elif t != token.NAME:
raise IndexCreatorValueException(
"Wrong value to assign", self.cnt_line_nr(line, 0))
if d[0][1] == "type":
if d[2][1] == "TreeBasedIndex":
self.custom_header.add("from CodernityDB3.tree_index import TreeBasedIndex\n")
elif d[2][1] == "MultiTreeBasedIndex":
self.custom_header.add("from CodernityDB3.tree_index import MultiTreeBasedIndex\n")
elif d[2][1] == "MultiHashIndex":
self.custom_header.add("from CodernityDB3.hash_index import MultiHashIndex\n")
self.tokens_head.insert(2, tk)
self.index_type = tk
else:
self.index_name = tk
return
else:
self.tokens += ['\n kwargs["' + d[0][1] + '"]']
else:
raise IndexCreatorValueException("Can't handle properity assingment ", self.cnt_line_nr(d[0][4], 0))
self.tokens += ['=']
self.check_adjacents(d[2:], 0)
self.check_colons(d[2:], 0)
for i in d[2:]:
self.tokens += [i[1]]
def generate_func(self, t, tk, pos_start, pos_end, line, hdata, stage):
if self.last_line[stage] != -1 and pos_start[0] > self.last_line[stage] and line != '':
raise IndexCreatorFunctionException("This line will never be executed!", self.cnt_line_nr(line, stage))
if t == 0:
return
if pos_start[1] == 0:
if self.line_cons[stage][pos_start[0] - 1] == -1:
self.tokens += ['\n return']
self.last_line[stage] = pos_start[0]
else:
self.tokens += ['\n if']
elif tk == ':' and self.line_cons[stage][pos_start[0] - 1] > -1:
if self.line_cons[stage][pos_start[0] - 1] == 0:
self.tokens += [':\n return']
return
self.line_cons[stage][pos_start[0] - 1] -= 1
if tk in self.logic2:
# print tk
if line[pos_start[1] - 1] != tk and line[pos_start[1] + 1] != tk:
self.tokens += [tk]
if line[pos_start[1] - 1] != tk and line[pos_start[1] + 1] == tk:
if tk == '&':
self.tokens += ['and']
else:
self.tokens += ['or']
return
if self.brackets != 0:
def search_through_known_dicts(a):
for i, (n, r) in self.known_dicts_in_mkv:
if i == tk and r > pos_start[1] and n == pos_start[0] and hdata == 'data':
return True
return False
if t == token.NAME and len(self.funcs_stack) > 0 and self.funcs_stack[-1][0] == 'md5' and search_through_known_dicts(tk):
raise IndexCreatorValueException("Second value returned by make_key_value for sure isn't a dictionary ", self.cnt_line_nr(line, 1))
if tk == ')':
self.cur_brackets -= 1
if len(self.funcs_stack) > 0 and self.cur_brackets == self.funcs_stack[-1][1]:
self.tokens += [tk]
self.tokens += self.funcs[self.funcs_stack[-1][0]][1]
del self.funcs_stack[-1]
return
if tk == '(':
self.cur_brackets += 1
if tk in self.none:
self.tokens += ['None']
return
if t == token.NAME and tk not in self.logic and tk != hdata:
if tk not in self.funcs:
self.tokens += [hdata + '["' + tk + '"]']
else:
self.tokens += self.funcs[tk][0]
if tk in self.funcs_with_body:
self.funcs_with_body[tk] = (
self.funcs_with_body[tk][0], True)
self.custom_header.add(self.handle_int_imports.get(tk))
self.funcs_stack += [(tk, self.cur_brackets)]
else:
self.tokens += [tk]
def handle_make_value(self, t, tk, pos_start, pos_end, line):
self.generate_func(t, tk, pos_start, pos_end, line, 'data', 1)
def handle_make_key(self, t, tk, pos_start, pos_end, line):
self.generate_func(t, tk, pos_start, pos_end, line, 'key', 2)