schema: support encoding=None connections

Totktonada · Totktonada · commit e3b7be1dd0b2 · 2020-08-28T18:01:16.000+03:00
Several different problems are fixed here, but all have the same root. When a connection encoding is None (it is default on Python 2 and may be set explicitly on Python 3), all mp_str values are decoded into bytes, not Unicode strings (note that bytes is alias for str in Python 2). But the database schema parsing code have assumptions that _vspace / _vindex values are Unicode strings. The resolved problems are the following: 1. Default encoding in bytes#decode() method is 'ascii', however names in tarantool can contain symbols beyond ASCII symbol table. Set 'utf-8' for names decoding. 2. Convert all binary values into Unicode strings before parse or store them. This allows further correct accesses to the local schema representation. 3. Convert binary parameters like space, index or field name into Unicode strings, when a schema is accessed to don't trigger redundant schema refetching. Those problems are briefly mentioned in [1]. Tested manually with Python 2 and Python 3: my testing tarantool instance has a space with name '©' and after the changes I'm able to connect to it when the connection encoding is set to None. Also I verified that schema is not fetched each time when I do <connection>.select('©') in Python 2 (where such string literal is str / bytes, not Unicode string). Relevant test cases are added in next commits. [1]: #105 (comment)
diff --git a/tarantool/schema.py b/tarantool/schema.py
@@ -16,26 +16,63 @@
 import tarantool.const as const
 
 
+def to_unicode(s):
+    if isinstance(s, bytes):
+        return s.decode(encoding='utf-8')
+    return s
+
+
+def to_unicode_recursive(x, max_depth):
+    """Same as to_unicode(), but traverses over dictionaries,
+       lists and tuples recursivery.
+
+       x: value to convert
+
+       max_depth: 1 accepts a scalar, 2 accepts a list of scalars,
+       etc.
+    """
+    assert max_depth > 0
+
+    if isinstance(x, dict):
+        res = dict()
+        for key, val in x.items():
+            key = to_unicode_recursive(key, max_depth - 1)
+            val = to_unicode_recursive(val, max_depth - 1)
+            res[key] = val
+        return res
+
+    if isinstance(x, list) or isinstance(x, tuple):
+        res = []
+        for val in x:
+            val = to_unicode_recursive(val, max_depth - 1)
+            res.append(val)
+        if isinstance(x, tuple):
+            return tuple(res)
+        return res
+
+    return to_unicode(x)
+
+
 class SchemaIndex(object):
     def __init__(self, index_row, space):
         self.iid = index_row[1]
         self.name = index_row[2]
-        if isinstance(self.name, bytes):
-            self.name = self.name.decode()
+        self.name = to_unicode(index_row[2])
         self.index = index_row[3]
         self.unique = index_row[4]
         self.parts = []
-        if isinstance(index_row[5], (list, tuple)):
-            for val in index_row[5]:
+        parts_raw = to_unicode_recursive(index_row[5], 3)
+        if isinstance(parts_raw, (list, tuple)):
+            for val in parts_raw:
                 if isinstance(val, dict):
                     self.parts.append((val['field'], val['type']))
                 else:
                     self.parts.append((val[0], val[1]))
         else:
-            for i in range(index_row[5]):
+            for i in range(parts_raw):
                 self.parts.append((
-                    index_row[5 + 1 + i * 2],
-                    index_row[5 + 2 + i * 2]
+                    to_unicode(index_row[5 + 1 + i * 2]),
+                    to_unicode(index_row[5 + 2 + i * 2])
                 ))
         self.space = space
         self.space.indexes[self.iid] = self
@@ -52,16 +89,15 @@ class SchemaSpace(object):
     def __init__(self, space_row, schema):
         self.sid = space_row[0]
         self.arity = space_row[1]
-        self.name = space_row[2]
-        if isinstance(self.name, bytes):
-            self.name = self.name.decode()
+        self.name = to_unicode(space_row[2])
         self.indexes = {}
         self.schema = schema
         self.schema[self.sid] = self
         if self.name:
             self.schema[self.name] = self
         self.format = dict()
-        for part_id, part in enumerate(space_row[6]):
+        format_raw = to_unicode_recursive(space_row[6], 3)
+        for part_id, part in enumerate(format_raw):
             part['id'] = part_id
             self.format[part['name']] = part
             self.format[part_id     ] = part
@@ -78,6 +114,8 @@ def __init__(self, con):
         self.con = con
 
     def get_space(self, space):
+        space = to_unicode(space)
+
         try:
             return self.schema[space]
         except KeyError:
@@ -135,6 +173,9 @@ def fetch_space_all(self):
             SchemaSpace(row, self.schema)
 
     def get_index(self, space, index):
+        space = to_unicode(space)
+        index = to_unicode(index)
+
         _space = self.get_space(space)
         try:
             return _space.indexes[index]
@@ -203,6 +244,9 @@ def fetch_index_from(self, space, index):
         return index_row
 
     def get_field(self, space, field):
+        space = to_unicode(space)
+        field = to_unicode(field)
+
         _space = self.get_space(space)
         try:
             return _space.format[field]