First crack at serialization with backrefs.

Chia-Network · May 20, 2022 · 744fd33 · 744fd33
1 parent fc73cd9
commit 744fd33
Show file tree

Hide file tree

Showing 6 changed files with 395 additions and 20 deletions.
diff --git a/clvm/SExp.py b/clvm/SExp.py
@@ -167,9 +167,9 @@ def nullp(self):
     def as_int(self):
         return int_from_bytes(self.atom)
 
-    def as_bin(self):
+    def as_bin(self, /, allow_backrefs=False):
         f = io.BytesIO()
-        sexp_to_stream(self, f)
+        sexp_to_stream(self, f, allow_backrefs=allow_backrefs)
         return f.getvalue()
 
     @classmethod

diff --git a/clvm/object_cache.py b/clvm/object_cache.py
@@ -0,0 +1,76 @@
+import hashlib
+
+
+class ObjectCache:
+    """
+    `ObjectCache` provides a way to calculate and cache values for each node
+    in a clvm object tree. It can be used to calculate the sha256 tree hash
+    for an object and save the hash for all the child objects for building
+    usage tables, for example.
+    """
+    def __init__(self, f):
+        self.f = f
+        self.lookup = dict()
+
+    def get(self, obj):
+        obj_id = id(obj)
+        if obj_id not in self.lookup:
+            obj_list = [obj]
+            while obj_list:
+                node = obj_list.pop()
+                node_id = id(node)
+                if node_id not in self.lookup:
+                    v = self.f(self, node)
+                    if v is None:
+                        if node.pair is None:
+                            raise ValueError("f returned None for atom", node)
+                        obj_list.append(node)
+                        obj_list.append(node.pair[0])
+                        obj_list.append(node.pair[1])
+                    else:
+                        self.lookup[node_id] = (v, node)
+        return self.lookup[obj_id][0]
+
+    def contains(self, obj):
+        return id(obj) in self.lookup
+
+
+def treehash(cache, obj):
+    """
+    This function can be fed to `ObjectCache` to calculate the sha256 tree
+    hash for all objects in a tree.
+    """
+    if obj.pair:
+        left, right = obj.pair
+        if cache.contains(left) and cache.contains(right):
+            left_hash = cache.get(left)
+            right_hash = cache.get(right)
+            return hashlib.sha256(b"\2" + left_hash + right_hash).digest()
+        return None
+    return hashlib.sha256(b"\1" + obj.atom).digest()
+
+
+def serialized_length(cache, obj):
+    """
+    This function can be fed to `ObjectCache` to calculate the serialized
+    length for all objects in a tree.
+    """
+    if obj.pair:
+        left, right = obj.pair
+        if cache.contains(left) and cache.contains(right):
+            left_length = cache.get(left)
+            right_length = cache.get(right)
+            return 1 + left_length + right_length
+        return None
+    lb = len(obj.atom)
+    if lb == 0 or (lb == 1 and obj.atom[0] < 128):
+        return 1
+    if lb < 0x40:
+        return 1 + lb
+    if lb < 0x2000:
+        return 2 + lb
+    if lb < 0x100000:
+        return 3 + lb
+    if lb < 0x8000000:
+        return 4 + lb
+    return 5 + lb
diff --git a/clvm/read_cache_lookup.py b/clvm/read_cache_lookup.py
@@ -0,0 +1,148 @@
+from collections import Counter
+
+import hashlib
+
+
+def hash_blobs(*blobs):
+    s = hashlib.sha256()
+    for blob in blobs:
+        s.update(blob)
+    return s.digest()
+
+
+class ReadCacheLookup:
+    """
+    When deserializing a clvm object, a stack of deserialized child objects
+    is created, which can be used with back-references. A `ReadCacheLookup` keeps
+    track of the state of this stack and all child objects under each root
+    node in the stack so that we can quickly determine if a relevant
+    back-reference is available.
+
+    In other words, if we've already serialized an object with tree hash T,
+    and we encounter another object with that tree hash, we don't re-serialize
+    it, but rather include a back-reference to it. This data structure lets
+    us quickly determine which back-reference has the shortest path.
+
+    Note that there is a counter. This is because the stack contains some
+    child objects that are transient, and no longer appear in the stack
+    at later times in the parsing. We don't want to waste time looking for
+    these objects that no longer exist, so we reference-count them.
+    """
+
+    def __init__(self):
+        """
+        Create a new `ReadCacheLookup` object with just the null terminator
+        (ie. and empty list of objects).
+        """
+        self.root_hash = hash_blobs(b"\1")
+        self.read_stack = []
+        self.count = Counter()
+        self.parent_lookup = {}
+
+    def push(self, obj_hash):
+        """
+        Note that an object with the given hash has just been pushed to
+        the read stack, and update the lookups as appropriate.
+        """
+        # we add two new entries: the new root of the tree, and this object (by id)
+        # new_root: (obj_hash, old_root)
+        new_root_hash = hash_blobs(b"\2", obj_hash, self.root_hash)
+
+        self.read_stack.append((obj_hash, self.root_hash))
+
+        self.count.update([obj_hash, new_root_hash])
+
+        new_parent_to_old_root = (new_root_hash, 0)
+        self.parent_lookup.setdefault(obj_hash, list()).append(new_parent_to_old_root)
+
+        new_parent_to_id = (new_root_hash, 1)
+        self.parent_lookup.setdefault(self.root_hash, list()).append(new_parent_to_id)
+        self.root_hash = new_root_hash
+
+    def pop(self):
+        """
+        Note that the top object has just been popped from the read stack.
+        Return the 2-tuple of the child hashes.
+        """
+        item = self.read_stack.pop()
+        self.count[item[0]] -= 1
+        self.count[self.root_hash] -= 1
+        self.root_hash = item[1]
+        return item
+
+    def pop2_and_cons(self):
+        """
+        Note that a "pop-and-cons" operation has just happened. We remove
+        two objects, cons them together, and push the cons, updating
+        the internal look-ups as necessary.
+        """
+        # we remove two items: the right side of each left/right pair
+        right = self.pop()
+        left = self.pop()
+
+        self.count.update([left[0], right[0]])
+
+        new_root_hash = hash_blobs(b"\2", left[0], right[0])
+
+        self.parent_lookup.setdefault(left[0], list()).append((new_root_hash, 0))
+        self.parent_lookup.setdefault(right[0], list()).append((new_root_hash, 1))
+        self.push(new_root_hash)
+
+    def find_path(self, obj, serialized_length):
+        """
+        This function looks for a path from the root to a child node with a given hash
+        by using the read cache.
+        """
+        if serialized_length < 3:
+            return None
+
+        seen_ids = set()
+
+        max_bytes_for_path_encoding = serialized_length - 2
+        # 1 byte for 0xfe, 1 min byte for savings
+
+        max_path_length = max_bytes_for_path_encoding * 8 - 1
+        seen_ids.add(obj)
+
+        partial_paths = [(obj, [])]
+
+        while partial_paths:
+            new_partial_paths = []
+            for (node, path) in partial_paths:
+                if node == self.root_hash:
+                    path.reverse()
+                    return path_to_bytes(path)
+
+                parents = self.parent_lookup.get(node)
+
+                if parents:
+                    for (parent, direction) in parents:
+                        if self.count[parent] > 0 and parent not in seen_ids:
+                            new_path = list(path)
+                            new_path.append(direction)
+                            if len(new_path) > max_path_length:
+                                return None
+                            new_partial_paths.append((parent, new_path))
+                        seen_ids.add(parent)
+            partial_paths = new_partial_paths
+        return None
+
+
+def path_to_bytes(path):
+    """
+    Convert a list of 0/1 values to a path expected by clvm.
+    """
+    byte_count = (len(path) + 1 + 7) >> 3
+    v = bytearray(byte_count)
+    index = byte_count - 1
+    mask = 1
+    for p in path:
+        if p:
+            v[index] |= mask
+        if mask == 0x80:
+            index -= 1
+            mask = 1
+        else:
+            mask <<= 1
+    v[index] |= mask
+    return v