Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Feat/glom like key searching #44

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions dictdatabase/dataclasses.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import dataclasses


@dataclasses.dataclass(frozen=True)
class SearchResult:
start_byte: int
end_byte: int
found: bool


@dataclasses.dataclass(frozen=True)
class Index:
key: str
key_start: int
key_end: int
indent_level: int
indent_with: str
value_hash: str
old_value_end: int
27 changes: 27 additions & 0 deletions dictdatabase/index_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import hashlib

from dictdatabase import utils
from dictdatabase.dataclasses import Index


def create_index(all_file_bytes: bytes, key: str, start, end) -> Index:
"""
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
value

Args:
all_file_bytes (bytes): The entire file as a byte string.
key (str): The key of the value we're indexing.
start: the start of the value in the file
end: the end of the value in the file

Returns:
The key, start, end, indent_level, indent_with, value_hash, end
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, key_start
)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()
return Index(key, start, end, indent_level, indent_with, value_hash, end)
126 changes: 67 additions & 59 deletions dictdatabase/indexing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import orjson
import os

import orjson

from . import config
from .dataclasses import Index


# Problem: Multiple read processes will concurrently read and write the same file
# In some cases this will result in a empty read error, thats why the try-except exists
Expand All @@ -22,61 +26,65 @@


class Indexer:
"""
The Indexer takes the name of a database file, and tries to load the .index file
of the corresponding database file.

The name of the index file is the name of the database file, with the extension
.index and all "/" replaced with "___"

The content of the index file is a json object, where the keys are keys inside
the database json file, and the values are lists of 5 elements:
- start_index: The index of the first byte of the value of the key in the database file
- end_index: The index of the last byte of the value of the key in the database file
- indent_level: The indent level of the key in the database file
- indent_with: The indent string used.
- value_hash: The hash of the value bytes
"""

__slots__ = ("data", "path")

def __init__(self, db_name: str):
# Make path of index file
db_name = db_name.replace("/", "___")
self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")

os.makedirs(os.path.dirname(self.path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
return

try:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())
except orjson.JSONDecodeError:
self.data = {}


def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Elements:[start_index, end_index, indent_level, indent_with, value_hash]
"""
return self.data.get(key, None)


def write(self, key, start_index, end_index, indent_level, indent_with, value_hash, old_value_end):
"""
Write index information for a key to the index file
"""

if self.data.get(key, None) is not None:
delta = end_index - old_value_end
for entry in self.data.values():
if entry[0] > old_value_end:
entry[0] += delta
entry[1] += delta

self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
Comment on lines -25 to -82
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@UmbrellaMalware let's stick with tabs so the diff looks clean

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep8 says spaces are preferred
This can become a problem if someone else wants to propose changes to the project

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I know. I personally prefer tabs, and think they are faster to work with. But if this project becomes bigger, I agree that it should adhere to most python community standards. But I'd like to keep the merge clean, the indentation and formatting stuff should be part of a separate issue/MR

"""
The Indexer takes the name of a database file, and tries to load the .index file
of the corresponding database file.

The name of the index file is the name of the database file, with the extension
.index and all "/" replaced with "___"

The content of the index file is a json object, where the keys are keys inside
the database json file, and the values are lists of 5 elements:
- start_index: The index of the first byte of the value of the key in the database file
- end_index: The index of the last byte of the value of the key in the database file
- indent_level: The indent level of the key in the database file
- indent_with: The indent string used.
- value_hash: The hash of the value bytes
"""

__slots__ = ("data", "path")

def __init__(self, db_name: str):
# Make path of index file
db_name = db_name.replace("/", "___")
self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")

os.makedirs(os.path.dirname(self.path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
return

try:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())
except orjson.JSONDecodeError:
self.data = {}

def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Elements:[start_index, end_index, indent_level, indent_with, value_hash]
"""
return self.data.get(key, None)

def write(self, index: Index):
"""
Write index information for a key to the index file
"""

if self.data.get(index.key, None) is not None:
delta = index.key_end - index.old_value_end
for entry in self.data.values():
if entry[0] > index.old_value_end:
entry[0] += delta
entry[1] += delta

self.data[index.key] = [
index.key_start,
index.key_end,
index.indent_level,
index.indent_with,
index.value_hash,
]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
93 changes: 60 additions & 33 deletions dictdatabase/io_unsafe.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
from __future__ import annotations
from typing import Tuple

import hashlib
import json
from dataclasses import dataclass
from typing import Tuple

import orjson
import json
import hashlib
from . import config, utils, byte_codes, indexing, io_bytes

from . import byte_codes
from . import config
from . import indexing
from . import io_bytes
from . import searching
from . import utils
from .dataclasses import Index
from .index_manager import create_index


@dataclass(frozen=True) # slots=True not supported by python 3.8 and 3.9
Expand Down Expand Up @@ -46,7 +56,9 @@ def read(db_name: str) -> dict:
########################################################################################


def try_read_bytes_using_indexer(indexer: indexing.Indexer, db_name: str, key: str) -> bytes | None:
def try_read_bytes_using_indexer(
indexer: indexing.Indexer, db_name: str, key: str
) -> bytes | None:
"""
Check if the key info is saved in the file's index file.
If it is and the value has not changed, return the value bytes.
Expand Down Expand Up @@ -79,21 +91,12 @@ def partial_read_only(db_name: str, key: str) -> dict | None:

# Not found in index file, search for key in the entire file
all_file_bytes = io_bytes.read(db_name)
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)

if key_end == -1:
start, end, found = searching.search_value_position_in_db(all_file_bytes, key)
if not found:
return None

# Key found, now determine the bounding byte indices of the value
start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
end = utils.seek_index_through_value_bytes(all_file_bytes, start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()

# Write key info to index file
indexer.write(key, start, end, indent_level, indent_with, value_hash, end)
indexer.write(create_index(all_file_bytes, key, start, end))
return orjson.loads(value_bytes)


Expand Down Expand Up @@ -130,7 +133,9 @@ def write(db_name: str, data: dict):
################################################################################


def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key) -> Tuple[PartialFileHandle | None, bytes | None]:
def try_get_parial_file_handle_by_index(
indexer: indexing.Indexer, db_name, key
) -> Tuple[PartialFileHandle | None, bytes | None]:
"""
Try to get a partial file handle by using the key entry in the index file.

Expand All @@ -151,7 +156,9 @@ def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key)
if value_hash != hashlib.sha256(value_bytes).hexdigest():
return None, all_file_bytes
value_data = orjson.loads(value_bytes)
partial_dict = PartialDict(all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:])
partial_dict = PartialDict(
all_file_bytes[:start], key, value_data, start, end, all_file_bytes[end:]
)

# If compression is disabled, only the value and suffix have to be read
else:
Expand All @@ -163,9 +170,14 @@ def try_get_parial_file_handle_by_index(indexer: indexing.Indexer, db_name, key)
prefix_bytes = io_bytes.read(db_name, end=start)
return None, prefix_bytes + value_and_suffix_bytes
value_data = orjson.loads(value_bytes)
partial_dict = PartialDict(None, key, value_data, start, end, value_and_suffix_bytes[value_length:])
partial_dict = PartialDict(
None, key, value_data, start, end, value_and_suffix_bytes[value_length:]
)

return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer), None
return (
PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer),
None,
)


def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:
Expand All @@ -180,25 +192,33 @@ def get_partial_file_handle(db_name: str, key: str) -> PartialFileHandle:

# Search for key in the index file
indexer = indexing.Indexer(db_name)
partial_handle, all_file_bytes = try_get_parial_file_handle_by_index(indexer, db_name, key)
partial_handle, all_file_bytes = try_get_parial_file_handle_by_index(
indexer, db_name, key
)
if partial_handle is not None:
return partial_handle

# Not found in index file, search for key in the entire file
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
position = searching.search_key_position_in_db(all_file_bytes, key)

if key_end == -1:
raise KeyError(f"Key \"{key}\" not found in db \"{db_name}\"")
if not position.found:
raise KeyError(f'Key "{key}" not found in db "{db_name}"')

# Key found, now determine the bounding byte indices of the value
start = key_end + (1 if all_file_bytes[key_end] == byte_codes.SPACE else 0)
start = position.end_byte + (
1 if all_file_bytes[position.end_byte] == byte_codes.SPACE else 0
)
end = utils.seek_index_through_value_bytes(all_file_bytes, start)

indent_level, indent_with = utils.detect_indentation_in_json_bytes(all_file_bytes, key_start)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, position.start_byte
)

partial_value = orjson.loads(all_file_bytes[start:end])
prefix_bytes = all_file_bytes[:start] if config.use_compression else None
partial_dict = PartialDict(prefix_bytes, key, partial_value, start, end, all_file_bytes[end:])
partial_dict = PartialDict(
prefix_bytes, key, partial_value, start, end, all_file_bytes[end:]
)
return PartialFileHandle(db_name, partial_dict, indent_level, indent_with, indexer)


Expand All @@ -216,19 +236,26 @@ def partial_write(pf: PartialFileHandle):
partial_bytes = partial_bytes.replace(replace_this, replace_with)

# Write key info to index file
pf.indexer.write(
index = Index(
key=pf.partial_dict.key,
start_index=pf.partial_dict.value_start,
end_index=pf.partial_dict.value_start + len(partial_bytes),
key_start=pf.partial_dict.value_start,
key_end=pf.partial_dict.value_start + len(partial_bytes),
indent_level=pf.indent_level,
indent_with=pf.indent_with,
value_hash=hashlib.sha256(partial_bytes).hexdigest(),
old_value_end=pf.partial_dict.value_end,
)
pf.indexer.write(index)

if pf.partial_dict.prefix is None:
# Prefix could not be determined due to compression, so write the entire file
io_bytes.write(pf.db_name, partial_bytes + pf.partial_dict.suffix, start=pf.partial_dict.value_start)
io_bytes.write(
pf.db_name,
partial_bytes + pf.partial_dict.suffix,
start=pf.partial_dict.value_start,
)
else:
# Prefix was determined, so only write the changed part and the suffix
io_bytes.write(pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix)
io_bytes.write(
pf.db_name, pf.partial_dict.prefix + partial_bytes + pf.partial_dict.suffix
)
Loading