Skip to content

Commit

Permalink
Use injector to initialize storage adaptor type (#916)
Browse files Browse the repository at this point in the history
1. Extract the `AppConfig` object that can parse os.environ on app
bootup, consolidating our various environment variable usages.
2. `get_storage_adaptor_class` is a now a `module.provider` that
receives the `AppConfig` object via `injected` and selects its
implementation based on that.
3.  Tests simplify a bit as a result of this.


I will be making more refactorings that integrate `AppConfig` in other
places to simplify our configuration and bootup sequence.
  • Loading branch information
corps committed Jul 17, 2024
1 parent 5fd276d commit cc9ad01
Show file tree
Hide file tree
Showing 11 changed files with 227 additions and 107 deletions.
6 changes: 0 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,6 @@

Autofix creates and uses a ChromaDB for every separate codebase. This can either be stored in a Google Cloud Storage (GCS) bucket, in your local filesystem, or you can make your own storage adapter.

##### Local Workspace

Autofix will need a local folder to use as its workspace, by default it uses `data/chroma/workspaces`.

You can set the location in filesystem where Autofix will use as a workspace with `CODEBASE_WORKSPACE_DIR`.

##### Google Cloud Storage

To use GCS, you need to set the following environment variables:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
"unidiff",
"tree_sitter_languages",
"tree_sitter",
"google-cloud-storage",
"google.cloud.*",
"langfuse.*",
"langfuse",
"sklearn.dummy",
Expand Down
33 changes: 19 additions & 14 deletions src/seer/automation/codebase/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
EmbeddedDocumentChunk,
RepositoryInfo,
)
from seer.automation.codebase.storage_adapters import StorageAdapter, get_storage_adapter_class
from seer.automation.codebase.storage_adapters import StorageAdapter
from seer.automation.models import RepoDefinition
from seer.db import DbCodebaseNamespace, DbCodebaseNamespaceMutex, DbRepositoryInfo, Session
from seer.dependency_injection import inject, injected


class CodebaseNamespaceManager:
Expand Down Expand Up @@ -159,7 +160,13 @@ def get_namespace(
return CodebaseNamespace.from_db(db_namespace)

@classmethod
def load_workspace(cls, namespace_id: int, skip_copy: bool = False):
@inject
def load_workspace(
cls,
namespace_id: int,
skip_copy: bool = False,
storage_type: type[StorageAdapter] = injected,
):
with Session() as session:
db_namespace = session.get(DbCodebaseNamespace, namespace_id)

Expand All @@ -174,9 +181,7 @@ def load_workspace(cls, namespace_id: int, skip_copy: bool = False):
repo_info = RepositoryInfo.from_db(db_repo_info)
namespace = CodebaseNamespace.from_db(db_namespace)

storage_adapter = get_storage_adapter_class()(
repo_id=repo_info.id, namespace_slug=namespace.slug
)
storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug)

did_copy = False
if not skip_copy:
Expand All @@ -192,13 +197,15 @@ def load_workspace(cls, namespace_id: int, skip_copy: bool = False):
return None

@classmethod
@inject
def load_workspace_for_repo_definition(
cls,
organization: int,
project: int,
repo: RepoDefinition,
sha: str | None = None,
tracking_branch: str | None = None,
storage_type: type[StorageAdapter] = injected,
) -> Self | None:
with Session() as session:
db_repo_info = (
Expand Down Expand Up @@ -252,9 +259,7 @@ def load_workspace_for_repo_definition(
repo_info = RepositoryInfo.from_db(db_repo_info)
namespace = CodebaseNamespace.from_db(db_namespace)

storage_adapter = get_storage_adapter_class()(
repo_id=repo_info.id, namespace_slug=namespace.slug
)
storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug)

cls._wait_for_mutex_clear(namespace.id)
cls._set_mutex(namespace.id)
Expand All @@ -268,6 +273,7 @@ def load_workspace_for_repo_definition(
return None

@classmethod
@inject
def create_repo(
cls,
organization: int,
Expand All @@ -276,6 +282,7 @@ def create_repo(
head_sha: str,
tracking_branch: str | None = None,
should_set_as_default: bool = False,
storage_type: type[StorageAdapter] = injected,
):
autofix_logger.info(
f"Creating new repo for {organization}/{project}/{repo.external_id} (repo: {repo.full_name})"
Expand Down Expand Up @@ -309,9 +316,7 @@ def create_repo(
repo_info = RepositoryInfo.from_db(db_repo_info)
namespace = CodebaseNamespace.from_db(db_namespace)

storage_adapter = get_storage_adapter_class()(
repo_id=repo_info.id, namespace_slug=namespace.slug
)
storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug)

return cls(repo_info, namespace, storage_adapter)

Expand Down Expand Up @@ -354,12 +359,14 @@ def create_namespace_with_new_or_existing_repo(
)

@classmethod
@inject
def create_or_get_namespace_for_repo(
cls,
repo_id: int,
sha: str,
tracking_branch: str | None = None,
should_set_as_default: bool = False,
storage_type: type[StorageAdapter] = injected,
):
with Session() as session:
existing_namespace = None
Expand Down Expand Up @@ -412,9 +419,7 @@ def create_or_get_namespace_for_repo(

namespace = CodebaseNamespace.from_db(db_namespace)

storage_adapter = get_storage_adapter_class()(
repo_id=repo_info.id, namespace_slug=namespace.slug
)
storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug)

return cls(repo_info, namespace, storage_adapter)

Expand Down
60 changes: 30 additions & 30 deletions src/seer/automation/codebase/storage_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,27 @@
import shutil
import tempfile

# Why is this all good on pylance but mypy is complaining?
from google.cloud import storage # type: ignore
from google.cloud import storage
from google.cloud.storage import Bucket

from seer.automation.autofix.utils import autofix_logger
from seer.automation.codebase.utils import cleanup_dir
from seer.bootup import module
from seer.configuration import AppConfig, CodebaseStorageType
from seer.dependency_injection import inject, injected


class StorageAdapter(abc.ABC):
repo_id: int
namespace_slug: str
app_config: AppConfig

def __init__(self, repo_id: int, namespace_slug: str):
@inject
def __init__(self, repo_id: int, namespace_slug: str, app_config: AppConfig = injected):
self.repo_id = repo_id
self.namespace_slug = namespace_slug
self.tmpdir = tempfile.mkdtemp()
self.app_config = app_config

def __del__(self):
self.clear_workspace()
Expand Down Expand Up @@ -50,23 +56,20 @@ class FilesystemStorageAdapter(StorageAdapter):
data storage and retrieval from a local directory structure.
"""

@staticmethod
def get_storage_dir():
storage_dir = os.getenv("CODEBASE_STORAGE_DIR", "data/chroma/storage")
return os.path.abspath(storage_dir)
def get_storage_dir(self):
return self.app_config.CODEBASE_STORAGE_DIR

@staticmethod
def get_storage_location(repo_id: int, namespace_slug: str):
storage_dir = FilesystemStorageAdapter.get_storage_dir()
return os.path.join(storage_dir, f"{repo_id}/{namespace_slug}")
def get_storage_location(self):
storage_dir = self.get_storage_dir()
return os.path.join(storage_dir, f"{self.repo_id}/{self.namespace_slug}")

@staticmethod
def clear_all_storage():
storage_dir = FilesystemStorageAdapter.get_storage_dir()
shutil.rmtree(storage_dir, ignore_errors=True)
@inject
def clear_all_storage(app_config: AppConfig = injected):
shutil.rmtree(app_config.CODEBASE_STORAGE_DIR, ignore_errors=True)

def copy_to_workspace(self):
storage_path = self.get_storage_location(self.repo_id, self.namespace_slug)
storage_path = self.get_storage_location()

if os.path.exists(storage_path):
try:
Expand All @@ -78,7 +81,7 @@ def copy_to_workspace(self):
return True

def save_to_storage(self):
storage_path = self.get_storage_location(self.repo_id, self.namespace_slug)
storage_path = self.get_storage_location()

if os.path.exists(storage_path):
try:
Expand All @@ -92,7 +95,7 @@ def save_to_storage(self):
return True

def delete_from_storage(self):
storage_path = self.get_storage_location(self.repo_id, self.namespace_slug)
storage_path = self.get_storage_location()

if os.path.exists(storage_path):
try:
Expand All @@ -109,13 +112,11 @@ class GcsStorageAdapter(StorageAdapter):
A storage adapter designed to store database files in Google Cloud Storage.
"""

@staticmethod
def get_bucket():
return storage.Client().bucket(os.getenv("CODEBASE_GCS_STORAGE_BUCKET", "sentry-ml"))
def get_bucket(self) -> Bucket:
return storage.Client().bucket(self.app_config.CODEBASE_GCS_STORAGE_BUCKET)

@staticmethod
def get_storage_prefix(repo_id: int, namespace_slug: str):
storage_dir = os.getenv("CODEBASE_GCS_STORAGE_DIR", "tmp_jenn/dev/chroma/storage")
def get_storage_prefix(self, repo_id: int, namespace_slug: str):
storage_dir = self.app_config.CODEBASE_GCS_STORAGE_DIR
return os.path.join(storage_dir, f"{repo_id}/{namespace_slug}")

def copy_to_workspace(self) -> bool:
Expand Down Expand Up @@ -195,14 +196,13 @@ def delete_from_storage(self) -> bool:
return True


def get_storage_adapter_class() -> type[StorageAdapter]:
storage_type = os.getenv("CODEBASE_STORAGE_TYPE", "filesystem")

autofix_logger.debug(f"Using storage type: {storage_type}")
@module.provider
def get_storage_adapter_class(config: AppConfig = injected) -> type[StorageAdapter]:
autofix_logger.debug(f"Using storage type: {config.CODEBASE_STORAGE_TYPE}")

if storage_type == "filesystem":
if config.CODEBASE_STORAGE_TYPE == CodebaseStorageType.FILESYSTEM:
return FilesystemStorageAdapter
elif storage_type == "gcs":
elif config.CODEBASE_STORAGE_TYPE == CodebaseStorageType.GCS:
return GcsStorageAdapter
else:
raise ValueError(f"Unknown storage type: {storage_type}")
raise ValueError(f"Unknown storage type: {config.CODEBASE_STORAGE_TYPE}")
95 changes: 95 additions & 0 deletions src/seer/configuration.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import os.path
from enum import Enum
from typing import Annotated, Any

from pydantic import BaseModel, BeforeValidator, Field

from seer.bootup import module, stub_module


class CodebaseStorageType(str, Enum):
FILESYSTEM = "filesystem"
GCS = "gcs"


def parse_int_from_env(data: str) -> int:
return int(data)


def parse_list_from_env(data: str) -> list[str]:
return data.split()


def parse_bool_from_env(data: str) -> bool:
if not data.lower() in ("yes", "true", "t", "y", "1", "on"):
return False
return True


def as_absolute_path(path: str) -> str:
return os.path.abspath(path)


ParseInt = Annotated[int, BeforeValidator(parse_int_from_env)]
ParseList = Annotated[list[str], BeforeValidator(parse_list_from_env)]
ParseBool = Annotated[bool, BeforeValidator(parse_bool_from_env)]
ParsePath = Annotated[str, BeforeValidator(as_absolute_path)]


class AppConfig(BaseModel):
CODEBASE_STORAGE_TYPE: CodebaseStorageType = CodebaseStorageType.FILESYSTEM
CODEBASE_STORAGE_DIR: ParsePath = os.path.abspath("data/chroma/storage")

DATABASE_URL: str
CELERY_BROKER_URL: str
GITHUB_TOKEN: str = ""
GITHUB_APP_ID: str = ""
GITHUB_PRIVATE_KEY: str = ""

CODEBASE_GCS_STORAGE_BUCKET: str = "sentry-ml"
CODEBASE_GCS_STORAGE_DIR: str = "tmp_jenn/dev/chroma/storage"

JSON_API_SHARED_SECRETS: ParseList = Field(default_factory=list)
TORCH_NUM_THREADS: ParseInt = 1
NO_SENTRY_INTEGRATION: ParseBool = False
DEV: ParseBool = False

@property
def is_production(self) -> bool:
return not self.DEV

@property
def has_sentry_integration(self) -> bool:
return not self.NO_SENTRY_INTEGRATION

def model_post_init(self, context: Any):
if self.is_production:
assert self.has_sentry_integration, "Sentry integration required for production mode."

if self.has_sentry_integration:
assert (
self.JSON_API_SHARED_SECRETS
), "JSON_API_SHARED_SECRETS required for sentry integration."


@module.provider
def load_from_environment(environ: dict[str, str] | None = None) -> AppConfig:
return AppConfig.model_validate(environ or os.environ)


@stub_module.provider
def provide_test_defaults() -> AppConfig:
"""
Load defaults into the base app config useful for tests
"""

base = load_from_environment(
{
**os.environ,
"NO_SENTRY_INTEGRATION": "true",
}
)
base.CODEBASE_STORAGE_DIR = os.path.abspath("data/tests/chroma/storage")
base.CODEBASE_GCS_STORAGE_DIR = os.path.abspath("chroma-test/data/storage")

return base
17 changes: 15 additions & 2 deletions src/seer/dependency_injection.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class Config:
class FactoryAnnotation:
concrete_type: type
is_collection: bool
is_type: bool
label: str

@classmethod
Expand All @@ -76,11 +77,23 @@ def from_annotation(cls, source: Any) -> "FactoryAnnotation":
not inner.is_collection
), f"Cannot get_factory {source}: collections must be of concrete types, not other lists"
return dataclasses.replace(inner, is_collection=True)
elif annotation.origin is type:
assert (
len(annotation.args) == 1
), f"Cannot get_factory {source}: type requires at least one argument"
inner = FactoryAnnotation.from_annotation(annotation.args[0])
assert not inner.label, f"Cannot get_factory {source}: type has embedded Labeled"
assert (
not inner.is_collection and not inner.is_type
), f"Cannot get_factory {source}: type factories must be of concrete types, not lists or other types"
return dataclasses.replace(inner, is_type=True)

assert (
annotation.origin is None
), f"Cannot get_factory {source}, only concrete types or lists of concrete types are supported"
return FactoryAnnotation(concrete_type=annotation.source, is_collection=False, label="")
), f"Cannot get_factory {source}, only concrete types, type annotations, or lists of concrete types are supported"
return FactoryAnnotation(
concrete_type=annotation.source, is_collection=False, is_type=False, label=""
)

@classmethod
def from_factory(cls, c: Callable) -> "FactoryAnnotation":
Expand Down
Loading

0 comments on commit cc9ad01

Please sign in to comment.