From f2db650b9791eaa0fc344f16522035e9610637bc Mon Sep 17 00:00:00 2001 From: Zachary Collins Date: Tue, 16 Jul 2024 16:42:54 -0700 Subject: [PATCH] Use injector to initialize storage adaptor type --- README.md | 6 -- src/seer/automation/codebase/namespace.py | 33 ++++--- .../automation/codebase/storage_adapters.py | 49 +++++----- src/seer/configuration.py | 95 +++++++++++++++++++ src/seer/dependency_injection.py | 17 +++- .../codebase/test_codebase_index.py | 2 - tests/automation/codebase/test_namespace.py | 1 - .../codebase/test_storage_adapters.py | 36 +++---- tests/conftest.py | 12 +-- 9 files changed, 173 insertions(+), 78 deletions(-) create mode 100644 src/seer/configuration.py diff --git a/README.md b/README.md index f445af2f2..a328f8d34 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,6 @@ Autofix creates and uses a ChromaDB for every separate codebase. This can either be stored in a Google Cloud Storage (GCS) bucket, in your local filesystem, or you can make your own storage adapter. -##### Local Workspace - -Autofix will need a local folder to use as its workspace, by default it uses `data/chroma/workspaces`. - -You can set the location in filesystem where Autofix will use as a workspace with `CODEBASE_WORKSPACE_DIR`. - ##### Google Cloud Storage To use GCS, you need to set the following environment variables: diff --git a/src/seer/automation/codebase/namespace.py b/src/seer/automation/codebase/namespace.py index e4f054fad..8380ae37c 100644 --- a/src/seer/automation/codebase/namespace.py +++ b/src/seer/automation/codebase/namespace.py @@ -17,9 +17,10 @@ EmbeddedDocumentChunk, RepositoryInfo, ) -from seer.automation.codebase.storage_adapters import StorageAdapter, get_storage_adapter_class +from seer.automation.codebase.storage_adapters import StorageAdapter from seer.automation.models import RepoDefinition from seer.db import DbCodebaseNamespace, DbCodebaseNamespaceMutex, DbRepositoryInfo, Session +from seer.dependency_injection import inject, injected class CodebaseNamespaceManager: @@ -159,7 +160,13 @@ def get_namespace( return CodebaseNamespace.from_db(db_namespace) @classmethod - def load_workspace(cls, namespace_id: int, skip_copy: bool = False): + @inject + def load_workspace( + cls, + namespace_id: int, + skip_copy: bool = False, + storage_type: type[StorageAdapter] = injected, + ): with Session() as session: db_namespace = session.get(DbCodebaseNamespace, namespace_id) @@ -174,9 +181,7 @@ def load_workspace(cls, namespace_id: int, skip_copy: bool = False): repo_info = RepositoryInfo.from_db(db_repo_info) namespace = CodebaseNamespace.from_db(db_namespace) - storage_adapter = get_storage_adapter_class()( - repo_id=repo_info.id, namespace_slug=namespace.slug - ) + storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug) did_copy = False if not skip_copy: @@ -192,6 +197,7 @@ def load_workspace(cls, namespace_id: int, skip_copy: bool = False): return None @classmethod + @inject def load_workspace_for_repo_definition( cls, organization: int, @@ -199,6 +205,7 @@ def load_workspace_for_repo_definition( repo: RepoDefinition, sha: str | None = None, tracking_branch: str | None = None, + storage_type: type[StorageAdapter] = injected, ) -> Self | None: with Session() as session: db_repo_info = ( @@ -252,9 +259,7 @@ def load_workspace_for_repo_definition( repo_info = RepositoryInfo.from_db(db_repo_info) namespace = CodebaseNamespace.from_db(db_namespace) - storage_adapter = get_storage_adapter_class()( - repo_id=repo_info.id, namespace_slug=namespace.slug - ) + storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug) cls._wait_for_mutex_clear(namespace.id) cls._set_mutex(namespace.id) @@ -268,6 +273,7 @@ def load_workspace_for_repo_definition( return None @classmethod + @inject def create_repo( cls, organization: int, @@ -276,6 +282,7 @@ def create_repo( head_sha: str, tracking_branch: str | None = None, should_set_as_default: bool = False, + storage_type: type[StorageAdapter] = injected, ): autofix_logger.info( f"Creating new repo for {organization}/{project}/{repo.external_id} (repo: {repo.full_name})" @@ -309,9 +316,7 @@ def create_repo( repo_info = RepositoryInfo.from_db(db_repo_info) namespace = CodebaseNamespace.from_db(db_namespace) - storage_adapter = get_storage_adapter_class()( - repo_id=repo_info.id, namespace_slug=namespace.slug - ) + storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug) return cls(repo_info, namespace, storage_adapter) @@ -354,12 +359,14 @@ def create_namespace_with_new_or_existing_repo( ) @classmethod + @inject def create_or_get_namespace_for_repo( cls, repo_id: int, sha: str, tracking_branch: str | None = None, should_set_as_default: bool = False, + storage_type=injected, ): with Session() as session: existing_namespace = None @@ -412,9 +419,7 @@ def create_or_get_namespace_for_repo( namespace = CodebaseNamespace.from_db(db_namespace) - storage_adapter = get_storage_adapter_class()( - repo_id=repo_info.id, namespace_slug=namespace.slug - ) + storage_adapter = storage_type(repo_id=repo_info.id, namespace_slug=namespace.slug) return cls(repo_info, namespace, storage_adapter) diff --git a/src/seer/automation/codebase/storage_adapters.py b/src/seer/automation/codebase/storage_adapters.py index d4f8313df..d23d2cc00 100644 --- a/src/seer/automation/codebase/storage_adapters.py +++ b/src/seer/automation/codebase/storage_adapters.py @@ -6,19 +6,26 @@ # Why is this all good on pylance but mypy is complaining? from google.cloud import storage # type: ignore +from google.cloud.storage import Bucket from seer.automation.autofix.utils import autofix_logger from seer.automation.codebase.utils import cleanup_dir +from seer.bootup import module +from seer.configuration import AppConfig, CodebaseStorageType +from seer.dependency_injection import inject, injected class StorageAdapter(abc.ABC): repo_id: int namespace_slug: str + app_config: AppConfig - def __init__(self, repo_id: int, namespace_slug: str): + @inject + def __init__(self, repo_id: int, namespace_slug: str, app_config: AppConfig = injected): self.repo_id = repo_id self.namespace_slug = namespace_slug self.tmpdir = tempfile.mkdtemp() + self.app_config = app_config def __del__(self): self.clear_workspace() @@ -50,20 +57,17 @@ class FilesystemStorageAdapter(StorageAdapter): data storage and retrieval from a local directory structure. """ - @staticmethod - def get_storage_dir(): - storage_dir = os.getenv("CODEBASE_STORAGE_DIR", "data/chroma/storage") - return os.path.abspath(storage_dir) + def get_storage_dir(self): + return self.app_config.CODEBASE_STORAGE_DIR - @staticmethod - def get_storage_location(repo_id: int, namespace_slug: str): - storage_dir = FilesystemStorageAdapter.get_storage_dir() + def get_storage_location(self, repo_id: int, namespace_slug: str): + storage_dir = self.get_storage_dir() return os.path.join(storage_dir, f"{repo_id}/{namespace_slug}") @staticmethod - def clear_all_storage(): - storage_dir = FilesystemStorageAdapter.get_storage_dir() - shutil.rmtree(storage_dir, ignore_errors=True) + @inject + def clear_all_storage(app_config: AppConfig = injected): + shutil.rmtree(app_config.CODEBASE_STORAGE_DIR, ignore_errors=True) def copy_to_workspace(self): storage_path = self.get_storage_location(self.repo_id, self.namespace_slug) @@ -109,13 +113,11 @@ class GcsStorageAdapter(StorageAdapter): A storage adapter designed to store database files in Google Cloud Storage. """ - @staticmethod - def get_bucket(): - return storage.Client().bucket(os.getenv("CODEBASE_GCS_STORAGE_BUCKET", "sentry-ml")) + def get_bucket(self) -> Bucket: + return storage.Client().bucket(self.app_config.CODEBASE_GCS_STORAGE_BUCKET) - @staticmethod - def get_storage_prefix(repo_id: int, namespace_slug: str): - storage_dir = os.getenv("CODEBASE_GCS_STORAGE_DIR", "tmp_jenn/dev/chroma/storage") + def get_storage_prefix(self, repo_id: int, namespace_slug: str): + storage_dir = self.app_config.CODEBASE_GCS_STORAGE_DIR return os.path.join(storage_dir, f"{repo_id}/{namespace_slug}") def copy_to_workspace(self) -> bool: @@ -195,14 +197,13 @@ def delete_from_storage(self) -> bool: return True -def get_storage_adapter_class() -> type[StorageAdapter]: - storage_type = os.getenv("CODEBASE_STORAGE_TYPE", "filesystem") - - autofix_logger.debug(f"Using storage type: {storage_type}") +@module.provider +def get_storage_adapter_class(config: AppConfig = injected) -> type[StorageAdapter]: + autofix_logger.debug(f"Using storage type: {config.CODEBASE_STORAGE_TYPE}") - if storage_type == "filesystem": + if config.CODEBASE_STORAGE_TYPE == CodebaseStorageType.FILESYSTEM: return FilesystemStorageAdapter - elif storage_type == "gcs": + elif config.CODEBASE_STORAGE_TYPE == CodebaseStorageType.GCS: return GcsStorageAdapter else: - raise ValueError(f"Unknown storage type: {storage_type}") + raise ValueError(f"Unknown storage type: {config.CODEBASE_STORAGE_TYPE}") diff --git a/src/seer/configuration.py b/src/seer/configuration.py new file mode 100644 index 000000000..c0d890f2f --- /dev/null +++ b/src/seer/configuration.py @@ -0,0 +1,95 @@ +import os.path +from enum import Enum +from typing import Annotated, Any + +from pydantic import BaseModel, BeforeValidator, Field + +from seer.bootup import module, stub_module + + +class CodebaseStorageType(str, Enum): + FILESYSTEM = "filesystem" + GCS = "gcs" + + +def parse_int_from_env(data: str) -> int: + return int(data) + + +def parse_list_from_env(data: str) -> list[str]: + return data.split() + + +def parse_bool_from_env(data: str) -> bool: + if not data.lower() in ("yes", "true", "t", "y", "1", "on"): + return False + return True + + +def as_absolute_path(path: str) -> str: + return os.path.abspath(path) + + +ParseInt = Annotated[int, BeforeValidator(parse_int_from_env)] +ParseList = Annotated[list[str], BeforeValidator(parse_list_from_env)] +ParseBool = Annotated[bool, BeforeValidator(parse_bool_from_env)] +ParsePath = Annotated[str, BeforeValidator(as_absolute_path)] + + +class AppConfig(BaseModel): + CODEBASE_STORAGE_TYPE: CodebaseStorageType = CodebaseStorageType.FILESYSTEM + CODEBASE_STORAGE_DIR: ParsePath = os.path.abspath("data/chroma/storage") + + DATABASE_URL: str + CELERY_BROKER_URL: str + GITHUB_TOKEN: str = "" + GITHUB_APP_ID: str = "" + GITHUB_PRIVATE_KEY: str = "" + + CODEBASE_GCS_STORAGE_BUCKET: str = "sentry-ml" + CODEBASE_GCS_STORAGE_DIR: str = "tmp_jenn/dev/chroma/storage" + + JSON_API_SHARED_SECRETS: ParseList = Field(default_factory=list) + TORCH_NUM_THREADS: ParseInt = 1 + NO_SENTRY_INTEGRATION: ParseBool = False + DEV: ParseBool = False + + @property + def is_production(self) -> bool: + return not self.DEV + + @property + def has_sentry_integration(self) -> bool: + return not self.NO_SENTRY_INTEGRATION + + def model_post_init(self, context: Any): + if self.is_production: + assert self.has_sentry_integration, "Sentry integration required for production mode." + + if self.has_sentry_integration: + assert ( + self.JSON_API_SHARED_SECRETS + ), "JSON_API_SHARED_SECRETS required for sentry integration." + + +@module.provider +def load_from_environment(environ: dict[str, str] = os.environ) -> AppConfig: + return AppConfig.model_validate(environ) + + +@stub_module.provider +def provide_test_defaults() -> AppConfig: + """ + Load defaults into the base app config useful for tests + """ + + base = load_from_environment( + { + **os.environ, + "NO_SENTRY_INTEGRATION": "true", + } + ) + base.CODEBASE_STORAGE_DIR = os.path.abspath("data/tests/chroma/storage") + base.CODEBASE_GCS_STORAGE_DIR = os.path.abspath("chroma-test/data/storage") + + return base diff --git a/src/seer/dependency_injection.py b/src/seer/dependency_injection.py index 8a806ec0a..10b2efcfd 100644 --- a/src/seer/dependency_injection.py +++ b/src/seer/dependency_injection.py @@ -56,6 +56,7 @@ class Config: class FactoryAnnotation: concrete_type: type is_collection: bool + is_type: bool label: str @classmethod @@ -76,11 +77,23 @@ def from_annotation(cls, source: Any) -> "FactoryAnnotation": not inner.is_collection ), f"Cannot get_factory {source}: collections must be of concrete types, not other lists" return dataclasses.replace(inner, is_collection=True) + elif annotation.origin is type: + assert ( + len(annotation.args) == 1 + ), f"Cannot get_factory {source}: type requires at least one argument" + inner = FactoryAnnotation.from_annotation(annotation.args[0]) + assert not inner.label, f"Cannot get_factory {source}: type has embedded Labeled" + assert ( + not inner.is_collection and not inner.is_type + ), f"Cannot get_factory {source}: type factories must be of concrete types, not lists or other types" + return dataclasses.replace(inner, is_type=True) assert ( annotation.origin is None - ), f"Cannot get_factory {source}, only concrete types or lists of concrete types are supported" - return FactoryAnnotation(concrete_type=annotation.source, is_collection=False, label="") + ), f"Cannot get_factory {source}, only concrete types, type annotations, or lists of concrete types are supported" + return FactoryAnnotation( + concrete_type=annotation.source, is_collection=False, is_type=False, label="" + ) @classmethod def from_factory(cls, c: Callable) -> "FactoryAnnotation": diff --git a/tests/automation/codebase/test_codebase_index.py b/tests/automation/codebase/test_codebase_index.py index 892a8d1b9..3868dc7ea 100644 --- a/tests/automation/codebase/test_codebase_index.py +++ b/tests/automation/codebase/test_codebase_index.py @@ -30,7 +30,6 @@ class TestCodebaseIndexCreateAndIndex(unittest.TestCase): def setUp(self): os.environ["CODEBASE_STORAGE_TYPE"] = "filesystem" os.environ["CODEBASE_STORAGE_DIR"] = "data/tests/chroma/storage" - os.environ["CODEBASE_WORKSPACE_DIR"] = "data/tests/chroma/workspaces" def tearDown(self) -> None: FilesystemStorageAdapter.clear_all_storage() @@ -215,7 +214,6 @@ class TestCodebaseIndexUpdate(unittest.TestCase): def setUp(self): os.environ["CODEBASE_STORAGE_TYPE"] = "filesystem" os.environ["CODEBASE_STORAGE_DIR"] = "data/tests/chroma/storage" - os.environ["CODEBASE_WORKSPACE_DIR"] = "data/tests/chroma/workspaces" self.embedding_model = MagicMock() self.embedding_model.encode.return_value = [np.ones((768))] diff --git a/tests/automation/codebase/test_namespace.py b/tests/automation/codebase/test_namespace.py index b04686a72..72163a9d5 100644 --- a/tests/automation/codebase/test_namespace.py +++ b/tests/automation/codebase/test_namespace.py @@ -21,7 +21,6 @@ class TestNamespaceManager(unittest.TestCase): def setUp(self): os.environ["CODEBASE_STORAGE_TYPE"] = "filesystem" os.environ["CODEBASE_STORAGE_DIR"] = "data/tests/chroma/storage" - os.environ["CODEBASE_WORKSPACE_DIR"] = "data/tests/chroma/workspaces" def tearDown(self) -> None: FilesystemStorageAdapter.clear_all_storage() diff --git a/tests/automation/codebase/test_storage_adapters.py b/tests/automation/codebase/test_storage_adapters.py index df68d85b0..13875f30f 100644 --- a/tests/automation/codebase/test_storage_adapters.py +++ b/tests/automation/codebase/test_storage_adapters.py @@ -7,16 +7,21 @@ GcsStorageAdapter, get_storage_adapter_class, ) +from seer.configuration import AppConfig, CodebaseStorageType +from seer.dependency_injection import resolve class TestStorageAdapter(unittest.TestCase): - def test_fs_workspace_clear(self): - os.environ["CODEBASE_STORAGE_TYPE"] = "filesystem" - os.environ["CODEBASE_STORAGE_DIR"] = "data/tests/chroma/storage" - os.environ["CODEBASE_WORKSPACE_DIR"] = "data/tests/chroma/workspaces" + def test_adaptor_selection(self): + existing_config = resolve(AppConfig) + existing_config.CODEBASE_STORAGE_TYPE = CodebaseStorageType.FILESYSTEM + assert get_storage_adapter_class(existing_config) is FilesystemStorageAdapter + + existing_config.CODEBASE_STORAGE_TYPE = CodebaseStorageType.FILESYSTEM + assert get_storage_adapter_class(existing_config) is FilesystemStorageAdapter - StorageAdapter = get_storage_adapter_class() - adapter = StorageAdapter(1, "test") + def test_fs_workspace_clear(self): + adapter = FilesystemStorageAdapter(1, "test") # Create a file in the workspace workspace_dir = adapter.tmpdir @@ -31,12 +36,7 @@ def test_fs_workspace_clear(self): self.assertFalse(os.path.exists(os.path.join(workspace_dir, "test.txt"))) def test_gcs_workspace_clear(self): - os.environ["CODEBASE_STORAGE_TYPE"] = "gcs" - os.environ["CODEBASE_GCS_BUCKET"] = "seer-chroma" - os.environ["CODEBASE_GCS_STORAGE_DIR"] = "chroma-test/data/storage" - - StorageAdapter = get_storage_adapter_class() - adapter = StorageAdapter(1, "test") + adapter = GcsStorageAdapter(1, "test") # Create a file in the workspace workspace_dir = adapter.tmpdir @@ -52,11 +52,6 @@ def test_gcs_workspace_clear(self): class TestFilesystemStorageAdapter(unittest.TestCase): - def setUp(self): - os.environ["CODEBASE_STORAGE_TYPE"] = "filesystem" - os.environ["CODEBASE_STORAGE_DIR"] = "data/tests/chroma/storage" - os.environ["CODEBASE_WORKSPACE_DIR"] = "data/tests/chroma/workspaces" - def tearDown(self) -> None: FilesystemStorageAdapter.clear_all_storage() return super().tearDown() @@ -127,16 +122,11 @@ def test_save_to_storage_overwrites_existing_files(self): class TestGcsStorageAdapter(unittest.TestCase): - def setUp(self) -> None: - os.environ["CODEBASE_STORAGE_TYPE"] = "gcs" - os.environ["CODEBASE_GCS_BUCKET"] = "seer-chroma" - os.environ["CODEBASE_GCS_STORAGE_DIR"] = "chroma-test/data/storage" - @patch("seer.automation.codebase.storage_adapters.storage.Client") def test_storage_prefix(self, mock_gcs_client): adapter = GcsStorageAdapter(1, "test") storage_prefix = adapter.get_storage_prefix(1, "test") - self.assertEqual(storage_prefix, "chroma-test/data/storage/1/test") + self.assertEqual(storage_prefix, os.path.abspath("chroma-test/data/storage/1/test")) @patch("seer.automation.codebase.storage_adapters.storage.Client") def test_copy_to_workspace(self, mock_gcs_client): diff --git a/tests/conftest.py b/tests/conftest.py index 79c4ace79..609898f50 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,12 +16,6 @@ def test_module() -> Module: return stub_module -@pytest.fixture(autouse=True) -def enable_test_injector(test_module: Module) -> None: - with test_module: - yield - - @pytest.fixture(autouse=True, scope="session") def configure_environment(): os.environ["DATABASE_URL"] = os.environ["DATABASE_URL"].replace("db", "test-db") @@ -46,6 +40,12 @@ def setup_app(): db.metadata.drop_all(bind=db.engine) +@pytest.fixture(autouse=True) +def enable_test_injector(test_module: Module, setup_app) -> None: + with test_module: + yield + + pytest_plugins = ( "pytest_asyncio", "celery.contrib.pytest",