From 5b18450cbc23e4955c52c41c2843d5f71b5d442e Mon Sep 17 00:00:00 2001 From: Arpad Borsos Date: Tue, 25 Jul 2023 15:27:44 +0200 Subject: [PATCH] Implement ArtifactBundle flat file indexing (#53505) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The high-level overview is this: - When uploading a new `ArtifactBundle`, we `get_or_create` the `ArtifactBundleFlatFileIndex`, and `update_or_create` the `FlatFileIndexState`, setting it to `NOT_INDEXED`. - Then, using a distributed lock, we read the existing `ArtifactBundleFlatFileIndex`, merging the `ArtifactBundle` into it and updating it, finally setting the `FlatFileIndexState` to `WAS_INDEXED`. As the final step is behind a distributed lock, we can be confident that we don’t have data races. If any error happens during this operation, the `FlatFileIndexState` indicates that indexing still needs to happen. A separate "repair-job" could in the future just query all the `FlatFileIndexState` entries older than X that were not yet indexed, and add them to that index at a later time. --------- Co-authored-by: Riccardo Busetti --- src/sentry/conf/server.py | 2 + .../debug_files/artifact_bundle_indexing.py | 285 ++++++++++ src/sentry/features/__init__.py | 1 + src/sentry/models/artifactbundle.py | 40 +- src/sentry/tasks/assemble.py | 38 +- .../test_artifact_bundle_indexing.py | 508 ++++++++++++++++++ tests/sentry/models/test_artifactbundle.py | 23 +- tests/sentry/tasks/test_assemble.py | 27 + 8 files changed, 883 insertions(+), 41 deletions(-) create mode 100644 src/sentry/debug_files/artifact_bundle_indexing.py create mode 100644 tests/sentry/debug_files/test_artifact_bundle_indexing.py diff --git a/src/sentry/conf/server.py b/src/sentry/conf/server.py index d7a26fed1a7891..a54d9084860c2e 100644 --- a/src/sentry/conf/server.py +++ b/src/sentry/conf/server.py @@ -1686,6 +1686,8 @@ def SOCIAL_AUTH_DEFAULT_USERNAME() -> str: "projects:alert-filters": True, # Enable functionality to specify custom inbound filters on events. "projects:custom-inbound-filters": False, + # Enable the new flat file indexing system for sourcemaps. + "organizations:sourcemaps-bundle-flat-file-indexing": False, # Enable data forwarding functionality for projects. "projects:data-forwarding": True, # Enable functionality to discard groups. diff --git a/src/sentry/debug_files/artifact_bundle_indexing.py b/src/sentry/debug_files/artifact_bundle_indexing.py new file mode 100644 index 00000000000000..d321f02d6fa25f --- /dev/null +++ b/src/sentry/debug_files/artifact_bundle_indexing.py @@ -0,0 +1,285 @@ +import hashlib +import logging +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeVar + +import sentry_sdk +from django.db import router +from django.utils import timezone + +from sentry.locks import locks +from sentry.models.artifactbundle import ( + NULL_STRING, + ArtifactBundle, + ArtifactBundleArchive, + ArtifactBundleFlatFileIndex, + ArtifactBundleIndexingState, + FlatFileIndexState, +) +from sentry.utils import json, metrics +from sentry.utils.db import atomic_transaction +from sentry.utils.locking.lock import Lock +from sentry.utils.retries import TimedRetryPolicy + +logger = logging.getLogger(__name__) + + +# We want to keep the bundle as being indexed for 600 seconds = 10 minutes. We might need to revise this number and +# optimize it based on the time taken to perform the indexing (on average). +FLAT_FILE_INDEXING_CACHE_TIMEOUT = 600 + + +@sentry_sdk.tracing.trace +def mark_bundle_for_flat_file_indexing( + artifact_bundle: ArtifactBundle, + project_ids: List[int], + release: Optional[str], + dist: Optional[str], +): + identifiers = [] + + for project_id in project_ids: + if release: + identifiers.append( + FlatFileIdentifier(project_id, release=release, dist=dist or NULL_STRING) + ) + + identifiers.append(FlatFileIdentifier.for_debug_id(project_id)) + + # Create / Update the indexing state in the database + with atomic_transaction( + using=( + router.db_for_write(ArtifactBundleFlatFileIndex), + router.db_for_write(FlatFileIndexState), + ) + ): + for identifier in identifiers: + flat_file_index, _created = ArtifactBundleFlatFileIndex.objects.get_or_create( + project_id=identifier.project_id, + release_name=identifier.release, + dist_name=identifier.dist, + ) + FlatFileIndexState.objects.update_or_create( + flat_file_index=flat_file_index, + artifact_bundle=artifact_bundle, + defaults={ + "indexing_state": ArtifactBundleIndexingState.NOT_INDEXED.value, + "date_added": timezone.now(), + }, + ) + + return identifiers + + +class FlatFileIdentifier(NamedTuple): + project_id: int + release: str + dist: str + + @staticmethod + def for_debug_id(project_id: int) -> "FlatFileIdentifier": + return FlatFileIdentifier(project_id, release=NULL_STRING, dist=NULL_STRING) + + def is_indexing_by_release(self) -> bool: + # An identifier is indexing by release if release is set. + return bool(self.release) + + def _key_hash(self) -> str: + key = f"{self.project_id}|{self.release}|{self.dist}" + return hashlib.sha1(key.encode()).hexdigest() + + def get_lock(self) -> Lock: + key_hash = self._key_hash() + locking_key = f"bundle_index:write:{key_hash}" + return locks.get(locking_key, duration=60 * 10, name="bundle_index") + + +@sentry_sdk.tracing.trace +def update_artifact_bundle_index( + bundle_meta: "BundleMeta", bundle_archive: ArtifactBundleArchive, identifier: FlatFileIdentifier +): + """ + This will merge the `ArtifactBundle` given via `bundle_meta` and `bundle_archive` + into the index identified via `identifier`. + + If this function fails for any reason, it can be, and *has to be* retried at a later point. + """ + # TODO: maybe query `FlatFileIndexState` to avoid double-indexing? + + lock = identifier.get_lock() + with TimedRetryPolicy(60)(lock.acquire): + flat_file_index = ArtifactBundleFlatFileIndex.objects.select_related("flat_file_index").get( + project_id=identifier.project_id, + release_name=identifier.release, + dist_name=identifier.dist, + ) + + index = FlatFileIndex() + # Load the index from the file if it exists + if existing_index := flat_file_index.load_flat_file_index(): + index.from_json(existing_index) + + # Before merging new data into the index, we will clear any existing + # data from the index related to this bundle. + # This is related to an edge-case in which the same `bundle_id` could be + # re-used but with different file contents. + index.remove(bundle_meta.id) + + # We merge the index based on the identifier type. + if identifier.is_indexing_by_release(): + index.merge_urls(bundle_meta, bundle_archive) + else: + index.merge_debug_ids(bundle_meta, bundle_archive) + + # Store the updated index file + new_json_index = index.to_json() + flat_file_index.update_flat_file_index(new_json_index) + + # And then mark the bundle as indexed + was_updated = FlatFileIndexState.compare_state_and_set( + flat_file_index.id, + bundle_meta.id, + ArtifactBundleIndexingState.NOT_INDEXED, + ArtifactBundleIndexingState.WAS_INDEXED, + ) + if not was_updated: + metrics.incr("artifact_bundle_flat_file_indexing.duplicated_indexing") + logger.error("`ArtifactBundle` %r was already indexed into %r", bundle_meta, identifier) + + +@dataclass(frozen=True) +class BundleMeta: + id: int + timestamp: datetime + + +Bundles = List[BundleMeta] +FilesByUrl = Dict[str, List[int]] +FilesByDebugID = Dict[str, List[int]] + + +T = TypeVar("T") + + +class FlatFileIndex: + def __init__(self): + # By default, a flat file index is empty. + self._bundles: Bundles = [] + self._files_by_url: FilesByUrl = {} + self._files_by_debug_id: FilesByDebugID = {} + + def from_json(self, json_str: str) -> None: + json_idx = json.loads(json_str) + + bundles = json_idx.get("bundles", []) + self._bundles = [ + BundleMeta( + int(bundle["bundle_id"].split("/")[1]), + datetime.fromisoformat(bundle["timestamp"]), + ) + for bundle in bundles + ] + self._files_by_url = json_idx.get("files_by_url", {}) + self._files_by_debug_id = json_idx.get("files_by_debug_id", {}) + + def to_json(self) -> str: + bundles = [ + { + # NOTE: Symbolicator is using the `bundle_id` as the `?download=...` + # parameter it passes to the artifact-lookup API to download the + # linked bundle from, so this has to match whatever download_id + # the artifact-lookup API accepts. + "bundle_id": f"artifact_bundle/{bundle.id}", + "timestamp": datetime.isoformat(bundle.timestamp), + } + for bundle in self._bundles + ] + json_idx: Dict[str, Any] = { + "bundles": bundles, + "files_by_url": self._files_by_url, + "files_by_debug_id": self._files_by_debug_id, + } + + return json.dumps(json_idx) + + def merge_urls(self, bundle_meta: BundleMeta, bundle_archive: ArtifactBundleArchive): + bundle_index = self._add_or_update_bundle(bundle_meta) + if bundle_index is None: + return + + for url in bundle_archive.get_all_urls(): + self._add_sorted_entry(self._files_by_url, url, bundle_index) + + def merge_debug_ids(self, bundle_meta: BundleMeta, bundle_archive: ArtifactBundleArchive): + bundle_index = self._add_or_update_bundle(bundle_meta) + if bundle_index is None: + return + + for debug_id, _ in bundle_archive.get_all_debug_ids(): + self._add_sorted_entry(self._files_by_debug_id, debug_id, bundle_index) + + def _add_or_update_bundle(self, bundle_meta: BundleMeta) -> Optional[int]: + index_and_bundle_meta = self._index_and_bundle_meta_for_id(bundle_meta.id) + if index_and_bundle_meta is None: + self._bundles.append(bundle_meta) + return len(self._bundles) - 1 + + found_bundle_index, found_bundle_meta = index_and_bundle_meta + # In case the new bundle is exactly the same, we will not update, since it's unnecessary. + if found_bundle_meta == bundle_meta: + return None + else: + # TODO: it might be possible to optimize updating and re-sorting + # an existing bundle + self._bundles[found_bundle_index] = bundle_meta + return found_bundle_index + + def _add_sorted_entry(self, collection: Dict[T, List[int]], key: T, bundle_index: int): + entries = collection.get(key, []) + entries.append(bundle_index) + # Remove duplicates by doing a roundtrip through `set`. + entries = list(set(entries)) + # Symbolicator will consider the newest element the last element of the list. + entries.sort(key=lambda index: (self._bundles[index].timestamp, self._bundles[index].id)) + collection[key] = entries + + def remove(self, artifact_bundle_id: int) -> bool: + index_and_bundle_meta = self._index_and_bundle_meta_for_id(artifact_bundle_id) + if index_and_bundle_meta is None: + return False + + found_bundle_index, _ = index_and_bundle_meta + self._files_by_url = self._update_bundle_references(self._files_by_url, found_bundle_index) + self._files_by_debug_id = self._update_bundle_references( + self._files_by_debug_id, found_bundle_index + ) + self._bundles.pop(found_bundle_index) + + return True + + @staticmethod + def _update_bundle_references(collection: Dict[T, List[int]], removed_bundle_index: int): + updated_collection: Dict[T, List[int]] = {} + + for key, indexes in collection.items(): + updated_indexes = [ + index if index < removed_bundle_index else index - 1 + for index in indexes + if index != removed_bundle_index + ] + + # Only if we have some indexes we want to keep the key. + if len(updated_indexes) > 0: + updated_collection[key] = updated_indexes + + return updated_collection + + def _index_and_bundle_meta_for_id( + self, artifact_bundle_id: int + ) -> Optional[Tuple[int, BundleMeta]]: + for index, bundle in enumerate(self._bundles): + if bundle.id == artifact_bundle_id: + return index, bundle + + return None diff --git a/src/sentry/features/__init__.py b/src/sentry/features/__init__.py index 9a5ac4a9e2a76b..748da5a6a42b06 100644 --- a/src/sentry/features/__init__.py +++ b/src/sentry/features/__init__.py @@ -260,6 +260,7 @@ default_manager.add("organizations:ds-org-recalibration", OrganizationFeature, FeatureHandlerStrategy.INTERNAL) default_manager.add("organizations:slack-use-new-lookup", OrganizationFeature, FeatureHandlerStrategy.REMOTE) default_manager.add("organizations:disable-on-broken", OrganizationFeature, FeatureHandlerStrategy.REMOTE) +default_manager.add("organizations:sourcemaps-bundle-flat-file-indexing", OrganizationFeature, FeatureHandlerStrategy.REMOTE) # Project scoped features default_manager.add("projects:alert-filters", ProjectFeature, FeatureHandlerStrategy.INTERNAL) diff --git a/src/sentry/models/artifactbundle.py b/src/sentry/models/artifactbundle.py index f736c5e7681ec3..ec185de03fb78e 100644 --- a/src/sentry/models/artifactbundle.py +++ b/src/sentry/models/artifactbundle.py @@ -132,26 +132,6 @@ class Meta: index_together = (("project_id", "release_name", "dist_name"),) - @classmethod - def create_flat_file_index( - cls, project_id: int, release: str, dist: str, file_contents: Optional[str] - ) -> "ArtifactBundleFlatFileIndex": - from sentry.models import File - - with atomic_transaction( - using=(router.db_for_write(File), router.db_for_write(ArtifactBundleFlatFileIndex)) - ): - # By default, we can create a flat index file which has not `File` object bound to it. - file = None - if file_contents: - file = cls._create_flat_file_index_object(file_contents) - - index = ArtifactBundleFlatFileIndex.objects.create( - project_id=project_id, release_name=release, dist_name=dist, flat_file_index=file - ) - - return index - def update_flat_file_index(self, file_contents: str): from sentry.models import File @@ -212,8 +192,8 @@ def compare_state_and_set( updated_rows = FlatFileIndexState.objects.filter( flat_file_index_id=flat_file_index_id, artifact_bundle_id=artifact_bundle_id, - indexing_state=indexing_state, - ).update(indexing_state=new_indexing_state, date_added=timezone.now()) + indexing_state=indexing_state.value, + ).update(indexing_state=new_indexing_state.value, date_added=timezone.now()) # If we had one row being updated, it means that the cas operation succeeded. return updated_rows == 1 @@ -302,8 +282,12 @@ class ArtifactBundleArchive: def __init__(self, fileobj: IO, build_memory_map: bool = True): self._fileobj = fileobj self._zip_file = zipfile.ZipFile(self._fileobj) + self._entries_by_debug_id = {} + self._entries_by_url = {} + self.manifest = self._read_manifest() self.artifact_count = len(self.manifest.get("files", {})) + if build_memory_map: self._build_memory_maps() @@ -342,9 +326,6 @@ def normalize_debug_id(debug_id: Optional[str]) -> Optional[str]: return None def _build_memory_maps(self): - self._entries_by_debug_id = {} - self._entries_by_url = {} - files = self.manifest.get("files", {}) for file_path, info in files.items(): # Building the map for debug_id lookup. @@ -367,6 +348,15 @@ def _build_memory_maps(self): # Building the map for url lookup. self._entries_by_url[info.get("url")] = (file_path, info) + def get_all_urls(self): + return self._entries_by_url.keys() + + def get_all_debug_ids(self): + return self._entries_by_debug_id.keys() + + def has_debug_ids(self): + return len(self._entries_by_debug_id) > 0 + def extract_debug_ids_from_manifest( self, ) -> Set[Tuple[SourceFileType, str]]: diff --git a/src/sentry/tasks/assemble.py b/src/sentry/tasks/assemble.py index a60aea3f3bc971..95b467b991540a 100644 --- a/src/sentry/tasks/assemble.py +++ b/src/sentry/tasks/assemble.py @@ -13,9 +13,14 @@ from django.db.models import Q from django.utils import timezone -from sentry import analytics, options +from sentry import analytics, features, options from sentry.api.serializers import serialize from sentry.cache import default_cache +from sentry.debug_files.artifact_bundle_indexing import ( + BundleMeta, + mark_bundle_for_flat_file_indexing, + update_artifact_bundle_index, +) from sentry.debug_files.artifact_bundles import index_artifact_bundles_for_release from sentry.models import File, Organization, Release, ReleaseFile from sentry.models.artifactbundle import ( @@ -457,7 +462,7 @@ def __init__( self.organization = organization self.release = release self.dist = dist - self.projects_ids = project_ids + self.project_ids = project_ids def _validate_bundle(self): self.archive = ArtifactBundleArchive(self.assemble_result.bundle_temp_file) @@ -498,7 +503,7 @@ def _create_artifact_bundle(self) -> None: analytics.record( "artifactbundle.manifest_extracted", organization_id=self.organization.id, - project_ids=self.projects_ids, + project_ids=self.project_ids, has_debug_ids=len(debug_ids_with_types) > 0, ) @@ -544,7 +549,7 @@ def _create_artifact_bundle(self) -> None: defaults=new_date_added, ) - for project_id in self.projects_ids: + for project_id in self.project_ids: ProjectArtifactBundle.objects.create_or_update( organization_id=self.organization.id, project_id=project_id, @@ -594,6 +599,12 @@ def _create_artifact_bundle(self) -> None: date_snapshot=date_snapshot, ) + if features.has("organizations:sourcemaps-bundle-flat-file-indexing", self.organization): + try: + self._index_bundle_into_flat_file(artifact_bundle) + except Exception as e: + sentry_sdk.capture_exception(e) + @sentry_sdk.tracing.trace def _create_or_update_artifact_bundle( self, bundle_id: str, date_added: datetime @@ -734,6 +745,25 @@ def _index_bundle_if_needed(self, release: str, dist: str, date_snapshot: dateti metrics.incr("tasks.assemble.artifact_bundle.index_artifact_bundles_error") sentry_sdk.capture_exception(e) + @sentry_sdk.tracing.trace + def _index_bundle_into_flat_file(self, artifact_bundle: ArtifactBundle): + identifiers = mark_bundle_for_flat_file_indexing( + artifact_bundle, self.project_ids, self.release, self.dist + ) + + bundle_meta = BundleMeta( + id=artifact_bundle.id, + # We give priority to the date last modified for total ordering. + timestamp=(artifact_bundle.date_last_modified or artifact_bundle.date_uploaded), + ) + + for identifier in identifiers: + try: + update_artifact_bundle_index(bundle_meta, self.archive, identifier) + except Exception as e: + metrics.incr("artifact_bundle_flat_file_indexing.error_when_indexing") + sentry_sdk.capture_exception(e) + def prepare_post_assembler( assemble_result: AssembleResult, diff --git a/tests/sentry/debug_files/test_artifact_bundle_indexing.py b/tests/sentry/debug_files/test_artifact_bundle_indexing.py new file mode 100644 index 00000000000000..90b68b0f6a03db --- /dev/null +++ b/tests/sentry/debug_files/test_artifact_bundle_indexing.py @@ -0,0 +1,508 @@ +import zipfile +from datetime import timedelta +from io import BytesIO +from typing import Any, Dict +from unittest.mock import patch + +from django.core.files.base import ContentFile +from django.utils import timezone +from freezegun import freeze_time + +from sentry.debug_files.artifact_bundle_indexing import ( + BundleMeta, + FlatFileIndex, + mark_bundle_for_flat_file_indexing, + update_artifact_bundle_index, +) +from sentry.models import File +from sentry.models.artifactbundle import ( + ArtifactBundle, + ArtifactBundleArchive, + ArtifactBundleFlatFileIndex, +) +from sentry.testutils.cases import TestCase +from sentry.utils import json + + +def make_compressed_zip_file(files): + def remove_and_return(dictionary, key): + dictionary.pop(key) + return dictionary + + compressed = BytesIO() + with zipfile.ZipFile(compressed, mode="w") as zip_file: + for file_path, info in files.items(): + zip_file.writestr(file_path, bytes(info["content"])) + + zip_file.writestr( + "manifest.json", + json.dumps( + { + # We remove the "content" key in the original dict, thus no subsequent calls should be made. + "files": { + file_path: remove_and_return(info, "content") + for file_path, info in files.items() + } + } + ), + ) + compressed.seek(0) + + return compressed.getvalue() + + +class FlatFileTestCase(TestCase): + def mock_artifact_bundle(self, manifest): + file = make_compressed_zip_file(manifest) + + file_obj = File.objects.create(name="bundle.zip", type="artifact.bundle") + file_obj.putfile(ContentFile(file)) + + now = timezone.now() + + return ArtifactBundle.objects.create( + organization_id=self.organization.id, + bundle_id="5b14b2e6-141a-4584-a1bf-3b306af9d846", + file=file_obj, + artifact_count=10, + date_uploaded=now, + date_added=now, + date_last_modified=now, + ) + + def mock_artifact_bundle_flat_file_index(self, release, dist, file_contents): + index = ArtifactBundleFlatFileIndex.objects.create( + project_id=self.project.id, + release=release, + dist=dist, + ) + index.update_flat_file_index(file_contents) + return index + + @staticmethod + def mock_flat_file_index(): + return { + "bundles": [ + BundleMeta(id=1, timestamp=timezone.now() - timedelta(hours=1)), + BundleMeta(id=2, timestamp=timezone.now()), + ], + "files_by_url": {"~/app.js": [0], "~/main.js": [1, 0]}, + } + + +class FlatFileIndexingTest(FlatFileTestCase): + def mock_simple_artifact_bundle(self, with_debug_ids: bool = False): + manifest: Dict[str, Any] = { + "path/in/zip/foo": { + "url": "~/app.js", + "type": "minified_source", + "content": b"app_js", + }, + "path/in/zip/bar": { + "url": "~/main.js", + "content": b"main_js", + "type": "minified_source", + }, + } + + if with_debug_ids: + manifest["path/in/zip/foo"]["headers"] = { + "debug-id": "f206e0e7-3d0c-41cb-bccc-11b716728e27" + } + manifest["path/in/zip/bar"]["headers"] = { + "debug-id": "5c23c9a2-ffb8-49f4-8cc9-fbea9abe4493" + } + + return self.mock_artifact_bundle(manifest) + + def test_index_bundle_in_flat_file_with_only_release(self): + release = "1.0" + dist = "android" + + artifact_bundle = self.mock_simple_artifact_bundle() + + identifiers = mark_bundle_for_flat_file_indexing( + artifact_bundle, [self.project.id], release, dist + ) + bundle_meta = BundleMeta( + id=artifact_bundle.id, + timestamp=artifact_bundle.date_last_modified, + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as archive: + for identifier in identifiers: + update_artifact_bundle_index(bundle_meta, archive, identifier) + + assert ArtifactBundleFlatFileIndex.objects.get( + project_id=self.project.id, release_name=release, dist_name=dist + ) + + def test_index_bundle_in_flat_file_with_debug_ids(self): + artifact_bundle = self.mock_simple_artifact_bundle(with_debug_ids=True) + + identifiers = mark_bundle_for_flat_file_indexing( + artifact_bundle, [self.project.id], None, None + ) + bundle_meta = BundleMeta( + id=artifact_bundle.id, + timestamp=artifact_bundle.date_last_modified, + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as archive: + for identifier in identifiers: + update_artifact_bundle_index(bundle_meta, archive, identifier) + + assert ArtifactBundleFlatFileIndex.objects.get( + project_id=self.project.id, release_name="", dist_name="" + ) + + def test_index_bundle_in_flat_file_with_release_and_debug_ids(self): + release = "1.0" + dist = "android" + + artifact_bundle = self.mock_simple_artifact_bundle(with_debug_ids=True) + + identifiers = mark_bundle_for_flat_file_indexing( + artifact_bundle, [self.project.id], release, dist + ) + bundle_meta = BundleMeta( + id=artifact_bundle.id, + timestamp=artifact_bundle.date_last_modified, + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as archive: + for identifier in identifiers: + update_artifact_bundle_index(bundle_meta, archive, identifier) + + assert ArtifactBundleFlatFileIndex.objects.get( + project_id=self.project.id, release_name=release, dist_name=dist + ) + assert ArtifactBundleFlatFileIndex.objects.get( + project_id=self.project.id, release_name="", dist_name="" + ) + + @patch("sentry.debug_files.artifact_bundle_indexing.FlatFileIndex.to_json") + def test_index_bundle_in_flat_file_with_error(self, to_json): + to_json.side_effect = Exception + + artifact_bundle = self.mock_simple_artifact_bundle(with_debug_ids=True) + + identifiers = mark_bundle_for_flat_file_indexing( + artifact_bundle, [self.project.id], None, None + ) + bundle_meta = BundleMeta( + id=artifact_bundle.id, + timestamp=artifact_bundle.date_last_modified, + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as archive: + for identifier in identifiers: + try: + update_artifact_bundle_index(bundle_meta, archive, identifier) + except Exception: + pass + + index = ArtifactBundleFlatFileIndex.objects.get( + project_id=self.project.id, release_name="", dist_name="" + ) + assert index.load_flat_file_index() is None + + def test_index_bundle_in_flat_file_with_conflict(self): + release = "1.0" + dist = "android" + artifact_bundle = self.mock_simple_artifact_bundle(with_debug_ids=True) + + identifiers = mark_bundle_for_flat_file_indexing( + artifact_bundle, [self.project.id], release, dist + ) + bundle_meta = BundleMeta( + id=artifact_bundle.id, + timestamp=artifact_bundle.date_last_modified, + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as archive: + for identifier in identifiers: + update_artifact_bundle_index(bundle_meta, archive, identifier) + + # We run the indexing twice, it should still succeed + for identifier in identifiers: + update_artifact_bundle_index(bundle_meta, archive, identifier) + + assert ArtifactBundleFlatFileIndex.objects.filter( + project_id=self.project.id, release_name=release, dist_name=dist + ).exists() + assert ArtifactBundleFlatFileIndex.objects.filter( + project_id=self.project.id, release_name="", dist_name="" + ).exists() + + +@freeze_time("2023-07-13T10:00:00.000Z") +class FlatFileIndexTest(FlatFileTestCase): + def test_flat_file_index_with_no_index_stored_and_release(self): + artifact_bundle = self.mock_artifact_bundle( + { + "path/in/zip/foo": { + "url": "~/path/to/app.js", + "type": "minified_source", + "content": b"app_idx1", + }, + "path/in/zip/bar": { + "url": "~/path/to/other1.js", + "content": b"other1_idx1", + "type": "minified_source", + }, + } + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as bundle_archive: + flat_file_index = FlatFileIndex() + bundle_meta = BundleMeta( + id=artifact_bundle.id, timestamp=artifact_bundle.date_last_modified + ) + flat_file_index.merge_urls(bundle_meta, bundle_archive) + + assert json.loads(flat_file_index.to_json()) == { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{artifact_bundle.id}", + "timestamp": "2023-07-13T10:00:00+00:00", + } + ], + "files_by_url": {"~/path/to/app.js": [0], "~/path/to/other1.js": [0]}, + "files_by_debug_id": {}, + } + + def test_flat_file_index_with_no_index_stored_and_debug_ids(self): + artifact_bundle = self.mock_artifact_bundle( + { + "path/in/zip/foo": { + "url": "~/path/to/app.js", + "type": "minified_source", + "content": b"app_idx1", + "headers": {"debug-id": "f206e0e7-3d0c-41cb-bccc-11b716728e27"}, + }, + "path/in/zip/bar": { + "url": "~/path/to/other1.js", + "content": b"other1_idx1", + "type": "minified_source", + "headers": {"debug-id": "016ac8b3-60cb-427f-829c-7f99c92a6a95"}, + }, + } + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as bundle_archive: + flat_file_index = FlatFileIndex() + bundle_meta = BundleMeta( + id=artifact_bundle.id, timestamp=artifact_bundle.date_last_modified + ) + flat_file_index.merge_debug_ids(bundle_meta, bundle_archive) + + assert json.loads(flat_file_index.to_json()) == { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{artifact_bundle.id}", + "timestamp": "2023-07-13T10:00:00+00:00", + } + ], + "files_by_debug_id": { + "016ac8b3-60cb-427f-829c-7f99c92a6a95": [0], + "f206e0e7-3d0c-41cb-bccc-11b716728e27": [0], + }, + "files_by_url": {}, + } + + def test_flat_file_index_with_index_stored_and_release(self): + existing_bundle_id = 0 + existing_bundle_date = timezone.now() - timedelta(hours=1) + existing_json_index = { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{existing_bundle_id}", + "timestamp": existing_bundle_date.isoformat(), + } + ], + "files_by_url": {"~/path/to/app.js": [0]}, + } + + artifact_bundle = self.mock_artifact_bundle( + { + "path/in/zip/foo": { + "url": "~/path/to/app.js", + "type": "minified_source", + "content": b"app_idx1", + }, + "path/in/zip/bar": { + "url": "~/path/to/other1.js", + "content": b"other1_idx1", + "type": "minified_source", + }, + } + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as bundle_archive: + flat_file_index = FlatFileIndex() + flat_file_index.from_json(json.dumps(existing_json_index)) + bundle_meta = BundleMeta( + id=artifact_bundle.id, timestamp=artifact_bundle.date_last_modified + ) + flat_file_index.merge_urls(bundle_meta, bundle_archive) + + assert json.loads(flat_file_index.to_json()) == { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{existing_bundle_id}", + "timestamp": "2023-07-13T09:00:00+00:00", + }, + { + "bundle_id": f"artifact_bundle/{artifact_bundle.id}", + "timestamp": "2023-07-13T10:00:00+00:00", + }, + ], + "files_by_debug_id": {}, + "files_by_url": {"~/path/to/app.js": [0, 1], "~/path/to/other1.js": [1]}, + } + + def test_flat_file_index_with_index_stored_and_debug_ids(self): + existing_bundle_id = 0 + existing_bundle_date = timezone.now() - timedelta(hours=1) + existing_json_index = { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{existing_bundle_id}", + "timestamp": existing_bundle_date.isoformat(), + } + ], + "files_by_debug_id": {"f206e0e7-3d0c-41cb-bccc-11b716728e27": [0]}, + } + + artifact_bundle = self.mock_artifact_bundle( + { + "path/in/zip/foo": { + "url": "~/path/to/app.js", + "type": "minified_source", + "content": b"app_idx1", + "headers": {"debug-id": "f206e0e7-3d0c-41cb-bccc-11b716728e27"}, + }, + "path/in/zip/bar": { + "url": "~/path/to/other1.js", + "content": b"other1_idx1", + "type": "minified_source", + "headers": {"debug-id": "016ac8b3-60cb-427f-829c-7f99c92a6a95"}, + }, + } + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as bundle_archive: + flat_file_index = FlatFileIndex() + flat_file_index.from_json(json.dumps(existing_json_index)) + bundle_meta = BundleMeta( + id=artifact_bundle.id, timestamp=artifact_bundle.date_last_modified + ) + flat_file_index.merge_debug_ids(bundle_meta, bundle_archive) + + assert json.loads(flat_file_index.to_json()) == { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{existing_bundle_id}", + "timestamp": "2023-07-13T09:00:00+00:00", + }, + { + "bundle_id": f"artifact_bundle/{artifact_bundle.id}", + "timestamp": "2023-07-13T10:00:00+00:00", + }, + ], + "files_by_debug_id": { + "016ac8b3-60cb-427f-829c-7f99c92a6a95": [1], + "f206e0e7-3d0c-41cb-bccc-11b716728e27": [0, 1], + }, + "files_by_url": {}, + } + + def test_flat_file_index_remove_bundle(self): + now = timezone.now() + + existing_json_index = { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{1234}", + "timestamp": (now - timedelta(hours=2)).isoformat(), + }, + { + "bundle_id": f"artifact_bundle/{5678}", + "timestamp": (now - timedelta(hours=1)).isoformat(), + }, + {"bundle_id": f"artifact_bundle/{9101112}", "timestamp": now.isoformat()}, + ], + "files_by_debug_id": { + "016ac8b3-60cb-427f-829c-7f99c92a6a95": [0], + "2a9e7ab2-50ba-43b5-a8fd-13f6ac1f5976": [0, 1], + "f206e0e7-3d0c-41cb-bccc-11b716728e27": [0, 1, 2], + "de02ba67-6820-423f-a1b2-00c7e7d3bc9c": [1, 2], + "c1e9ab1f-3745-44c8-be4b-aca3705c7c17": [2], + }, + } + + flat_file_index = FlatFileIndex() + flat_file_index.from_json(json.dumps(existing_json_index)) + flat_file_index.remove(5678) + + assert json.loads(flat_file_index.to_json()) == { + "bundles": [ + {"bundle_id": f"artifact_bundle/{1234}", "timestamp": "2023-07-13T08:00:00+00:00"}, + { + "bundle_id": f"artifact_bundle/{9101112}", + "timestamp": "2023-07-13T10:00:00+00:00", + }, + ], + "files_by_debug_id": { + "016ac8b3-60cb-427f-829c-7f99c92a6a95": [0], + "2a9e7ab2-50ba-43b5-a8fd-13f6ac1f5976": [0], + "f206e0e7-3d0c-41cb-bccc-11b716728e27": [0, 1], + "de02ba67-6820-423f-a1b2-00c7e7d3bc9c": [1], + "c1e9ab1f-3745-44c8-be4b-aca3705c7c17": [1], + }, + "files_by_url": {}, + } + + def test_flat_file_index_with_index_stored_and_duplicated_bundle(self): + existing_bundle_id = 0 + existing_bundle_date = timezone.now() - timedelta(hours=1) + existing_json_index = { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{existing_bundle_id}", + "timestamp": existing_bundle_date.isoformat(), + } + ], + "files_by_url": {"~/path/to/app.js": [0]}, + } + + artifact_bundle = self.mock_artifact_bundle( + { + "path/in/zip/foo": { + "url": "~/path/to/app.js", + "type": "minified_source", + "content": b"app_idx1", + }, + } + ) + + with ArtifactBundleArchive(artifact_bundle.file.getfile()) as bundle_archive: + flat_file_index = FlatFileIndex() + flat_file_index.from_json(json.dumps(existing_json_index)) + # We use the id of the existing bundle. + bundle_meta = BundleMeta( + id=existing_bundle_id, timestamp=artifact_bundle.date_last_modified + ) + flat_file_index.merge_urls(bundle_meta, bundle_archive) + + assert json.loads(flat_file_index.to_json()) == { + "bundles": [ + { + "bundle_id": f"artifact_bundle/{existing_bundle_id}", + "timestamp": "2023-07-13T10:00:00+00:00", + } + ], + "files_by_url": {"~/path/to/app.js": [0]}, + "files_by_debug_id": {}, + } diff --git a/tests/sentry/models/test_artifactbundle.py b/tests/sentry/models/test_artifactbundle.py index 287c9fb6766a83..43866e6906ce37 100644 --- a/tests/sentry/models/test_artifactbundle.py +++ b/tests/sentry/models/test_artifactbundle.py @@ -30,12 +30,12 @@ def mock_flat_file_index_contents_with_debug_ids(): def test_artifact_bundle_flat_index_is_created(self): file_contents = self.mock_flat_file_index_contents_with_urls() - index = ArtifactBundleFlatFileIndex.create_flat_file_index( + index = ArtifactBundleFlatFileIndex.objects.create( project_id=self.project.id, - release="1.0", - dist="android", - file_contents=json.dumps(file_contents), + release_name="1.0", + dist_name="android", ) + index.update_flat_file_index(json.dumps(file_contents)) assert index.project_id == self.project.id assert index.release_name == "1.0" @@ -45,11 +45,10 @@ def test_artifact_bundle_flat_index_is_created(self): assert json.loads(flat_file_index) == file_contents def test_artifact_bundle_flat_index_is_created_with_empty_contents(self): - index = ArtifactBundleFlatFileIndex.create_flat_file_index( + index = ArtifactBundleFlatFileIndex.objects.create( project_id=self.project.id, - release="1.0", - dist="android", - file_contents=None, + release_name="1.0", + dist_name="android", ) assert index.project_id == self.project.id @@ -58,12 +57,12 @@ def test_artifact_bundle_flat_index_is_created_with_empty_contents(self): assert index.load_flat_file_index() is None def test_artifact_bundle_flat_index_is_updated(self): - index = ArtifactBundleFlatFileIndex.create_flat_file_index( + index = ArtifactBundleFlatFileIndex.objects.create( project_id=self.project.id, - release="1.0", - dist="android", - file_contents=json.dumps(self.mock_flat_file_index_contents_with_urls()), + release_name="1.0", + dist_name="android", ) + index.update_flat_file_index(json.dumps(self.mock_flat_file_index_contents_with_urls())) updated_file_contents = self.mock_flat_file_index_contents_with_debug_ids() index.update_flat_file_index(json.dumps(updated_file_contents)) diff --git a/tests/sentry/tasks/test_assemble.py b/tests/sentry/tasks/test_assemble.py index c9b78dfbc4fea0..7ed490c48bf03f 100644 --- a/tests/sentry/tasks/test_assemble.py +++ b/tests/sentry/tasks/test_assemble.py @@ -10,6 +10,7 @@ from sentry.models import File, FileBlob, FileBlobOwner, ReleaseFile from sentry.models.artifactbundle import ( ArtifactBundle, + ArtifactBundleFlatFileIndex, ArtifactBundleIndexingState, DebugIdArtifactBundle, ProjectArtifactBundle, @@ -666,6 +667,32 @@ def test_bundle_indexing_started_when_over_threshold(self, index_artifact_bundle dist=dist, ) + def test_bundle_flat_file_indexing(self): + release = "1.0" + dist = "android" + + bundle_file_1 = self.create_artifact_bundle_zip( + fixture_path="artifact_bundle_debug_ids", project=self.project.id + ) + blob1_1 = FileBlob.from_file(ContentFile(bundle_file_1)) + total_checksum_1 = sha1(bundle_file_1).hexdigest() + + with self.feature("organizations:sourcemaps-bundle-flat-file-indexing"): + assemble_artifacts( + org_id=self.organization.id, + project_ids=[self.project.id], + version=release, + dist=dist, + checksum=total_checksum_1, + chunks=[blob1_1.checksum], + upload_as_artifact_bundle=True, + ) + + flat_file_index = ArtifactBundleFlatFileIndex.objects.get( + project_id=self.project.id, release_name=release, dist_name=dist + ) + assert flat_file_index.load_flat_file_index() is not None + def test_artifacts_without_debug_ids(self): bundle_file = self.create_artifact_bundle_zip( org=self.organization.slug, release=self.release.version