From aa3698c4be4a9fbbc9e419669d17ed23fa63a7c2 Mon Sep 17 00:00:00 2001 From: Jodi Jang Date: Thu, 12 Sep 2024 13:37:11 -0700 Subject: [PATCH 1/2] chore(similarity): Add span to individual record insert --- src/seer/grouping/grouping.py | 78 ++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/src/seer/grouping/grouping.py b/src/seer/grouping/grouping.py index 56115c7d3..472262e3c 100644 --- a/src/seer/grouping/grouping.py +++ b/src/seer/grouping/grouping.py @@ -424,46 +424,50 @@ def insert_batch_grouping_records( ) with Session() as session: for i, entry in enumerate(data.data): - embedding = embeddings[i].astype("float32") - nearest_neighbor = self.query_nearest_k_neighbors( - session, - embedding, - entry.project_id, - entry.hash, - data.threshold, - data.k, - data.hnsw_candidates, - data.hnsw_distance, - data.use_reranking, - ) - - if nearest_neighbor: - neighbor, distance = nearest_neighbor[0][0], nearest_neighbor[0][1] - message_similarity_score = difflib.SequenceMatcher( - None, entry.message, neighbor.message - ).ratio() - response = GroupingResponse( - parent_hash=neighbor.hash, - stacktrace_distance=distance, - message_distance=1.0 - message_similarity_score, - should_group=True, - ) - groups_with_neighbor[str(entry.group_id)] = response - else: - insert_stmt = insert(DbGroupingRecord).values( - project_id=entry.project_id, - message=entry.message, - error_type=entry.exception_type, - hash=entry.hash, - stacktrace_embedding=embedding, + with sentry_sdk.start_span( + op="seer.grouping", description="insert single grouping record" + ) as span: + span.set_data("stacktrace_len", len(data.stacktrace_list[i])) + embedding = embeddings[i].astype("float32") + nearest_neighbor = self.query_nearest_k_neighbors( + session, + embedding, + entry.project_id, + entry.hash, + data.threshold, + data.k, + data.hnsw_candidates, + data.hnsw_distance, + data.use_reranking, ) - session.execute( - insert_stmt.on_conflict_do_nothing( - index_elements=(DbGroupingRecord.project_id, DbGroupingRecord.hash) + if nearest_neighbor: + neighbor, distance = nearest_neighbor[0][0], nearest_neighbor[0][1] + message_similarity_score = difflib.SequenceMatcher( + None, entry.message, neighbor.message + ).ratio() + response = GroupingResponse( + parent_hash=neighbor.hash, + stacktrace_distance=distance, + message_distance=1.0 - message_similarity_score, + should_group=True, ) - ) - session.commit() + groups_with_neighbor[str(entry.group_id)] = response + else: + insert_stmt = insert(DbGroupingRecord).values( + project_id=entry.project_id, + message=entry.message, + error_type=entry.exception_type, + hash=entry.hash, + stacktrace_embedding=embedding, + ) + + session.execute( + insert_stmt.on_conflict_do_nothing( + index_elements=(DbGroupingRecord.project_id, DbGroupingRecord.hash) + ) + ) + session.commit() return groups_with_neighbor From eb4af9740c78829a36a3d0290274715ab81aa127 Mon Sep 17 00:00:00 2001 From: Jodi Jang Date: Tue, 17 Sep 2024 10:06:20 -0700 Subject: [PATCH 2/2] chore(similarity): Make message column nullable in grouping records --- .../versions/d0d1d2df4540_migration.py | 31 +++++++++++++++++++ src/seer/db.py | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 src/migrations/versions/d0d1d2df4540_migration.py diff --git a/src/migrations/versions/d0d1d2df4540_migration.py b/src/migrations/versions/d0d1d2df4540_migration.py new file mode 100644 index 000000000..e156a30f2 --- /dev/null +++ b/src/migrations/versions/d0d1d2df4540_migration.py @@ -0,0 +1,31 @@ +"""Migration + +Revision ID: d0d1d2df4540 +Revises: da0a9c9f1bb4 +Create Date: 2024-09-17 17:03:02.000258 + +""" +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "d0d1d2df4540" +down_revision = "da0a9c9f1bb4" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("grouping_records", schema=None) as batch_op: + batch_op.alter_column("message", existing_type=sa.VARCHAR(), nullable=True) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("grouping_records", schema=None) as batch_op: + batch_op.alter_column("message", existing_type=sa.VARCHAR(), nullable=False) + + # ### end Alembic commands ### diff --git a/src/seer/db.py b/src/seer/db.py index a66ecddf8..0b6e63ef3 100644 --- a/src/seer/db.py +++ b/src/seer/db.py @@ -256,7 +256,7 @@ class DbGroupingRecord(Base): server_default=text("nextval('grouping_records_id_seq')"), ) project_id: Mapped[int] = mapped_column(BigInteger, primary_key=True, nullable=False) - message: Mapped[str] = mapped_column(String, nullable=False) + message: Mapped[Optional[str]] = mapped_column(String, nullable=True) error_type: Mapped[str] = mapped_column(String, nullable=True) stacktrace_embedding: Mapped[Vector] = mapped_column(Vector(768), nullable=False) hash: Mapped[str] = mapped_column(String(32), nullable=False)