Skip to content

Commit

Permalink
Merge branch 'mainline' into li/patch-parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
wanliAlex authored Sep 17, 2024
2 parents 9b630cb + 02d2010 commit f6f9f45
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 32 deletions.
7 changes: 2 additions & 5 deletions .github/workflows/cloud-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ on:
with the prefix "test_index". Even those that were not created by this workflow.'
required: true
default: 'run_integration_tests'
pull_request:
branches:
- mainline
- 'releases/*'
# allows other workflows to reuse these unit tests:
workflow_call:

Expand All @@ -37,8 +33,9 @@ jobs:
steps:

- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
ref: ${{ github.head_ref || github.ref }} # Checkout the current branch or PR head
fetch-depth: 0

- name: Set up Python 3.8
Expand Down
85 changes: 58 additions & 27 deletions tests/v2_tests/test_hybrid_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,10 @@ def test_hybrid_search_searchable_attributes(self):
)
self.assertEqual(len(hybrid_res["hits"]),
3) # Only 3 documents have text field 2. Tensor retrieval will get them all.
self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12")
self.assertEqual(hybrid_res["hits"][1]["_id"], "doc11")
self.assertEqual(hybrid_res["hits"][2]["_id"], "doc13")
# TODO: Put these checks back when lexical search with replicas is consistent.
# self.assertEqual(hybrid_res["hits"][0]["_id"], "doc12")
# self.assertEqual(hybrid_res["hits"][1]["_id"], "doc11")
# self.assertEqual(hybrid_res["hits"][2]["_id"], "doc13")

def test_hybrid_search_with_custom_vector_query(self):
"""
Expand Down Expand Up @@ -192,8 +193,9 @@ def test_hybrid_search_same_retrieval_and_ranking_matches_original_method(self):
)

self.assertEqual(len(hybrid_res["hits"]), len(base_res["hits"]))
for i in range(len(hybrid_res["hits"])):
self.assertEqual(hybrid_res["hits"][i]["_id"], base_res["hits"][i]["_id"])
# TODO: Put these checks back when lexical search with replicas is consistent.
#for i in range(len(hybrid_res["hits"])):
# self.assertEqual(hybrid_res["hits"][i]["_id"], base_res["hits"][i]["_id"])

def test_hybrid_search_with_filter(self):
"""
Expand Down Expand Up @@ -232,7 +234,7 @@ def test_hybrid_search_with_filter(self):
self.assertEqual(len(hybrid_res["hits"]), 1)
self.assertEqual(hybrid_res["hits"][0]["_id"], "doc8")

def test_hybrid_search_rrf_with_replicas_has_no_duplicates(self):
def test_hybrid_search_structured_rrf_with_replicas_has_no_duplicates(self):
"""
Tests that show that running 100 searches on indexes with 3 replicas (structured text & unstructured text)
will not have duplicates in results.
Expand All @@ -242,29 +244,58 @@ def test_hybrid_search_rrf_with_replicas_has_no_duplicates(self):
if not self.client.config.is_marqo_cloud:
self.skipTest("Test is not relevant for non-Marqo Cloud instances")

index_test_cases = [CloudTestIndex.structured_text, CloudTestIndex.unstructured_text]
for cloud_test_index_to_use in index_test_cases:
test_index_name = self.get_test_index_name(
cloud_test_index_to_use=cloud_test_index_to_use,
open_source_test_index_name=None
)
self.client.index(test_index_name).add_documents(
self.docs_list,
tensor_fields=["text_field_1", "text_field_2", "text_field_3"] \
if "unstr" in test_index_name else None
# Split into 2 separate blocks to unblock (looping error occurring)
cloud_test_index_to_use = CloudTestIndex.structured_text
test_index_name = self.get_test_index_name(
cloud_test_index_to_use=cloud_test_index_to_use,
open_source_test_index_name=None
)
print(f"Running test for index: {test_index_name}", flush=True)
add_docs_res = self.client.index(test_index_name).add_documents(self.docs_list)
print(f"Add docs result: {add_docs_res}", flush=True)
for _ in range(100):
hybrid_res = self.client.index(test_index_name).search(
"dogs",
search_method="HYBRID",
limit=10
)

for _ in range(100):
hybrid_res = self.client.index(test_index_name).search(
"dogs",
search_method="HYBRID",
limit=10
)
# check for duplicates
hit_ids = [hit["_id"] for hit in hybrid_res["hits"]]
self.assertEqual(len(hit_ids), len(set(hit_ids)),
f"Duplicates found in results. Only {len(set(hit_ids))} unique results out of "
f"{len(hit_ids)}")

# check for duplicates
hit_ids = [hit["_id"] for hit in hybrid_res["hits"]]
self.assertEqual(len(hit_ids), len(set(hit_ids)),
f"Duplicates found in results. Only {len(set(hit_ids))} unique results out of "
f"{len(hit_ids)}")
def test_hybrid_search_unstructured_rrf_with_replicas_has_no_duplicates(self):
"""
Tests that show that running 100 searches on indexes with 3 replicas (structured text & unstructured text)
will not have duplicates in results.
Only relevant for cloud tests.
"""

if not self.client.config.is_marqo_cloud:
self.skipTest("Test is not relevant for non-Marqo Cloud instances")

cloud_test_index_to_use = CloudTestIndex.unstructured_text
test_index_name = self.get_test_index_name(
cloud_test_index_to_use=cloud_test_index_to_use,
open_source_test_index_name=None
)
print(f"Running test for index: {test_index_name}", flush=True)
add_docs_res = self.client.index(test_index_name).add_documents(
self.docs_list,
tensor_fields=["text_field_1", "text_field_2", "text_field_3"]
)
print(f"Add docs result: {add_docs_res}", flush=True)
for _ in range(100):
hybrid_res = self.client.index(test_index_name).search(
"dogs",
search_method="HYBRID",
limit=10
)

# check for duplicates
hit_ids = [hit["_id"] for hit in hybrid_res["hits"]]
self.assertEqual(len(hit_ids), len(set(hit_ids)),
f"Duplicates found in results. Only {len(set(hit_ids))} unique results out of "
f"{len(hit_ids)}")

0 comments on commit f6f9f45

Please sign in to comment.