Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Apply hidden_resources to suggest queries #2487

Merged
merged 2 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions nucliadb/src/nucliadb/ingest/orm/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from nucliadb.common import ids
from nucliadb.ingest import logger
from nucliadb.ingest.orm.utils import compute_paragraph_key
from nucliadb_models.labels import BASE_LABELS, flatten_resource_labels
from nucliadb_models.labels import BASE_LABELS, LABEL_HIDDEN, flatten_resource_labels
from nucliadb_models.metadata import ResourceProcessingStatus
from nucliadb_protos import utils_pb2
from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
Expand Down Expand Up @@ -509,7 +509,8 @@ def _set_resource_labels(self, basic: Basic, origin: Optional[Origin]):

# hidden
if basic.hidden:
self.labels["q"].append("h")
_, p1, p2 = LABEL_HIDDEN.split("/")
self.labels[p1].append(p2)

self.compute_labels()

Expand Down
6 changes: 6 additions & 0 deletions nucliadb/src/nucliadb/search/api/v1/suggest.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from nucliadb.search.search.exceptions import InvalidQueryError
from nucliadb.search.search.merge import merge_suggest_results
from nucliadb.search.search.query import suggest_query_to_pb
from nucliadb.search.search.utils import filter_hidden_resources
from nucliadb_models.common import FieldTypeName
from nucliadb_models.resource import NucliaDBRoles
from nucliadb_models.search import (
Expand Down Expand Up @@ -81,6 +82,7 @@ async def suggest_knowledgebox(
x_forwarded_for: str = Header(""),
debug: bool = fastapi_query(SearchParamDefaults.debug),
highlight: bool = fastapi_query(SearchParamDefaults.highlight),
show_hidden: bool = fastapi_query(SearchParamDefaults.show_hidden, include_in_schema=False),
) -> Union[KnowledgeboxSuggestResults, HTTPClientError]:
try:
return await suggest(
Expand All @@ -102,6 +104,7 @@ async def suggest_knowledgebox(
x_forwarded_for,
debug,
highlight,
show_hidden,
)
except InvalidQueryError as exc:
return HTTPClientError(status_code=412, detail=str(exc))
Expand All @@ -126,8 +129,10 @@ async def suggest(
x_forwarded_for: str,
debug: bool,
highlight: bool,
show_hidden: bool,
) -> KnowledgeboxSuggestResults:
with cache.request_caches():
hidden = await filter_hidden_resources(kbid, show_hidden)
pb_query = suggest_query_to_pb(
features,
query,
Expand All @@ -138,6 +143,7 @@ async def suggest(
range_creation_end,
range_modification_start,
range_modification_end,
hidden,
)
results, incomplete_results, queried_nodes = await node_query(kbid, Method.SUGGEST, pb_query)

Expand Down
22 changes: 17 additions & 5 deletions nucliadb/src/nucliadb/search/search/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
)
from nucliadb.search.utilities import get_predict
from nucliadb_models.internal.predict import QueryInfo
from nucliadb_models.labels import translate_system_to_alias_label
from nucliadb_models.labels import LABEL_HIDDEN, translate_system_to_alias_label
from nucliadb_models.metadata import ResourceProcessingStatus
from nucliadb_models.search import (
Filter,
Expand Down Expand Up @@ -128,9 +128,9 @@ def __init__(
self.hidden = hidden
if self.hidden is not None:
if self.hidden:
label_filters.append(Filter(all=["/q/h"])) # type: ignore
label_filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
else:
label_filters.append(Filter(none=["/q/h"])) # type: ignore
label_filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore

self.label_filters: dict[str, Any] = convert_to_node_filters(label_filters)
self.flat_label_filters: list[str] = []
Expand Down Expand Up @@ -709,6 +709,7 @@ def suggest_query_to_pb(
range_creation_end: Optional[datetime] = None,
range_modification_start: Optional[datetime] = None,
range_modification_end: Optional[datetime] = None,
hidden: Optional[bool] = None,
) -> nodereader_pb2.SuggestRequest:
request = nodereader_pb2.SuggestRequest()

Expand All @@ -718,10 +719,21 @@ def suggest_query_to_pb(

if SuggestOptions.PARAGRAPH in features:
request.features.append(nodereader_pb2.SuggestFeatures.PARAGRAPHS)
filters = [translate_label(fltr) for fltr in filters]
request.filter.field_labels.extend(filters)
request.fields.extend(fields)

if hidden is not None:
if hidden:
filters.append(Filter(all=[LABEL_HIDDEN])) # type: ignore
else:
filters.append(Filter(none=[LABEL_HIDDEN])) # type: ignore

expression = convert_to_node_filters(filters)
if expression:
expression = translate_label_filters(expression)

request.filter.field_labels.extend(flatten_filter_literals(expression))
request.filter.labels_expression = json.dumps(expression)

if range_creation_start is not None:
request.timestamps.from_created.FromDatetime(range_creation_start)
if range_creation_end is not None:
Expand Down
2 changes: 1 addition & 1 deletion nucliadb/tests/nucliadb/integration/search/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ async def _test_filtering(nucliadb_reader: AsyncClient, kbid: str, filters):
]

# Check that only the expected paragraphs were returned
assert len(paragraphs) == len(expected_paragraphs)
assert len(paragraphs) == len(expected_paragraphs), f"{filters}\n{paragraphs}\n{expected_paragraphs}"
not_yet_found = expected_paragraphs.copy()
for par in paragraphs:
if par["text"] not in expected_paragraphs:
Expand Down
16 changes: 14 additions & 2 deletions nucliadb/tests/nucliadb/integration/search/test_hidden.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@

from nucliadb.common.context import ApplicationContext
from nucliadb_protos.writer_pb2_grpc import WriterStub
from tests.utils import broker_resource, inject_message
from tests.utils import broker_resource_with_title_paragraph, inject_message


async def create_resource(kbid, nucliadb_grpc):
message = broker_resource(kbid)
message = broker_resource_with_title_paragraph(kbid)
await inject_message(nucliadb_grpc, message)
return message.uuid

Expand All @@ -53,6 +53,10 @@ async def test_hidden_search(
assert resp.status_code == 200
assert resp.json()["resources"].keys() == {r1, r2}

resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/suggest?query=title")
assert resp.status_code == 200
assert set([r["rid"] for r in resp.json()["paragraphs"]["results"]]) == {r1, r2}

# Hide r1
resp = await nucliadb_writer.patch(f"/kb/{knowledgebox}/resource/{r1}", json={"hidden": True})
assert resp.status_code == 200
Expand All @@ -63,11 +67,19 @@ async def test_hidden_search(
assert resp.status_code == 200
assert resp.json()["resources"].keys() == {r2}

resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/suggest?query=title")
assert resp.status_code == 200
assert set([r["rid"] for r in resp.json()["paragraphs"]["results"]]) == {r2}

# Unless show_hidden is passed, then both resources are returned
resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/search?show_hidden=true")
assert resp.status_code == 200
assert resp.json()["resources"].keys() == {r1, r2}

resp = await nucliadb_reader.get(f"/kb/{knowledgebox}/suggest?query=title&show_hidden=true")
assert resp.status_code == 200
assert set([r["rid"] for r in resp.json()["paragraphs"]["results"]]) == {r1, r2}


@pytest.fixture()
async def app_context(natsd, storage, nucliadb):
Expand Down
52 changes: 52 additions & 0 deletions nucliadb/tests/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,58 @@ def broker_resource(kbid: str, rid=None, slug=None, title=None, summary=None) ->
return bm


def broker_resource_with_title_paragraph(
kbid: str, rid=None, slug=None, title=None, summary=None
) -> BrokerMessage:
"""
Returns a broker resource with barebones metadata.
"""
rid = rid or str(uuid.uuid4()).replace("-", "")
slug = slug or f"{rid}slug1"
bm: BrokerMessage = BrokerMessage(
kbid=kbid,
uuid=rid,
slug=slug,
type=BrokerMessage.AUTOCOMMIT,
)
title = title or "Title Resource"
summary = summary or "Summary of document"
bm.basic.icon = "text/plain"
bm.basic.title = title
bm.basic.summary = summary
bm.basic.thumbnail = "doc"
bm.basic.metadata.useful = True
bm.basic.metadata.language = "es"
bm.basic.created.FromDatetime(datetime.now())
bm.basic.modified.FromDatetime(datetime.now())
bm.origin.source = rpb.Origin.Source.WEB

etw = rpb.ExtractedTextWrapper()
etw.body.text = title
etw.field.field = "title"
etw.field.field_type = rpb.FieldType.GENERIC
bm.extracted_text.append(etw)

fcm = rpb.FieldComputedMetadataWrapper()
fcm.field.field = "title"
fcm.field.field_type = rpb.FieldType.GENERIC
p1 = rpb.Paragraph(
start=0,
end=5,
)
fcm.metadata.metadata.paragraphs.append(p1)
bm.field_metadata.append(fcm)

etw = rpb.ExtractedTextWrapper()
etw.body.text = summary
etw.field.field = "summary"
etw.field.field_type = rpb.FieldType.GENERIC
bm.extracted_text.append(etw)

bm.source = BrokerMessage.MessageSource.WRITER
return bm


async def inject_message(writer: WriterStub, message: BrokerMessage):
resp = await writer.ProcessMessage([message]) # type: ignore
assert resp.status == OpStatusWriter.Status.OK
2 changes: 2 additions & 0 deletions nucliadb_models/src/nucliadb_models/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@

LABEL_QUERY_ALIASES_REVERSED = {v: k for k, v in LABEL_QUERY_ALIASES.items()}

LABEL_HIDDEN = "/q/h"


def translate_alias_to_system_label(label: str) -> str:
parts = label.split("/")
Expand Down
31 changes: 24 additions & 7 deletions nucliadb_node/src/shards/shard_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ use nucliadb_core::protos::{
ShardFile, ShardFileChunk, ShardFileList, StreamRequest, SuggestFeatures, SuggestRequest, SuggestResponse,
VectorSearchRequest, VectorSearchResponse,
};
use nucliadb_core::query_language;
use nucliadb_core::query_language::BooleanExpression;
use nucliadb_core::query_language::BooleanOperation;
use nucliadb_core::query_language::Operator;
Expand Down Expand Up @@ -372,29 +373,45 @@ impl ShardReader {
// Prefilter to apply field label filters
if let Some(filter) = &mut request.filter {
if !filter.field_labels.is_empty() && suggest_paragraphs {
let labels = std::mem::take(&mut filter.field_labels);
let operands = labels.into_iter().map(BooleanExpression::Literal).collect();
let op = BooleanOperation {
operator: Operator::And,
operands,
let labels_formula = if filter.labels_expression.is_empty() {
// Backwards compatibility, take all labels to be AND'ed together
let labels = std::mem::take(&mut filter.field_labels);
let operands = labels.into_iter().map(BooleanExpression::Literal).collect();
let op = BooleanOperation {
operator: Operator::And,
operands,
};

Some(BooleanExpression::Operation(op))
} else {
// Parse the formula for labels, suggest only supports resource labels
let context = query_language::QueryContext {
field_labels: filter.field_labels.iter().cloned().collect(),
paragraph_labels: HashSet::new(),
};
let analysis = query_language::translate(Some(&filter.labels_expression), None, &context)?;
analysis.labels_prefilter_query
};

let prefilter = PreFilterRequest {
timestamp_filters: vec![],
security: None,
labels_formula: Some(BooleanExpression::Operation(op)),
labels_formula,
keywords_formula: None,
};

let prefiltered = read_rw_lock(&self.text_reader).prefilter(&prefilter)?;

// Apply prefilter to paragraphs query
// Apply prefilter to paragraphs query and clear filters
match prefiltered.valid_fields {
query_planner::ValidFieldCollector::All => {}
query_planner::ValidFieldCollector::Some(keys) => {
request.key_filters = keys.iter().map(|v| format!("{}{}", v.resource_id, v.field_id)).collect()
}
query_planner::ValidFieldCollector::None => suggest_paragraphs = false,
}
filter.labels_expression.clear();
filter.field_labels.clear();
}
}

Expand Down
36 changes: 36 additions & 0 deletions nucliadb_node/tests/test_suggest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,42 @@ async fn test_suggest_paragraphs(
paragraph_labels: vec![],
..Default::default()
}),
..request.clone()
}))
.await
.unwrap()
.into_inner();

expect_paragraphs(&response, &[]);

// Inverted filter
let response = reader
.suggest(Request::new(SuggestRequest {
filter: Some(Filter {
field_labels: vec!["/s/p/de".to_string()],
labels_expression: r#"{"not": {"literal": "/s/p/de"}}"#.to_string(),
paragraph_labels: vec![],
..Default::default()
}),
..request.clone()
}))
.await
.unwrap()
.into_inner();

expect_paragraphs(
&response,
&[(&shard.resources["little prince"], "/a/title"), (&shard.resources["little prince"], "/a/summary")],
);

let response = reader
.suggest(Request::new(SuggestRequest {
filter: Some(Filter {
field_labels: vec!["/s/p/en".to_string()],
labels_expression: r#"{"not": {"literal": "/s/p/en"}}"#.to_string(),
paragraph_labels: vec![],
..Default::default()
}),
..request
}))
.await
Expand Down
Loading