Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
javitonino committed Sep 18, 2024
1 parent f5e49f8 commit b0df20f
Show file tree
Hide file tree
Showing 11 changed files with 186 additions and 191 deletions.
1 change: 1 addition & 0 deletions nucliadb_models/src/nucliadb_models/internal/shards.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class DocumentServiceEnum(str, Enum):
DOCUMENT_V0 = "DOCUMENT_V0"
DOCUMENT_V1 = "DOCUMENT_V1"
DOCUMENT_V2 = "DOCUMENT_V2"
DOCUMENT_V3 = "DOCUMENT_V3"


class ParagraphServiceEnum(str, Enum):
Expand Down
1 change: 1 addition & 0 deletions nucliadb_node/src/shards/shard_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ impl ShardReader {
0 => DocumentService::DocumentV0,
1 => DocumentService::DocumentV1,
2 => DocumentService::DocumentV2,
3 => DocumentService::DocumentV3,
i => panic!("Unknown document version {i}"),
}
}
Expand Down
1 change: 1 addition & 0 deletions nucliadb_node/src/shards/shard_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ impl ShardWriter {
0 => DocumentService::DocumentV0,
1 => DocumentService::DocumentV1,
2 => DocumentService::DocumentV2,
3 => DocumentService::DocumentV3,
i => panic!("Unknown document version {i}"),
}
}
Expand Down
1 change: 1 addition & 0 deletions nucliadb_protos/noderesources.proto
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ message ShardCreated {
DOCUMENT_V0 = 0;
DOCUMENT_V1 = 1;
DOCUMENT_V2 = 2;
DOCUMENT_V3 = 3;
}
DocumentService document_service = 2;
enum ParagraphService {
Expand Down
130 changes: 65 additions & 65 deletions nucliadb_protos/python/src/nucliadb_protos/noderesources_pb2.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -129,11 +129,13 @@ class ShardCreated(google.protobuf.message.Message):
DOCUMENT_V0: ShardCreated._DocumentService.ValueType # 0
DOCUMENT_V1: ShardCreated._DocumentService.ValueType # 1
DOCUMENT_V2: ShardCreated._DocumentService.ValueType # 2
DOCUMENT_V3: ShardCreated._DocumentService.ValueType # 3

class DocumentService(_DocumentService, metaclass=_DocumentServiceEnumTypeWrapper): ...
DOCUMENT_V0: ShardCreated.DocumentService.ValueType # 0
DOCUMENT_V1: ShardCreated.DocumentService.ValueType # 1
DOCUMENT_V2: ShardCreated.DocumentService.ValueType # 2
DOCUMENT_V3: ShardCreated.DocumentService.ValueType # 3

class _ParagraphService:
ValueType = typing.NewType("ValueType", builtins.int)
Expand Down
3 changes: 3 additions & 0 deletions nucliadb_protos/rust/src/noderesources.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ pub mod shard_created {
DocumentV0 = 0,
DocumentV1 = 1,
DocumentV2 = 2,
DocumentV3 = 3,
}
impl DocumentService {
/// String value of the enum field names used in the ProtoBuf definition.
Expand All @@ -72,6 +73,7 @@ pub mod shard_created {
DocumentService::DocumentV0 => "DOCUMENT_V0",
DocumentService::DocumentV1 => "DOCUMENT_V1",
DocumentService::DocumentV2 => "DOCUMENT_V2",
DocumentService::DocumentV3 => "DOCUMENT_V3",
}
}
/// Creates an enum from field names used in the ProtoBuf definition.
Expand All @@ -80,6 +82,7 @@ pub mod shard_created {
"DOCUMENT_V0" => Some(Self::DocumentV0),
"DOCUMENT_V1" => Some(Self::DocumentV1),
"DOCUMENT_V2" => Some(Self::DocumentV2),
"DOCUMENT_V3" => Some(Self::DocumentV3),
_ => None,
}
}
Expand Down
210 changes: 97 additions & 113 deletions nucliadb_texts3/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,42 +95,50 @@ impl Debug for TextReaderService {
}
}

struct PrefilterSegmentCollector {
reader: BytesFastFieldReader,
uuids: Vec<String>,
struct FieldUuidSegmentCollector {
uuid_reader: BytesFastFieldReader,
field_reader: BytesFastFieldReader,
results: Vec<ValidField>,
}

impl SegmentCollector for PrefilterSegmentCollector {
type Fruit = Vec<String>;
impl SegmentCollector for FieldUuidSegmentCollector {
type Fruit = Vec<ValidField>;

fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
let bytes = self.reader.get_bytes(doc);
self.uuids.push(String::from_utf8_lossy(bytes).to_string());
fn collect(&mut self, doc: tantivy::DocId, _score: tantivy::Score) {
let uuid = self.uuid_reader.get_bytes(doc);
let field = self.field_reader.get_bytes(doc);
self.results.push(ValidField {
resource_id: String::from_utf8_lossy(uuid).to_string(),
field_id: String::from_utf8_lossy(field).to_string(),
});
}

fn harvest(self) -> Self::Fruit {
self.uuids
self.results
}
}

struct PrefilterCollector {
struct FieldUuidCollector {
uuid: Field,
field: Field,
}

impl Collector for PrefilterCollector {
type Fruit = Vec<String>;
impl Collector for FieldUuidCollector {
type Fruit = Vec<ValidField>;

type Child = PrefilterSegmentCollector;
type Child = FieldUuidSegmentCollector;

fn for_segment(
&self,
segment_local_id: tantivy::SegmentOrdinal,
_segment_local_id: tantivy::SegmentOrdinal,
segment: &tantivy::SegmentReader,
) -> tantivy::Result<Self::Child> {
let reader = segment.fast_fields().bytes(self.field)?;
Ok(PrefilterSegmentCollector {
reader,
uuids: vec![],
let uuid_reader = segment.fast_fields().bytes(self.uuid)?;
let field_reader = segment.fast_fields().bytes(self.field)?;
Ok(FieldUuidSegmentCollector {
uuid_reader,
field_reader,
results: vec![],
})
}

Expand Down Expand Up @@ -224,65 +232,29 @@ impl FieldReader for TextReaderService {

let prefilter_query: Box<dyn Query> = Box::new(BooleanQuery::intersection(subqueries));
let searcher = self.reader.searcher();
// let collector = PrefilterCollector {
// field: self.schema.uuid_field,
// };
// let mut docs_fulfilled = searcher.search(&prefilter_query, &collector)?;
// let mut inverted = false;

// // If none of the fields match the pre-filter, thats all the query planner needs to know.
// if docs_fulfilled.is_empty() {
// return Ok(PreFilterResponse {
// valid_fields: ValidFieldCollector::None,
// });
// }

// // If all the fields match the pre-filter, thats all the query planner needs to know
// if docs_fulfilled.len() as u64 == searcher.num_docs() {
// return Ok(PreFilterResponse {
// valid_fields: ValidFieldCollector::All,
// });
// }

// // More than half in results
// if docs_fulfilled.len() * 2 > searcher.num_docs() as usize {
// let inverted_prefilter_query: Box<dyn Query> =
// Box::new(BooleanQuery::new(vec![(Occur::MustNot, prefilter_query)]));
// docs_fulfilled = searcher.search(&inverted_prefilter_query, &DocSetCollector)?;
// inverted = true;
// }

// The fields matching the pre-filter are a non-empty subset of all the fields, so they are
// brought to memory
// let mut valid_fields = Vec::new();
// for fulfilled_doc in docs_fulfilled {
// let fulfilled_field = ValidField {
// resource_id: fulfilled_doc,
// field_id: String::new(),
// };
// valid_fields.push(fulfilled_field);
// }
// println!("FAST Prefiltered down to {} of {}", valid_fields.len(), searcher.num_docs());

let mut docs_fulfilled = searcher.search(&prefilter_query, &DocSetCollector)?;
println!("Prefiltered down to {} of {}", docs_fulfilled.len(), searcher.num_docs());
let mut inverted = false;

// The fields matching the pre-filter are a non-empty subset of all the fields, so they are
// brought to memory
let mut valid_fields = Vec::new();
for fulfilled_doc in docs_fulfilled {
if let Ok(doc) = searcher.doc(fulfilled_doc) {
let resource_id = doc.get_first(self.schema.uuid).unwrap().as_text().unwrap().to_string();
valid_fields.push(ValidField {
resource_id,
field_id: String::new(),
});
}
let collector = FieldUuidCollector {
uuid: self.schema.uuid,
field: self.schema.field,
};
let docs_fulfilled = searcher.search(&prefilter_query, &collector)?;

// If none of the fields match the pre-filter, thats all the query planner needs to know.
if docs_fulfilled.is_empty() {
return Ok(PreFilterResponse {
valid_fields: ValidFieldCollector::None,
});
}

// If all the fields match the pre-filter, thats all the query planner needs to know
if docs_fulfilled.len() as u64 == searcher.num_docs() {
return Ok(PreFilterResponse {
valid_fields: ValidFieldCollector::All,
});
}

// The fields matching the pre-filter are a non-empty subset of all the fields
Ok(PreFilterResponse {
valid_fields: ValidFieldCollector::Some(valid_fields),
valid_fields: ValidFieldCollector::Some(docs_fulfilled),
})
}

Expand Down Expand Up @@ -394,19 +366,23 @@ impl TextReaderService {
bm25: 0.0,
booster: id as f32,
});
let uuid = doc
.get_first(self.schema.uuid)
.expect("document doesn't appear to have uuid.")
.as_text()
.unwrap()
.to_string();

let field = doc
.get_first(self.schema.field)
.expect("document doesn't appear to have field.")
.as_facet()
.unwrap()
.to_path_string();
let uuid = String::from_utf8(
doc.get_first(self.schema.uuid)
.expect("document doesn't appear to have uuid.")
.as_bytes()
.unwrap()
.to_vec(),
)
.unwrap();

let field = String::from_utf8(
doc.get_first(self.schema.field)
.expect("document doesn't appear to have field.")
.as_bytes()
.unwrap()
.to_vec(),
)
.unwrap();

let labels = doc
.get_all(self.schema.facets)
Expand Down Expand Up @@ -462,19 +438,23 @@ impl TextReaderService {
bm25: score,
booster: id as f32,
});
let uuid = doc
.get_first(self.schema.uuid)
.expect("document doesn't appear to have uuid.")
.as_text()
.unwrap()
.to_string();

let field = doc
.get_first(self.schema.field)
.expect("document doesn't appear to have field.")
.as_facet()
.unwrap()
.to_path_string();
let uuid = String::from_utf8(
doc.get_first(self.schema.uuid)
.expect("document doesn't appear to have uuid.")
.as_bytes()
.unwrap()
.to_vec(),
)
.unwrap();

let field = String::from_utf8(
doc.get_first(self.schema.field)
.expect("document doesn't appear to have field.")
.as_bytes()
.unwrap()
.to_vec(),
)
.unwrap();

let labels = doc
.get_all(self.schema.facets)
Expand Down Expand Up @@ -642,19 +622,23 @@ impl Iterator for BatchProducer {
let top_docs = self.searcher.search(&self.query, &top_docs).unwrap();
let mut items = vec![];
for doc in top_docs.into_iter().flat_map(|i| self.searcher.doc(i.1)) {
let uuid = doc
.get_first(self.uuid_field)
.expect("document doesn't appear to have uuid.")
.as_text()
.unwrap()
.to_string();

let field = doc
.get_first(self.field_field)
.expect("document doesn't appear to have field.")
.as_facet()
.unwrap()
.to_path_string();
let uuid = String::from_utf8(
doc.get_first(self.uuid_field)
.expect("document doesn't appear to have uuid.")
.as_bytes()
.unwrap()
.to_vec(),
)
.unwrap();

let field = String::from_utf8(
doc.get_first(self.field_field)
.expect("document doesn't appear to have field.")
.as_bytes()
.unwrap()
.to_vec(),
)
.unwrap();

let labels = doc
.get_all(self.facet_field)
Expand Down
6 changes: 3 additions & 3 deletions nucliadb_texts3/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
//
use nucliadb_core::protos::*;
use tantivy::chrono::{DateTime, Utc};
use tantivy::schema::{Cardinality, FacetOptions, Field, NumericOptions, Schema, FAST, STORED, STRING, TEXT};
use tantivy::schema::{Cardinality, FacetOptions, Field, NumericOptions, Schema, FAST, INDEXED, STORED, TEXT};

#[derive(Debug, Clone)]
pub struct TextSchema {
Expand Down Expand Up @@ -55,8 +55,8 @@ impl TextSchema {

let facet_options = FacetOptions::default().set_stored();

let uuid = sb.add_bytes_field("uuid", FAST);
let field = sb.add_bytes_field("field", FAST);
let uuid = sb.add_bytes_field("uuid", STORED | FAST | INDEXED);
let field = sb.add_bytes_field("field", STORED | FAST | INDEXED);

let text = sb.add_text_field("text", TEXT);

Expand Down
18 changes: 10 additions & 8 deletions nucliadb_texts3/src/search_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,18 @@ pub fn create_query(
queries.push((Occur::Must, main_q));

// Field types filter
search
let field_filter: Vec<_> = search
.fields
.iter()
.map(|value| format!("/{}", value))
.flat_map(|facet_key| Facet::from_text(facet_key.as_str()).ok().into_iter())
.for_each(|facet| {
let facet_term = Term::from_facet(schema.field, &facet);
let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
queries.push((Occur::Must, Box::new(facet_term_query)));
});
.map(|field_name| {
let term = Term::from_field_bytes(schema.field, field_name.as_bytes());
let term_query: Box<dyn Query> = Box::new(TermQuery::new(term, IndexRecordOption::Basic));
(Occur::Should, term_query)
})
.collect();
if !field_filter.is_empty() {
queries.push((Occur::Must, Box::new(BooleanQuery::new(field_filter))));
}

if let Some(filter) = search.filter.as_ref() {
let context = QueryContext {
Expand Down
Loading

0 comments on commit b0df20f

Please sign in to comment.