wip

nuclia · Sep 18, 2024 · b0df20f · b0df20f
1 parent f5e49f8
commit b0df20f
Show file tree

Hide file tree

Showing 11 changed files with 186 additions and 191 deletions.
diff --git a/nucliadb_models/src/nucliadb_models/internal/shards.py b/nucliadb_models/src/nucliadb_models/internal/shards.py
@@ -33,6 +33,7 @@ class DocumentServiceEnum(str, Enum):
     DOCUMENT_V0 = "DOCUMENT_V0"
     DOCUMENT_V1 = "DOCUMENT_V1"
     DOCUMENT_V2 = "DOCUMENT_V2"
+    DOCUMENT_V3 = "DOCUMENT_V3"
 
 
 class ParagraphServiceEnum(str, Enum):

diff --git a/nucliadb_node/src/shards/shard_reader.rs b/nucliadb_node/src/shards/shard_reader.rs
@@ -156,6 +156,7 @@ impl ShardReader {
             0 => DocumentService::DocumentV0,
             1 => DocumentService::DocumentV1,
             2 => DocumentService::DocumentV2,
+            3 => DocumentService::DocumentV3,
             i => panic!("Unknown document version {i}"),
         }
     }

diff --git a/nucliadb_node/src/shards/shard_writer.rs b/nucliadb_node/src/shards/shard_writer.rs
@@ -72,6 +72,7 @@ impl ShardWriter {
             0 => DocumentService::DocumentV0,
             1 => DocumentService::DocumentV1,
             2 => DocumentService::DocumentV2,
+            3 => DocumentService::DocumentV3,
             i => panic!("Unknown document version {i}"),
         }
     }

diff --git a/nucliadb_protos/noderesources.proto b/nucliadb_protos/noderesources.proto
@@ -29,6 +29,7 @@ message ShardCreated {
         DOCUMENT_V0 = 0;
         DOCUMENT_V1 = 1;
         DOCUMENT_V2 = 2;
+        DOCUMENT_V3 = 3;
     }
     DocumentService document_service = 2;
     enum ParagraphService {

diff --git a/nucliadb_protos/python/src/nucliadb_protos/noderesources_pb2.py b/nucliadb_protos/python/src/nucliadb_protos/noderesources_pb2.py
diff --git a/nucliadb_protos/python/src/nucliadb_protos/noderesources_pb2.pyi b/nucliadb_protos/python/src/nucliadb_protos/noderesources_pb2.pyi
@@ -129,11 +129,13 @@ class ShardCreated(google.protobuf.message.Message):
         DOCUMENT_V0: ShardCreated._DocumentService.ValueType  # 0
         DOCUMENT_V1: ShardCreated._DocumentService.ValueType  # 1
         DOCUMENT_V2: ShardCreated._DocumentService.ValueType  # 2
+        DOCUMENT_V3: ShardCreated._DocumentService.ValueType  # 3
 
     class DocumentService(_DocumentService, metaclass=_DocumentServiceEnumTypeWrapper): ...
     DOCUMENT_V0: ShardCreated.DocumentService.ValueType  # 0
     DOCUMENT_V1: ShardCreated.DocumentService.ValueType  # 1
     DOCUMENT_V2: ShardCreated.DocumentService.ValueType  # 2
+    DOCUMENT_V3: ShardCreated.DocumentService.ValueType  # 3
 
     class _ParagraphService:
         ValueType = typing.NewType("ValueType", builtins.int)

diff --git a/nucliadb_protos/rust/src/noderesources.rs b/nucliadb_protos/rust/src/noderesources.rs
@@ -61,6 +61,7 @@ pub mod shard_created {
         DocumentV0 = 0,
         DocumentV1 = 1,
         DocumentV2 = 2,
+        DocumentV3 = 3,
     }
     impl DocumentService {
         /// String value of the enum field names used in the ProtoBuf definition.
@@ -72,6 +73,7 @@ pub mod shard_created {
                 DocumentService::DocumentV0 => "DOCUMENT_V0",
                 DocumentService::DocumentV1 => "DOCUMENT_V1",
                 DocumentService::DocumentV2 => "DOCUMENT_V2",
+                DocumentService::DocumentV3 => "DOCUMENT_V3",
             }
         }
         /// Creates an enum from field names used in the ProtoBuf definition.
@@ -80,6 +82,7 @@ pub mod shard_created {
                 "DOCUMENT_V0" => Some(Self::DocumentV0),
                 "DOCUMENT_V1" => Some(Self::DocumentV1),
                 "DOCUMENT_V2" => Some(Self::DocumentV2),
+                "DOCUMENT_V3" => Some(Self::DocumentV3),
                 _ => None,
             }
         }

diff --git a/nucliadb_texts3/src/reader.rs b/nucliadb_texts3/src/reader.rs
@@ -95,42 +95,50 @@ impl Debug for TextReaderService {
     }
 }
 
-struct PrefilterSegmentCollector {
-    reader: BytesFastFieldReader,
-    uuids: Vec<String>,
+struct FieldUuidSegmentCollector {
+    uuid_reader: BytesFastFieldReader,
+    field_reader: BytesFastFieldReader,
+    results: Vec<ValidField>,
 }
 
-impl SegmentCollector for PrefilterSegmentCollector {
-    type Fruit = Vec<String>;
+impl SegmentCollector for FieldUuidSegmentCollector {
+    type Fruit = Vec<ValidField>;
 
-    fn collect(&mut self, doc: tantivy::DocId, score: tantivy::Score) {
-        let bytes = self.reader.get_bytes(doc);
-        self.uuids.push(String::from_utf8_lossy(bytes).to_string());
+    fn collect(&mut self, doc: tantivy::DocId, _score: tantivy::Score) {
+        let uuid = self.uuid_reader.get_bytes(doc);
+        let field = self.field_reader.get_bytes(doc);
+        self.results.push(ValidField {
+            resource_id: String::from_utf8_lossy(uuid).to_string(),
+            field_id: String::from_utf8_lossy(field).to_string(),
+        });
     }
 
     fn harvest(self) -> Self::Fruit {
-        self.uuids
+        self.results
     }
 }
 
-struct PrefilterCollector {
+struct FieldUuidCollector {
+    uuid: Field,
     field: Field,
 }
 
-impl Collector for PrefilterCollector {
-    type Fruit = Vec<String>;
+impl Collector for FieldUuidCollector {
+    type Fruit = Vec<ValidField>;
 
-    type Child = PrefilterSegmentCollector;
+    type Child = FieldUuidSegmentCollector;
 
     fn for_segment(
         &self,
-        segment_local_id: tantivy::SegmentOrdinal,
+        _segment_local_id: tantivy::SegmentOrdinal,
         segment: &tantivy::SegmentReader,
     ) -> tantivy::Result<Self::Child> {
-        let reader = segment.fast_fields().bytes(self.field)?;
-        Ok(PrefilterSegmentCollector {
-            reader,
-            uuids: vec![],
+        let uuid_reader = segment.fast_fields().bytes(self.uuid)?;
+        let field_reader = segment.fast_fields().bytes(self.field)?;
+        Ok(FieldUuidSegmentCollector {
+            uuid_reader,
+            field_reader,
+            results: vec![],
         })
     }
 
@@ -224,65 +232,29 @@ impl FieldReader for TextReaderService {
 
         let prefilter_query: Box<dyn Query> = Box::new(BooleanQuery::intersection(subqueries));
         let searcher = self.reader.searcher();
-        // let collector = PrefilterCollector {
-        //     field: self.schema.uuid_field,
-        // };
-        // let mut docs_fulfilled = searcher.search(&prefilter_query, &collector)?;
-        // let mut inverted = false;
-
-        // // If none of the fields match the pre-filter, thats all the query planner needs to know.
-        // if docs_fulfilled.is_empty() {
-        //     return Ok(PreFilterResponse {
-        //         valid_fields: ValidFieldCollector::None,
-        //     });
-        // }
-
-        // // If all the fields match the pre-filter, thats all the query planner needs to know
-        // if docs_fulfilled.len() as u64 == searcher.num_docs() {
-        //     return Ok(PreFilterResponse {
-        //         valid_fields: ValidFieldCollector::All,
-        //     });
-        // }
-
-        // // More than half in results
-        // if docs_fulfilled.len() * 2 > searcher.num_docs() as usize {
-        //     let inverted_prefilter_query: Box<dyn Query> =
-        //         Box::new(BooleanQuery::new(vec![(Occur::MustNot, prefilter_query)]));
-        //     docs_fulfilled = searcher.search(&inverted_prefilter_query, &DocSetCollector)?;
-        //     inverted = true;
-        // }
-
-        // The fields matching the pre-filter are a non-empty subset of all the fields, so they are
-        // brought to memory
-        // let mut valid_fields = Vec::new();
-        // for fulfilled_doc in docs_fulfilled {
-        //     let fulfilled_field = ValidField {
-        //         resource_id: fulfilled_doc,
-        //         field_id: String::new(),
-        //     };
-        //     valid_fields.push(fulfilled_field);
-        // }
-        // println!("FAST Prefiltered down to {} of {}", valid_fields.len(), searcher.num_docs());
-
-        let mut docs_fulfilled = searcher.search(&prefilter_query, &DocSetCollector)?;
-        println!("Prefiltered down to {} of {}", docs_fulfilled.len(), searcher.num_docs());
-        let mut inverted = false;
-
-        // The fields matching the pre-filter are a non-empty subset of all the fields, so they are
-        // brought to memory
-        let mut valid_fields = Vec::new();
-        for fulfilled_doc in docs_fulfilled {
-            if let Ok(doc) = searcher.doc(fulfilled_doc) {
-                let resource_id = doc.get_first(self.schema.uuid).unwrap().as_text().unwrap().to_string();
-                valid_fields.push(ValidField {
-                    resource_id,
-                    field_id: String::new(),
-                });
-            }
+        let collector = FieldUuidCollector {
+            uuid: self.schema.uuid,
+            field: self.schema.field,
+        };
+        let docs_fulfilled = searcher.search(&prefilter_query, &collector)?;
+
+        // If none of the fields match the pre-filter, thats all the query planner needs to know.
+        if docs_fulfilled.is_empty() {
+            return Ok(PreFilterResponse {
+                valid_fields: ValidFieldCollector::None,
+            });
+        }
+
+        // If all the fields match the pre-filter, thats all the query planner needs to know
+        if docs_fulfilled.len() as u64 == searcher.num_docs() {
+            return Ok(PreFilterResponse {
+                valid_fields: ValidFieldCollector::All,
+            });
         }
 
+        // The fields matching the pre-filter are a non-empty subset of all the fields
         Ok(PreFilterResponse {
-            valid_fields: ValidFieldCollector::Some(valid_fields),
+            valid_fields: ValidFieldCollector::Some(docs_fulfilled),
         })
     }
 
@@ -394,19 +366,23 @@ impl TextReaderService {
                         bm25: 0.0,
                         booster: id as f32,
                     });
-                    let uuid = doc
-                        .get_first(self.schema.uuid)
-                        .expect("document doesn't appear to have uuid.")
-                        .as_text()
-                        .unwrap()
-                        .to_string();
-
-                    let field = doc
-                        .get_first(self.schema.field)
-                        .expect("document doesn't appear to have field.")
-                        .as_facet()
-                        .unwrap()
-                        .to_path_string();
+                    let uuid = String::from_utf8(
+                        doc.get_first(self.schema.uuid)
+                            .expect("document doesn't appear to have uuid.")
+                            .as_bytes()
+                            .unwrap()
+                            .to_vec(),
+                    )
+                    .unwrap();
+
+                    let field = String::from_utf8(
+                        doc.get_first(self.schema.field)
+                            .expect("document doesn't appear to have field.")
+                            .as_bytes()
+                            .unwrap()
+                            .to_vec(),
+                    )
+                    .unwrap();
 
                     let labels = doc
                         .get_all(self.schema.facets)
@@ -462,19 +438,23 @@ impl TextReaderService {
                         bm25: score,
                         booster: id as f32,
                     });
-                    let uuid = doc
-                        .get_first(self.schema.uuid)
-                        .expect("document doesn't appear to have uuid.")
-                        .as_text()
-                        .unwrap()
-                        .to_string();
-
-                    let field = doc
-                        .get_first(self.schema.field)
-                        .expect("document doesn't appear to have field.")
-                        .as_facet()
-                        .unwrap()
-                        .to_path_string();
+                    let uuid = String::from_utf8(
+                        doc.get_first(self.schema.uuid)
+                            .expect("document doesn't appear to have uuid.")
+                            .as_bytes()
+                            .unwrap()
+                            .to_vec(),
+                    )
+                    .unwrap();
+
+                    let field = String::from_utf8(
+                        doc.get_first(self.schema.field)
+                            .expect("document doesn't appear to have field.")
+                            .as_bytes()
+                            .unwrap()
+                            .to_vec(),
+                    )
+                    .unwrap();
 
                     let labels = doc
                         .get_all(self.schema.facets)
@@ -642,19 +622,23 @@ impl Iterator for BatchProducer {
         let top_docs = self.searcher.search(&self.query, &top_docs).unwrap();
         let mut items = vec![];
         for doc in top_docs.into_iter().flat_map(|i| self.searcher.doc(i.1)) {
-            let uuid = doc
-                .get_first(self.uuid_field)
-                .expect("document doesn't appear to have uuid.")
-                .as_text()
-                .unwrap()
-                .to_string();
-
-            let field = doc
-                .get_first(self.field_field)
-                .expect("document doesn't appear to have field.")
-                .as_facet()
-                .unwrap()
-                .to_path_string();
+            let uuid = String::from_utf8(
+                doc.get_first(self.uuid_field)
+                    .expect("document doesn't appear to have uuid.")
+                    .as_bytes()
+                    .unwrap()
+                    .to_vec(),
+            )
+            .unwrap();
+
+            let field = String::from_utf8(
+                doc.get_first(self.field_field)
+                    .expect("document doesn't appear to have field.")
+                    .as_bytes()
+                    .unwrap()
+                    .to_vec(),
+            )
+            .unwrap();
 
             let labels = doc
                 .get_all(self.facet_field)

diff --git a/nucliadb_texts3/src/schema.rs b/nucliadb_texts3/src/schema.rs
@@ -19,7 +19,7 @@
 //
 use nucliadb_core::protos::*;
 use tantivy::chrono::{DateTime, Utc};
-use tantivy::schema::{Cardinality, FacetOptions, Field, NumericOptions, Schema, FAST, STORED, STRING, TEXT};
+use tantivy::schema::{Cardinality, FacetOptions, Field, NumericOptions, Schema, FAST, INDEXED, STORED, TEXT};
 
 #[derive(Debug, Clone)]
 pub struct TextSchema {
@@ -55,8 +55,8 @@ impl TextSchema {
 
         let facet_options = FacetOptions::default().set_stored();
 
-        let uuid = sb.add_bytes_field("uuid", FAST);
-        let field = sb.add_bytes_field("field", FAST);
+        let uuid = sb.add_bytes_field("uuid", STORED | FAST | INDEXED);
+        let field = sb.add_bytes_field("field", STORED | FAST | INDEXED);
 
         let text = sb.add_text_field("text", TEXT);
 

diff --git a/nucliadb_texts3/src/search_query.rs b/nucliadb_texts3/src/search_query.rs
@@ -80,16 +80,18 @@ pub fn create_query(
     queries.push((Occur::Must, main_q));
 
     // Field types filter
-    search
+    let field_filter: Vec<_> = search
         .fields
         .iter()
-        .map(|value| format!("/{}", value))
-        .flat_map(|facet_key| Facet::from_text(facet_key.as_str()).ok().into_iter())
-        .for_each(|facet| {
-            let facet_term = Term::from_facet(schema.field, &facet);
-            let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
-            queries.push((Occur::Must, Box::new(facet_term_query)));
-        });
+        .map(|field_name| {
+            let term = Term::from_field_bytes(schema.field, field_name.as_bytes());
+            let term_query: Box<dyn Query> = Box::new(TermQuery::new(term, IndexRecordOption::Basic));
+            (Occur::Should, term_query)
+        })
+        .collect();
+    if !field_filter.is_empty() {
+        queries.push((Occur::Must, Box::new(BooleanQuery::new(field_filter))));
+    }
 
     if let Some(filter) = search.filter.as_ref() {
         let context = QueryContext {