add jsonl loader

FalkorDB · Jul 17, 2024 · d3f3430 · d3f3430
1 parent d99261d
commit d3f3430
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 4 deletions.
diff --git a/falkordb_gemini_kg/classes/source.py b/falkordb_gemini_kg/classes/source.py
@@ -29,6 +29,8 @@ def Source(path: str, instruction: str | None = None) -> "AbstractSource":
  s = URL(path)
  elif ".csv" in path.lower():
  s = CSV(path)
+ elif ".jsonl" in path.lower():
+ s = JSONL(path)
  else:
  s = TEXT(path)
 
@@ -109,3 +111,12 @@ class CSV(AbstractSource):
  def __init__(self, path, rows_per_document: int = 50):
  super().__init__(path)
  self.loader = CSVLoader(self.path, rows_per_document)
+
+class JSONL(AbstractSource):
+ """
+ JSONL resource
+ """
+
+ def __init__(self, path, rows_per_document: int = 50):
+ super().__init__(path)
+ self.loader = JSONLLoader(self.path, rows_per_document)
diff --git a/falkordb_gemini_kg/document_loaders/__init__.py b/falkordb_gemini_kg/document_loaders/__init__.py
@@ -3,6 +3,6 @@
 from .html import HTMLLoader
 from .csv import CSVLoader
 from .url import URLLoader
+from .jsonl import JSONLLoader
 
-
-__all__ = ["PDFLoader", "TextLoader", "HTMLLoader", "CSVLoader", "URLLoader"]
+__all__ = ["PDFLoader", "TextLoader", "HTMLLoader", "CSVLoader", "URLLoader", "JSONLLoader"]
diff --git a/falkordb_gemini_kg/document_loaders/jsonl.py b/falkordb_gemini_kg/document_loaders/jsonl.py
@@ -0,0 +1,27 @@
+from typing import Iterator
+from falkordb_gemini_kg.classes.Document import Document
+
+
+class JSONLLoader:
+ """
+ JSONL loader
+ """
+
+ def __init__(self, path: str, rows_per_document: int = 500):
+ self.path = path
+ self.rows_per_document = rows_per_document
+
+ def load(self) -> Iterator[Document]:
+ with open(self.path, "r") as f:
+ rows = f.readlines()
+ num_rows = len(rows)
+ num_documents = num_rows // self.rows_per_document
+ for i in range(num_documents):
+ content = "\n".join(
+ rows[
+ i
+ * self.rows_per_document : (i + 1)
+ * self.rows_per_document
+ ]
+ )
+ yield Document(content)
diff --git a/falkordb_gemini_kg/fixtures/prompts.py b/falkordb_gemini_kg/fixtures/prompts.py
@@ -25,6 +25,7 @@
 Do not respond to any questions that might ask anything else than ontology creation.
 Do not include any text except ontology.
 Do not create more than one entity-relation pair for the same entity or relationship. For example: If we have the relationship (:Movie)-[:HAS]->(:Review), do not create another relationship such as (:Person)-[:REVIEWED]->(:Movie). Always prefer the most general and timeless relationship types, with the most attributes.
+Do not create an entity without an unique attribute. Each entity should have at least one unique attribute.
 
 ## 5. Format
 The ontology should be in JSON format and should follow the schema provided below.
@@ -178,6 +179,7 @@
 Do not create relationships without their corresponding entities.
 Do not allow duplicated inverse relationships, for example, if you have a relationship "OWNS" from Person to House, do not create another relationship "OWNED_BY" from House to Person.
 Do not use the example Movie context to assume the ontology. The ontology should be created based on the provided text only.
+Do not create an entity without an unique attribute. Each entity should have at least one unique attribute.
 
 Use the following instructions as boundaries for the ontology extraction process. 
 {boundaries}
@@ -212,14 +214,15 @@
 Given the following ontology, correct any mistakes or missing information in the ontology.
 Add any missing entities, relations, or attributes to the ontology.
 Make sure to connect all related entities in the ontology. For example, if a Person PLAYED a Character in a Movie, make sure to connect the Character back to the Movie, otherwise we won't be able to say which Movie the Character is from.
-Make sure each entity contains at least one unique attribute. For example, a Person entity should have a unique attribute like "name".
+Make sure each entity contains at least one unique attribute.
 Make sure all entities have relations.
 Make sure all relations have 2 entities (source and target).
 Make sure all entity labels are titlecase.
 Do not allow duplicated relationships, for example, if you have a relationship "OWNS" from Person to House, do not create another relationship "OWNS_HOUSE", or even "OWNED_BY" from House to Person.
 Relationship names must be timeless. For example "WROTE" and "WRITTEN" means the same thing, if the source and target entities are the same. Remove similar scenarios.
 Do not create relationships without their corresponding entities.
 Do not use the example Movie context to assume the ontology. The ontology should be created based on the provided text only.
+Do not allow entities without at least one unique attribute.
 
 Ontology:
 {ontology}
@@ -432,7 +435,6 @@
 Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC]
 Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks.
 
-If the provided information is empty, say that you don't know the answer.
 """
 
 GRAPH_QA_PROMPT = """