From d3f3430e1aacabb90adf1fcd8697eefadbf896d7 Mon Sep 17 00:00:00 2001 From: Dudi Zimberknopf Date: Wed, 17 Jul 2024 10:29:28 +0300 Subject: [PATCH] add jsonl loader --- falkordb_gemini_kg/classes/source.py | 11 ++++++++ .../document_loaders/__init__.py | 4 +-- falkordb_gemini_kg/document_loaders/jsonl.py | 27 +++++++++++++++++++ falkordb_gemini_kg/fixtures/prompts.py | 6 +++-- 4 files changed, 44 insertions(+), 4 deletions(-) create mode 100644 falkordb_gemini_kg/document_loaders/jsonl.py diff --git a/falkordb_gemini_kg/classes/source.py b/falkordb_gemini_kg/classes/source.py index 5737525..e7a2a92 100644 --- a/falkordb_gemini_kg/classes/source.py +++ b/falkordb_gemini_kg/classes/source.py @@ -29,6 +29,8 @@ def Source(path: str, instruction: str | None = None) -> "AbstractSource": s = URL(path) elif ".csv" in path.lower(): s = CSV(path) + elif ".jsonl" in path.lower(): + s = JSONL(path) else: s = TEXT(path) @@ -109,3 +111,12 @@ class CSV(AbstractSource): def __init__(self, path, rows_per_document: int = 50): super().__init__(path) self.loader = CSVLoader(self.path, rows_per_document) + +class JSONL(AbstractSource): + """ + JSONL resource + """ + + def __init__(self, path, rows_per_document: int = 50): + super().__init__(path) + self.loader = JSONLLoader(self.path, rows_per_document) diff --git a/falkordb_gemini_kg/document_loaders/__init__.py b/falkordb_gemini_kg/document_loaders/__init__.py index b16f945..a6b85b2 100644 --- a/falkordb_gemini_kg/document_loaders/__init__.py +++ b/falkordb_gemini_kg/document_loaders/__init__.py @@ -3,6 +3,6 @@ from .html import HTMLLoader from .csv import CSVLoader from .url import URLLoader +from .jsonl import JSONLLoader - -__all__ = ["PDFLoader", "TextLoader", "HTMLLoader", "CSVLoader", "URLLoader"] +__all__ = ["PDFLoader", "TextLoader", "HTMLLoader", "CSVLoader", "URLLoader", "JSONLLoader"] diff --git a/falkordb_gemini_kg/document_loaders/jsonl.py b/falkordb_gemini_kg/document_loaders/jsonl.py new file mode 100644 index 0000000..fd12ec5 --- /dev/null +++ b/falkordb_gemini_kg/document_loaders/jsonl.py @@ -0,0 +1,27 @@ +from typing import Iterator +from falkordb_gemini_kg.classes.Document import Document + + +class JSONLLoader: + """ + JSONL loader + """ + + def __init__(self, path: str, rows_per_document: int = 500): + self.path = path + self.rows_per_document = rows_per_document + + def load(self) -> Iterator[Document]: + with open(self.path, "r") as f: + rows = f.readlines() + num_rows = len(rows) + num_documents = num_rows // self.rows_per_document + for i in range(num_documents): + content = "\n".join( + rows[ + i + * self.rows_per_document : (i + 1) + * self.rows_per_document + ] + ) + yield Document(content) diff --git a/falkordb_gemini_kg/fixtures/prompts.py b/falkordb_gemini_kg/fixtures/prompts.py index cab5adf..277c53e 100644 --- a/falkordb_gemini_kg/fixtures/prompts.py +++ b/falkordb_gemini_kg/fixtures/prompts.py @@ -25,6 +25,7 @@ Do not respond to any questions that might ask anything else than ontology creation. Do not include any text except ontology. Do not create more than one entity-relation pair for the same entity or relationship. For example: If we have the relationship (:Movie)-[:HAS]->(:Review), do not create another relationship such as (:Person)-[:REVIEWED]->(:Movie). Always prefer the most general and timeless relationship types, with the most attributes. +Do not create an entity without an unique attribute. Each entity should have at least one unique attribute. ## 5. Format The ontology should be in JSON format and should follow the schema provided below. @@ -178,6 +179,7 @@ Do not create relationships without their corresponding entities. Do not allow duplicated inverse relationships, for example, if you have a relationship "OWNS" from Person to House, do not create another relationship "OWNED_BY" from House to Person. Do not use the example Movie context to assume the ontology. The ontology should be created based on the provided text only. +Do not create an entity without an unique attribute. Each entity should have at least one unique attribute. Use the following instructions as boundaries for the ontology extraction process. {boundaries} @@ -212,7 +214,7 @@ Given the following ontology, correct any mistakes or missing information in the ontology. Add any missing entities, relations, or attributes to the ontology. Make sure to connect all related entities in the ontology. For example, if a Person PLAYED a Character in a Movie, make sure to connect the Character back to the Movie, otherwise we won't be able to say which Movie the Character is from. -Make sure each entity contains at least one unique attribute. For example, a Person entity should have a unique attribute like "name". +Make sure each entity contains at least one unique attribute. Make sure all entities have relations. Make sure all relations have 2 entities (source and target). Make sure all entity labels are titlecase. @@ -220,6 +222,7 @@ Relationship names must be timeless. For example "WROTE" and "WRITTEN" means the same thing, if the source and target entities are the same. Remove similar scenarios. Do not create relationships without their corresponding entities. Do not use the example Movie context to assume the ontology. The ontology should be created based on the provided text only. +Do not allow entities without at least one unique attribute. Ontology: {ontology} @@ -432,7 +435,6 @@ Context:[manager:CTL LLC, manager:JANE STREET GROUP LLC] Helpful Answer: CTL LLC, JANE STREET GROUP LLC owns Neo4j stocks. -If the provided information is empty, say that you don't know the answer. """ GRAPH_QA_PROMPT = """