feat: add method - detect format / data_type (#380)

mem0ai · Aug 16, 2023 · 4c8876f · 4c8876f
1 parent f92e890
commit 4c8876f
Show file tree

Hide file tree

Showing 18 changed files with 472 additions and 121 deletions.
diff --git a/README.md b/README.md
@@ -28,8 +28,8 @@ pip install embedchain
   zuck_bot = Llama2App()
 
   # Embed your data
-  zuck_bot.add("youtube_video", "https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
-  zuck_bot.add("web_page", "https://en.wikipedia.org/wiki/Mark_Zuckerberg")
+  zuck_bot.add("https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
+  zuck_bot.add("https://en.wikipedia.org/wiki/Mark_Zuckerberg")
 
   # Nice, your bot is ready now. Start asking questions to your bot.
   zuck_bot.query("Who is Mark Zuckerberg?")
@@ -64,9 +64,9 @@ os.environ["OPENAI_API_KEY"] = "YOUR API KEY"
 elon_bot = App()
 
 # Embed online resources
-elon_bot.add("web_page", "https://en.wikipedia.org/wiki/Elon_Musk")
-elon_bot.add("web_page", "https://tesla.com/elon-musk")
-elon_bot.add("youtube_video", "https://www.youtube.com/watch?v=MxZpaJK74Y4")
+elon_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
+elon_bot.add("https://tesla.com/elon-musk")
+elon_bot.add("https://www.youtube.com/watch?v=MxZpaJK74Y4")
 
 # Query the bot
 elon_bot.query("How many companies does Elon Musk run?")

diff --git a/docs/advanced/adding_data.mdx b/docs/advanced/adding_data.mdx
@@ -6,20 +6,20 @@ title: '➕ Adding Data'
 
 - This step assumes that you have already created an `app` instance by either using `App`, `OpenSourceApp` or `CustomApp`. We are calling our app instance as `naval_chat_bot` 🤖
 
-- Now use `.add()` function to add any dataset.
+- Now use `.add` method to add any dataset.
 
 ```python
 # naval_chat_bot = App() or
 # naval_chat_bot = OpenSourceApp()
 
 # Embed Online Resources
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
-naval_chat_bot.add("web_page", "https://nav.al/feedback")
-naval_chat_bot.add("web_page", "https://nav.al/agi")
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
+naval_chat_bot.add("https://nav.al/feedback")
+naval_chat_bot.add("https://nav.al/agi")
 
 # Embed Local Resources
-naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
+naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
 ```
 
 The possible formats to add data can be found on the [Supported Data Formats](/advanced/data_types) page.
diff --git a/docs/advanced/app_types.mdx b/docs/advanced/app_types.mdx
@@ -35,8 +35,8 @@ os.environ['REPLICATE_API_TOKEN'] = "REPLICATE API TOKEN"
 zuck_bot = Llama2App()
 
 # Embed your data
-zuck_bot.add("youtube_video", "https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
-zuck_bot.add("web_page", "https://en.wikipedia.org/wiki/Mark_Zuckerberg")
+zuck_bot.add("https://www.youtube.com/watch?v=Ff4fRgnuFgQ")
+zuck_bot.add("https://en.wikipedia.org/wiki/Mark_Zuckerberg")
 
 # Nice, your bot is ready now. Start asking questions to your bot.
 zuck_bot.query("Who is Mark Zuckerberg?")

diff --git a/docs/advanced/configuration.mdx b/docs/advanced/configuration.mdx
@@ -26,17 +26,17 @@ naval_chat_bot = App(config)
 
 # Example: define your own chunker config for `youtube_video`
 chunker_config = ChunkerConfig(chunk_size=1000, chunk_overlap=100, length_function=len)
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44", AddConfig(chunker=chunker_config))
 
 add_config = AddConfig()
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", add_config)
-naval_chat_bot.add("web_page", "https://nav.al/feedback", add_config)
-naval_chat_bot.add("web_page", "https://nav.al/agi", add_config)
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf", config=add_config)
+naval_chat_bot.add("https://nav.al/feedback", config=add_config)
+naval_chat_bot.add("https://nav.al/agi", config=add_config)
 
-naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."), add_config)
+naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."), config=add_config)
 
 query_config = QueryConfig()
-print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?", query_config))
+print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?", config=query_config))
 ```
 
 ### Custom prompt template
@@ -53,7 +53,7 @@ einstein_chat_bot = App()
 
 # Embed Wikipedia page
 page = wikipedia.page("Albert Einstein")
-einstein_chat_bot.add("text", page.content)
+einstein_chat_bot.add(page.content)
 
 # Example: use your own custom template with `$context` and `$query`
 einstein_chat_template = Template("""
@@ -75,7 +75,7 @@ queries = [
         "Why did you divorce your first wife?",
 ]
 for query in queries:
-        response = einstein_chat_bot.query(query, query_config)
+        response = einstein_chat_bot.query(query, config=query_config)
         print("Query: ", query)
         print("Response: ", response)
 

diff --git a/docs/advanced/data_types.mdx b/docs/advanced/data_types.mdx
@@ -2,22 +2,48 @@
 title: '📋 Supported data formats'
 ---
 
-Embedchain supports following data formats:
+## Automatic data type detection
+The add method automatically tries to detect the data_type, based on your input for the source argument. So `app.add('https://www.youtube.com/watch?v=dQw4w9WgXcQ')` is enough to embed a YouTube video.
+
+This detection is implemented for all formats. It is based on factors such as whether it's a URL, a local file, the source data type, etc.
+
+### Debugging automatic detection
+
+
+Set `log_level=DEBUG` (in [AppConfig](http://localhost:3000/advanced/query_configuration#appconfig)) and make sure it's working as intended.
+
+Otherwise, you will not know when, for instance, an invalid filepath is interpreted as raw text instead.
+
+### Forcing a data type
+
+To omit any issues with the data type detection, you can **force** a data_type by adding it as a `add` method argument.
+The examples below show you the keyword to force the respective `data_type`.
+
+Forcing can also be used for edge cases, such as interpreting a sitemap as a web_page, for reading it's raw text instead of following links.
+
+## Remote Data Types
+
+<Tip>
+**Use local files in remote data types**
+
+Some data_types are meant for remote content and only work with URLs.
+You can pass local files by formatting the path using the `file:` [URI scheme](https://en.wikipedia.org/wiki/File_URI_scheme), e.g. `file:///info.pdf`.
+</Tip>
 
 ### Youtube video
 
 To add any youtube video to your app, use the data_type (first argument to `.add()` method) as `youtube_video`. Eg:
 
 ```python
-app.add('youtube_video', 'a_valid_youtube_url_here')
+app.add('a_valid_youtube_url_here', data_type='youtube_video')
 ```
 
 ### PDF file
 
 To add any pdf file, use the data_type as `pdf_file`. Eg:
 
 ```python
-app.add('pdf_file', 'a_valid_url_where_pdf_file_can_be_accessed')
+app.add('a_valid_url_where_pdf_file_can_be_accessed', data_type='pdf_file')
 ```
 
 Note that we do not support password protected pdfs.
@@ -27,51 +53,54 @@ Note that we do not support password protected pdfs.
 To add any web page, use the data_type as `web_page`. Eg:
 
 ```python
-app.add('web_page', 'a_valid_web_page_url')
+app.add('a_valid_web_page_url', data_type='web_page')
 ```
 
 ### Sitemap
 
 Add all web pages from an xml-sitemap. Filters non-text files. Use the data_type as `sitemap`. Eg:
 
 ```python
-app.add('sitemap', 'https://example.com/sitemap.xml')
+app.add('https://example.com/sitemap.xml', data_type='sitemap')
 ```
 
 ### Doc file
 
-To add any doc/docx file, use the data_type as `docx`. Eg:
+To add any doc/docx file, use the data_type as `docx`. `docx` allows remote urls and conventional file paths. Eg:
 
 ```python
-app.add('docx', 'a_local_docx_file_path')
+app.add('https://example.com/content/intro.docx', data_type="docx")
+app.add('content/intro.docx', data_type="docx")
 ```
 
 ### Code documentation website loader
 
 To add any code documentation website as a loader, use the data_type as `docs_site`. Eg:
 
 ```python
-app.add("docs_site", "https://docs.embedchain.ai/")
+app.add("https://docs.embedchain.ai/", data_type="docs_site")
 ```
 
 ### Notion
 To use notion you must install the extra dependencies with `pip install embedchain[notion]`.
 
-To load a notion page, use the data_type as `notion`.
+To load a notion page, use the data_type as `notion`. Since it is hard to automatically detect, forcing this is advised.
 The next argument must **end** with the `notion page id`. The id is a 32-character string. Eg:
 
 ```python
-app.add("notion", "cfbc134ca6464fc980d0391613959196")
-app.add("notion", "my-page-cfbc134ca6464fc980d0391613959196")
-app.add("notion", "https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196")
+app.add("cfbc134ca6464fc980d0391613959196", "notion")
+app.add("my-page-cfbc134ca6464fc980d0391613959196", "notion")
+app.add("https://www.notion.so/my-page-cfbc134ca6464fc980d0391613959196", "notion")
 ```
 
+## Local Data Types
+
 ### Text
 
 To supply your own text, use the data_type as `text` and enter a string. The text is not processed, this can be very versatile. Eg:
 
 ```python
-app.add_local('text', 'Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.')
+app.add('Seek wealth, not money or status. Wealth is having assets that earn while you sleep. Money is how we transfer time and wealth. Status is your place in the social hierarchy.', data_type='text')
 ```
 
 Note: This is not used in the examples because in most cases you will supply a whole paragraph or file, which did not fit.
@@ -81,7 +110,7 @@ Note: This is not used in the examples because in most cases you will supply a w
 To supply your own QnA pair, use the data_type as `qna_pair` and enter a tuple. Eg:
 
 ```python
-app.add_local('qna_pair', ("Question", "Answer"))
+app.add(("Question", "Answer"), data_type="qna_pair")
 ```
 
 ## Reusing a vector database
@@ -94,8 +123,8 @@ Create a local index:
 from embedchain import App
 
 naval_chat_bot = App()
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
 ```
 
 You can reuse the local index with the same code, but without adding new documents:
@@ -107,6 +136,6 @@ naval_chat_bot = App()
 print(naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?"))
 ```
 
-### More formats (coming soon!)
+## More formats (coming soon!)
 
-- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats.
+- If you want to add any other format, please create an [issue](https://github.com/embedchain/embedchain/issues) and we will add it to the list of supported formats.
diff --git a/docs/advanced/query_configuration.mdx b/docs/advanced/query_configuration.mdx
@@ -25,7 +25,7 @@ Yes, you are passing `ChunkerConfig` to `AddConfig`, like so:
 ```python
 chunker_config = ChunkerConfig(chunk_size=100)
 add_config = AddConfig(chunker=chunker_config)
-app.add_local("text", "lorem ipsum", config=add_config)
+app.add("lorem ipsum", config=add_config)
 ```
 
 ### ChunkerConfig

diff --git a/docs/introduction.mdx b/docs/introduction.mdx
@@ -7,7 +7,7 @@ description: '📝 Embedchain is a framework to easily create LLM powered bots o
 
 Embedchain abstracts the entire process of loading a dataset, chunking it, creating embeddings, and storing it in a vector database.
 
-You can add a single or multiple datasets using the .add and .add_local functions. Then, simply use the .query function to find answers from the added datasets.
+You can add a single or multiple datasets using the `.add` method. Then, simply use the `.query` method to find answers from the added datasets.
 
 If you want to create a Naval Ravikant bot with a YouTube video, a book in PDF format, two blog posts, and a question and answer pair, all you need to do is add the respective links. Embedchain will take care of the rest, creating a bot for you.
 
@@ -16,13 +16,13 @@ from embedchain import App
 
 naval_chat_bot = App()
 # Embed Online Resources
-naval_chat_bot.add("youtube_video", "https://www.youtube.com/watch?v=3qHkcs3kG44")
-naval_chat_bot.add("pdf_file", "https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
-naval_chat_bot.add("web_page", "https://nav.al/feedback")
-naval_chat_bot.add("web_page", "https://nav.al/agi")
+naval_chat_bot.add("https://www.youtube.com/watch?v=3qHkcs3kG44")
+naval_chat_bot.add("https://navalmanack.s3.amazonaws.com/Eric-Jorgenson_The-Almanack-of-Naval-Ravikant_Final.pdf")
+naval_chat_bot.add("https://nav.al/feedback")
+naval_chat_bot.add("https://nav.al/agi")
 
 # Embed Local Resources
-naval_chat_bot.add_local("qna_pair", ("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
+naval_chat_bot.add(("Who is Naval Ravikant?", "Naval Ravikant is an Indian-American entrepreneur and investor."))
 
 naval_chat_bot.query("What unique capacity does Naval argue humans possess when it comes to understanding explanations or concepts?")
 # Answer: Naval argues that humans possess the unique capacity to understand explanations or concepts to the maximum extent possible in this physical reality.
@@ -32,7 +32,7 @@ naval_chat_bot.query("What unique capacity does Naval argue humans possess when
 
 Creating a chat bot over any dataset involves the following steps:
 
-1. Load the data
+1. Detect the data type and load the data
 2. Create meaningful chunks
 3. Create embeddings for each chunk
 4. Store the chunks in a vector database
@@ -53,4 +53,4 @@ The process of loading the dataset and querying involves multiple steps, each wi
 
 Embedchain takes care of all these nuances and provides a simple interface to create bots over any dataset.
 
-In the first release, we make it easier for anyone to get a chatbot over any dataset up and running in less than a minute. Just create an app instance, add the datasets using the `.add()` function, and use the `.query()` function to get the relevant answers.
+In the first release, we make it easier for anyone to get a chatbot over any dataset up and running in less than a minute. Just create an app instance, add the datasets using the `.add` method, and use the `.query` method to get the relevant answers.
diff --git a/docs/mint.json b/docs/mint.json
@@ -32,7 +32,7 @@
     },
     {
       "group": "Advanced",
-      "pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data","advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"]
+      "pages": ["advanced/app_types", "advanced/interface_types", "advanced/adding_data", "advanced/data_types", "advanced/query_configuration", "advanced/configuration", "advanced/testing", "advanced/vector_database", "advanced/showcase"]
     },
     {
       "group": "Examples",

diff --git a/docs/quickstart.mdx b/docs/quickstart.mdx
@@ -26,8 +26,8 @@ os.environ["OPENAI_API_KEY"] = "xxx"
 elon_musk_bot = App()
 
 # Embed Online Resources
-elon_musk_bot.add("web_page", "https://en.wikipedia.org/wiki/Elon_Musk")
-elon_musk_bot.add("web_page", "https://www.tesla.com/elon-musk")
+elon_musk_bot.add("https://en.wikipedia.org/wiki/Elon_Musk")
+elon_musk_bot.add("https://www.tesla.com/elon-musk")
 
 response = elon_musk_bot.query("How many companies does Elon Musk run?")
 print(response)

diff --git a/embedchain/chunkers/base_chunker.py b/embedchain/chunkers/base_chunker.py
@@ -1,5 +1,7 @@
 import hashlib
 
+from embedchain.models.data_type import DataType
+
 
 class BaseChunker:
     def __init__(self, text_splitter):
@@ -26,7 +28,7 @@ def create_chunks(self, loader, src):
 
             meta_data = data["meta_data"]
             # add data type to meta data to allow query using data type
-            meta_data["data_type"] = self.data_type
+            meta_data["data_type"] = self.data_type.value
             url = meta_data["url"]
 
             chunks = self.get_chunks(content)
@@ -52,8 +54,10 @@ def get_chunks(self, content):
         """
         return self.text_splitter.split_text(content)
 
-    def set_data_type(self, data_type):
+    def set_data_type(self, data_type: DataType):
         """
         set the data type of chunker
         """
         self.data_type = data_type
+
+        # TODO: This should be done during initialization. This means it has to be done in the child classes.