From a5700b98aec97d6a9574111c551915cc414b271e Mon Sep 17 00:00:00 2001 From: "Amir A. Zohrenejad" Date: Wed, 13 Sep 2023 23:56:08 -0700 Subject: [PATCH] DH-4598 ported changes in other branch for docs into the refactor branch --- README.md | 2 + docs/api.process_nl_query_response.rst | 2 +- docs/api.rst | 94 ++++++++++++++++++++++++ docs/api.update_nl_query_response.rst | 4 +- docs/conf.py | 5 +- docs/contributing.projects.rst | 11 +++ docs/envars.rst | 63 ++++++++++++++++ docs/getting_started.rst | 19 ----- docs/index.rst | 32 +++++++- docs/introduction.rst | 10 +-- docs/tutorial.chatgpt_plugin.rst | 4 + docs/tutorial.finetune_sql_generator.rst | 4 + docs/tutorial.sample_database.rst | 4 + 13 files changed, 221 insertions(+), 33 deletions(-) create mode 100644 docs/contributing.projects.rst create mode 100644 docs/envars.rst delete mode 100644 docs/getting_started.rst create mode 100644 docs/tutorial.chatgpt_plugin.rst create mode 100644 docs/tutorial.finetune_sql_generator.rst create mode 100644 docs/tutorial.sample_database.rst diff --git a/README.md b/README.md index 83e0be8e..9185e9bb 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ Dataherald is a natural language-to-SQL engine built for enterprise-level questi This project is undergoing swift development, and as such, the API may be subject to change at any time. +If you would like to learn more, you can join the Discord or read the docs. + ## Overview ### Background diff --git a/docs/api.process_nl_query_response.rst b/docs/api.process_nl_query_response.rst index 3b569307..6f14c5ab 100644 --- a/docs/api.process_nl_query_response.rst +++ b/docs/api.process_nl_query_response.rst @@ -1,5 +1,5 @@ Process a NL query response -======================= +============================= Once you made a question you can try sending a new sql query to improve the response, this is not stored diff --git a/docs/api.rst b/docs/api.rst index f156e0f7..cd91503d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,6 +1,100 @@ API ======================= +The Dataherald Engine exposes RESTful APIs that can be used to: + +* 🔌 Connect to and manage connections to databases +* 🔑 Add context to the engine through scanning the databases, adding descriptions to tables and columns and adding golden records +* 🙋‍♀️ Ask natural language questions from the relational data + +Our APIs have resource-oriented URL built around standard HTTP response codes and verbs. The core resources are described below. + + +Database Connections +------------------------------ + +The ``database-connections`` object allows you to define connections to your relational data stores. + +Related endpoints are: + +* :doc:`Create database connection ` -- ``POST api/v1/database-connections`` +* :doc:`List database connections ` -- ``GET api/v1/database-connections`` +* :doc:`Update a database connection ` -- ``PUT api/v1/database-connections/{alias}`` + + +.. code-block:: json + + { + "alias": "string", + "use_ssh": false, + "connection_uri": "string", + "path_to_credentials_file": "string", + "ssh_settings": { + "db_name": "string", + "host": "string", + "username": "string", + "password": "string", + "remote_host": "string", + "remote_db_name": "string", + "remote_db_password": "string", + "private_key_path": "string", + "private_key_password": "string", + "db_driver": "string" + } + } + + +Query Response +------------------ +The ``query-response`` object is created from the answering natural language questions from the relational data. + +The related endpoints are: + +* :doc:`process_nl_query_response ` -- ``POST api/v1/nl-query-responses`` +* :doc:`update_nl_query_response ` -- ``PATCH api/v1/nl-query-responses/{query_id}`` + + +.. code-block:: json + + { + "confidence_score": "string", + "error_message": "string", + "exec_time": "float", + "intermediate_steps":["string"], + "nl_question_id": "string", + "nl_response": "string", + "sql_generation_status": "string", + "sql_query": "string", + "sql_query_result": {}, + "total_cost": "float", + "total_tokens": "int" + } + + +Table Descriptions +--------------------- +The ``table-descriptions`` object is used to add context about the tables and columns in the relational database. +These are then used to help the LLM build valid SQL to answer natural language questions. + +Related endpoints are: + +* :doc:`Scan table description ` -- ``POST api/v1/table-descriptions/scan`` +* :doc:`Add table description ` -- ``PATCH api/v1/table-descriptions/{table_description_id}`` +* :doc:`List table description ` -- ``GET api/v1/table-descriptions`` + +.. code-block:: json + + { + "columns": [{}], + "db_connection_id": "string", + "description": "string", + "examples": [{}], + "table_name": "string", + "table_schema": "string" + } + + + .. toctree:: :hidden: diff --git a/docs/api.update_nl_query_response.rst b/docs/api.update_nl_query_response.rst index 11f3c100..9447afa0 100644 --- a/docs/api.update_nl_query_response.rst +++ b/docs/api.update_nl_query_response.rst @@ -1,7 +1,7 @@ Update a NL query response -======================= +============================ -Once you made a question, you can give feedback to improve the queries +Once you ask a question, you can give feedback to improve the queries Request this ``PATCH`` endpoint:: diff --git a/docs/conf.py b/docs/conf.py index dc5cde16..e99f88df 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,10 +11,11 @@ sys.path.insert(0, os.path.abspath("..")) -project = "Dataherald" +project = "Dataherald AI" copyright = "2023, Dataherald" author = "Dataherald" -release = "0.0.1" +release = "main" +html_title = project # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/contributing.projects.rst b/docs/contributing.projects.rst new file mode 100644 index 00000000..79724f15 --- /dev/null +++ b/docs/contributing.projects.rst @@ -0,0 +1,11 @@ +Jumping in +==================== + +We are beyond thrilled that you are considering joining this project. There are a number of +community projects that are in development, spanning areas such as: + +* Connecting to public data sources +* Building integrations with front-end frameworks +* Testing and benchmarking new NL-to-SQL approaches proposed in academic literature + +The best place to jump in is to hop on the #projects channel on our :ref:`Discord server `_ \ No newline at end of file diff --git a/docs/envars.rst b/docs/envars.rst new file mode 100644 index 00000000..78e80012 --- /dev/null +++ b/docs/envars.rst @@ -0,0 +1,63 @@ +Environment Variables +======================= +The Dataherald engine has a number of environment variables that need to be set in order for it to work. The following is the sample +provided in the .env.example file with the default values. + + +.. code-block:: bash + + OPENAI_API_KEY = + ORG_ID = + LLM_MODEL = 'gpt-4-32k' + + + GOLDEN_RECORD_COLLECTION = 'my-golden-records' + PINECONE_API_KEY = + PINECONE_ENVIRONMENT = + + + API_SERVER = "dataherald.api.fastapi.FastAPI" + SQL_GENERATOR = "dataherald.sql_generator.dataherald_sqlagent.DataheraldSQLAgent" + EVALUATOR = "dataherald.eval.simple_evaluator.SimpleEvaluator" + DB = "dataherald.db.mongo.MongoDB" + VECTOR_STORE = 'dataherald.vector_store.chroma.Chroma' + CONTEXT_STORE = 'dataherald.context_store.default.DefaultContextStore' + DB_SCANNER = 'dataherald.db_scanner.sqlalchemy.SqlAlchemyScanner' + + + MONGODB_URI = "mongodb://admin:admin@mongodb:27017" + MONGODB_DB_NAME = 'dataherald' + MONGODB_DB_USERNAME = 'admin' + MONGODB_DB_PASSWORD = 'admin' + + ENCRYPT_KEY = + + S3_AWS_ACCESS_KEY_ID = + S3_AWS_SECRET_ACCESS_KEY = + ` + + +.. csv-table:: + :header: "Variable Name", "Description", "Default Value", "Required" + :widths: 15, 55, 25, 5 + + "OPENAI_API_KEY", "The OpenAI key used by the Dataherald Engine", "None", "Yes" + "ORG_ID", "The OpenAI Organization ID used by the Dataherald Engine", "None", "Yes" + "LLM_MODEL", "The Language Model used by the Dataherald Engine. Supported values include gpt-4-32k, gpt-4, gpt-3.5-turbo, gpt-3.5-turbo-16k", "``gpt-4-32k``", "No" + "GOLDEN_RECORD_COLLECTION", "The name of the collection in Mongo where golden records will be stored", "``my-golden-records``", "No" + "PINECONE_API_KEY", "The Pinecone API key used", "None", "Yes if using the Pinecone vector store" + "PINECONE_ENVIRONMENT", "The Pinecone environment", "None", "Yes if using the Pinecone vector store" + "API_SERVER", "The implementation of the API Module used by the Dataherald Engine.", "``dataherald.api.fastapi.FastAPI``", "Yes" + "SQL_GENERATOR", "The implementation of the SQLGenerator Module to be used.", "``dataherald.sql_generator. dataherald_sqlagent. DataheraldSQLAgent``", "Yes" + "EVALUATOR", "The implementation of the Evaluator Module to be used.", "``dataherald.eval. simple_evaluator.SimpleEvaluator``", "Yes" + "DB", "The implementation of the DB Module to be used.", "``dataherald.db.mongo.MongoDB``", "Yes" + "VECTOR_STORE", "The implementation of the Vector Store Module to be used. Chroma and Pinecone modules are currently included.", "``dataherald.vector_store. chroma.Chroma``", "Yes" + "CONTEXT_STORE", "The implementation of the Context Store Module to be used.", "``dataherald.context_store. default.DefaultContextStore``", "Yes" + "DB_SCANNER", "The implementation of the DB Scanner Module to be used.", "``dataherald.db_scanner. sqlalchemy.SqlAlchemyScanner``", "Yes" + "MONGODB_URI", "The URI of the MongoDB that will be used for application storage.", "``mongodb:// admin:admin@mongodb:27017``", "Yes" + "MONGODB_DB_NAME", "The name of the MongoDB database that will be used.", "``dataherald``", "Yes" + "MONGODB_DB_USERNAME", "The username of the MongoDB database", "``admin``", "Yes" + "MONGODB_DB_PASSWORD", "The password of the MongoDB database", "``admin``", "Yes" + "ENCRYPT_KEY", "The key that will be used to encrypt data at rest before storing", "None", "Yes" + "S3_AWS_ACCESS_KEY_ID", "The key used to access credential files if saved to S3", "None", "No" + "S3_AWS_SECRET_ACCESS_KEY", "The key used to access credential files if saved to S3", "None", "No " diff --git a/docs/getting_started.rst b/docs/getting_started.rst deleted file mode 100644 index 4c0fe45c..00000000 --- a/docs/getting_started.rst +++ /dev/null @@ -1,19 +0,0 @@ -.. _getting_started: - -Getting started -======================== - -Dataherald AI comes "batteries included." While the engine is modular and core modules can be easily replaced, we have included best-in-class implementations of core modules so you can get set up in minutes. - - - -.. toctree:: - :hidden: - - introduction - quickstart - - - - - diff --git a/docs/index.rst b/docs/index.rst index 1b5ceb23..cba4eb79 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,14 +7,38 @@ Dataherald AI ======================================== Welcome to the official documentation page of the Dataherald AI engine. This documentation is intended for developers who want to: -* Use the Dataherald AI engine to set up Natural Language interfaces from structured data in their own projects. -* Contribute to the Dataherald AI engine. +* 🖥️ Use the Dataherald AI engine to set up Natural Language interfaces from structured data in their own projects. +* 🏍️ Contribute to the Dataherald AI engine. These documents will cover how to get started, how to set up an API from your database that can answer questions in plain English and how to extend the core engine's functionality. .. toctree:: + :maxdepth: 1 + :caption: Getting Started + :hidden: + + introduction + quickstart + +.. toctree:: + :caption: References :hidden: - getting_started api - modules \ No newline at end of file + envars + modules + +.. toctree:: + :caption: Tutorials + :hidden: + + tutorial.sample_database + tutorial.finetune_sql_generator + tutorial.chatgpt_plugin + + +.. toctree:: + :caption: Contributing + :hidden: + + contributing.projects diff --git a/docs/introduction.rst b/docs/introduction.rst index 46cfccfc..e18bf8b2 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -12,8 +12,8 @@ You can use Dataherald to: Dataherald is built to: -* Be modular, allowing different implementations of core modules to be plugged-in -* Come batteries included: Have best-in-class implementations for modules like text to SQL, evaluation -* Be easy to set-up and use with major data warehouses -* Allow for Active Learning, allowing you to improve the performance with usage -* Be fast +* 🔌 Be modular, allowing different implementations of core modules to be plugged-in +* 🔋 Come batteries included: Have best-in-class implementations for modules like text to SQL, evaluation +* 📀 Be easy to set-up and use with major data warehouses +* 👨‍🏫 Allow for Active Learning, allowing you to improve the performance with usage +* 🏎️ Be fast \ No newline at end of file diff --git a/docs/tutorial.chatgpt_plugin.rst b/docs/tutorial.chatgpt_plugin.rst new file mode 100644 index 00000000..ebbea8f0 --- /dev/null +++ b/docs/tutorial.chatgpt_plugin.rst @@ -0,0 +1,4 @@ +Create a ChatGPT plug-in from your structured data +===================================================== + +Coming soon ... \ No newline at end of file diff --git a/docs/tutorial.finetune_sql_generator.rst b/docs/tutorial.finetune_sql_generator.rst new file mode 100644 index 00000000..1d67e80d --- /dev/null +++ b/docs/tutorial.finetune_sql_generator.rst @@ -0,0 +1,4 @@ +Using a Custom Text to SQL Engine +================================== + +Coming soon ... diff --git a/docs/tutorial.sample_database.rst b/docs/tutorial.sample_database.rst new file mode 100644 index 00000000..65a8fd8a --- /dev/null +++ b/docs/tutorial.sample_database.rst @@ -0,0 +1,4 @@ +Setting up a sample Database for accurate NL-to-SQL +==================================================== + +Coming soon ... \ No newline at end of file