Skip to content

Commit

Permalink
[Release] Version 0.1.1
Browse files Browse the repository at this point in the history
What's changed:

- Update the version of ChromaDB to 4.13.
- Re-generate the `poetry.lock` file.
- Update Python scripts to migrate ChromaDB from 3.21 to 4.13
  (See https://docs.trychroma.com/migration).
- Updated README to clarify when to delete an existing vector database.
- Update the chatbot UI template to use "PaLM" to refer to the AI model.

Sept 28, 2023
  • Loading branch information
kyolee415 committed Sep 29, 2023
1 parent e437473 commit c88997c
Show file tree
Hide file tree
Showing 8 changed files with 1,790 additions and 1,975 deletions.
11 changes: 6 additions & 5 deletions demos/palm/python/docs-agent/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -487,15 +487,16 @@ To convert Markdown files to plain text files:

### 2. Populate a new vector database

**Important**: If the `vector_stores/chroma` directory already exists, delete
(or move) the `chroma` directory before populating a new vector database. Also,
if the Docs Agent chat app is already running using this `chroma` directory, shut down
the app before deleting the directory.

Once you have plain text files processed and stored in the `output_path` directory,
you can run the `populat_vector_database.py` script to populate a vector database
with the contents of the plain text files and their embeddings (and metadata).

**Important**: For a clean setup, if the `vector_stores/chroma` directory already
exists, delete (or move) the `chroma` directory before populating a new vector
database. (Otherwise, new entries will be added to your existing vector database.)
Also, if the Docs Agent chat app is already running using this `chroma` directory,
shut down the app before deleting the directory.

To populate a new vector database:

1. Go to the Docs Agent project directory, for example:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ <h4 id="rewrite-question-header">Question:</h4>
<span id="rewrite-question-span">
<p>{{ question | replace("+", " ") | replace("%3F", "?")}}</p>
</span>
<h4 id="rewrite-response-header">Bard's response:</h4>
<h4 id="rewrite-response-header">PaLM's response:</h4>
<span id="rewrite-original-response-span">
{{ response_in_html | safe }}
</span>
Expand Down
7 changes: 1 addition & 6 deletions demos/palm/python/docs-agent/chroma.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,7 @@ class Chroma:
"""Chroma wrapper"""

def __init__(self, chroma_dir) -> None:
self.client = chromadb.Client(
Settings(
chroma_db_impl="duckdb+parquet",
persist_directory=chroma_dir,
)
)
self.client = chromadb.PersistentClient(path=chroma_dir)

def list_collections(self):
return self.client.list_collections()
Expand Down
3,716 changes: 1,764 additions & 1,952 deletions demos/palm/python/docs-agent/poetry.lock

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions demos/palm/python/docs-agent/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docs-agent"
version = "0.1.0"
version = "0.1.1"
description = ""
authors = ["Docs Agent contributors"]
readme = "README.md"
Expand All @@ -11,7 +11,7 @@ rich = "^13.3.5"
Markdown = "^3.4.3"
beautifulsoup4 = "^4.12.2"
protobuf = ">=3.20"
chromadb = "^0.3.21"
chromadb = "==0.4.13"
sentence-transformers = "^2.2.2"
ratelimit = "^2.2.1"
absl-py = "^1.4.0"
Expand All @@ -21,6 +21,7 @@ google-generativeai = "^0.1.0"
grpcio = "^1.57.0"
grpcio-tools = "^1.57.0"
uuid = "^1.30"
pytz = ">=2020.1"

[tool.poetry.group.dev.dependencies]
ipython = "^8.13.2"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def process_page_and_section_titles(markdown_text):
page_title = data["title"]
markdown_text = data.content
metadata = data.metadata
if "URL" in data:
final_url = data["URL"]
metadata["URL"] = final_url
for line in markdown_text.split("\n"):
new_line = ""
skip_this_line = False
Expand All @@ -173,7 +176,7 @@ def process_page_and_section_titles(markdown_text):
# Detect Markdown heading levels
if heading == "#":
page_title = captured_title.strip()
metadata = {"title": page_title}
metadata["title"] = page_title
subsection_title = ""
section_title = ""
elif heading == "##":
Expand Down
15 changes: 10 additions & 5 deletions demos/palm/python/docs-agent/scripts/populate_vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,7 @@
MODEL = os.path.join(BASE_DIR, "models/all-mpnet-base-v2")
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=MODEL)

chroma_client = chromadb.Client(
Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_VECTOR_DB_DIR)
)

chroma_client = chromadb.PersistentClient(path=LOCAL_VECTOR_DB_DIR)

# Create embed function for PaLM
# API call limit to 5 qps
Expand Down Expand Up @@ -175,6 +172,8 @@ def embed_function(texts: Documents) -> Embeddings:
# Using the full path avoids mismatches
full_file_name = FULL_BASE_DIR + clean_filename + file
metadata_dict_extra = {}
# Flag to see if there is a predefined URL from frontmatter
final_url = False
# Reads the metadata associated with files
for key in index:
if full_file_name in index[key]:
Expand All @@ -197,6 +196,10 @@ def embed_function(texts: Documents) -> Embeddings:
index[key][full_file_name]["metadata"], delimiter="_"
)
metadata_dict_extra = dict(metadata_dict_extra)
# Extracts user specified URL
if "URL" in metadata_dict_extra:
final_url = True
final_url_value = metadata_dict_extra["URL"]
else:
metadata_dict_extra = {}
if "UUID" in index[key][full_file_name]:
Expand All @@ -216,6 +219,9 @@ def embed_function(texts: Documents) -> Embeddings:
# Remove .md at the end of URLs by default.
match3 = re.search(r"(.*)\.md$", url)
url = match3[1]
# Replaces the URL if it comes from frontmatter
if (final_url):
url = final_url_value
# Creates a dictionary with basic metadata values
# (i.e. source, URL, and md_hash)
metadata_dict_main = {
Expand Down Expand Up @@ -287,7 +293,6 @@ def embed_function(texts: Documents) -> Embeddings:
print("[Warning] Empty file!")
print("")
auto.close()
chroma_client.persist()
# results = collection.query(
# query_texts=["What are some differences between apples and oranges?"],
# n_results=3,
Expand Down
4 changes: 1 addition & 3 deletions demos/palm/python/docs-agent/scripts/test_vector_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ def embed_palm(texts: Documents) -> Embeddings:
ai_console = Console(width=160)
ai_console.rule("Fold")

chroma_client = chromadb.Client(
Settings(chroma_db_impl="duckdb+parquet", persist_directory=LOCAL_VECTOR_DB_DIR)
)
chroma_client = chromadb.PersistentClient(path=LOCAL_VECTOR_DB_DIR)

if EMBEDDINGS_TYPE == "PALM":
PALM_EMBEDDING_MODEL = "models/embedding-gecko-001"
Expand Down

0 comments on commit c88997c

Please sign in to comment.