Skip to content
This repository has been archived by the owner on Nov 13, 2024. It is now read-only.

Commit

Permalink
Merge remote-tracking branch 'origin/dev' into v0.1-e2e-tests
Browse files Browse the repository at this point in the history
  • Loading branch information
igiloh-pinecone committed Oct 1, 2023
2 parents 0f0931c + d359098 commit 09508e0
Show file tree
Hide file tree
Showing 34 changed files with 321 additions and 91 deletions.
50 changes: 50 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
tokenizer:
type: OpenAITokenizer
params:
model_name: gpt-3.5-turbo

knowledge_base:
params:
default_top_k: 5

record_encoder:
type: OpenAIEncoder
params:
model_name: "text-embedding-ada-002"
batch_size: 100

chunker:
type: MarkdownChunker
params:
chunk_size: 256
chunk_overlap: 0
keep_separator: true

context_engine:
context_builder:
type: StuffingContextBuilder

params:
global_metadata_filter: null # An optional metadata filter to apply to all queries

llm:
type: OpenAILLM
params:
model_name: gpt-3.5-turbo
model_params: null # Model-specific parameters. May change depending on the model.

chat_engine:
params:
max_prompt_tokens: 3000
max_generated_tokens: null # Will use the LLM's default max generated tokens
max_context_tokens: 2500
system_prompt: null # Will use the default system prompt
history_pruning: recent # Options: [raise, recent]
min_history_messages: 1

query_builder:
type: FunctionCallingQueryGenerator
params:
top_k: 5
prompt: null # Will use the default prompt
function_description: null # Will use the default function description
12 changes: 12 additions & 0 deletions config/minimal_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
knowledge_base: {} # This is a mandatory field, it will initialize the default KnowledgeBase class

context_engine: {} # This is a mandatory field, it will initialize the default ContextEngine class

# The `llm` field is optional. If it's not passed - no LLM will be initialized (see context_engine_only.yaml)
llm:
type: OpenAILLM
params:
model_name: gpt-3.5-turbo

# The `chat_engine` field is optional (see context_engine_only.yaml)
chat_engine: {}
9 changes: 5 additions & 4 deletions resin/chat_engine/chat_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from resin.chat_engine.query_generator import (QueryGenerator,
FunctionCallingQueryGenerator, )
from resin.context_engine import ContextEngine
from resin.knoweldge_base.tokenizer import Tokenizer
from resin.llm import BaseLLM
from resin.tokenizer import Tokenizer
from resin.llm import BaseLLM, OpenAILLM
from resin.llm.models import ModelParams, SystemMessage
from resin.models.api_models import (StreamingChatChunk, ChatResponse,
StreamingChatResponse, )
Expand Down Expand Up @@ -54,11 +54,12 @@ async def aget_context(self, messages: Messages) -> Context:
class ChatEngine(BaseChatEngine):

DEFAULT_QUERY_GENERATOR = FunctionCallingQueryGenerator
DEFAULT_LLM = OpenAILLM

def __init__(self,
*,
llm: BaseLLM,
context_engine: ContextEngine,
llm: Optional[BaseLLM] = None,
max_prompt_tokens: int = 4096,
max_generated_tokens: Optional[int] = None,
max_context_tokens: Optional[int] = None,
Expand All @@ -67,7 +68,7 @@ def __init__(self,
history_pruning: str = "recent",
min_history_messages: int = 1
):
self.llm = llm
self.llm = llm if llm is not None else self.DEFAULT_LLM()
self.context_engine = context_engine
self.max_prompt_tokens = max_prompt_tokens
self.max_generated_tokens = max_generated_tokens
Expand Down
2 changes: 1 addition & 1 deletion resin/chat_engine/history_pruner/base.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import Tuple

from resin.knoweldge_base.tokenizer.tokenizer import Tokenizer
from resin.tokenizer import Tokenizer
from resin.models.data_models import Messages


Expand Down
2 changes: 1 addition & 1 deletion resin/chat_engine/prompt_builder/prompt_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
RecentHistoryPruner, )
from resin.chat_engine.history_pruner.base import HistoryPruner
from resin.chat_engine.models import HistoryPruningMethod
from resin.knoweldge_base.tokenizer import Tokenizer
from resin.tokenizer import Tokenizer
from resin.models.data_models import Messages, Role, MessageBase


Expand Down
2 changes: 1 addition & 1 deletion resin/context_engine/context_builder/stuffing.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from resin.context_engine.context_builder.base import BaseContextBuilder
from resin.context_engine.models import ContextQueryResult, ContextSnippet
from resin.knoweldge_base.models import QueryResult, DocumentWithScore
from resin.knoweldge_base.tokenizer.tokenizer import Tokenizer
from resin.tokenizer import Tokenizer
from resin.models.data_models import Context


Expand Down
6 changes: 2 additions & 4 deletions resin/knoweldge_base/chunker/langchain_text_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,8 +449,6 @@ def get_separators_for_language(language: Language) -> List[str]:
]
elif language == Language.MARKDOWN:
return [
# add first level header to langchain implementation
"\n# ",
"\n## ",
"\n### ",
"\n#### ",
Expand All @@ -460,8 +458,8 @@ def get_separators_for_language(language: Language) -> List[str]:
# Heading level 2
# ---------------
# End of code block
"```\n\n",
# add table rows to langchain implementation
"```\n",
# add end of table to langchain implementation
'|\n\n'
# Horizontal lines
"\n\n***\n\n",
Expand Down
3 changes: 2 additions & 1 deletion resin/knoweldge_base/chunker/recursive_character.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from resin.knoweldge_base.chunker.base import Chunker
from resin.knoweldge_base.models import KBDocChunk
from resin.knoweldge_base.tokenizer.tokenizer import Tokenizer
from resin.tokenizer import Tokenizer
from resin.models.data_models import Document


Expand All @@ -30,6 +30,7 @@ def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
return [KBDocChunk(id=f"{document.id}_{i}",
document_id=document.id,
text=text_chunk,
source=document.source,
metadata=deepcopy(document.metadata))
for i, text_chunk in enumerate(text_chunks)]

Expand Down
3 changes: 2 additions & 1 deletion resin/knoweldge_base/chunker/token_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .base import Chunker
from ..models import KBDocChunk
from ..tokenizer.tokenizer import Tokenizer
from resin.tokenizer import Tokenizer
from ...models.data_models import Document


Expand Down Expand Up @@ -45,6 +45,7 @@ def chunk_single_document(self, document: Document) -> List[KBDocChunk]:
return [KBDocChunk(id=f"{document.id}_{i}",
document_id=document.id,
text=text_chunk,
source=document.source,
metadata=document.metadata)
for i, text_chunk in enumerate(text_chunks)]

Expand Down
57 changes: 41 additions & 16 deletions resin/knoweldge_base/knowledge_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
except ImportError:
from pinecone import Index

from pinecone_datasets import Dataset, DatasetMetadata, DenseModelMetadata
from pinecone_datasets import Dataset
from pinecone_datasets import DenseModelMetadata, DatasetMetadata

from resin.knoweldge_base.base import BaseKnowledgeBase
from resin.knoweldge_base.chunker import Chunker, MarkdownChunker
from resin.knoweldge_base.record_encoder import (RecordEncoder,
DenseRecordEncoder)
OpenAIRecordEncoder)
from resin.knoweldge_base.models import (KBQueryResult, KBQuery, QueryResult,
KBDocChunkWithScore, )
from resin.knoweldge_base.reranker import Reranker, TransparentReranker
Expand All @@ -27,28 +28,29 @@
TIMEOUT_INDEX_CREATE = 300
TIMEOUT_INDEX_PROVISION = 30
INDEX_PROVISION_TIME_INTERVAL = 3
RESERVED_METADATA_KEYS = {"document_id", "text", "source"}


class KnowledgeBase(BaseKnowledgeBase):

DEFAULT_RECORD_ENCODER = DenseRecordEncoder
DEFAULT_RECORD_ENCODER = OpenAIRecordEncoder
DEFAULT_CHUNKER = MarkdownChunker
DEFAULT_RERANKER = TransparentReranker

def __init__(self,
index_name: str,
*,
encoder: Optional[RecordEncoder] = None,
record_encoder: Optional[RecordEncoder] = None,
chunker: Optional[Chunker] = None,
reranker: Optional[Reranker] = None,
default_top_k: int = 10,
default_top_k: int = 5,
):
if default_top_k < 1:
raise ValueError("default_top_k must be greater than 0")

self._index_name = self._get_full_index_name(index_name)
self._default_top_k = default_top_k
self._encoder = encoder if encoder is not None else self.DEFAULT_RECORD_ENCODER() # noqa: E501
self._encoder = record_encoder if record_encoder is not None else self.DEFAULT_RECORD_ENCODER() # noqa: E501
self._chunker = chunker if chunker is not None else self.DEFAULT_CHUNKER()
self._reranker = reranker if reranker is not None else self.DEFAULT_RERANKER()

Expand Down Expand Up @@ -114,8 +116,8 @@ def verify_connection_health(self) -> None:
def create_with_new_index(cls,
index_name: str,
*,
encoder: RecordEncoder,
chunker: Chunker,
record_encoder: Optional[RecordEncoder] = None,
chunker: Optional[Chunker] = None,
reranker: Optional[Reranker] = None,
default_top_k: int = 10,
indexed_fields: Optional[List[str]] = None,
Expand All @@ -134,8 +136,9 @@ def create_with_new_index(cls,
"Please remove it from indexed_fields")

if dimension is None:
if encoder.dimension is not None:
dimension = encoder.dimension
record_encoder = record_encoder if record_encoder is not None else cls.DEFAULT_RECORD_ENCODER() # noqa: E501
if record_encoder.dimension is not None:
dimension = record_encoder.dimension
else:
raise ValueError("Could not infer dimension from encoder. "
"Please provide the vectors' dimension")
Expand Down Expand Up @@ -174,7 +177,7 @@ def create_with_new_index(cls,

# initialize KnowledgeBase
return cls(index_name=index_name,
encoder=encoder,
record_encoder=record_encoder,
chunker=chunker,
reranker=reranker,
default_top_k=default_top_k)
Expand Down Expand Up @@ -267,6 +270,7 @@ def _query_index(self,
text=text,
document_id=document_id,
score=match['score'],
source=metadata.pop('source', ''),
metadata=metadata)
)
return KBQueryResult(query=query.text, documents=documents)
Expand All @@ -277,6 +281,15 @@ def upsert(self,
batch_size: int = 100):
self._verify_not_deleted()

for doc in documents:
metadata_keys = set(doc.metadata.keys())
forbidden_keys = metadata_keys.intersection(RESERVED_METADATA_KEYS)
if forbidden_keys:
raise ValueError(
f"Document with id {doc.id} contains reserved metadata keys: "
f"{forbidden_keys}. Please remove them and try again."
)

chunks = self._chunker.chunk_documents(documents)
encoded_chunks = self._encoder.encode_documents(chunks)

Expand Down Expand Up @@ -315,13 +328,25 @@ def upsert_dataframe(self,
batch_size: int = 100):
self._verify_not_deleted()

expected_columns = ["id", "text", "metadata"]
if not all([c in df.columns for c in expected_columns]):
required_columns = {"id", "text"}
optional_columns = {"source", "metadata"}

df_columns = set(df.columns)
if not df_columns.issuperset(required_columns):
raise ValueError(
f"Dataframe must contain the following columns: {expected_columns}"
f"Got: {df.columns}"
f"Dataframe must contain the following columns: "
f"{list(required_columns)}, Got: {list(df.columns)}"
)
documents = [Document(id=row.id, text=row.text, metadata=row.metadata)

redundant_columns = df_columns - required_columns - optional_columns
if redundant_columns:
raise ValueError(
f"Dataframe contains unknown columns: {list(redundant_columns)}. "
f"Only the following columns are allowed: "
f"{list(required_columns) + list(optional_columns)}"
)

documents = [Document(**row._asdict())
for row in df.itertuples()]
self.upsert(documents, namespace=namespace, batch_size=batch_size)

Expand Down
1 change: 1 addition & 0 deletions resin/knoweldge_base/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def to_db_record(self):
metadata = deepcopy(self.metadata)
metadata["text"] = self.text
metadata["document_id"] = self.document_id
metadata["source"] = self.source

return {
"id": self.id,
Expand Down
3 changes: 2 additions & 1 deletion resin/knoweldge_base/record_encoder/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .base import RecordEncoder
from .dense_record_encoder import DenseRecordEncoder
from .dense import DenseRecordEncoder
from .openai import OpenAIRecordEncoder
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from typing import List, Optional
from typing import List
from functools import cached_property

from pinecone_text.dense.base_dense_ecoder import BaseDenseEncoder
from pinecone_text.dense.openai_encoder import OpenAIEncoder

from .base import RecordEncoder
from resin.knoweldge_base.models import KBQuery, KBEncodedDocChunk, KBDocChunk
Expand All @@ -11,17 +10,10 @@

class DenseRecordEncoder(RecordEncoder):

DEFAULT_DENSE_ENCODER = OpenAIEncoder
DEFAULT_MODEL_NAME = "text-embedding-ada-002"

def __init__(self,
dense_encoder: Optional[BaseDenseEncoder] = None,
*,
batch_size: int = 500,
dense_encoder: BaseDenseEncoder,
**kwargs):
super().__init__(batch_size=batch_size, **kwargs)
if dense_encoder is None:
dense_encoder = self.DEFAULT_DENSE_ENCODER(self.DEFAULT_MODEL_NAME)
super().__init__(**kwargs)
self._dense_encoder = dense_encoder

def _encode_documents_batch(self,
Expand Down
25 changes: 25 additions & 0 deletions resin/knoweldge_base/record_encoder/openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from typing import List

from pinecone_text.dense.openai_encoder import OpenAIEncoder
from resin.knoweldge_base.models import KBDocChunk, KBEncodedDocChunk, KBQuery
from resin.knoweldge_base.record_encoder.dense import DenseRecordEncoder
from resin.models.data_models import Query


class OpenAIRecordEncoder(DenseRecordEncoder):

def __init__(self,
*,
model_name: str = "text-embedding-ada-002",
batch_size: int = 100,
**kwargs):
encoder = OpenAIEncoder(model_name)
super().__init__(dense_encoder=encoder, batch_size=batch_size, **kwargs)

async def _aencode_documents_batch(self,
documents: List[KBDocChunk]
) -> List[KBEncodedDocChunk]:
raise NotImplementedError

async def _aencode_queries_batch(self, queries: List[Query]) -> List[KBQuery]:
raise NotImplementedError
2 changes: 1 addition & 1 deletion resin/llm/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
class OpenAILLM(BaseLLM):

def __init__(self,
model_name: str,
model_name: str = "gpt-3.5-turbo",
*,
model_params: Optional[ModelParams] = None,
):
Expand Down
File renamed without changes.
Loading

0 comments on commit 09508e0

Please sign in to comment.