From 73a42cfd8bb37f5df3c57b6b1e9c94d171d3a385 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 11:04:47 -0400 Subject: [PATCH 01/26] Add new LangChain llm module. --- ads/llm/langchain/plugins/llms/__init__.py | 0 ..._data_science_model_deployment_endpoint.py | 927 ++++++++++++++++++ 2 files changed, 927 insertions(+) create mode 100644 ads/llm/langchain/plugins/llms/__init__.py create mode 100644 ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py diff --git a/ads/llm/langchain/plugins/llms/__init__.py b/ads/llm/langchain/plugins/llms/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py new file mode 100644 index 000000000..cb67a5855 --- /dev/null +++ b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py @@ -0,0 +1,927 @@ +import json +import logging +from typing import ( + Any, + AsyncIterator, + Callable, + Dict, + Iterator, + List, + Literal, + Optional, + Union, +) + +import aiohttp +import requests +from langchain_core.callbacks import ( + AsyncCallbackManagerForLLMRun, + CallbackManagerForLLMRun, +) +from langchain_core.language_models.llms import BaseLLM, create_base_retry_decorator +from langchain_core.load.serializable import Serializable +from langchain_core.outputs import Generation, GenerationChunk, LLMResult +from langchain_core.pydantic_v1 import Field +from langchain_core.utils import get_from_dict_or_env, pre_init + +from langchain_community.utilities.requests import Requests + +logger = logging.getLogger(__name__) + + +DEFAULT_TIME_OUT = 300 +DEFAULT_CONTENT_TYPE_JSON = "application/json" +DEFAULT_MODEL_NAME = "odsc-llm" + + +class TokenExpiredError(Exception): + """Raises when token expired.""" + + +class ServerError(Exception): + """Raises when encounter server error when making inference.""" + + +def _create_retry_decorator( + llm: "BaseOCIModelDeployment", + *, + run_manager: Optional[ + Union[AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun] + ] = None, +) -> Callable[[Any], Any]: + """Create a retry decorator.""" + errors = [requests.exceptions.ConnectTimeout, TokenExpiredError] + decorator = create_base_retry_decorator( + error_types=errors, max_retries=llm.max_retries, run_manager=run_manager + ) + return decorator + + +class BaseOCIModelDeployment(Serializable): + """Base class for LLM deployed on OCI Data Science Model Deployment.""" + + auth: dict = Field(default_factory=dict, exclude=True) + """ADS auth dictionary for OCI authentication: + https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html. + This can be generated by calling `ads.common.auth.api_keys()` + or `ads.common.auth.resource_principal()`. If this is not + provided then the `ads.common.default_signer()` will be used.""" + + endpoint: str = "" + """The uri of the endpoint from the deployed Model Deployment model.""" + + streaming: bool = False + """Whether to stream the results or not.""" + + max_retries: int = 3 + """Maximum number of retries to make when generating.""" + + @pre_init + def validate_environment( # pylint: disable=no-self-argument + cls, values: Dict + ) -> Dict: + """Validate that python package exists in environment.""" + try: + import ads + + except ImportError as ex: + raise ImportError( + "Could not import ads python package. " + "Please install it with `pip install oracle_ads`." + ) from ex + + if not values.get("auth", None): + values["auth"] = ads.common.auth.default_signer() + + values["endpoint"] = get_from_dict_or_env( + values, + "endpoint", + "OCI_LLM_ENDPOINT", + ) + return values + + def _headers( + self, is_async: Optional[bool] = False, body: Optional[dict] = None + ) -> Dict: + """Construct and return the headers for a request. + + Args: + is_async (bool, optional): Indicates if the request is asynchronous. + Defaults to `False`. + body (optional): The request body to be included in the headers if + the request is asynchronous. + + Returns: + Dict: A dictionary containing the appropriate headers for the request. + """ + if is_async: + signer = self.auth["signer"] + _req = requests.Request("POST", self.endpoint, json=body) + req = _req.prepare() + req = signer(req) + headers = {} + for key, value in req.headers.items(): + headers[key] = value + + if self.streaming: + headers.update( + {"enable-streaming": "true", "Accept": "text/event-stream"} + ) + return headers + + return ( + { + "Content-Type": DEFAULT_CONTENT_TYPE_JSON, + "enable-streaming": "true", + "Accept": "text/event-stream", + } + if self.streaming + else { + "Content-Type": DEFAULT_CONTENT_TYPE_JSON, + } + ) + + def completion_with_retry( + self, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any + ) -> Any: + """Use tenacity to retry the completion call.""" + retry_decorator = _create_retry_decorator(self, run_manager=run_manager) + + @retry_decorator + def _completion_with_retry(**kwargs: Any) -> Any: + try: + request_timeout = kwargs.pop("request_timeout", DEFAULT_TIME_OUT) + data = kwargs.pop("data") + stream = kwargs.pop("stream", self.streaming) + + request = Requests( + headers=self._headers(), auth=self.auth.get("signer") + ) + response = request.post( + url=self.endpoint, + data=data, + timeout=request_timeout, + stream=stream, + **kwargs, + ) + self._check_response(response) + return response + except TokenExpiredError as e: + raise e + except Exception as err: + logger.debug( + f"Requests payload: {data}. Requests arguments: " + f"url={self.endpoint},timeout={request_timeout},stream={stream}. " + f"Additional request kwargs={kwargs}." + ) + raise RuntimeError( + f"Error occurs by inference endpoint: {str(err)}" + ) from err + + return _completion_with_retry(**kwargs) + + async def acompletion_with_retry( + self, + run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> Any: + """Use tenacity to retry the async completion call.""" + retry_decorator = _create_retry_decorator(self, run_manager=run_manager) + + @retry_decorator + async def _completion_with_retry(**kwargs: Any) -> Any: + try: + request_timeout = kwargs.pop("request_timeout", DEFAULT_TIME_OUT) + data = kwargs.pop("data") + stream = kwargs.pop("stream", self.streaming) + + request = Requests(headers=self._headers(is_async=True, body=data)) + if stream: + response = request.apost( + url=self.endpoint, + data=data, + timeout=request_timeout, + ) + return self._aiter_sse(response) + else: + async with request.apost( + url=self.endpoint, + data=data, + timeout=request_timeout, + ) as resp: + self._check_response(resp) + data = await resp.json() + return data + except TokenExpiredError as e: + raise e + except Exception as err: + logger.debug( + f"Requests payload: `{data}`. " + f"Stream mode={stream}. " + f"Requests kwargs: url={self.endpoint}, timeout={request_timeout}." + ) + raise RuntimeError( + f"Error occurs by inference endpoint: {str(err)}" + ) from err + + return await _completion_with_retry(**kwargs) + + def _check_response(self, response: Any) -> None: + """Handle server error by checking the response status. + + Args: + response: + The response object from either `requests` or `aiohttp` library. + + Raises: + TokenExpiredError: + If the response status code is 401 and the token refresh is successful. + ServerError: + If any other HTTP error occurs. + """ + try: + response.raise_for_status() + except requests.exceptions.HTTPError as http_err: + status_code = ( + response.status_code + if hasattr(response, "status_code") + else response.status + ) + if status_code == 401 and self._refresh_signer(): + raise TokenExpiredError() from http_err + + raise ServerError( + f"Server error: {str(http_err)}. \nMessage: {response.text}" + ) from http_err + + def _parse_stream(self, lines: Iterator[bytes]) -> Iterator[str]: + """Parse a stream of byte lines and yield parsed string lines. + + Args: + lines (Iterator[bytes]): + An iterator that yields lines in byte format. + + Yields: + Iterator[str]: + An iterator that yields parsed lines as strings. + """ + for line in lines: + _line = self._parse_stream_line(line) + if _line is not None: + yield _line + + async def _parse_stream_async( + self, + lines: aiohttp.StreamReader, + ) -> AsyncIterator[str]: + """ + Asynchronously parse a stream of byte lines and yield parsed string lines. + + Args: + lines (aiohttp.StreamReader): + An `aiohttp.StreamReader` object that yields lines in byte format. + + Yields: + AsyncIterator[str]: + An asynchronous iterator that yields parsed lines as strings. + """ + async for line in lines: + _line = self._parse_stream_line(line) + if _line is not None: + yield _line + + def _parse_stream_line(self, line: bytes) -> Optional[str]: + """Parse a single byte line and return a processed string line if valid. + + Args: + line (bytes): A single line in byte format. + + Returns: + Optional[str]: + The processed line as a string if valid, otherwise `None`. + """ + line = line.strip() + if line: + _line = line.decode("utf-8") + if "[DONE]" in _line: + return None + + if _line.lower().startswith("data:"): + return _line[5:].lstrip() + return None + + async def _aiter_sse( + self, + async_cntx_mgr: Any, + ) -> AsyncIterator[str]: + """Asynchronously iterate over server-sent events (SSE). + + Args: + async_cntx_mgr: An asynchronous context manager that yields a client + response object. + + Yields: + AsyncIterator[str]: An asynchronous iterator that yields parsed server-sent + event lines as json string. + """ + async with async_cntx_mgr as client_resp: + self._check_response(client_resp) + async for line in self._parse_stream_async(client_resp.content): + yield line + + def _refresh_signer(self) -> bool: + """Attempt to refresh the security token using the signer. + + Returns: + bool: `True` if the token was successfully refreshed, `False` otherwise. + """ + if self.auth.get("signer", None) and hasattr( + self.auth["signer"], "refresh_security_token" + ): + self.auth["signer"].refresh_security_token() + return True + return False + + +class OCIModelDeploymentLLM(BaseLLM, BaseOCIModelDeployment): + """LLM deployed on OCI Data Science Model Deployment. + + To use, you must provide the model HTTP endpoint from your deployed + model, e.g. https://modeldeployment..oci.customer-oci.com//predict. + + To authenticate, `oracle-ads` has been used to automatically load + credentials: https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html + + Make sure to have the required policies to access the OCI Data + Science Model Deployment endpoint. See: + https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__predict-endpoint + + Example: + + .. code-block:: python + + from langchain_community.llms import OCIModelDeploymentLLM + + llm = OCIModelDeploymentLLM( + endpoint="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict", + model="odsc-llm", + streaming=True, + model_kwargs={"frequency_penalty": 1.0}, + ) + llm.invoke("tell me a joke.") + + Customized Usage: + + User can inherit from our base class and overrwrite the `_process_response`, `_process_stream_response`, + `_construct_json_body` for satisfying customized needed. + + .. code-block:: python + + from langchain_community.llms import OCIModelDeploymentLLM + + class MyCutomizedModel(OCIModelDeploymentLLM): + def _process_stream_response(self, response_json:dict) -> GenerationChunk: + print("My customized output stream handler.") + return GenerationChunk() + + def _process_response(self, response_json:dict) -> List[Generation]: + print("My customized output handler.") + return [Generation()] + + def _construct_json_body(self, prompt: str, param:dict) -> dict: + print("My customized input handler.") + return {} + + llm = MyCutomizedModel( + endpoint=f"https://modeldeployment.us-ashburn-1.oci.customer-oci.com/{ocid}/predict", + model="", + } + + llm.invoke("tell me a joke.") + + """ # noqa: E501 + + model: str = DEFAULT_MODEL_NAME + """The name of the model.""" + + max_tokens: int = 256 + """Denotes the number of tokens to predict per generation.""" + + temperature: float = 0.2 + """A non-negative float that tunes the degree of randomness in generation.""" + + k: int = -1 + """Number of most likely tokens to consider at each step.""" + + p: float = 0.75 + """Total probability mass of tokens to consider at each step.""" + + best_of: int = 1 + """Generates best_of completions server-side and returns the "best" + (the one with the highest log probability per token). + """ + + stop: Optional[List[str]] = None + """Stop words to use when generating. Model output is cut off + at the first occurrence of any of these substrings.""" + + model_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Keyword arguments to pass to the model.""" + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "oci_model_deployment_endpoint" + + @classmethod + def is_lc_serializable(cls) -> bool: + """Return whether this model can be serialized by Langchain.""" + return True + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters.""" + return { + "best_of": self.best_of, + "max_tokens": self.max_tokens, + "model": self.model, + "stop": self.stop, + "stream": self.streaming, + "temperature": self.temperature, + "top_k": self.k, + "top_p": self.p, + } + + @property + def _identifying_params(self) -> Dict[str, Any]: + """Get the identifying parameters.""" + _model_kwargs = self.model_kwargs or {} + return { + **{"endpoint": self.endpoint, "model_kwargs": _model_kwargs}, + **self._default_params, + } + + def _generate( + self, + prompts: List[str], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> LLMResult: + """Call out to OCI Data Science Model Deployment endpoint with k unique prompts. + + Args: + prompts: The prompts to pass into the service. + stop: Optional list of stop words to use when generating. + + Returns: + The full LLM output. + + Example: + .. code-block:: python + + response = llm.invoke("Tell me a joke.") + response = llm.generate(["Tell me a joke."]) + """ + generations: List[List[Generation]] = [] + params = self._invocation_params(stop, **kwargs) + for prompt in prompts: + body = self._construct_json_body(prompt, params) + if self.streaming: + generation = GenerationChunk(text="") + for chunk in self._stream( + prompt, stop=stop, run_manager=run_manager, **kwargs + ): + generation += chunk + generations.append([generation]) + else: + res = self.completion_with_retry( + data=body, + run_manager=run_manager, + **kwargs, + ) + generations.append(self._process_response(res.json())) + return LLMResult(generations=generations) + + async def _agenerate( + self, + prompts: List[str], + stop: Optional[List[str]] = None, + run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> LLMResult: + """Call out to OCI Data Science Model Deployment endpoint async with k unique prompts. + + Args: + prompts: The prompts to pass into the service. + stop: Optional list of stop words to use when generating. + + Returns: + The full LLM output. + + Example: + .. code-block:: python + + response = await llm.ainvoke("Tell me a joke.") + response = await llm.agenerate(["Tell me a joke."]) + """ # noqa: E501 + generations: List[List[Generation]] = [] + params = self._invocation_params(stop, **kwargs) + for prompt in prompts: + body = self._construct_json_body(prompt, params) + if self.streaming: + generation = GenerationChunk(text="") + async for chunk in self._astream( + prompt, stop=stop, run_manager=run_manager, **kwargs + ): + generation += chunk + generations.append([generation]) + else: + res = await self.acompletion_with_retry( + data=body, + run_manager=run_manager, + **kwargs, + ) + generations.append(self._process_response(res)) + return LLMResult(generations=generations) + + def _stream( + self, + prompt: str, + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> Iterator[GenerationChunk]: + """Stream OCI Data Science Model Deployment endpoint on given prompt. + + + Args: + prompt (str): + The prompt to pass into the model. + stop (List[str], Optional): + List of stop words to use when generating. + kwargs: + requests_kwargs: + Additional ``**kwargs`` to pass to requests.post + + Returns: + An iterator of GenerationChunks. + + + Example: + + .. code-block:: python + + response = llm.stream("Tell me a joke.") + + """ + requests_kwargs = kwargs.pop("requests_kwargs", {}) + self.streaming = True + params = self._invocation_params(stop, **kwargs) + body = self._construct_json_body(prompt, params) + + response = self.completion_with_retry( + data=body, run_manager=run_manager, stream=True, **requests_kwargs + ) + + for line in self._parse_stream(response.iter_lines()): + chunk = self._handle_sse_line(line) + if run_manager: + run_manager.on_llm_new_token(chunk.text, chunk=chunk) + yield chunk + + async def _astream( + self, + prompt: str, + stop: Optional[List[str]] = None, + run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> AsyncIterator[GenerationChunk]: + """Stream OCI Data Science Model Deployment endpoint async on given prompt. + + + Args: + prompt (str): + The prompt to pass into the model. + stop (List[str], Optional): + List of stop words to use when generating. + kwargs: + requests_kwargs: + Additional ``**kwargs`` to pass to requests.post + + Returns: + An iterator of GenerationChunks. + + + Example: + + .. code-block:: python + + async for chunk in llm.astream(("Tell me a joke."): + print(chunk, end="", flush=True) + + """ + requests_kwargs = kwargs.pop("requests_kwargs", {}) + self.streaming = True + params = self._invocation_params(stop, **kwargs) + body = self._construct_json_body(prompt, params) + + async for line in await self.acompletion_with_retry( + data=body, run_manager=run_manager, stream=True, **requests_kwargs + ): + chunk = self._handle_sse_line(line) + if run_manager: + await run_manager.on_llm_new_token(chunk.text, chunk=chunk) + yield chunk + + def _construct_json_body(self, prompt: str, params: dict) -> dict: + """Constructs the request body as a dictionary (JSON).""" + return { + "prompt": prompt, + **params, + } + + def _invocation_params( + self, stop: Optional[List[str]] = None, **kwargs: Any + ) -> dict: + """Combines the invocation parameters with default parameters.""" + params = self._default_params + _model_kwargs = self.model_kwargs or {} + params["stop"] = stop or params.get("stop", []) + return {**params, **_model_kwargs, **kwargs} + + def _process_stream_response(self, response_json: dict) -> GenerationChunk: + """Formats streaming response for OpenAI spec into GenerationChunk.""" + try: + choice = response_json["choices"][0] + if not isinstance(choice, dict): + raise TypeError("Endpoint response is not well formed.") + except (KeyError, IndexError, TypeError) as e: + raise ValueError("Error while formatting response payload.") from e + + return GenerationChunk(text=choice.get("text", "")) + + def _process_response(self, response_json: dict) -> List[Generation]: + """Formats response in OpenAI spec. + + Args: + response_json (dict): The JSON response from the chat model endpoint. + + Returns: + ChatResult: An object containing the list of `ChatGeneration` objects + and additional LLM output information. + + Raises: + ValueError: If the response JSON is not well-formed or does not + contain the expected structure. + + """ + generations = [] + try: + choices = response_json["choices"] + if not isinstance(choices, list): + raise TypeError("Endpoint response is not well formed.") + except (KeyError, TypeError) as e: + raise ValueError("Error while formatting response payload.") from e + + for choice in choices: + gen = Generation( + text=choice.get("text"), + generation_info=self._generate_info(choice), + ) + generations.append(gen) + + return generations + + def _generate_info(self, choice: dict) -> Any: + """Extracts generation info from the response.""" + gen_info = {} + finish_reason = choice.get("finish_reason", None) + logprobs = choice.get("logprobs", None) + index = choice.get("index", None) + if finish_reason: + gen_info.update({"finish_reason": finish_reason}) + if logprobs is not None: + gen_info.update({"logprobs": logprobs}) + if index is not None: + gen_info.update({"index": index}) + + return gen_info or None + + def _handle_sse_line(self, line: str) -> GenerationChunk: + try: + obj = json.loads(line) + return self._process_stream_response(obj) + except Exception: + return GenerationChunk(text="") + + +class OCIModelDeploymentTGI(OCIModelDeploymentLLM): + """OCI Data Science Model Deployment TGI Endpoint. + + To use, you must provide the model HTTP endpoint from your deployed + model, e.g. https://modeldeployment..oci.customer-oci.com//predict. + + To authenticate, `oracle-ads` has been used to automatically load + credentials: https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html + + Make sure to have the required policies to access the OCI Data + Science Model Deployment endpoint. See: + https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__predict-endpoint + + Example: + .. code-block:: python + + from langchain_community.llms import OCIModelDeploymentTGI + + llm = OCIModelDeploymentTGI( + endpoint="https://modeldeployment..oci.customer-oci.com//predict", + api="/v1/completions", + streaming=True, + temperature=0.2, + seed=42, + # other model parameters ... + ) + + """ + + api: Literal["/generate", "/v1/completions"] = "/generate" + """Api spec.""" + + frequency_penalty: float = 0.0 + """Penalizes repeated tokens according to frequency. Between 0 and 1.""" + + seed: Optional[int] = None + """Random sampling seed""" + + repetition_penalty: Optional[float] = None + """The parameter for repetition penalty. 1.0 means no penalty.""" + + suffix: Optional[str] = None + """The text to append to the prompt. """ + + do_sample: bool = True + """If set to True, this parameter enables decoding strategies such as + multi-nominal sampling, beam-search multi-nominal sampling, Top-K + sampling and Top-p sampling. + """ + + watermark: bool = True + """Watermarking with `A Watermark for Large Language Models `_. + Defaults to True.""" + + return_full_text: bool = False + """Whether to prepend the prompt to the generated text. Defaults to False.""" + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "oci_model_deployment_tgi_endpoint" + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters for invoking OCI model deployment TGI endpoint.""" + return ( + { + "model": self.model, # can be any + "frequency_penalty": self.frequency_penalty, + "max_tokens": self.max_tokens, + "repetition_penalty": self.repetition_penalty, + "temperature": self.temperature, + "top_p": self.p, + "seed": self.seed, + "stream": self.streaming, + "suffix": self.suffix, + "stop": self.stop, + } + if self.api == "/v1/completions" + else { + "best_of": self.best_of, + "max_new_tokens": self.max_tokens, + "temperature": self.temperature, + "top_k": ( + self.k if self.k > 0 else None + ), # `top_k` must be strictly positive' + "top_p": self.p, + "do_sample": self.do_sample, + "return_full_text": self.return_full_text, + "watermark": self.watermark, + "stop": self.stop, + } + ) + + @property + def _identifying_params(self) -> Dict[str, Any]: + """Get the identifying parameters.""" + _model_kwargs = self.model_kwargs or {} + return { + **{ + "endpoint": self.endpoint, + "api": self.api, + "model_kwargs": _model_kwargs, + }, + **self._default_params, + } + + def _construct_json_body(self, prompt: str, params: dict) -> dict: + """Construct request payload.""" + if self.api == "/v1/completions": + return super()._construct_json_body(prompt, params) + + return { + "inputs": prompt, + "parameters": params, + } + + def _process_response(self, response_json: dict) -> List[Generation]: + """Formats response.""" + if self.api == "/v1/completions": + return super()._process_response(response_json) + + try: + text = response_json["generated_text"] + except KeyError as e: + raise ValueError( + f"Error while formatting response payload.response_json={response_json}" + ) from e + + return [Generation(text=text)] + + +class OCIModelDeploymentVLLM(OCIModelDeploymentLLM): + """VLLM deployed on OCI Data Science Model Deployment + + To use, you must provide the model HTTP endpoint from your deployed + model, e.g. https://modeldeployment..oci.customer-oci.com//predict. + + To authenticate, `oracle-ads` has been used to automatically load + credentials: https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html + + Make sure to have the required policies to access the OCI Data + Science Model Deployment endpoint. See: + https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__predict-endpoint + + Example: + .. code-block:: python + + from langchain_community.llms import OCIModelDeploymentVLLM + + llm = OCIModelDeploymentVLLM( + endpoint="https://modeldeployment..oci.customer-oci.com//predict", + model="odsc-llm", + streaming=False, + temperature=0.2, + max_tokens=512, + n=3, + best_of=3, + # other model parameters + ) + + """ + + n: int = 1 + """Number of output sequences to return for the given prompt.""" + + k: int = -1 + """Number of most likely tokens to consider at each step.""" + + frequency_penalty: float = 0.0 + """Penalizes repeated tokens according to frequency. Between 0 and 1.""" + + presence_penalty: float = 0.0 + """Penalizes repeated tokens. Between 0 and 1.""" + + use_beam_search: bool = False + """Whether to use beam search instead of sampling.""" + + ignore_eos: bool = False + """Whether to ignore the EOS token and continue generating tokens after + the EOS token is generated.""" + + logprobs: Optional[int] = None + """Number of log probabilities to return per output token.""" + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "oci_model_deployment_vllm_endpoint" + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters for calling vllm.""" + return { + "best_of": self.best_of, + "frequency_penalty": self.frequency_penalty, + "ignore_eos": self.ignore_eos, + "logprobs": self.logprobs, + "max_tokens": self.max_tokens, + "model": self.model, + "n": self.n, + "presence_penalty": self.presence_penalty, + "stop": self.stop, + "stream": self.streaming, + "temperature": self.temperature, + "top_k": self.k, + "top_p": self.p, + "use_beam_search": self.use_beam_search, + } From 42859cf8a3d06e8f3f6cee12fbc91f17e536f585 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 11:05:00 -0400 Subject: [PATCH 02/26] Add LangChain chat model module. --- .../langchain/plugins/chat_models/__init__.py | 0 .../plugins/chat_models/oci_data_science.py | 894 ++++++++++++++++++ 2 files changed, 894 insertions(+) create mode 100644 ads/llm/langchain/plugins/chat_models/__init__.py create mode 100644 ads/llm/langchain/plugins/chat_models/oci_data_science.py diff --git a/ads/llm/langchain/plugins/chat_models/__init__.py b/ads/llm/langchain/plugins/chat_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ads/llm/langchain/plugins/chat_models/oci_data_science.py b/ads/llm/langchain/plugins/chat_models/oci_data_science.py new file mode 100644 index 000000000..251590a78 --- /dev/null +++ b/ads/llm/langchain/plugins/chat_models/oci_data_science.py @@ -0,0 +1,894 @@ +import json +import logging +from operator import itemgetter +from typing import ( + Any, + AsyncIterator, + Dict, + Iterator, + List, + Literal, + Optional, + Type, + Union, +) + +from langchain_core.callbacks import ( + AsyncCallbackManagerForLLMRun, + CallbackManagerForLLMRun, +) +from langchain_core.language_models import LanguageModelInput +from langchain_core.language_models.chat_models import ( + BaseChatModel, + agenerate_from_stream, + generate_from_stream, +) +from langchain_core.messages import AIMessageChunk, BaseMessage, BaseMessageChunk +from langchain_core.output_parsers import ( + JsonOutputParser, + PydanticOutputParser, +) +from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult +from langchain_core.pydantic_v1 import BaseModel, Field +from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough + +from langchain_community.adapters.openai import ( + convert_dict_to_message, + convert_message_to_dict, +) +from langchain_community.chat_models.openai import _convert_delta_to_message_chunk +from langchain_community.llms.oci_data_science_model_deployment_endpoint import ( + DEFAULT_MODEL_NAME, + BaseOCIModelDeployment, +) + +logger = logging.getLogger(__name__) + + +def _is_pydantic_class(obj: Any) -> bool: + return isinstance(obj, type) and issubclass(obj, BaseModel) + + +class ChatOCIModelDeployment(BaseChatModel, BaseOCIModelDeployment): + """OCI Data Science Model Deployment chat model integration. + + To use, you must provide the model HTTP endpoint from your deployed + chat model, e.g. https://modeldeployment..oci.customer-oci.com//predict. + + To authenticate, `oracle-ads` has been used to automatically load + credentials: https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html + + Make sure to have the required policies to access the OCI Data + Science Model Deployment endpoint. See: + https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__predict-endpoint + + Instantiate: + .. code-block:: python + + from langchain_community.chat_models import ChatOCIModelDeployment + + chat = ChatOCIModelDeployment( + endpoint="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict", + model="odsc-llm", + streaming=True, + max_retries=3, + model_kwargs={ + "max_token": 512, + "temperature": 0.2, + # other model parameters ... + }, + ) + + Invocation: + .. code-block:: python + + messages = [ + ("system", "You are a helpful translator. Translate the user sentence to French."), + ("human", "Hello World!"), + ] + chat.invoke(messages) + + .. code-block:: python + + AIMessage( + content='Bonjour le monde!',response_metadata={'token_usage': {'prompt_tokens': 40, 'total_tokens': 50, 'completion_tokens': 10},'model_name': 'odsc-llm','system_fingerprint': '','finish_reason': 'stop'},id='run-cbed62da-e1b3-4abd-9df3-ec89d69ca012-0') + + Streaming: + .. code-block:: python + + for chunk in chat.stream(messages): + print(chunk) + + .. code-block:: python + + content='' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='\n' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='B' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='on' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='j' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='our' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content=' le' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content=' monde' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='!' id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + content='' response_metadata={'finish_reason': 'stop'} id='run-23df02c6-c43f-42de-87c6-8ad382e125c3' + + Asyc: + .. code-block:: python + + await chat.ainvoke(messages) + + # stream: + # async for chunk in (await chat.astream(messages)) + + .. code-block:: python + + AIMessage(content='Bonjour le monde!', response_metadata={'finish_reason': 'stop'}, id='run-8657a105-96b7-4bb6-b98e-b69ca420e5d1-0') + + Structured output: + .. code-block:: python + + from typing import Optional + + from langchain_core.pydantic_v1 import BaseModel, Field + + class Joke(BaseModel): + setup: str = Field(description="The setup of the joke") + punchline: str = Field(description="The punchline to the joke") + + structured_llm = chat.with_structured_output(Joke, method="json_mode") + structured_llm.invoke( + "Tell me a joke about cats, respond in JSON with `setup` and `punchline` keys" + ) + + .. code-block:: python + + Joke(setup='Why did the cat get stuck in the tree?',punchline='Because it was chasing its tail!') + + See ``ChatOCIModelDeployment.with_structured_output()`` for more. + + Customized Usage: + + You can inherit from base class and overrwrite the `_process_response`, `_process_stream_response`, + `_construct_json_body` for satisfying customized needed. + + .. code-block:: python + + class MyChatModel(ChatOCIModelDeployment): + def _process_stream_response(self, response_json: dict) -> ChatGenerationChunk: + print("My customized streaming result handler.") + return GenerationChunk(...) + + def _process_response(self, response_json:dict) -> ChatResult: + print("My customized output handler.") + return ChatResult(...) + + def _construct_json_body(self, messages: list, params: dict) -> dict: + print("My customized payload handler.") + return { + "messages": messages, + **params, + } + + chat = MyChatModel( + endpoint=f"https://modeldeployment.us-ashburn-1.oci.customer-oci.com/{ocid}/predict", + model="odsc-llm", + } + + chat.invoke("tell me a joke") + + """ # noqa: E501 + + model_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Keyword arguments to pass to the model.""" + + model: str = DEFAULT_MODEL_NAME + """The name of the model.""" + + stop: Optional[List[str]] = None + """Stop words to use when generating. Model output is cut off + at the first occurrence of any of these substrings.""" + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "oci_model_depolyment_chat_endpoint" + + @property + def _identifying_params(self) -> Dict[str, Any]: + """Get the identifying parameters.""" + _model_kwargs = self.model_kwargs or {} + return { + **{"endpoint": self.endpoint, "model_kwargs": _model_kwargs}, + **self._default_params, + } + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters.""" + return { + "model": self.model, + "stop": self.stop, + "stream": self.streaming, + } + + def _generate( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> ChatResult: + """Call out to an OCI Model Deployment Online endpoint. + + Args: + messages: The messages in the conversation with the chat model. + stop: Optional list of stop words to use when generating. + + Returns: + LangChain ChatResult + + Raises: + RuntimeError: + Raise when invoking endpoint fails. + + Example: + + .. code-block:: python + + messages = [ + ( + "system", + "You are a helpful assistant that translates English to French. Translate the user sentence.", + ), + ("human", "Hello World!"), + ] + + response = chat.invoke(messages) + """ # noqa: E501 + if self.streaming: + stream_iter = self._stream( + messages, stop=stop, run_manager=run_manager, **kwargs + ) + return generate_from_stream(stream_iter) + + requests_kwargs = kwargs.pop("requests_kwargs", {}) + params = self._invocation_params(stop, **kwargs) + body = self._construct_json_body(messages, params) + res = self.completion_with_retry( + data=body, run_manager=run_manager, **requests_kwargs + ) + return self._process_response(res.json()) + + def _stream( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> Iterator[ChatGenerationChunk]: + """Stream OCI Data Science Model Deployment endpoint on given messages. + + Args: + messages (List[BaseMessage]): + The messagaes to pass into the model. + stop (List[str], Optional): + List of stop words to use when generating. + kwargs: + requests_kwargs: + Additional ``**kwargs`` to pass to requests.post + + Returns: + An iterator of ChatGenerationChunk. + + Raises: + RuntimeError: + Raise when invoking endpoint fails. + + Example: + + .. code-block:: python + + messages = [ + ( + "system", + "You are a helpful assistant that translates English to French. Translate the user sentence.", + ), + ("human", "Hello World!"), + ] + + chunk_iter = chat.stream(messages) + + """ # noqa: E501 + requests_kwargs = kwargs.pop("requests_kwargs", {}) + self.streaming = True + params = self._invocation_params(stop, **kwargs) + body = self._construct_json_body(messages, params) # request json body + + response = self.completion_with_retry( + data=body, run_manager=run_manager, stream=True, **requests_kwargs + ) + default_chunk_class = AIMessageChunk + for line in self._parse_stream(response.iter_lines()): + # print(">>>" + str(line)) + chunk = self._handle_sse_line(line, default_chunk_class) + if run_manager: + run_manager.on_llm_new_token(chunk.text, chunk=chunk) + yield chunk + + async def _agenerate( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> ChatResult: + """Asynchronously call out to OCI Data Science Model Deployment + endpoint on given messages. + + Args: + messages (List[BaseMessage]): + The messagaes to pass into the model. + stop (List[str], Optional): + List of stop words to use when generating. + kwargs: + requests_kwargs: + Additional ``**kwargs`` to pass to requests.post + + Returns: + LangChain ChatResult. + + Raises: + ValueError: + Raise when invoking endpoint fails. + + Example: + + .. code-block:: python + + messages = [ + ( + "system", + "You are a helpful assistant that translates English to French. Translate the user sentence.", + ), + ("human", "I love programming."), + ] + + resp = await chat.ainvoke(messages) + + """ # noqa: E501 + if self.streaming: + stream_iter = self._astream( + messages, stop=stop, run_manager=run_manager, **kwargs + ) + return await agenerate_from_stream(stream_iter) + + requests_kwargs = kwargs.pop("requests_kwargs", {}) + params = self._invocation_params(stop, **kwargs) + body = self._construct_json_body(messages, params) + response = await self.acompletion_with_retry( + data=body, + run_manager=run_manager, + **requests_kwargs, + ) + return self._process_response(response) + + async def _astream( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[AsyncCallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> AsyncIterator[ChatGenerationChunk]: + """Asynchronously streaming OCI Data Science Model Deployment + endpoint on given messages. + + Args: + messages (List[BaseMessage]): + The messagaes to pass into the model. + stop (List[str], Optional): + List of stop words to use when generating. + kwargs: + requests_kwargs: + Additional ``**kwargs`` to pass to requests.post + + Returns: + An Asynciterator of ChatGenerationChunk. + + Raises: + ValueError: + Raise when invoking endpoint fails. + + Example: + + .. code-block:: python + + messages = [ + ( + "system", + "You are a helpful assistant that translates English to French. Translate the user sentence.", + ), + ("human", "I love programming."), + ] + + chunk_iter = await chat.astream(messages) + + """ # noqa: E501 + requests_kwargs = kwargs.pop("requests_kwargs", {}) + self.streaming = True + params = self._invocation_params(stop, **kwargs) + body = self._construct_json_body(messages, params) # request json body + + default_chunk_class = AIMessageChunk + async for line in await self.acompletion_with_retry( + data=body, run_manager=run_manager, stream=True, **requests_kwargs + ): + chunk = self._handle_sse_line(line, default_chunk_class) + if run_manager: + await run_manager.on_llm_new_token(chunk.text, chunk=chunk) + yield chunk + + def with_structured_output( + self, + schema: Optional[Union[Dict, Type[BaseModel]]] = None, + *, + method: Literal["json_mode"] = "json_mode", + include_raw: bool = False, + **kwargs: Any, + ) -> Runnable[LanguageModelInput, Union[Dict, BaseModel]]: + """Model wrapper that returns outputs formatted to match the given schema. + + Args: + schema: The output schema as a dict or a Pydantic class. If a Pydantic class + then the model output will be an object of that class. If a dict then + the model output will be a dict. With a Pydantic class the returned + attributes will be validated, whereas with a dict they will not be. If + `method` is "function_calling" and `schema` is a dict, then the dict + must match the OpenAI function-calling spec. + method: The method for steering model generation, currently only support + for "json_mode". If "json_mode" then JSON mode will be used. Note that + if using "json_mode" then you must include instructions for formatting + the output into the desired schema into the model call. + include_raw: If False then only the parsed structured output is returned. If + an error occurs during model output parsing it will be raised. If True + then both the raw model response (a BaseMessage) and the parsed model + response will be returned. If an error occurs during output parsing it + will be caught and returned as well. The final output is always a dict + with keys "raw", "parsed", and "parsing_error". + + Returns: + A Runnable that takes any ChatModel input and returns as output: + + If include_raw is True then a dict with keys: + raw: BaseMessage + parsed: Optional[_DictOrPydantic] + parsing_error: Optional[BaseException] + + If include_raw is False then just _DictOrPydantic is returned, + where _DictOrPydantic depends on the schema: + + If schema is a Pydantic class then _DictOrPydantic is the Pydantic + class. + + If schema is a dict then _DictOrPydantic is a dict. + + """ # noqa: E501 + if kwargs: + raise ValueError(f"Received unsupported arguments {kwargs}") + is_pydantic_schema = _is_pydantic_class(schema) + if method == "json_mode": + llm = self.bind(response_format={"type": "json_object"}) + output_parser = ( + PydanticOutputParser(pydantic_object=schema) # type: ignore[type-var, arg-type] + if is_pydantic_schema + else JsonOutputParser() + ) + else: + raise ValueError( + f"Unrecognized method argument. Expected `json_mode`." + f"Received: `{method}`." + ) + + if include_raw: + parser_assign = RunnablePassthrough.assign( + parsed=itemgetter("raw") | output_parser, parsing_error=lambda _: None + ) + parser_none = RunnablePassthrough.assign(parsed=lambda _: None) + parser_with_fallback = parser_assign.with_fallbacks( + [parser_none], exception_key="parsing_error" + ) + return RunnableMap(raw=llm) | parser_with_fallback + else: + return llm | output_parser + + def _invocation_params(self, stop: Optional[List[str]], **kwargs: Any) -> dict: + """Combines the invocation parameters with default parameters.""" + params = self._default_params + _model_kwargs = self.model_kwargs or {} + params["stop"] = stop or params.get("stop", []) + return {**params, **_model_kwargs, **kwargs} + + def _handle_sse_line( + self, line: str, default_chunk_cls: Type[BaseMessageChunk] = AIMessageChunk + ) -> ChatGenerationChunk: + """Handle a single Server-Sent Events (SSE) line and process it into + a chat generation chunk. + + Args: + line (str): A single line from the SSE stream in string format. + default_chunk_cls (AIMessageChunk): The default class for message + chunks to be used during the processing of the stream response. + + Returns: + ChatGenerationChunk: The processed chat generation chunk. If an error + occurs, an empty `ChatGenerationChunk` is returned. + """ + try: + obj = json.loads(line) + return self._process_stream_response(obj, default_chunk_cls) + except Exception as e: + logger.debug(f"Error occurs when processing line={line}: {str(e)}") + return ChatGenerationChunk(message=AIMessageChunk(content="")) + + def _construct_json_body(self, messages: list, params: dict) -> dict: + """Constructs the request body as a dictionary (JSON). + + Args: + messages (list): A list of message objects to be included in the + request body. + params (dict): A dictionary of additional parameters to be included + in the request body. + + Returns: + dict: A dictionary representing the JSON request body, including + converted messages and additional parameters. + + """ + return { + "messages": [convert_message_to_dict(m) for m in messages], + **params, + } + + def _process_stream_response( + self, + response_json: dict, + default_chunk_cls: Type[BaseMessageChunk] = AIMessageChunk, + ) -> ChatGenerationChunk: + """Formats streaming response in OpenAI spec. + + Args: + response_json (dict): The JSON response from the streaming endpoint. + default_chunk_cls (type, optional): The default class to use for + creating message chunks. Defaults to `AIMessageChunk`. + + Returns: + ChatGenerationChunk: An object containing the processed message + chunk and any relevant generation information such as finish + reason and usage. + + Raises: + ValueError: If the response JSON is not well-formed or does not + contain the expected structure. + """ + try: + choice = response_json["choices"][0] + if not isinstance(choice, dict): + raise TypeError("Endpoint response is not well formed.") + except (KeyError, IndexError, TypeError) as e: + raise ValueError( + "Error while formatting response payload for chat model of type" + ) from e + + print("***" + choice["delta"]) + + chunk = _convert_delta_to_message_chunk(choice["delta"], default_chunk_cls) + default_chunk_cls = chunk.__class__ + finish_reason = choice.get("finish_reason") + usage = choice.get("usage") + gen_info = {} + if finish_reason is not None: + gen_info.update({"finish_reason": finish_reason}) + if usage is not None: + gen_info.update({"usage": usage}) + + return ChatGenerationChunk( + message=chunk, generation_info=gen_info if gen_info else None + ) + + def _process_response(self, response_json: dict) -> ChatResult: + """Formats response in OpenAI spec. + + Args: + response_json (dict): The JSON response from the chat model endpoint. + + Returns: + ChatResult: An object containing the list of `ChatGeneration` objects + and additional LLM output information. + + Raises: + ValueError: If the response JSON is not well-formed or does not + contain the expected structure. + + """ + generations = [] + try: + choices = response_json["choices"] + if not isinstance(choices, list): + raise TypeError("Endpoint response is not well formed.") + except (KeyError, TypeError) as e: + raise ValueError( + "Error while formatting response payload for chat model of type" + ) from e + + for choice in choices: + message = convert_dict_to_message(choice["message"]) + generation_info = dict(finish_reason=choice.get("finish_reason")) + if "logprobs" in choice: + generation_info["logprobs"] = choice["logprobs"] + + gen = ChatGeneration( + message=message, + generation_info=generation_info, + ) + generations.append(gen) + + token_usage = response_json.get("usage", {}) + llm_output = { + "token_usage": token_usage, + "model_name": self.model, + "system_fingerprint": response_json.get("system_fingerprint", ""), + } + return ChatResult(generations=generations, llm_output=llm_output) + + +class ChatOCIModelDeploymentVLLM(ChatOCIModelDeployment): + """OCI large language chat models deployed with vLLM. + + To use, you must provide the model HTTP endpoint from your deployed + model, e.g. https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict. + + To authenticate, `oracle-ads` has been used to automatically load + credentials: https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html + + Make sure to have the required policies to access the OCI Data + Science Model Deployment endpoint. See: + https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__predict-endpoint + + Example: + + .. code-block:: python + + from langchain_community.chat_models import ChatOCIModelDeploymentVLLM + + chat = ChatOCIModelDeploymentVLLM( + endpoint="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict", + frequency_penalty=0.1, + max_tokens=512, + temperature=0.2, + top_p=1.0, + # other model parameters... + ) + + """ # noqa: E501 + + frequency_penalty: float = 0.0 + """Penalizes repeated tokens according to frequency. Between 0 and 1.""" + + logit_bias: Optional[Dict[str, float]] = None + """Adjust the probability of specific tokens being generated.""" + + max_tokens: Optional[int] = 256 + """The maximum number of tokens to generate in the completion.""" + + n: int = 1 + """Number of output sequences to return for the given prompt.""" + + presence_penalty: float = 0.0 + """Penalizes repeated tokens. Between 0 and 1.""" + + temperature: float = 0.2 + """What sampling temperature to use.""" + + top_p: float = 1.0 + """Total probability mass of tokens to consider at each step.""" + + best_of: Optional[int] = None + """Generates best_of completions server-side and returns the "best" + (the one with the highest log probability per token). + """ + + use_beam_search: Optional[bool] = False + """Whether to use beam search instead of sampling.""" + + top_k: Optional[int] = -1 + """Number of most likely tokens to consider at each step.""" + + min_p: Optional[float] = 0.0 + """Float that represents the minimum probability for a token to be considered. + Must be in [0,1]. 0 to disable this.""" + + repetition_penalty: Optional[float] = 1.0 + """Float that penalizes new tokens based on their frequency in the + generated text. Values > 1 encourage the model to use new tokens.""" + + length_penalty: Optional[float] = 1.0 + """Float that penalizes sequences based on their length. Used only + when `use_beam_search` is True.""" + + early_stopping: Optional[bool] = False + """Controls the stopping condition for beam search. It accepts the + following values: `True`, where the generation stops as soon as there + are `best_of` complete candidates; `False`, where a heuristic is applied + to the generation stops when it is very unlikely to find better candidates; + `never`, where the beam search procedure only stops where there cannot be + better candidates (canonical beam search algorithm).""" + + ignore_eos: Optional[bool] = False + """Whether to ignore the EOS token and continue generating tokens after + the EOS token is generated.""" + + min_tokens: Optional[int] = 0 + """Minimum number of tokens to generate per output sequence before + EOS or stop_token_ids can be generated""" + + stop_token_ids: Optional[List[int]] = None + """List of tokens that stop the generation when they are generated. + The returned output will contain the stop tokens unless the stop tokens + are special tokens.""" + + skip_special_tokens: Optional[bool] = True + """Whether to skip special tokens in the output. Defaults to True.""" + + spaces_between_special_tokens: Optional[bool] = True + """Whether to add spaces between special tokens in the output. + Defaults to True.""" + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "oci_model_depolyment_chat_endpoint_vllm" + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters.""" + params = { + "model": self.model, + "stop": self.stop, + "stream": self.streaming, + } + for attr_name in self._get_model_params(): + try: + value = getattr(self, attr_name) + if value is not None: + params.update({attr_name: value}) + except Exception: + pass + + return params + + def _get_model_params(self) -> List[str]: + """Gets the name of model parameters.""" + return [ + "best_of", + "early_stopping", + "frequency_penalty", + "ignore_eos", + "length_penalty", + "logit_bias", + "logprobs", + "max_tokens", + "min_p", + "min_tokens", + "n", + "presence_penalty", + "repetition_penalty", + "skip_special_tokens", + "spaces_between_special_tokens", + "stop_token_ids", + "temperature", + "top_k", + "top_p", + "use_beam_search", + ] + + +class ChatOCIModelDeploymentTGI(ChatOCIModelDeployment): + """OCI large language chat models deployed with Text Generation Inference. + + To use, you must provide the model HTTP endpoint from your deployed + model, e.g. https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict. + + To authenticate, `oracle-ads` has been used to automatically load + credentials: https://accelerated-data-science.readthedocs.io/en/latest/user_guide/cli/authentication.html + + Make sure to have the required policies to access the OCI Data + Science Model Deployment endpoint. See: + https://docs.oracle.com/en-us/iaas/data-science/using/model-dep-policies-auth.htm#model_dep_policies_auth__predict-endpoint + + Example: + + .. code-block:: python + + from langchain_community.chat_models import ChatOCIModelDeploymentTGI + + chat = ChatOCIModelDeploymentTGI( + endpoint="https://modeldeployment.us-ashburn-1.oci.customer-oci.com//predict", + max_token=512, + temperature=0.2, + frequency_penalty=0.1, + seed=42, + # other model parameters... + ) + + """ # noqa: E501 + + frequency_penalty: Optional[float] = None + """Penalizes repeated tokens according to frequency. Between 0 and 1.""" + + logit_bias: Optional[Dict[str, float]] = None + """Adjust the probability of specific tokens being generated.""" + + logprobs: Optional[bool] = None + """Whether to return log probabilities of the output tokens or not.""" + + max_tokens: int = 256 + """The maximum number of tokens to generate in the completion.""" + + n: int = 1 + """Number of output sequences to return for the given prompt.""" + + presence_penalty: Optional[float] = None + """Penalizes repeated tokens. Between 0 and 1.""" + + seed: Optional[int] = None + """To sample deterministically,""" + + temperature: float = 0.2 + """What sampling temperature to use.""" + + top_p: Optional[float] = None + """Total probability mass of tokens to consider at each step.""" + + top_logprobs: Optional[int] = None + """An integer between 0 and 5 specifying the number of most + likely tokens to return at each token position, each with an + associated log probability. logprobs must be set to true if + this parameter is used.""" + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "oci_model_depolyment_chat_endpoint_tgi" + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters.""" + params = { + "model": self.model, + "stop": self.stop, + "stream": self.streaming, + } + for attr_name in self._get_model_params(): + try: + value = getattr(self, attr_name) + if value is not None: + params.update({attr_name: value}) + except Exception: + pass + + return params + + def _get_model_params(self) -> List[str]: + """Gets the name of model parameters.""" + return [ + "frequency_penalty", + "logit_bias", + "logprobs", + "max_tokens", + "n", + "presence_penalty", + "seed", + "temperature", + "top_k", + "top_p", + "top_logprobs", + ] From 1494347db85eda4d8106238c0aa05accfb968bd8 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 11:07:02 -0400 Subject: [PATCH 03/26] Remove LangChain plugins for OCI generative AI. --- ads/llm/langchain/plugins/embeddings.py | 64 ----- ads/llm/langchain/plugins/llm_gen_ai.py | 301 ------------------------ 2 files changed, 365 deletions(-) delete mode 100644 ads/llm/langchain/plugins/embeddings.py delete mode 100644 ads/llm/langchain/plugins/llm_gen_ai.py diff --git a/ads/llm/langchain/plugins/embeddings.py b/ads/llm/langchain/plugins/embeddings.py deleted file mode 100644 index 4dc9a77a4..000000000 --- a/ads/llm/langchain/plugins/embeddings.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*-- - -# Copyright (c) 2023, 2024 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -from typing import List, Optional -from langchain.load.serializable import Serializable -from langchain.schema.embeddings import Embeddings -from ads.llm.langchain.plugins.base import GenerativeAiClientModel - - -class GenerativeAIEmbeddings(GenerativeAiClientModel, Embeddings, Serializable): - """OCI Generative AI embedding models.""" - - model: str = "cohere.embed-english-light-v2.0" - """Model name to use.""" - - truncate: Optional[str] = None - """Truncate embeddings that are too long from start or end ("NONE"|"START"|"END")""" - - @classmethod - def get_lc_namespace(cls) -> List[str]: - """Get the namespace of the LangChain object.""" - return ["ads", "llm"] - - @classmethod - def is_lc_serializable(cls) -> bool: - """This class can be serialized with default LangChain serialization.""" - return True - - def embed_documents(self, texts: List[str]) -> List[List[float]]: - """Embeds a list of strings. - - Args: - texts: The list of texts to embed. - - Returns: - List of embeddings, one for each text. - """ - from oci.generative_ai_inference.models import ( - EmbedTextDetails, - OnDemandServingMode, - ) - - details = EmbedTextDetails( - compartment_id=self.compartment_id, - inputs=texts, - serving_mode=OnDemandServingMode(model_id=self.model), - truncate=self.truncate, - ) - embeddings = self.client.embed_text(details).data.embeddings - return [list(map(float, e)) for e in embeddings] - - def embed_query(self, text: str) -> List[float]: - """Embeds a single string. - - Args: - text: The text to embed. - - Returns: - Embeddings for the text. - """ - return self.embed_documents([text])[0] diff --git a/ads/llm/langchain/plugins/llm_gen_ai.py b/ads/llm/langchain/plugins/llm_gen_ai.py deleted file mode 100644 index 6aafe9e03..000000000 --- a/ads/llm/langchain/plugins/llm_gen_ai.py +++ /dev/null @@ -1,301 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*-- - -# Copyright (c) 2023, 2024 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -import logging -from typing import Any, Dict, List, Optional - -from langchain.callbacks.manager import CallbackManagerForLLMRun - -from ads.llm.langchain.plugins.base import BaseLLM, GenerativeAiClientModel -from ads.llm.langchain.plugins.contant import Task - -logger = logging.getLogger(__name__) - - -class GenerativeAI(GenerativeAiClientModel, BaseLLM): - """GenerativeAI Service. - - To use, you should have the ``oci`` python package installed. - - Example - ------- - - .. code-block:: python - - from ads.llm import GenerativeAI - - gen_ai = GenerativeAI(compartment_id="ocid1.compartment.oc1..") - - """ - - task: str = "text_generation" - """Task can be either text_generation or text_summarization.""" - - model: Optional[str] = "cohere.command" - """Model name to use.""" - - frequency_penalty: float = None - """Penalizes repeated tokens according to frequency. Between 0 and 1.""" - - presence_penalty: float = None - """Penalizes repeated tokens. Between 0 and 1.""" - - truncate: Optional[str] = None - """Specify how the client handles inputs longer than the maximum token.""" - - length: str = "AUTO" - """Indicates the approximate length of the summary. """ - - format: str = "PARAGRAPH" - """Indicates the style in which the summary will be delivered - in a free form paragraph or in bullet points.""" - - extractiveness: str = "AUTO" - """Controls how close to the original text the summary is. High extractiveness summaries will lean towards reusing sentences verbatim, while low extractiveness summaries will tend to paraphrase more.""" - - additional_command: str = "" - """A free-form instruction for modifying how the summaries get generated. """ - - @property - def _identifying_params(self) -> Dict[str, Any]: - """Get the identifying parameters.""" - return { - **{ - "model": self.model, - "task": self.task, - "client_kwargs": self.client_kwargs, - "endpoint_kwargs": self.endpoint_kwargs, - }, - **self._default_params, - } - - @property - def _llm_type(self) -> str: - """Return type of llm.""" - return "GenerativeAI" - - @property - def _default_params(self) -> Dict[str, Any]: - """Get the default parameters for calling OCIGenerativeAI API.""" - # This property is used by _identifying_params(), which then used for serialization - # All parameters returning here should be JSON serializable. - - return ( - { - "compartment_id": self.compartment_id, - "temperature": self.temperature, - "max_tokens": self.max_tokens, - "top_k": self.k, - "top_p": self.p, - "frequency_penalty": self.frequency_penalty, - "presence_penalty": self.presence_penalty, - "truncate": self.truncate, - } - if self.task == Task.TEXT_GENERATION - else { - "compartment_id": self.compartment_id, - "temperature": self.temperature, - "length": self.length, - "format": self.format, - "extractiveness": self.extractiveness, - "additional_command": self.additional_command, - } - ) - - def _invocation_params(self, stop: Optional[List[str]], **kwargs: Any) -> dict: - params = self._default_params - if self.task == Task.TEXT_SUMMARIZATION: - return {**params} - - if self.stop is not None and stop is not None: - raise ValueError("`stop` found in both the input and default params.") - elif self.stop is not None: - params["stop_sequences"] = self.stop - else: - params["stop_sequences"] = stop - return {**params, **kwargs} - - def _call( - self, - prompt: str, - stop: Optional[List[str]] = None, - run_manager: Optional[CallbackManagerForLLMRun] = None, - **kwargs: Any, - ): - """Call out to GenerativeAI's generate endpoint. - - Parameters - ---------- - prompt (str): - The prompt to pass into the model. - stop (List[str], Optional): - List of stop words to use when generating. - - Returns - ------- - The string generated by the model. - - Example - ------- - - .. code-block:: python - - response = gen_ai("Tell me a joke.") - """ - - params = self._invocation_params(stop, **kwargs) - self._print_request(prompt, params) - - try: - completion = self.completion_with_retry(prompt=prompt, **params) - except Exception: - logger.error( - "Error occur when invoking oci service api." - "DEBUG INTO: task=%s, params=%s, prompt=%s", - self.task, - params, - prompt, - ) - raise - - return completion - - def _text_generation(self, request_class, serving_mode, **kwargs): - from oci.generative_ai_inference.models import ( - GenerateTextDetails, - GenerateTextResult, - ) - - compartment_id = kwargs.pop("compartment_id") - inference_request = request_class(**kwargs) - response = self.client.generate_text( - GenerateTextDetails( - compartment_id=compartment_id, - serving_mode=serving_mode, - inference_request=inference_request, - ), - **self.endpoint_kwargs, - ).data - response: GenerateTextResult - return response.inference_response - - def _cohere_completion(self, serving_mode, **kwargs) -> str: - from oci.generative_ai_inference.models import ( - CohereLlmInferenceRequest, - CohereLlmInferenceResponse, - ) - - response = self._text_generation( - CohereLlmInferenceRequest, serving_mode, **kwargs - ) - response: CohereLlmInferenceResponse - if kwargs.get("num_generations", 1) == 1: - completion = response.generated_texts[0].text - else: - completion = [result.text for result in response.generated_texts] - self._print_response(completion, response) - return completion - - def _llama_completion(self, serving_mode, **kwargs) -> str: - from oci.generative_ai_inference.models import ( - LlamaLlmInferenceRequest, - LlamaLlmInferenceResponse, - ) - - # truncate and stop_sequence are not supported. - kwargs.pop("truncate", None) - kwargs.pop("stop_sequences", None) - # top_k must be >1 or -1 - if "top_k" in kwargs and kwargs["top_k"] == 0: - kwargs.pop("top_k") - - # top_p must be 1 when temperature is 0 - if kwargs.get("temperature") == 0: - kwargs["top_p"] = 1 - - response = self._text_generation( - LlamaLlmInferenceRequest, serving_mode, **kwargs - ) - response: LlamaLlmInferenceResponse - if kwargs.get("num_generations", 1) == 1: - completion = response.choices[0].text - else: - completion = [result.text for result in response.choices] - self._print_response(completion, response) - return completion - - def _cohere_summarize(self, serving_mode, **kwargs) -> str: - from oci.generative_ai_inference.models import SummarizeTextDetails - - kwargs["input"] = kwargs.pop("prompt") - - response = self.client.summarize_text( - SummarizeTextDetails(serving_mode=serving_mode, **kwargs), - **self.endpoint_kwargs, - ) - return response.data.summary - - def completion_with_retry(self, **kwargs: Any) -> Any: - from oci.generative_ai_inference.models import OnDemandServingMode - - serving_mode = OnDemandServingMode(model_id=self.model) - - if self.task == Task.TEXT_SUMMARIZATION: - return self._cohere_summarize(serving_mode, **kwargs) - elif self.model.startswith("cohere"): - return self._cohere_completion(serving_mode, **kwargs) - elif self.model.startswith("meta.llama"): - return self._llama_completion(serving_mode, **kwargs) - raise ValueError(f"Model {self.model} is not supported.") - - def batch_completion( - self, - prompt: str, - stop: Optional[List[str]] = None, - run_manager: Optional[CallbackManagerForLLMRun] = None, - num_generations: int = 1, - **kwargs: Any, - ) -> List[str]: - """Generates multiple completion for the given prompt. - - Parameters - ---------- - prompt (str): - The prompt to pass into the model. - stop: (List[str], optional): - Optional list of stop words to use when generating. Defaults to None. - num_generations (int, optional): - Number of completions aims to get. Defaults to 1. - - Raises - ------ - NotImplementedError - Raise when invoking batch_completion under summarization task. - - Returns - ------- - List[str] - List of multiple completions. - - Example - ------- - - .. code-block:: python - - responses = gen_ai.batch_completion("Tell me a joke.", num_generations=5) - - """ - if self.task == Task.TEXT_SUMMARIZATION: - raise NotImplementedError( - f"task={Task.TEXT_SUMMARIZATION} does not support batch_completion. " - ) - - return self._call( - prompt=prompt, - stop=stop, - run_manager=run_manager, - num_generations=num_generations, - **kwargs, - ) From 1ff5374c281fd1e290d2e1d2850c2090849be435 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 11:42:27 -0400 Subject: [PATCH 04/26] Remove old versions of llm plugins. --- ads/llm/__init__.py | 13 +- ads/llm/langchain/plugins/base.py | 118 ---------- ads/llm/langchain/plugins/contant.py | 44 ---- ads/llm/langchain/plugins/llm_md.py | 316 --------------------------- ads/llm/serialize.py | 7 +- 5 files changed, 12 insertions(+), 486 deletions(-) delete mode 100644 ads/llm/langchain/plugins/base.py delete mode 100644 ads/llm/langchain/plugins/contant.py delete mode 100644 ads/llm/langchain/plugins/llm_md.py diff --git a/ads/llm/__init__.py b/ads/llm/__init__.py index 35f552f82..8f5fb1370 100644 --- a/ads/llm/__init__.py +++ b/ads/llm/__init__.py @@ -6,10 +6,15 @@ try: import langchain - from ads.llm.langchain.plugins.llm_gen_ai import GenerativeAI - from ads.llm.langchain.plugins.llm_md import ModelDeploymentTGI - from ads.llm.langchain.plugins.llm_md import ModelDeploymentVLLM - from ads.llm.langchain.plugins.embeddings import GenerativeAIEmbeddings + from ads.llm.langchain.plugins.llms.oci_data_science_model_deployment_endpoint import ( + OCIModelDeploymentVLLM, + OCIModelDeploymentTGI, + ) + from ads.llm.langchain.plugins.chat_models.oci_data_science import ( + ChatOCIModelDeployment, + ChatOCIModelDeploymentVLLM, + ChatOCIModelDeploymentTGI, + ) except ImportError as ex: if ex.name == "langchain": raise ImportError( diff --git a/ads/llm/langchain/plugins/base.py b/ads/llm/langchain/plugins/base.py deleted file mode 100644 index d9f260832..000000000 --- a/ads/llm/langchain/plugins/base.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*-- - -# Copyright (c) 2023, 2024 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -from typing import Any, Dict, List, Optional - -from langchain.llms.base import LLM -from langchain.pydantic_v1 import BaseModel, Field, root_validator - -from ads import logger -from ads.common.auth import default_signer -from ads.config import COMPARTMENT_OCID - - -class BaseLLM(LLM): - """Base OCI LLM class. Contains common attributes.""" - - auth: dict = Field(default_factory=default_signer, exclude=True) - """ADS auth dictionary for OCI authentication. - This can be generated by calling `ads.common.auth.api_keys()` or `ads.common.auth.resource_principal()`. - If this is not provided then the `ads.common.default_signer()` will be used.""" - - max_tokens: int = 256 - """Denotes the number of tokens to predict per generation.""" - - temperature: float = 0.2 - """A non-negative float that tunes the degree of randomness in generation.""" - - k: int = 0 - """Number of most likely tokens to consider at each step.""" - - p: int = 0.75 - """Total probability mass of tokens to consider at each step.""" - - stop: Optional[List[str]] = None - """Stop words to use when generating. Model output is cut off at the first occurrence of any of these substrings.""" - - def _print_request(self, prompt, params): - if self.verbose: - print(f"LLM API Request:\n{prompt}") - - def _print_response(self, completion, response): - if self.verbose: - print(f"LLM API Completion:\n{completion}") - - @classmethod - def get_lc_namespace(cls) -> List[str]: - """Get the namespace of the LangChain object.""" - return ["ads", "llm"] - - @classmethod - def is_lc_serializable(cls) -> bool: - """This class can be serialized with default LangChain serialization.""" - return True - - -class GenerativeAiClientModel(BaseModel): - """Base model for generative AI embedding model and LLM.""" - - # This auth is the same as the auth in BaseLLM class. - # However, this is needed for the Gen AI embedding model. - # Do not remove this attribute - auth: dict = Field(default_factory=default_signer, exclude=True) - """ADS auth dictionary for OCI authentication. - This can be generated by calling `ads.common.auth.api_keys()` or `ads.common.auth.resource_principal()`. - If this is not provided then the `ads.common.default_signer()` will be used.""" - - client: Any #: :meta private: - """OCI GenerativeAiClient.""" - - compartment_id: str = None - """Compartment ID of the caller.""" - - endpoint_kwargs: Dict[str, Any] = {} - """Optional attributes passed to the OCI API call.""" - - client_kwargs: Dict[str, Any] = {} - """Holds any client parameters for creating GenerativeAiClient""" - - @staticmethod - def _import_client(): - try: - from oci.generative_ai_inference import GenerativeAiInferenceClient - except ImportError as ex: - raise ImportError( - "Could not import GenerativeAiInferenceClient from oci. " - "The OCI SDK installed does not support generative AI service." - ) from ex - return GenerativeAiInferenceClient - - @root_validator() - def validate_environment( # pylint: disable=no-self-argument - cls, values: Dict - ) -> Dict: - """Validate that python package exists in environment.""" - # Initialize client only if user does not pass in client. - # Users may choose to initialize the OCI client by themselves and pass it into this model. - logger.warning( - f"The ads langchain plugin {cls.__name__} will be deprecated soon. " - "Please refer to https://python.langchain.com/v0.2/docs/integrations/providers/oci/ " - "for the latest support." - ) - if not values.get("client"): - auth = values.get("auth", {}) - client_kwargs = auth.get("client_kwargs") or {} - client_kwargs.update(values["client_kwargs"]) - # Import the GenerativeAIClient here so that there will be no error when user import ads.llm - # and the install OCI SDK does not support generative AI service yet. - client_class = cls._import_client() - values["client"] = client_class(**auth, **client_kwargs) - # Set default compartment ID - if not values.get("compartment_id"): - if COMPARTMENT_OCID: - values["compartment_id"] = COMPARTMENT_OCID - else: - raise ValueError("Please specify compartment_id.") - return values diff --git a/ads/llm/langchain/plugins/contant.py b/ads/llm/langchain/plugins/contant.py deleted file mode 100644 index 7116de1fa..000000000 --- a/ads/llm/langchain/plugins/contant.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*-- - -# Copyright (c) 2023, 2024 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -from enum import Enum - - -class StrEnum(str, Enum): - """Enum with string members - https://docs.python.org/3.11/library/enum.html#enum.StrEnum - """ - - # Pydantic uses Python's standard enum classes to define choices. - # https://docs.pydantic.dev/latest/api/standard_library_types/#enum - - -DEFAULT_TIME_OUT = 300 -DEFAULT_CONTENT_TYPE_JSON = "application/json" - - -class Task(StrEnum): - TEXT_GENERATION = "text_generation" - TEXT_SUMMARIZATION = "text_summarization" - - -class LengthParam(StrEnum): - SHORT = "SHORT" - MEDIUM = "MEDIUM" - LONG = "LONG" - AUTO = "AUTO" - - -class FormatParam(StrEnum): - PARAGRAPH = "PARAGRAPH" - BULLETS = "BULLETS" - AUTO = "AUTO" - - -class ExtractivenessParam(StrEnum): - LOW = "LOW" - MEDIUM = "MEDIUM" - HIGH = "HIGH" - AUTO = "AUTO" diff --git a/ads/llm/langchain/plugins/llm_md.py b/ads/llm/langchain/plugins/llm_md.py deleted file mode 100644 index 90767a58c..000000000 --- a/ads/llm/langchain/plugins/llm_md.py +++ /dev/null @@ -1,316 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*-- - -# Copyright (c) 2023 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -import logging -from typing import Any, Dict, List, Optional - -import requests -from langchain.callbacks.manager import CallbackManagerForLLMRun -from langchain.pydantic_v1 import root_validator -from langchain.utils import get_from_dict_or_env -from oci.auth import signers - -from ads.llm.langchain.plugins.base import BaseLLM -from ads.llm.langchain.plugins.contant import ( - DEFAULT_CONTENT_TYPE_JSON, - DEFAULT_TIME_OUT, -) - -logger = logging.getLogger(__name__) - - -class ModelDeploymentLLM(BaseLLM): - """Base class for LLM deployed on OCI Model Deployment.""" - - endpoint: str = "" - """The uri of the endpoint from the deployed Model Deployment model.""" - - best_of: int = 1 - """Generates best_of completions server-side and returns the "best" - (the one with the highest log probability per token). - """ - - @root_validator() - def validate_environment( # pylint: disable=no-self-argument - cls, values: Dict - ) -> Dict: - """Fetch endpoint from environment variable or arguments.""" - values["endpoint"] = get_from_dict_or_env( - values, - "endpoint", - "OCI_LLM_ENDPOINT", - ) - return values - - @property - def _default_params(self) -> Dict[str, Any]: - """Default parameters for the model.""" - raise NotImplementedError() - - @property - def _identifying_params(self) -> Dict[str, Any]: - """Get the identifying parameters.""" - return { - **{"endpoint": self.endpoint}, - **self._default_params, - } - - def _construct_json_body(self, prompt, params): - """Constructs the request body as a dictionary (JSON).""" - raise NotImplementedError - - def _invocation_params(self, stop: Optional[List[str]], **kwargs: Any) -> dict: - """Combines the invocation parameters with default parameters.""" - params = self._default_params - if self.stop is not None and stop is not None: - raise ValueError("`stop` found in both the input and default params.") - elif self.stop is not None: - params["stop"] = self.stop - elif stop is not None: - params["stop"] = stop - else: - # Don't set "stop" in param as None. It should be a list. - params["stop"] = [] - - return {**params, **kwargs} - - def _process_response(self, response_json: dict): - return response_json - - def _call( - self, - prompt: str, - stop: Optional[List[str]] = None, - run_manager: Optional[CallbackManagerForLLMRun] = None, - **kwargs: Any, - ) -> str: - """Call out to OCI Data Science Model Deployment endpoint. - - Parameters - ---------- - prompt (str): - The prompt to pass into the model. - stop (List[str], Optional): - List of stop words to use when generating. - - Returns - ------- - The string generated by the model. - - Example - ------- - - .. code-block:: python - - response = oci_md("Tell me a joke.") - - """ - params = self._invocation_params(stop, **kwargs) - body = self._construct_json_body(prompt, params) - self._print_request(prompt, params) - response = self.send_request(data=body, endpoint=self.endpoint) - completion = self._process_response(response) - self._print_response(completion, response) - return completion - - def send_request( - self, - data, - endpoint: str, - header: dict = None, - **kwargs, - ) -> Dict: - """Sends request to the model deployment endpoint. - - Parameters - ---------- - data (Json serializable): - data need to be sent to the endpoint. - endpoint (str): - The model HTTP endpoint. - header (dict, optional): - A dictionary of HTTP headers to send to the specified url. Defaults to {}. - - Raises - ------ - Exception: - Raise when invoking fails. - - Returns - ------- - A JSON representation of a requests.Response object. - """ - if not header: - header = {} - header["Content-Type"] = ( - header.pop("content_type", DEFAULT_CONTENT_TYPE_JSON) - or DEFAULT_CONTENT_TYPE_JSON - ) - timeout = kwargs.pop("timeout", DEFAULT_TIME_OUT) - request_kwargs = {"json": data} - request_kwargs["headers"] = header - signer = self.auth.get("signer") - - attempts = 0 - while attempts < 2: - request_kwargs["auth"] = signer - response = requests.post( - endpoint, timeout=timeout, **request_kwargs, **kwargs - ) - if response.status_code == 401 and self.is_principal_signer(signer): - signer.refresh_security_token() - attempts += 1 - continue - break - - try: - response.raise_for_status() - response_json = response.json() - - except Exception: - logger.error( - "DEBUG INFO: request_kwargs=%s, status_code=%s, content=%s", - request_kwargs, - response.status_code, - response.content, - ) - raise - - return response_json - - @staticmethod - def is_principal_signer(signer): - """Checks if the signer is instance principal or resource principal signer.""" - if ( - isinstance(signer, signers.InstancePrincipalsSecurityTokenSigner) - or isinstance(signer, signers.ResourcePrincipalsFederationSigner) - or isinstance(signer, signers.EphemeralResourcePrincipalSigner) - or isinstance(signer, signers.EphemeralResourcePrincipalV21Signer) - or isinstance(signer, signers.NestedResourcePrincipals) - or isinstance(signer, signers.OkeWorkloadIdentityResourcePrincipalSigner) - ): - return True - else: - return False - - -class ModelDeploymentTGI(ModelDeploymentLLM): - """OCI Data Science Model Deployment TGI Endpoint. - - Example - ------- - - .. code-block:: python - - from ads.llm import ModelDeploymentTGI - - oci_md = ModelDeploymentTGI(endpoint="") - - """ - - do_sample: bool = True - """if set to True, this parameter enables decoding strategies such as - multi-nominal sampling, beam-search multi-nominal sampling, Top-K sampling and Top-p sampling. - """ - - watermark = True - """Watermarking with `A Watermark for Large Language Models `_. - Defaults to True.""" - - return_full_text = False - """Whether to prepend the prompt to the generated text. Defaults to False.""" - - @property - def _llm_type(self) -> str: - """Return type of llm.""" - return "oci_model_deployment_tgi_endpoint" - - @property - def _default_params(self) -> Dict[str, Any]: - """Get the default parameters for invoking OCI model deployment TGI endpoint.""" - return { - "best_of": self.best_of, - "max_new_tokens": self.max_tokens, - "temperature": self.temperature, - "top_k": self.k - if self.k > 0 - else None, # `top_k` must be strictly positive' - "top_p": self.p, - "do_sample": self.do_sample, - "return_full_text": self.return_full_text, - "watermark": self.watermark, - } - - def _construct_json_body(self, prompt, params): - return { - "inputs": prompt, - "parameters": params, - } - - def _process_response(self, response_json: dict): - return str(response_json.get("generated_text", response_json)) - - -class ModelDeploymentVLLM(ModelDeploymentLLM): - """VLLM deployed on OCI Model Deployment""" - - model: str - """Name of the model.""" - - n: int = 1 - """Number of output sequences to return for the given prompt.""" - - k: int = -1 - """Number of most likely tokens to consider at each step.""" - - frequency_penalty: float = 0.0 - """Penalizes repeated tokens according to frequency. Between 0 and 1.""" - - presence_penalty: float = 0.0 - """Penalizes repeated tokens. Between 0 and 1.""" - - use_beam_search: bool = False - """Whether to use beam search instead of sampling.""" - - ignore_eos: bool = False - """Whether to ignore the EOS token and continue generating tokens after - the EOS token is generated.""" - - logprobs: Optional[int] = None - """Number of log probabilities to return per output token.""" - - @property - def _llm_type(self) -> str: - """Return type of llm.""" - return "oci_model_deployment_vllm_endpoint" - - @property - def _default_params(self) -> Dict[str, Any]: - """Get the default parameters for calling vllm.""" - return { - "n": self.n, - "best_of": self.best_of, - "max_tokens": self.max_tokens, - "top_k": self.k, - "top_p": self.p, - "temperature": self.temperature, - "presence_penalty": self.presence_penalty, - "frequency_penalty": self.frequency_penalty, - "stop": self.stop, - "ignore_eos": self.ignore_eos, - "use_beam_search": self.use_beam_search, - "logprobs": self.logprobs, - "model": self.model, - } - - def _construct_json_body(self, prompt, params): - return { - "prompt": prompt, - **params, - } - - def _process_response(self, response_json: dict): - return response_json["choices"][0]["text"] diff --git a/ads/llm/serialize.py b/ads/llm/serialize.py index 39d31b95d..968398650 100644 --- a/ads/llm/serialize.py +++ b/ads/llm/serialize.py @@ -21,7 +21,7 @@ from ads.common.auth import default_signer from ads.common.object_storage_details import ObjectStorageDetails -from ads.llm import GenerativeAI, ModelDeploymentTGI, ModelDeploymentVLLM +from ads.llm import OCIModelDeploymentVLLM, OCIModelDeploymentTGI from ads.llm.chain import GuardrailSequence from ads.llm.guardrails.base import CustomGuardrailBase from ads.llm.serializers.runnable_parallel import RunnableParallelSerializer @@ -29,9 +29,8 @@ # This is a temp solution for supporting custom LLM in legacy load_chain __lc_llm_dict = llms.get_type_to_cls_dict() -__lc_llm_dict[GenerativeAI.__name__] = lambda: GenerativeAI -__lc_llm_dict[ModelDeploymentTGI.__name__] = lambda: ModelDeploymentTGI -__lc_llm_dict[ModelDeploymentVLLM.__name__] = lambda: ModelDeploymentVLLM +__lc_llm_dict[OCIModelDeploymentTGI.__name__] = lambda: OCIModelDeploymentTGI +__lc_llm_dict[OCIModelDeploymentVLLM.__name__] = lambda: OCIModelDeploymentVLLM def __new_type_to_cls_dict(): From f37d14c1a71c31dc980ec9b6c82fe0708bed4bca Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 11:43:09 -0400 Subject: [PATCH 05/26] Remove RetrievalQA from serialize. --- ads/llm/serialize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ads/llm/serialize.py b/ads/llm/serialize.py index 968398650..52f2f15ba 100644 --- a/ads/llm/serialize.py +++ b/ads/llm/serialize.py @@ -12,7 +12,6 @@ import fsspec import yaml from langchain import llms -from langchain.chains import RetrievalQA from langchain.chains.loading import load_chain_from_config from langchain.llms import loading from langchain.load.load import Reviver @@ -46,7 +45,6 @@ def __new_type_to_cls_dict(): GuardrailSequence: GuardrailSequence.save, CustomGuardrailBase: CustomGuardrailBase.save, RunnableParallel: RunnableParallelSerializer.save, - RetrievalQA: RetrievalQASerializer.save, } # Mapping _type to custom deserialization functions From 96e16caa1b06a169533553a40dac0e88c1729bb7 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 12:31:52 -0400 Subject: [PATCH 06/26] Update to support pydantic v2 --- ads/llm/guardrails/base.py | 5 +++-- ads/llm/guardrails/huggingface.py | 2 +- ads/llm/langchain/plugins/chat_models/oci_data_science.py | 7 +++---- .../llms/oci_data_science_model_deployment_endpoint.py | 3 +-- ads/llm/requirements.txt | 4 ++-- pyproject.toml | 2 +- 6 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ads/llm/guardrails/base.py b/ads/llm/guardrails/base.py index d1b86d11f..555503afc 100644 --- a/ads/llm/guardrails/base.py +++ b/ads/llm/guardrails/base.py @@ -14,7 +14,7 @@ from typing import Any, List, Dict, Tuple from langchain.schema.prompt import PromptValue from langchain.tools.base import BaseTool, ToolException -from langchain.pydantic_v1 import BaseModel, root_validator +from pydantic import BaseModel, model_validator class RunInfo(BaseModel): @@ -190,7 +190,8 @@ class Config: This is used by the ``apply_filter()`` method. """ - @root_validator + @model_validator(mode="before") + @classmethod def default_name(cls, values): """Sets the default name of the guardrail.""" if not values.get("name"): diff --git a/ads/llm/guardrails/huggingface.py b/ads/llm/guardrails/huggingface.py index bb260b480..2298ac493 100644 --- a/ads/llm/guardrails/huggingface.py +++ b/ads/llm/guardrails/huggingface.py @@ -6,7 +6,7 @@ import evaluate -from langchain.pydantic_v1 import root_validator +from pydantic.v1 import root_validator from .base import Guardrail diff --git a/ads/llm/langchain/plugins/chat_models/oci_data_science.py b/ads/llm/langchain/plugins/chat_models/oci_data_science.py index 251590a78..becdafcd9 100644 --- a/ads/llm/langchain/plugins/chat_models/oci_data_science.py +++ b/ads/llm/langchain/plugins/chat_models/oci_data_science.py @@ -29,7 +29,6 @@ PydanticOutputParser, ) from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult -from langchain_core.pydantic_v1 import BaseModel, Field from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough from langchain_community.adapters.openai import ( @@ -37,7 +36,8 @@ convert_message_to_dict, ) from langchain_community.chat_models.openai import _convert_delta_to_message_chunk -from langchain_community.llms.oci_data_science_model_deployment_endpoint import ( +from pydantic import BaseModel, Field +from ads.llm.langchain.plugins.llms.oci_data_science_model_deployment_endpoint import ( DEFAULT_MODEL_NAME, BaseOCIModelDeployment, ) @@ -128,8 +128,7 @@ class ChatOCIModelDeployment(BaseChatModel, BaseOCIModelDeployment): .. code-block:: python from typing import Optional - - from langchain_core.pydantic_v1 import BaseModel, Field + from pydantic import BaseModel, Field class Joke(BaseModel): setup: str = Field(description="The setup of the joke") diff --git a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py index cb67a5855..83cbb0556 100644 --- a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +++ b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py @@ -21,10 +21,9 @@ from langchain_core.language_models.llms import BaseLLM, create_base_retry_decorator from langchain_core.load.serializable import Serializable from langchain_core.outputs import Generation, GenerationChunk, LLMResult -from langchain_core.pydantic_v1 import Field from langchain_core.utils import get_from_dict_or_env, pre_init - from langchain_community.utilities.requests import Requests +from pydantic import Field logger = logging.getLogger(__name__) diff --git a/ads/llm/requirements.txt b/ads/llm/requirements.txt index f3d35a82c..4b4846fb8 100644 --- a/ads/llm/requirements.txt +++ b/ads/llm/requirements.txt @@ -1,3 +1,3 @@ -langchain>=0.0.295 -pydantic>=1.10.13,<3 +langchain>=0.3 +pydantic>=2,<3 typing-extensions>=4.2.0 diff --git a/pyproject.toml b/pyproject.toml index 8a711f482..cda8fc75d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -202,7 +202,7 @@ pii = [ "spacy==3.6.1", "report-creator==1.0.9", ] -llm = ["langchain-community<0.0.32", "langchain>=0.1.10,<0.1.14", "evaluate>=0.4.0"] +llm = ["langchain>=0.3", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] # To reduce backtracking (decrese deps install time) during test/dev env setup reducing number of versions pip is From 9dbbe5fad7e980407d41ab6622a65c2eae83ffd7 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 13:51:26 -0400 Subject: [PATCH 07/26] Add copyrights. --- ads/llm/langchain/plugins/chat_models/__init__.py | 5 +++++ ads/llm/langchain/plugins/chat_models/oci_data_science.py | 7 +++++++ ads/llm/langchain/plugins/llms/__init__.py | 5 +++++ .../llms/oci_data_science_model_deployment_endpoint.py | 7 +++++++ 4 files changed, 24 insertions(+) diff --git a/ads/llm/langchain/plugins/chat_models/__init__.py b/ads/llm/langchain/plugins/chat_models/__init__.py index e69de29bb..b8d0460f5 100644 --- a/ads/llm/langchain/plugins/chat_models/__init__.py +++ b/ads/llm/langchain/plugins/chat_models/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/llm/langchain/plugins/chat_models/oci_data_science.py b/ads/llm/langchain/plugins/chat_models/oci_data_science.py index becdafcd9..9cc03f2b3 100644 --- a/ads/llm/langchain/plugins/chat_models/oci_data_science.py +++ b/ads/llm/langchain/plugins/chat_models/oci_data_science.py @@ -1,3 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + import json import logging from operator import itemgetter diff --git a/ads/llm/langchain/plugins/llms/__init__.py b/ads/llm/langchain/plugins/llms/__init__.py index e69de29bb..b8d0460f5 100644 --- a/ads/llm/langchain/plugins/llms/__init__.py +++ b/ads/llm/langchain/plugins/llms/__init__.py @@ -0,0 +1,5 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ diff --git a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py index 83cbb0556..4dce63dc8 100644 --- a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +++ b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py @@ -1,3 +1,10 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + import json import logging from typing import ( From d9721b9cc433c6426905db172d5f78e02d6ec2e7 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 16:01:43 -0400 Subject: [PATCH 08/26] Add chat template. --- ads/llm/__init__.py | 1 + ads/llm/chat_template.py | 26 ++++ .../templates/tool_chat_template_hermes.jinja | 130 ++++++++++++++++++ .../tool_chat_template_mistral_parallel.jinja | 94 +++++++++++++ 4 files changed, 251 insertions(+) create mode 100644 ads/llm/chat_template.py create mode 100644 ads/llm/templates/tool_chat_template_hermes.jinja create mode 100644 ads/llm/templates/tool_chat_template_mistral_parallel.jinja diff --git a/ads/llm/__init__.py b/ads/llm/__init__.py index 8f5fb1370..b6e9bcab6 100644 --- a/ads/llm/__init__.py +++ b/ads/llm/__init__.py @@ -15,6 +15,7 @@ ChatOCIModelDeploymentVLLM, ChatOCIModelDeploymentTGI, ) + from ads.llm.chat_template import ChatTemplates except ImportError as ex: if ex.name == "langchain": raise ImportError( diff --git a/ads/llm/chat_template.py b/ads/llm/chat_template.py new file mode 100644 index 000000000..9cdd198f8 --- /dev/null +++ b/ads/llm/chat_template.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2023 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + + +import os + + +class ChatTemplates: + """Contains chat templates.""" + + @staticmethod + def _read_template(filename): + with open( + os.path.join(os.path.dirname(__file__), "templates", filename), + mode="r", + encoding="utf-8", + ) as f: + return f.read() + + @staticmethod + def mistral(): + """Chat template for auto tool calling with Mistral model deploy with vLLM.""" + return ChatTemplates._read_template("tool_chat_template_mistral_parallel.jinja") diff --git a/ads/llm/templates/tool_chat_template_hermes.jinja b/ads/llm/templates/tool_chat_template_hermes.jinja new file mode 100644 index 000000000..0b0902c8e --- /dev/null +++ b/ads/llm/templates/tool_chat_template_hermes.jinja @@ -0,0 +1,130 @@ +{%- macro json_to_python_type(json_spec) %} + {%- set basic_type_map = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool" +} %} + + {%- if basic_type_map[json_spec.type] is defined %} + {{- basic_type_map[json_spec.type] }} + {%- elif json_spec.type == "array" %} + {{- "list[" + json_to_python_type(json_spec|items) + "]" }} + {%- elif json_spec.type == "object" %} + {%- if json_spec.additionalProperties is defined %} + {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }} + {%- else %} + {{- "dict" }} + {%- endif %} + {%- elif json_spec.type is iterable %} + {{- "Union[" }} + {%- for t in json_spec.type %} + {{- json_to_python_type({"type": t}) }} + {%- if not loop.last %} + {{- "," }} + {%- endif %} + {%- endfor %} + {{- "]" }} + {%- else %} + {{- "Any" }} + {%- endif %} +{%- endmacro %} + + +{{- bos_token }} +{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: " }} +{%- if tools is iterable and tools | length > 0 %} + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- '{"type": "function", "function": ' }} + {{- '{"name": "' + tool.name + '", ' }} + {{- '"description": "' + tool.name + '(' }} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- param_name + ": " + json_to_python_type(param_fields) }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- if tool.return is defined %} + {{- " -> " + json_to_python_type(tool.return) }} + {%- endif %} + {{- " - " + tool.description + "\n\n" }} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {%- if loop.first %} + {{- " Args:\n" }} + {%- endif %} + {{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }} + {%- endfor %} + {%- if tool.return is defined and tool.return.description is defined %} + {{- "\n Returns:\n " + tool.return.description }} + {%- endif %} + {{- '"' }} + {{- ', "parameters": ' }} + {%- if tool.parameters.properties | length == 0 %} + {{- "{}" }} + {%- else %} + {{- tool.parameters|tojson }} + {%- endif %} + {{- "}" }} + {%- if not loop.last %} + {{- "\n" }} + {%- endif %} + {%- endfor %} +{%- endif %} +{{- " " }} +{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}} +' }} +{{- "For each function call return a json object with function name and arguments within XML tags as follows: +" }} +{{- " +" }} +{{- '{"name": , "arguments": } +' }} +{{- '<|im_end|>' }} +{%- for message in messages %} + {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" and message.tool_calls is defined %} + {{- '<|im_start|>' + message.role }} + {%- for tool_call in message.tool_calls %} + {{- '\n\n' }} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '{' }} + {{- '"name": "' }} + {{- tool_call.name }} + {{- '"' }} + {%- if tool_call.arguments is defined %} + {{- ', ' }} + {{- '"arguments": ' }} + {{- tool_call.arguments|tojson }} + {%- endif %} + {{- '}' }} + {{- '\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>tool\n' }} + {%- endif %} + {{- '\n' }} + {{- message.content }} + {%- if not loop.last %} + {{- '\n\n' }} + {%- else %} + {{- '\n' }} + {%- endif %} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>' }} + {%- elif loop.last %} + {{- '<|im_end|>' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/ads/llm/templates/tool_chat_template_mistral_parallel.jinja b/ads/llm/templates/tool_chat_template_mistral_parallel.jinja new file mode 100644 index 000000000..a294cbfd0 --- /dev/null +++ b/ads/llm/templates/tool_chat_template_mistral_parallel.jinja @@ -0,0 +1,94 @@ +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} +{%- if tools is defined %} + {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %} + {%- if system_message is defined %} + {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %} + {%- else %} + {%- set system_message = parallel_tool_prompt %} + {%- endif %} +{%- endif %} +{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %} + +{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %} + {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %} + {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif %} +{%- endfor %} + +{{- bos_token }} +{%- for message in loop_messages %} + {%- if message["role"] == "user" %} + {%- if tools is not none and (message == user_messages[-1]) %} + {{- "[AVAILABLE_TOOLS] [" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- '{"type": "function", "function": {' }} + {%- for key, val in tool.items() if key != "return" %} + {%- if val is string %} + {{- '"' + key + '": "' + val + '"' }} + {%- else %} + {{- '"' + key + '": ' + val|tojson }} + {%- endif %} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- "}}" }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" }} + {%- endif %} + {%- endfor %} + {{- "[/AVAILABLE_TOOLS]" }} + {%- endif %} + {%- if loop.last and system_message is defined %} + {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }} + {%- else %} + {{- "[INST] " + message["content"] + "[/INST]" }} + {%- endif %} + {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %} + {%- if message.tool_calls is defined %} + {%- set tool_calls = message.tool_calls %} + {%- else %} + {%- set tool_calls = message.content %} + {%- endif %} + {{- "[TOOL_CALLS] [" }} + {%- for tool_call in tool_calls %} + {%- set out = tool_call.function|tojson %} + {{- out[:-1] }} + {%- if not tool_call.id is defined or tool_call.id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }} + {%- endif %} + {{- ', "id": "' + tool_call.id[-9:] + '"}' }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" + eos_token }} + {%- endif %} + {%- endfor %} + {%- elif message["role"] == "assistant" %} + {{- " " + message["content"] + eos_token }} + {%- elif message["role"] == "tool_results" or message["role"] == "tool" %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }} + {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }} + {%- endif %} + {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }} + {%- else %} + {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} + {%- endif %} +{%- endfor %} From 6d3ed3004f4e7026c21e1efa530fbc658ad3ed48 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 23 Sep 2024 16:03:17 -0400 Subject: [PATCH 09/26] Add tool calling for vLLM. --- .../plugins/chat_models/oci_data_science.py | 45 ++++++++++++++----- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/ads/llm/langchain/plugins/chat_models/oci_data_science.py b/ads/llm/langchain/plugins/chat_models/oci_data_science.py index 9cc03f2b3..fe4821e7e 100644 --- a/ads/llm/langchain/plugins/chat_models/oci_data_science.py +++ b/ads/llm/langchain/plugins/chat_models/oci_data_science.py @@ -18,6 +18,8 @@ Optional, Type, Union, + Sequence, + Callable, ) from langchain_core.callbacks import ( @@ -31,18 +33,20 @@ generate_from_stream, ) from langchain_core.messages import AIMessageChunk, BaseMessage, BaseMessageChunk +from langchain_core.tools import BaseTool from langchain_core.output_parsers import ( JsonOutputParser, PydanticOutputParser, ) from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult from langchain_core.runnables import Runnable, RunnableMap, RunnablePassthrough - -from langchain_community.adapters.openai import ( - convert_dict_to_message, - convert_message_to_dict, +from langchain_core.utils.function_calling import convert_to_openai_tool +from langchain_openai.chat_models.base import ( + _convert_delta_to_message_chunk, + _convert_message_to_dict, + _convert_dict_to_message, ) -from langchain_community.chat_models.openai import _convert_delta_to_message_chunk + from pydantic import BaseModel, Field from ads.llm.langchain.plugins.llms.oci_data_science_model_deployment_endpoint import ( DEFAULT_MODEL_NAME, @@ -154,7 +158,7 @@ class Joke(BaseModel): Customized Usage: - You can inherit from base class and overrwrite the `_process_response`, `_process_stream_response`, + You can inherit from base class and overwrite the `_process_response`, `_process_stream_response`, `_construct_json_body` for satisfying customized needed. .. code-block:: python @@ -550,7 +554,7 @@ def _construct_json_body(self, messages: list, params: dict) -> dict: """ return { - "messages": [convert_message_to_dict(m) for m in messages], + "messages": [_convert_message_to_dict(m) for m in messages], **params, } @@ -584,8 +588,6 @@ def _process_stream_response( "Error while formatting response payload for chat model of type" ) from e - print("***" + choice["delta"]) - chunk = _convert_delta_to_message_chunk(choice["delta"], default_chunk_cls) default_chunk_cls = chunk.__class__ finish_reason = choice.get("finish_reason") @@ -626,7 +628,7 @@ def _process_response(self, response_json: dict) -> ChatResult: ) from e for choice in choices: - message = convert_dict_to_message(choice["message"]) + message = _convert_dict_to_message(choice["message"]) generation_info = dict(finish_reason=choice.get("finish_reason")) if "logprobs" in choice: generation_info["logprobs"] = choice["logprobs"] @@ -645,6 +647,14 @@ def _process_response(self, response_json: dict) -> ChatResult: } return ChatResult(generations=generations, llm_output=llm_output) + def bind_tools( + self, + tools: Sequence[Union[Dict[str, Any], Type[BaseModel], Callable, BaseTool]], + **kwargs: Any, + ) -> Runnable[LanguageModelInput, BaseMessage]: + formatted_tools = [convert_to_openai_tool(tool) for tool in tools] + return super().bind(tools=formatted_tools, **kwargs) + class ChatOCIModelDeploymentVLLM(ChatOCIModelDeployment): """OCI large language chat models deployed with vLLM. @@ -748,6 +758,19 @@ class ChatOCIModelDeploymentVLLM(ChatOCIModelDeployment): """Whether to add spaces between special tokens in the output. Defaults to True.""" + tool_choice: Optional[str] = None + """Whether to use tool calling. + Defaults to None, tool calling is disabled. + Tool calling requires model support and vLLM to be configured with `--tool-call-parser`. + Set this to `auto` for the model to determine whether to make tool calls automatically. + Set this to `required` to force the model to always call one or more tools. + """ + + chat_template: Optional[str] = None + """Use customized chat template. + Defaults to None. The chat template from the tokenizer will be used. + """ + @property def _llm_type(self) -> str: """Return type of llm.""" @@ -794,6 +817,8 @@ def _get_model_params(self) -> List[str]: "top_k", "top_p", "use_beam_search", + "tool_choice", + "chat_template", ] From 845c5e4b3eb20dac2567fffb0f452c83a67a9ad3 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Tue, 24 Sep 2024 11:09:11 -0400 Subject: [PATCH 10/26] Add chat template for hermes model. --- ads/llm/chat_template.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ads/llm/chat_template.py b/ads/llm/chat_template.py index 9cdd198f8..0aa1831c0 100644 --- a/ads/llm/chat_template.py +++ b/ads/llm/chat_template.py @@ -24,3 +24,8 @@ def _read_template(filename): def mistral(): """Chat template for auto tool calling with Mistral model deploy with vLLM.""" return ChatTemplates._read_template("tool_chat_template_mistral_parallel.jinja") + + @staticmethod + def hermes(): + """Chat template for auto tool calling with Hermes model deploy with vLLM.""" + return ChatTemplates._read_template("tool_chat_template_hermes.jinja") From ce6d4e5ddfb1ae48c252300891d3fa26c3eaf4a3 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Wed, 25 Sep 2024 13:50:11 -0400 Subject: [PATCH 11/26] Update test for LangChain LLMs. --- .../test_oci_model_deployment_endpoint.py | 148 ++++++++++++++++++ .../with_extras/langchain/test_llm_plugins.py | 65 -------- 2 files changed, 148 insertions(+), 65 deletions(-) create mode 100644 tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py delete mode 100644 tests/unitary/with_extras/langchain/test_llm_plugins.py diff --git a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py new file mode 100644 index 000000000..0b25869ab --- /dev/null +++ b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py @@ -0,0 +1,148 @@ +"""Test OCI Data Science Model Deployment Endpoint.""" + +from unittest import mock +import pytest +from requests.exceptions import HTTPError +from ads.llm import OCIModelDeploymentTGI, OCIModelDeploymentVLLM + + +CONST_MODEL_NAME = "odsc-vllm" +CONST_ENDPOINT = "https://oci.endpoint/ocid/predict" +CONST_PROMPT_FOR_COMPLETION = "This is a prompt." +CONST_COMPLETION = "This is a completion." +CONST_COMPLETION_RESPONSE = { + "choices": [{"index": 0, "text": CONST_COMPLETION}], +} +CONST_COMPLETION_RESPONSE_TGI = {"generated_text": CONST_COMPLETION} +CONST_STREAM_TEMPLATE = ( + 'data: {"id":"","object":"text_completion","created":123456,' + + '"choices":[{"index":0,"text":"","finish_reason":""}]}' +) +CONST_STREAM_RESPONSE = ( + CONST_STREAM_TEMPLATE.replace("", " " + word).encode() + for word in CONST_COMPLETION.split(" ") +) + +CONST_ASYNC_STREAM_TEMPLATE = ( + '{"id":"","object":"text_completion","created":123456,' + + '"choices":[{"index":0,"text":"","finish_reason":""}]}' +) +CONST_ASYNC_STREAM_RESPONSE = ( + CONST_ASYNC_STREAM_TEMPLATE.replace("", " " + word).encode() + for word in CONST_COMPLETION.split(" ") +) + + +def mocked_requests_post(self, **kwargs): + """Method to mock post requests""" + + class MockResponse: + """Represents a mocked response.""" + + def __init__(self, json_data, status_code=200): + self.json_data = json_data + self.status_code = status_code + + def raise_for_status(self): + """Mocked raise for status.""" + if 400 <= self.status_code < 600: + raise HTTPError("", response=self) + + def json(self): + """Returns mocked json data.""" + return self.json_data + + def iter_lines(self, chunk_size=4096): + """Returns a generator of mocked streaming response.""" + return CONST_STREAM_RESPONSE + + @property + def text(self): + return "" + + payload = kwargs.get("json") + if "inputs" in payload: + prompt = payload.get("inputs") + is_tgi = True + else: + prompt = payload.get("prompt") + is_tgi = False + + if prompt == CONST_PROMPT_FOR_COMPLETION: + if is_tgi: + return MockResponse(json_data=CONST_COMPLETION_RESPONSE_TGI) + return MockResponse(json_data=CONST_COMPLETION_RESPONSE) + + return MockResponse( + json_data={}, + status_code=404, + ) + + +async def mocked_async_streaming_response(*args, **kwargs): + """Returns mocked response for async streaming.""" + for item in CONST_ASYNC_STREAM_RESPONSE: + yield item + + +@pytest.mark.requires("ads") +@mock.patch("ads.common.auth.default_signer", return_value=dict(signer=None)) +@mock.patch("requests.post", side_effect=mocked_requests_post) +def test_invoke_vllm(mock_post, mock_auth) -> None: + """Tests invoking vLLM endpoint.""" + llm = OCIModelDeploymentVLLM(endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME) + output = llm.invoke(CONST_PROMPT_FOR_COMPLETION) + assert output == CONST_COMPLETION + + +@pytest.mark.requires("ads") +@mock.patch("ads.common.auth.default_signer", return_value=dict(signer=None)) +@mock.patch("requests.post", side_effect=mocked_requests_post) +def test_stream_tgi(mock_post, mock_auth) -> None: + """Tests streaming with TGI endpoint using OpenAI spec.""" + llm = OCIModelDeploymentTGI( + endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME, streaming=True + ) + output = "" + count = 0 + for chunk in llm.stream(CONST_PROMPT_FOR_COMPLETION): + output += chunk + count += 1 + assert count == 4 + assert output.strip() == CONST_COMPLETION + + +@pytest.mark.requires("ads") +@mock.patch("ads.common.auth.default_signer", return_value=dict(signer=None)) +@mock.patch("requests.post", side_effect=mocked_requests_post) +def test_generate_tgi(mock_post, mock_auth) -> None: + """Tests invoking TGI endpoint using TGI generate spec.""" + llm = OCIModelDeploymentTGI( + endpoint=CONST_ENDPOINT, api="/generate", model=CONST_MODEL_NAME + ) + output = llm.invoke(CONST_PROMPT_FOR_COMPLETION) + assert output == CONST_COMPLETION + + +@pytest.mark.asyncio +@pytest.mark.requires("ads") +@mock.patch( + "ads.common.auth.default_signer", return_value=dict(signer=mock.MagicMock()) +) +@mock.patch( + "langchain_community.utilities.requests.Requests.apost", + mock.MagicMock(), +) +async def test_stream_async(mock_auth): + """Tests async streaming.""" + llm = OCIModelDeploymentTGI( + endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME, streaming=True + ) + with mock.patch.object( + llm, + "_aiter_sse", + mock.MagicMock(return_value=mocked_async_streaming_response()), + ): + + chunks = [chunk async for chunk in llm.astream(CONST_PROMPT_FOR_COMPLETION)] + assert "".join(chunks).strip() == CONST_COMPLETION diff --git a/tests/unitary/with_extras/langchain/test_llm_plugins.py b/tests/unitary/with_extras/langchain/test_llm_plugins.py deleted file mode 100644 index 3c21e0e8c..000000000 --- a/tests/unitary/with_extras/langchain/test_llm_plugins.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*-- - -# Copyright (c) 2023 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -import pytest -import unittest -from unittest.mock import patch - -from ads.llm import ModelDeploymentTGI -from oci.signer import Signer - - -class LangChainPluginsTest(unittest.TestCase): - mock_endpoint = "https://mock_endpoint/predict" - - def mocked_requests_post(endpoint, headers, json, auth, **kwargs): - class MockResponse: - def __init__(self, json_data, status_code): - self.json_data = json_data - self.status_code = status_code - - def json(self): - return self.json_data - - def raise_for_status(self): - pass - - assert endpoint.startswith("https://") - assert json - assert headers - prompt = json.get("inputs") - assert prompt and isinstance(prompt, str) - completion = "ads" if "who" in prompt else "Unknown" - assert auth - assert isinstance(auth, Signer) - - return MockResponse( - json_data={"generated_text": completion}, - status_code=200, - ) - - def test_oci_model_deployment_model_param(self): - llm = ModelDeploymentTGI(endpoint=self.mock_endpoint, temperature=0.9) - model_params_keys = [ - "best_of", - "max_new_tokens", - "temperature", - "top_k", - "top_p", - "do_sample", - "return_full_text", - "watermark", - ] - assert llm.endpoint == self.mock_endpoint - assert all(key in llm._default_params for key in model_params_keys) - assert llm.temperature == 0.9 - - @patch("requests.post", mocked_requests_post) - def test_oci_model_deployment_call(self): - llm = ModelDeploymentTGI(endpoint=self.mock_endpoint) - response = llm("who am i") - completion = "ads" - assert response == completion From 3d9ecfd9f108c241c137141d2600fe18f99dc873 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Wed, 25 Sep 2024 14:01:09 -0400 Subject: [PATCH 12/26] Update tests. --- ..._data_science_model_deployment_endpoint.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py index 4dce63dc8..d9fc1c3ac 100644 --- a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +++ b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py @@ -21,6 +21,7 @@ import aiohttp import requests +import traceback from langchain_core.callbacks import ( AsyncCallbackManagerForLLMRun, CallbackManagerForLLMRun, @@ -175,6 +176,7 @@ def _completion_with_retry(**kwargs: Any) -> Any: except TokenExpiredError as e: raise e except Exception as err: + traceback.print_exc() logger.debug( f"Requests payload: {data}. Requests arguments: " f"url={self.endpoint},timeout={request_timeout},stream={stream}. " @@ -221,6 +223,7 @@ async def _completion_with_retry(**kwargs: Any) -> Any: except TokenExpiredError as e: raise e except Exception as err: + traceback.print_exc() logger.debug( f"Requests payload: `{data}`. " f"Stream mode={stream}. " @@ -272,6 +275,7 @@ def _parse_stream(self, lines: Iterator[bytes]) -> Iterator[str]: An iterator that yields parsed lines as strings. """ for line in lines: + print("***" + str(line)) _line = self._parse_stream_line(line) if _line is not None: yield _line @@ -307,13 +311,16 @@ def _parse_stream_line(self, line: bytes) -> Optional[str]: The processed line as a string if valid, otherwise `None`. """ line = line.strip() - if line: - _line = line.decode("utf-8") - if "[DONE]" in _line: - return None + if not line: + return None + _line = line.decode("utf-8") + + if _line.lower().startswith("data:"): + _line = _line[5:].lstrip() - if _line.lower().startswith("data:"): - return _line[5:].lstrip() + if _line.startswith("[DONE]"): + return None + return _line return None async def _aiter_sse( @@ -589,11 +596,11 @@ def _stream( response = self.completion_with_retry( data=body, run_manager=run_manager, stream=True, **requests_kwargs ) - for line in self._parse_stream(response.iter_lines()): chunk = self._handle_sse_line(line) if run_manager: run_manager.on_llm_new_token(chunk.text, chunk=chunk) + yield chunk async def _astream( @@ -751,7 +758,7 @@ class OCIModelDeploymentTGI(OCIModelDeploymentLLM): """ - api: Literal["/generate", "/v1/completions"] = "/generate" + api: Literal["/generate", "/v1/completions"] = "/v1/completions" """Api spec.""" frequency_penalty: float = 0.0 From bee6a927de2227bc1c4680297736b725b85b24e6 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 12:43:08 -0400 Subject: [PATCH 13/26] Add tests for chat model. --- .../chat_models/test_oci_data_science.py | 169 ++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py diff --git a/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py b/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py new file mode 100644 index 000000000..34ca4aa70 --- /dev/null +++ b/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py @@ -0,0 +1,169 @@ +"""Test OCI Data Science Model Deployment Endpoint.""" + +from unittest import mock +import pytest +from langchain_core.messages import AIMessage, AIMessageChunk +from requests.exceptions import HTTPError +from ads.llm import ChatOCIModelDeploymentVLLM, ChatOCIModelDeploymentTGI + + +CONST_MODEL_NAME = "odsc-vllm" +CONST_ENDPOINT = "https://oci.endpoint/ocid/predict" +CONST_PROMPT = "This is a prompt." +CONST_COMPLETION = "This is a completion." +CONST_COMPLETION_RESPONSE = { + "id": "chat-123456789", + "object": "chat.completion", + "created": 123456789, + "model": "mistral", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": CONST_COMPLETION, + "tool_calls": [], + }, + "logprobs": None, + "finish_reason": "length", + "stop_reason": None, + } + ], + "usage": {"prompt_tokens": 115, "total_tokens": 371, "completion_tokens": 256}, + "prompt_logprobs": None, +} +CONST_COMPLETION_RESPONSE_TGI = {"generated_text": CONST_COMPLETION} +CONST_STREAM_TEMPLATE = ( + 'data: {"id":"chat-123456","object":"chat.completion.chunk","created":123456789,' + '"model":"odsc-llm","choices":[{"index":0,"delta":,"finish_reason":null}]}' +) +CONST_STREAM_DELTAS = ['{"role":"assistant","content":""}'] + [ + '{"content":" ' + word + '"}' for word in CONST_COMPLETION.split(" ") +] +CONST_STREAM_RESPONSE = ( + content + for content in [ + CONST_STREAM_TEMPLATE.replace("", delta).encode() + for delta in CONST_STREAM_DELTAS + ] + + [b"data: [DONE]"] +) + +CONST_ASYNC_STREAM_TEMPLATE = ( + '{"id":"chat-123456","object":"chat.completion.chunk","created":123456789,' + '"model":"odsc-llm","choices":[{"index":0,"delta":,"finish_reason":null}]}' +) +CONST_ASYNC_STREAM_RESPONSE = ( + CONST_ASYNC_STREAM_TEMPLATE.replace("", delta).encode() + for delta in CONST_STREAM_DELTAS +) + + +def mocked_requests_post(self, **kwargs): + """Method to mock post requests""" + + class MockResponse: + """Represents a mocked response.""" + + def __init__(self, json_data, status_code=200): + self.json_data = json_data + self.status_code = status_code + + def raise_for_status(self): + """Mocked raise for status.""" + if 400 <= self.status_code < 600: + raise HTTPError("", response=self) + + def json(self): + """Returns mocked json data.""" + return self.json_data + + def iter_lines(self, chunk_size=4096): + """Returns a generator of mocked streaming response.""" + return CONST_STREAM_RESPONSE + + @property + def text(self): + return "" + + payload = kwargs.get("json") + messages = payload.get("messages") + prompt = messages[0].get("content") + + if prompt == CONST_PROMPT: + return MockResponse(json_data=CONST_COMPLETION_RESPONSE) + + return MockResponse( + json_data={}, + status_code=404, + ) + + +@pytest.mark.requires("ads") +@mock.patch("ads.common.auth.default_signer", return_value=dict(signer=None)) +@mock.patch("requests.post", side_effect=mocked_requests_post) +def test_invoke_vllm(mock_post, mock_auth) -> None: + """Tests invoking vLLM endpoint.""" + llm = ChatOCIModelDeploymentVLLM(endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME) + output = llm.invoke(CONST_PROMPT) + assert isinstance(output, AIMessage) + assert output.content == CONST_COMPLETION + + +@pytest.mark.requires("ads") +@mock.patch("ads.common.auth.default_signer", return_value=dict(signer=None)) +@mock.patch("requests.post", side_effect=mocked_requests_post) +def test_invoke_tgi(mock_post, mock_auth) -> None: + """Tests invoking TGI endpoint using OpenAI Spec.""" + llm = ChatOCIModelDeploymentTGI(endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME) + output = llm.invoke(CONST_PROMPT) + assert isinstance(output, AIMessage) + assert output.content == CONST_COMPLETION + + +@pytest.mark.requires("ads") +@mock.patch("ads.common.auth.default_signer", return_value=dict(signer=None)) +@mock.patch("requests.post", side_effect=mocked_requests_post) +def test_stream_vllm(mock_post, mock_auth) -> None: + """Tests streaming with vLLM endpoint using OpenAI spec.""" + llm = ChatOCIModelDeploymentVLLM( + endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME, streaming=True + ) + output = AIMessageChunk("") + count = 0 + for chunk in llm.stream(CONST_PROMPT): + assert isinstance(chunk, AIMessageChunk) + output += chunk + count += 1 + assert count == 5 + assert output.content.strip() == CONST_COMPLETION + + +async def mocked_async_streaming_response(*args, **kwargs): + """Returns mocked response for async streaming.""" + for item in CONST_ASYNC_STREAM_RESPONSE: + yield item + + +@pytest.mark.asyncio +@pytest.mark.requires("ads") +@mock.patch( + "ads.common.auth.default_signer", return_value=dict(signer=mock.MagicMock()) +) +@mock.patch( + "langchain_community.utilities.requests.Requests.apost", + mock.MagicMock(), +) +async def test_stream_async(mock_auth): + """Tests async streaming.""" + llm = ChatOCIModelDeploymentVLLM( + endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME, streaming=True + ) + with mock.patch.object( + llm, + "_aiter_sse", + mock.MagicMock(return_value=mocked_async_streaming_response()), + ): + + chunks = [chunk.content async for chunk in llm.astream(CONST_PROMPT)] + assert "".join(chunks).strip() == CONST_COMPLETION From 2e2ee0e2fcbca921a6aa43960697bd23e078a531 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 12:43:32 -0400 Subject: [PATCH 14/26] Update tests. --- .../langchain/chat_models/__init__.py | 0 .../with_extras/langchain/llms/__init__.py | 0 .../test_oci_model_deployment_endpoint.py | 21 ++++++++++++------- 3 files changed, 14 insertions(+), 7 deletions(-) create mode 100644 tests/unitary/with_extras/langchain/chat_models/__init__.py create mode 100644 tests/unitary/with_extras/langchain/llms/__init__.py diff --git a/tests/unitary/with_extras/langchain/chat_models/__init__.py b/tests/unitary/with_extras/langchain/chat_models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unitary/with_extras/langchain/llms/__init__.py b/tests/unitary/with_extras/langchain/llms/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py index 0b25869ab..784bdaab9 100644 --- a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py +++ b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py @@ -8,10 +8,17 @@ CONST_MODEL_NAME = "odsc-vllm" CONST_ENDPOINT = "https://oci.endpoint/ocid/predict" -CONST_PROMPT_FOR_COMPLETION = "This is a prompt." +CONST_PROMPT = "This is a prompt." CONST_COMPLETION = "This is a completion." CONST_COMPLETION_RESPONSE = { - "choices": [{"index": 0, "text": CONST_COMPLETION}], + "choices": [ + { + "index": 0, + "text": CONST_COMPLETION, + "logprobs": 0.1, + "finish_reason": "length", + } + ], } CONST_COMPLETION_RESPONSE_TGI = {"generated_text": CONST_COMPLETION} CONST_STREAM_TEMPLATE = ( @@ -68,7 +75,7 @@ def text(self): prompt = payload.get("prompt") is_tgi = False - if prompt == CONST_PROMPT_FOR_COMPLETION: + if prompt == CONST_PROMPT: if is_tgi: return MockResponse(json_data=CONST_COMPLETION_RESPONSE_TGI) return MockResponse(json_data=CONST_COMPLETION_RESPONSE) @@ -91,7 +98,7 @@ async def mocked_async_streaming_response(*args, **kwargs): def test_invoke_vllm(mock_post, mock_auth) -> None: """Tests invoking vLLM endpoint.""" llm = OCIModelDeploymentVLLM(endpoint=CONST_ENDPOINT, model=CONST_MODEL_NAME) - output = llm.invoke(CONST_PROMPT_FOR_COMPLETION) + output = llm.invoke(CONST_PROMPT) assert output == CONST_COMPLETION @@ -105,7 +112,7 @@ def test_stream_tgi(mock_post, mock_auth) -> None: ) output = "" count = 0 - for chunk in llm.stream(CONST_PROMPT_FOR_COMPLETION): + for chunk in llm.stream(CONST_PROMPT): output += chunk count += 1 assert count == 4 @@ -120,7 +127,7 @@ def test_generate_tgi(mock_post, mock_auth) -> None: llm = OCIModelDeploymentTGI( endpoint=CONST_ENDPOINT, api="/generate", model=CONST_MODEL_NAME ) - output = llm.invoke(CONST_PROMPT_FOR_COMPLETION) + output = llm.invoke(CONST_PROMPT) assert output == CONST_COMPLETION @@ -144,5 +151,5 @@ async def test_stream_async(mock_auth): mock.MagicMock(return_value=mocked_async_streaming_response()), ): - chunks = [chunk async for chunk in llm.astream(CONST_PROMPT_FOR_COMPLETION)] + chunks = [chunk async for chunk in llm.astream(CONST_PROMPT)] assert "".join(chunks).strip() == CONST_COMPLETION From 3eca98a28ccaf8ea1e9fb7874ecff0424319137b Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 12:43:50 -0400 Subject: [PATCH 15/26] Remove extra print statements. --- ads/llm/langchain/plugins/chat_models/oci_data_science.py | 1 - .../plugins/llms/oci_data_science_model_deployment_endpoint.py | 1 - 2 files changed, 2 deletions(-) diff --git a/ads/llm/langchain/plugins/chat_models/oci_data_science.py b/ads/llm/langchain/plugins/chat_models/oci_data_science.py index fe4821e7e..89d812b6e 100644 --- a/ads/llm/langchain/plugins/chat_models/oci_data_science.py +++ b/ads/llm/langchain/plugins/chat_models/oci_data_science.py @@ -319,7 +319,6 @@ def _stream( ) default_chunk_class = AIMessageChunk for line in self._parse_stream(response.iter_lines()): - # print(">>>" + str(line)) chunk = self._handle_sse_line(line, default_chunk_class) if run_manager: run_manager.on_llm_new_token(chunk.text, chunk=chunk) diff --git a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py index d9fc1c3ac..134266644 100644 --- a/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py +++ b/ads/llm/langchain/plugins/llms/oci_data_science_model_deployment_endpoint.py @@ -275,7 +275,6 @@ def _parse_stream(self, lines: Iterator[bytes]) -> Iterator[str]: An iterator that yields parsed lines as strings. """ for line in lines: - print("***" + str(line)) _line = self._parse_stream_line(line) if _line is not None: yield _line From f6a5248191f11f63681b9c554114f7080b2fdad0 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 12:44:37 -0400 Subject: [PATCH 16/26] Skip tests that needs to be updated. --- .../with_extras/langchain/test_deploy.py | 2 + .../langchain/test_serialization.py | 69 +++---------------- .../with_extras/langchain/test_serializers.py | 6 ++ 3 files changed, 18 insertions(+), 59 deletions(-) diff --git a/tests/unitary/with_extras/langchain/test_deploy.py b/tests/unitary/with_extras/langchain/test_deploy.py index 6af80066e..8441e8f5c 100644 --- a/tests/unitary/with_extras/langchain/test_deploy.py +++ b/tests/unitary/with_extras/langchain/test_deploy.py @@ -2,6 +2,8 @@ import tempfile from unittest.mock import MagicMock, patch import pytest +pytest.skip(allow_module_level=True) +# TODO: Tests need to be updated from ads.llm.deploy import ChainDeployment diff --git a/tests/unitary/with_extras/langchain/test_serialization.py b/tests/unitary/with_extras/langchain/test_serialization.py index 831be24b2..d9284a838 100644 --- a/tests/unitary/with_extras/langchain/test_serialization.py +++ b/tests/unitary/with_extras/langchain/test_serialization.py @@ -6,9 +6,14 @@ import os -from copy import deepcopy + from unittest import SkipTest, TestCase, mock, skipIf +import pytest + +pytest.skip(allow_module_level=True) +# TODO: Tests need to be updated + import langchain_core from langchain.chains import LLMChain from langchain.llms import Cohere @@ -16,10 +21,8 @@ from langchain.schema.runnable import RunnableParallel, RunnablePassthrough from ads.llm import ( - GenerativeAI, - GenerativeAIEmbeddings, - ModelDeploymentTGI, - ModelDeploymentVLLM, + OCIModelDeploymentTGI, + OCIModelDeploymentVLLM, ) from ads.llm.serialize import dump, load @@ -132,63 +135,11 @@ def test_llm_chain_serialization_with_cohere(self): self.assertIsInstance(llm_chain.llm, Cohere) self.assertEqual(llm_chain.input_keys, ["subject"]) - def test_llm_chain_serialization_with_oci(self): - """Tests serialization of LLMChain with OCI Gen AI.""" - llm = ModelDeploymentVLLM(endpoint=self.ENDPOINT, model="my_model") - template = PromptTemplate.from_template(self.PROMPT_TEMPLATE) - llm_chain = LLMChain(prompt=template, llm=llm) - serialized = dump(llm_chain) - llm_chain = load(serialized) - self.assertIsInstance(llm_chain, LLMChain) - self.assertIsInstance(llm_chain.prompt, PromptTemplate) - self.assertEqual(llm_chain.prompt.template, self.PROMPT_TEMPLATE) - self.assertIsInstance(llm_chain.llm, ModelDeploymentVLLM) - self.assertEqual(llm_chain.llm.endpoint, self.ENDPOINT) - self.assertEqual(llm_chain.llm.model, "my_model") - self.assertEqual(llm_chain.input_keys, ["subject"]) - - @skipIf( - version_tuple(langchain_core.__version__) > (0, 1, 50), - "Serialization not supported in this langchain_core version", - ) - def test_oci_gen_ai_serialization(self): - """Tests serialization of OCI Gen AI LLM.""" - try: - llm = GenerativeAI( - compartment_id=self.COMPARTMENT_ID, - client_kwargs=self.GEN_AI_KWARGS, - ) - except ImportError as ex: - raise SkipTest("OCI SDK does not support Generative AI.") from ex - serialized = dump(llm) - llm = load(serialized) - self.assertIsInstance(llm, GenerativeAI) - self.assertEqual(llm.compartment_id, self.COMPARTMENT_ID) - self.assertEqual(llm.client_kwargs, self.GEN_AI_KWARGS) - - @skipIf( - version_tuple(langchain_core.__version__) > (0, 1, 50), - "Serialization not supported in this langchain_core version", - ) - def test_gen_ai_embeddings_serialization(self): - """Tests serialization of OCI Gen AI embeddings.""" - try: - embeddings = GenerativeAIEmbeddings( - compartment_id=self.COMPARTMENT_ID, client_kwargs=self.GEN_AI_KWARGS - ) - except ImportError as ex: - raise SkipTest("OCI SDK does not support Generative AI.") from ex - serialized = dump(embeddings) - self.assertEqual(serialized, self.EXPECTED_GEN_AI_EMBEDDINGS) - embeddings = load(serialized) - self.assertIsInstance(embeddings, GenerativeAIEmbeddings) - self.assertEqual(embeddings.compartment_id, self.COMPARTMENT_ID) - def test_runnable_sequence_serialization(self): """Tests serialization of runnable sequence.""" map_input = RunnableParallel(text=RunnablePassthrough()) template = PromptTemplate.from_template(self.PROMPT_TEMPLATE) - llm = ModelDeploymentTGI(endpoint=self.ENDPOINT) + llm = OCIModelDeploymentTGI(endpoint=self.ENDPOINT) chain = map_input | template | llm serialized = dump(chain) @@ -244,5 +195,5 @@ def test_runnable_sequence_serialization(self): [], ) self.assertIsInstance(chain.steps[1], PromptTemplate) - self.assertIsInstance(chain.steps[2], ModelDeploymentTGI) + self.assertIsInstance(chain.steps[2], OCIModelDeploymentTGI) self.assertEqual(chain.steps[2].endpoint, self.ENDPOINT) diff --git a/tests/unitary/with_extras/langchain/test_serializers.py b/tests/unitary/with_extras/langchain/test_serializers.py index 0ee07dd59..b9c29ecca 100644 --- a/tests/unitary/with_extras/langchain/test_serializers.py +++ b/tests/unitary/with_extras/langchain/test_serializers.py @@ -9,6 +9,12 @@ import unittest from unittest import mock from typing import List + +import pytest + +pytest.skip(allow_module_level=True) +# TODO: Tests need to be updated + from langchain.load.serializable import Serializable from langchain.schema.embeddings import Embeddings from langchain.vectorstores import OpenSearchVectorSearch, FAISS From a423f7561e9eee2e211f555a71fad0e92c79e95d Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 12:50:06 -0400 Subject: [PATCH 17/26] Update dependency and licenses. --- THIRD_PARTY_LICENSES.txt | 12 ++++++++++++ pyproject.toml | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/THIRD_PARTY_LICENSES.txt b/THIRD_PARTY_LICENSES.txt index 197066ec8..9b138fbea 100644 --- a/THIRD_PARTY_LICENSES.txt +++ b/THIRD_PARTY_LICENSES.txt @@ -157,6 +157,18 @@ langchain * Source code: https://github.com/langchain-ai/langchain * Project home: https://www.langchain.com/ +langchain-community +* Copyright (c) 2023 LangChain, Inc. +* License: MIT license +* Source code: https://github.com/langchain-ai/langchain/tree/master/libs/community +* Project home: https://github.com/langchain-ai/langchain/tree/master/libs/community + +langchain-openai +* Copyright (c) 2023 LangChain, Inc. +* License: MIT license +* Source code: https://github.com/langchain-ai/langchain/tree/master/libs/partners/openai +* Project home: https://github.com/langchain-ai/langchain/tree/master/libs/partners/openai + lightgbm * Copyright (c) 2023 Microsoft Corporation * License: MIT license diff --git a/pyproject.toml b/pyproject.toml index cda8fc75d..f5033b7dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -202,7 +202,7 @@ pii = [ "spacy==3.6.1", "report-creator==1.0.9", ] -llm = ["langchain>=0.3", "pydantic>=2,<3", "evaluate>=0.4.0"] +llm = ["langchain>=0.3", "langchain-community", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] # To reduce backtracking (decrese deps install time) during test/dev env setup reducing number of versions pip is From 56bc9a71d26606772170d1a88d7b3e2a9014fe1e Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 12:56:12 -0400 Subject: [PATCH 18/26] Update dependency to langchain>=0.2 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f5033b7dd..cef84a753 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -202,7 +202,7 @@ pii = [ "spacy==3.6.1", "report-creator==1.0.9", ] -llm = ["langchain>=0.3", "langchain-community", "pydantic>=2,<3", "evaluate>=0.4.0"] +llm = ["langchain>=0.2", "langchain-community", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] # To reduce backtracking (decrese deps install time) during test/dev env setup reducing number of versions pip is From 5a32624355d94df30573fd081df00dbb624c9acb Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 13:29:01 -0400 Subject: [PATCH 19/26] Add langchain_openai as dependency for LangChain integration. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cef84a753..dbcb3cf10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -202,7 +202,7 @@ pii = [ "spacy==3.6.1", "report-creator==1.0.9", ] -llm = ["langchain>=0.2", "langchain-community", "pydantic>=2,<3", "evaluate>=0.4.0"] +llm = ["langchain>=0.2", "langchain-community", "langchain_openai", "pydantic>=2,<3", "evaluate>=0.4.0"] aqua = ["jupyter_server"] # To reduce backtracking (decrese deps install time) during test/dev env setup reducing number of versions pip is From 777400ef09b96fa779e56287d852e04e7e3d30a2 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 16:37:28 -0400 Subject: [PATCH 20/26] Update docs for LangChain. --- .../large_language_model/langchain_models.rst | 209 ++++++++++-------- 1 file changed, 117 insertions(+), 92 deletions(-) diff --git a/docs/source/user_guide/large_language_model/langchain_models.rst b/docs/source/user_guide/large_language_model/langchain_models.rst index 4a262a9dc..6e4f96cda 100644 --- a/docs/source/user_guide/large_language_model/langchain_models.rst +++ b/docs/source/user_guide/large_language_model/langchain_models.rst @@ -1,142 +1,167 @@ LangChain Integration ********************* -.. versionadded:: 2.9.1 +.. versionadded:: 2.11.19 -LangChain compatible models/interfaces are needed for LangChain applications to invoke OCI generative AI service or LLMs deployed on OCI data science model deployment service. +LangChain compatible models/interfaces are needed for LangChain applications to invoke LLMs deployed on OCI data science model deployment service. -.. admonition:: Preview Feature +.. admonition:: LangChain Community :class: note - While the official integration of OCI and LangChain will be added to the LangChain library, ADS provides a preview version of the integration. - It it important to note that the APIs of the preview version may change in the future. + Integrations from ADS may provide additional or experimental features in the latest updates, while the stable integrations (such as ``OCIModelDeploymentVLLM`` and ``OCIModelDeploymentTGI``) are also available from `LangChain Community `_. -Integration with Generative AI -============================== +If you deploy LLM on OCI model deployment service using `AI Quick Actions `_ or `HuggingFace TGI `_ , you can use the integration models described in this page to build your application with LangChain. -The `OCI Generative AI service `_ provide text generation, summarization and embedding models. +Authentication +============== -To use the text generation model as LLM in LangChain: +By default, the integration uses the same authentication method configured with ``ads.set_auth()``. Optionally, you can also pass the ``auth`` keyword argument when initializing the model to use specific authentication method for the model. For example, to use resource principal for all OCI authentication: .. code-block:: python3 - from ads.llm import GenerativeAI - - llm = GenerativeAI( - compartment_id="", - # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint. - client_kwargs={ - "service_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" - }, + import ads + from ads.llm import ChatOCIModelDeploymentVLLM + + ads.set_auth(auth="resource_principal") + + llm = ChatOCIModelDeploymentVLLM( + model="odsc-llm", + endpoint= f"https://modeldeployment.oci.customer-oci.com//predict", + # Optionally you can specify additional keyword arguments for the model, e.g. temperature. + temperature=0.1, ) - llm.invoke("Translate the following sentence into French:\nHow are you?\n") - -Here is an example of using prompt template and OCI generative AI LLM to build a translation app: +Alternatively, you may use specific authentication for the model: .. code-block:: python3 - from langchain.prompts import PromptTemplate - from langchain.schema.runnable import RunnableParallel, RunnablePassthrough - from ads.llm import GenerativeAI - - # Map the input into a dictionary - map_input = RunnableParallel(text=RunnablePassthrough()) - # Template for the input text. - template = PromptTemplate.from_template( - "Translate English into French. Do not ask any questions.\nEnglish: Hello!\nFrench: " - ) - llm = GenerativeAI( - compartment_id="", - # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint. - client_kwargs={ - "service_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" - }, - ) + import ads + from ads.llm import ChatOCIModelDeploymentVLLM - # Build the app as a chain - translation_app = map_input | template | llm + llm = ChatOCIModelDeploymentVLLM( + model="odsc-llm", + endpoint= f"https://modeldeployment.oci.customer-oci.com//predict", + # Use security token authentication for the model + auth=ads.auth.security_token(profile="my_profile"), + # Optionally you can specify additional keyword arguments for the model, e.g. temperature. + temperature=0.1, + ) - # Now you have a translation app. - translation_app.invoke("Hello!") - # "Bonjour!" +Completion Models +================= -Similarly, you can use the embedding model: +Completion models takes a text string and input and returns a string with completions. To use completion models, your model should be deployed with the completion endpoint (``/v1/completions``). The following example shows how you can use the ``OCIModelDeploymentVLLM`` class for model deployed with vLLM container. If you deployed the model with TGI container, you can use ``OCIModelDeploymentTGI`` similarly. .. code-block:: python3 - from ads.llm import GenerativeAIEmbeddings + from ads.llm import OCIModelDeploymentVLLM - embed = GenerativeAIEmbeddings( - compartment_id="", - # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint. - client_kwargs={ - "service_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" - }, + llm = OCIModelDeploymentVLLM( + model="odsc-llm", + endpoint= f"https://modeldeployment.oci.customer-oci.com//predict", + # Optionally you can specify additional keyword arguments for the model. + max_tokens=32, ) - embed.embed_query("How are you?") + # Invoke the LLM. The completion will be a string. + completion = llm.invoke("Who is the first president of United States?") -Integration with Model Deployment -================================= + # Stream the completion + for chunk in llm.stream("Who is the first president of United States?"): + print(chunk, end="", flush=True) + + # Invoke asynchronously + completion = await llm.ainvoke("Who is the first president of United States?") + + # Stream asynchronously + async for chunk in llm.astream("Who is the first president of United States?"): + print(chunk, end="", flush=True) -.. admonition:: Available in LangChain - :class: note - The same ``OCIModelDeploymentVLLM`` and ``ModelDeploymentTGI`` classes are also `available from LangChain `_. +Chat Models +=========== -If you deploy open-source or your own LLM on OCI model deployment service using `vLLM `_ or `HuggingFace TGI `_ , you can use the ``ModelDeploymentVLLM`` or ``ModelDeploymentTGI`` to integrate your model with LangChain. +Chat models takes `chat messages `_ as inputs and returns additional chat message (usually `AIMessage `_) as output. To use chat models, your models must be deployed with chat completion endpoint (``/v1/chat/completions``). The following example shows how you can use the ``ChatOCIModelDeploymentVLLM`` class for model deployed with vLLM container. If you deployed the model with TGI container, you can use ``ChatOCIModelDeploymentTGI`` similarly. .. code-block:: python3 - from ads.llm import ModelDeploymentVLLM + from langchain_core.messages import HumanMessage, SystemMessage + from ads.llm import ChatOCIModelDeploymentVLLM - llm = ModelDeploymentVLLM( - endpoint="https:///predict", - model="" + llm = ChatOCIModelDeploymentVLLM( + model="odsc-llm", + endpoint= f"https://modeldeployment.oci.customer-oci.com//predict", + # Optionally you can specify additional keyword arguments for the model. + max_tokens=32, ) -.. code-block:: python3 + messages = [ + SystemMessage(content="You're a helpful assistant providing concise answers."), + HumanMessage(content="Who's the first president of United States?"), + ] - from ads.llm import ModelDeploymentTGI + # Invoke the LLM. The response will be `AIMessage` + response = llm.invoke(messages) + # Print the text of the response + print(response.content) - llm = ModelDeploymentTGI( - endpoint="https:///predict", - ) + # Stream the response. Note that each chunk is an `AIMessageChunk`` + for chunk in llm.stream(messages): + print(chunk.content, end="", flush=True) -Authentication -============== + # Invoke asynchronously + response = await llm.ainvoke(messages) + print(response.content) -By default, the integration uses the same authentication method configured with ``ads.set_auth()``. Optionally, you can also pass the ``auth`` keyword argument when initializing the model to use specific authentication method for the model. For example, to use resource principal for all OCI authentication: + # Stream asynchronously + async for chunk in llm.astream(messages): + print(chunk.content, end="") + + +Tool Calling +============ + +The vLLM container support `tool/function calling `_ on some models (e.g. Mistral and Hermes models). To use tool calling, you must customize the "Model deployment configuration" to use ``--enable-auto-tool-choice`` and specify ``--tool-call-parser`` when deploying the model with vLLM container. A customized ``chat_template`` is also needed for tool/function calling to work with vLLM. ADS includes a convenience way to import the example templates provided by vLLM. .. code-block:: python3 - import ads - from ads.llm import GenerativeAI - - ads.set_auth(auth="resource_principal") - - llm = GenerativeAI( - compartment_id="", - # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint. - client_kwargs={ - "service_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" - }, + from ads.llm import ChatOCIModelDeploymentVLLM, ChatTemplates + + llm = ChatOCIModelDeploymentVLLM( + model="odsc-llm", + endpoint= f"https://modeldeployment.oci.customer-oci.com//predict", + # Set tool_choice to "auto" to enable tool/function calling. + tool_choice="auto", + # Use the modified mistral template provided by vLLM + chat_template=ChatTemplates.mistral() ) -Alternatively, you may use specific authentication for the model: +Following is an example of creating an agent with a tool to get current exchange rate: .. code-block:: python3 - import ads - from ads.llm import GenerativeAI - - llm = GenerativeAI( - # Use security token authentication for the model - auth=ads.auth.security_token(profile="my_profile"), - compartment_id="", - # Optionally you can specify keyword arguments for the OCI client, e.g. service_endpoint. - client_kwargs={ - "service_endpoint": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com" - }, + import requests + from langchain_core.tools import tool + from langchain_core.prompts import ChatPromptTemplate + from langchain.agents import create_tool_calling_agent, AgentExecutor + + @tool + def get_exchange_rate(currency:str) -> str: + """Obtain the current exchange rates of currency in ISO 4217 Three Letter Currency Code""" + + response = requests.get(f"https://open.er-api.com/v6/latest/{currency}") + return response.json() + + tools = [get_exchange_rate] + prompt = ChatPromptTemplate.from_messages( + [ + ("system", "You are a helpful assistant"), + ("placeholder", "{chat_history}"), + ("human", "{input}"), + ("placeholder", "{agent_scratchpad}"), + ] ) + + agent = create_tool_calling_agent(llm, tools, prompt) + agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True, return_intermediate_steps=True) + agent_executor.invoke({"input": "what's the currency conversion of USD to Yen"}) From 97f536440213d228ae26a2353ab87972f14a75ca Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 16:49:30 -0400 Subject: [PATCH 21/26] Skip LangChain test for Python 3.8. --- .../langchain/chat_models/test_oci_data_science.py | 6 ++++++ .../langchain/llms/test_oci_model_deployment_endpoint.py | 5 +++++ tests/unitary/with_extras/langchain/test_guardrails.py | 3 +++ 3 files changed, 14 insertions(+) diff --git a/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py b/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py index 34ca4aa70..206ab7081 100644 --- a/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py +++ b/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py @@ -1,5 +1,6 @@ """Test OCI Data Science Model Deployment Endpoint.""" +import sys from unittest import mock import pytest from langchain_core.messages import AIMessage, AIMessageChunk @@ -7,6 +8,11 @@ from ads.llm import ChatOCIModelDeploymentVLLM, ChatOCIModelDeploymentTGI +pytestmark = pytest.mark.skipif( + sys.version_info < (3, 9), reason="Requires Python 3.9 or higher" +) + + CONST_MODEL_NAME = "odsc-vllm" CONST_ENDPOINT = "https://oci.endpoint/ocid/predict" CONST_PROMPT = "This is a prompt." diff --git a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py index 784bdaab9..8ee95af41 100644 --- a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py +++ b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py @@ -1,10 +1,15 @@ """Test OCI Data Science Model Deployment Endpoint.""" +import sys from unittest import mock import pytest from requests.exceptions import HTTPError from ads.llm import OCIModelDeploymentTGI, OCIModelDeploymentVLLM +pytestmark = pytest.mark.skipif( + sys.version_info < (3, 9), reason="Requires Python 3.9 or higher" +) + CONST_MODEL_NAME = "odsc-vllm" CONST_ENDPOINT = "https://oci.endpoint/ocid/predict" diff --git a/tests/unitary/with_extras/langchain/test_guardrails.py b/tests/unitary/with_extras/langchain/test_guardrails.py index ae3fedc5f..67da7ec28 100644 --- a/tests/unitary/with_extras/langchain/test_guardrails.py +++ b/tests/unitary/with_extras/langchain/test_guardrails.py @@ -7,6 +7,7 @@ import json import os import tempfile +import sys from typing import Any, List, Dict, Mapping, Optional from unittest import TestCase from langchain.callbacks.manager import CallbackManagerForLLMRun @@ -19,6 +20,8 @@ from ads.llm.serialize import load, dump import pytest +pytestmark = pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires Python 3.9 or higher") + class FakeLLM(LLM): """Fake LLM for testing purpose.""" From d3b9aeb00bd7e9cbd2d4b2488cd6595bff54854d Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 16:53:29 -0400 Subject: [PATCH 22/26] Add python 3.9 requirement to LangChain docs. --- .../large_language_model/langchain_models.rst | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/source/user_guide/large_language_model/langchain_models.rst b/docs/source/user_guide/large_language_model/langchain_models.rst index 6e4f96cda..66d756e97 100644 --- a/docs/source/user_guide/large_language_model/langchain_models.rst +++ b/docs/source/user_guide/large_language_model/langchain_models.rst @@ -3,12 +3,18 @@ LangChain Integration .. versionadded:: 2.11.19 -LangChain compatible models/interfaces are needed for LangChain applications to invoke LLMs deployed on OCI data science model deployment service. - .. admonition:: LangChain Community :class: note - Integrations from ADS may provide additional or experimental features in the latest updates, while the stable integrations (such as ``OCIModelDeploymentVLLM`` and ``OCIModelDeploymentTGI``) are also available from `LangChain Community `_. + While the stable integrations (such as ``OCIModelDeploymentVLLM`` and ``OCIModelDeploymentTGI``) are also available from `LangChain Community `_, integrations from ADS may provide additional or experimental features in the latest updates, . + +.. admonition:: Requirements + :class: note + + The LangChain integration requires ``python>=3.9`` and ``LangChain>=0.3`` + + +LangChain compatible models/interfaces are needed for LangChain applications to invoke LLMs deployed on OCI data science model deployment service. If you deploy LLM on OCI model deployment service using `AI Quick Actions `_ or `HuggingFace TGI `_ , you can use the integration models described in this page to build your application with LangChain. From 8b849231c3c2cd63cb3d75148b98a2f1d909a74b Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 16:54:02 -0400 Subject: [PATCH 23/26] Update LangChain docs. --- .../source/user_guide/large_language_model/langchain_models.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/user_guide/large_language_model/langchain_models.rst b/docs/source/user_guide/large_language_model/langchain_models.rst index 66d756e97..fb701212d 100644 --- a/docs/source/user_guide/large_language_model/langchain_models.rst +++ b/docs/source/user_guide/large_language_model/langchain_models.rst @@ -11,7 +11,7 @@ LangChain Integration .. admonition:: Requirements :class: note - The LangChain integration requires ``python>=3.9`` and ``LangChain>=0.3`` + The LangChain integration requires ``python>=3.9`` and ``langchain>=0.3`` LangChain compatible models/interfaces are needed for LangChain applications to invoke LLMs deployed on OCI data science model deployment service. From 7ad8616fda16af151cbe5efb7dccbef1cdb32adc Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 22:09:15 -0400 Subject: [PATCH 24/26] Skip guardrail test for python<=3.8 --- tests/unitary/with_extras/langchain/test_guardrails.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/unitary/with_extras/langchain/test_guardrails.py b/tests/unitary/with_extras/langchain/test_guardrails.py index 67da7ec28..5abecdf77 100644 --- a/tests/unitary/with_extras/langchain/test_guardrails.py +++ b/tests/unitary/with_extras/langchain/test_guardrails.py @@ -10,6 +10,12 @@ import sys from typing import Any, List, Dict, Mapping, Optional from unittest import TestCase +import pytest + +pytestmark = pytest.mark.skipif( + sys.version_info < (3, 9), reason="Requires Python 3.9 or higher" +) + from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain.llms.base import LLM from langchain.prompts import PromptTemplate @@ -18,9 +24,6 @@ from ads.llm.guardrails.base import BlockedByGuardrail, GuardrailIO from ads.llm.chain import GuardrailSequence from ads.llm.serialize import load, dump -import pytest - -pytestmark = pytest.mark.skipif(sys.version_info < (3, 9), reason="Requires Python 3.9 or higher") class FakeLLM(LLM): From 8cec51295f5fc632f7683b46bc5b5de1c5f6dc00 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Thu, 26 Sep 2024 22:09:26 -0400 Subject: [PATCH 25/26] Update copyright info in tests. --- .../langchain/chat_models/test_oci_data_science.py | 6 ++++++ .../langchain/llms/test_oci_model_deployment_endpoint.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py b/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py index 206ab7081..89ebce844 100644 --- a/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py +++ b/tests/unitary/with_extras/langchain/chat_models/test_oci_data_science.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2024 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + """Test OCI Data Science Model Deployment Endpoint.""" import sys diff --git a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py index 8ee95af41..16e2f04e6 100644 --- a/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py +++ b/tests/unitary/with_extras/langchain/llms/test_oci_model_deployment_endpoint.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*-- + +# Copyright (c) 2024 Oracle and/or its affiliates. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ + """Test OCI Data Science Model Deployment Endpoint.""" import sys From ecbc0c8d8b30669adbb784dbf92e90a851b38199 Mon Sep 17 00:00:00 2001 From: Qiu Qin Date: Mon, 30 Sep 2024 10:25:36 -0400 Subject: [PATCH 26/26] Skip guardrail tests on python 3.8 --- tests/unitary/with_extras/langchain/test_guardrails.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/unitary/with_extras/langchain/test_guardrails.py b/tests/unitary/with_extras/langchain/test_guardrails.py index 5abecdf77..3a97e3c3d 100644 --- a/tests/unitary/with_extras/langchain/test_guardrails.py +++ b/tests/unitary/with_extras/langchain/test_guardrails.py @@ -12,9 +12,8 @@ from unittest import TestCase import pytest -pytestmark = pytest.mark.skipif( - sys.version_info < (3, 9), reason="Requires Python 3.9 or higher" -) +if sys.version_info < (3, 9): + pytest.skip("Requires Python 3.9 or higher", allow_module_level=True) from langchain.callbacks.manager import CallbackManagerForLLMRun from langchain.llms.base import LLM