Batch OpenAI embeddings (#2715)

* fix: updating deployments schemas * openai embeddings * black * tags * size * size
Azure · Oct 12, 2023 · f30476c · f30476c
1 parent 85f2e7c
commit f30476c
Show file tree

Hide file tree

Showing 22 changed files with 8,624 additions and 0 deletions.
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/code/batch_driver.py b/cli/endpoints/batch/deploy-models/openai-embeddings/code/batch_driver.py
@@ -0,0 +1,88 @@
+import os
+import glob
+import mlflow
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import List
+from datasets import load_dataset
+
+DATA_READERS = {
+    ".csv": "csv",
+    ".tsv": "tsv",
+    ".parquet": "parquet",
+    ".json": "json",
+    ".jsonl": "json",
+    ".arrow": "arrow",
+    ".txt": "text",
+}
+
+
+def init():
+    global model
+    global output_file
+    global task_name
+    global text_column
+
+    # AZUREML_MODEL_DIR is the path where the model is located.
+    # If the model is MLFlow, you don't need to indicate further.
+    model_path = glob.glob(os.environ["AZUREML_MODEL_DIR"] + "/*/")[0]
+    # AZUREML_BI_TEXT_COLUMN is an environment variable you can use
+    # to indicate over which column you want to run the model on. It can
+    # used only if the model has one single input.
+    text_column = os.environ.get("AZUREML_BI_TEXT_COLUMN", None)
+
+    model = mlflow.pyfunc.load_model(model_path)
+    model_info = mlflow.models.get_model_info(model_path)
+
+    if not mlflow.openai.FLAVOR_NAME in model_info.flavors:
+        raise ValueError(
+            "The indicated model doesn't have an OpenAI flavor on it. Use "
+            "``mlflow.openai.log_model`` to log OpenAI models."
+        )
+
+    if text_column:
+        if (
+            model.metadata
+            and model.metadata.signature
+            and len(model.metadata.signature.inputs) > 1
+        ):
+            raise ValueError(
+                "The model requires more than 1 input column to run. You can't use "
+                "AZUREML_BI_TEXT_COLUMN to indicate which column to send to the model. Format your "
+                f"data with columns {model.metadata.signature.inputs.input_names()} instead."
+            )
+
+    task_name = model._model_impl.model["task"]
+    output_path = os.environ["AZUREML_BI_OUTPUT_PATH"]
+    output_file = os.path.join(output_path, f"{task_name}.jsonl")
+
+
+def run(mini_batch: List[str]):
+    if mini_batch:
+        filtered_files = filter(lambda x: Path(x).suffix in DATA_READERS, mini_batch)
+        results = []
+
+        for file in filtered_files:
+            data_format = Path(file).suffix
+            data = load_dataset(DATA_READERS[data_format], data_files={"data": file})[
+                "data"
+            ].data.to_pandas()
+            if text_column:
+                data = data.loc[[text_column]]
+            scores = model.predict(data)
+            results.append(
+                pd.DataFrame(
+                    {
+                        "file": np.repeat(Path(file).name, len(scores)),
+                        "row": range(0, len(scores)),
+                        task_name: scores,
+                    }
+                )
+            )
+
+        pd.concat(results, axis="rows").to_json(
+            output_file, orient="records", mode="a", lines=True
+        )
+
+    return mini_batch
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/data/billsum-0.csv b/cli/endpoints/batch/deploy-models/openai-embeddings/data/billsum-0.csv
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/deploy-and-run.sh b/cli/endpoints/batch/deploy-models/openai-embeddings/deploy-and-run.sh
@@ -0,0 +1,77 @@
+set -e
+
+# <set_variables>
+export ENDPOINT_NAME="<YOUR_ENDPOINT_NAME>"
+# </set_variables>
+
+# <set_openai>
+OPENAI_API_BASE="https://<deployment>.openai.azure.com/"
+# </set_openai>
+
+# <name_endpoint>
+ENDPOINT_NAME="text-davinci-002"
+# </name_endpoint>
+
+# The following code ensures the created deployment has a unique name
+ENDPOINT_SUFIX=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w ${1:-5} | head -n 1)
+ENDPOINT_NAME="$ENDPOINT_NAME-$ENDPOINT_SUFIX"
+
+echo "Register the model"
+# <register_model>
+MODEL_NAME='text-embedding-ada-002'
+az ml model create --name $MODEL_NAME --path "model"
+# </register_model>
+
+echo "Creating batch endpoint $ENDPOINT_NAME"
+# <create_endpoint>
+az ml batch-endpoint create -n $ENDPOINT_NAME -f endpoint.yml
+# </create_endpoint>
+
+echo "Creating batch deployment $DEPLOYMENT_NAME for endpoint $ENDPOINT_NAME"
+# <create_deployment>
+az ml batch-deployment create --file deployment.yml \
+                              --endpoint-name $ENDPOINT_NAME \
+                              --set-default \
+                              --set settings.environment_variables.OPENAI_API_BASE=$OPENAI_API_BASE
+# </create_deployment>
+
+echo "Invoking batch endpoint"
+# <start_batch_scoring_job>
+JOB_NAME=$(az ml batch-endpoint invoke --name $ENDPOINT_NAME --input data --query name -o tsv)
+# </start_batch_scoring_job>
+
+echo "Showing job detail"
+# <show_job_in_studio>
+az ml job show -n $JOB_NAME --web
+# </show_job_in_studio>
+
+echo "Stream job logs to console"
+# <stream_job_logs>
+az ml job stream -n $JOB_NAME
+# </stream_job_logs>
+
+# <check_job_status>
+STATUS=$(az ml job show -n $JOB_NAME --query status -o tsv)
+echo $STATUS
+if [[ $STATUS == "Completed" ]]
+then
+  echo "Job completed"
+elif [[ $STATUS ==  "Failed" ]]
+then
+  echo "Job failed"
+  exit 1
+else 
+  echo "Job status not failed or completed"
+  exit 2
+fi
+# </check_job_status>
+
+echo "Download scores to local path"
+# <download_outputs>
+az ml job download --name $JOB_NAME --output-name score --download-path ./
+# </download_outputs>
+
+echo "Delete resources"
+# <delete_endpoint>
+az ml batch-endpoint delete --name $ENDPOINT_NAME --yes
+# </delete_endpoint>
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/deployment.yml b/cli/endpoints/batch/deploy-models/openai-embeddings/deployment.yml
@@ -0,0 +1,28 @@
+$schema: https://azuremlschemas.azureedge.net/latest/batchDeployment.schema.json
+endpoint_name: text-embedding-ada-qwerty
+name: default
+description: The default deployment for generating embeddings
+type: model
+model: azureml:text-embedding-ada-002@latest
+environment:
+  name: batch-openai-mlflow
+  image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
+  conda_file: environment/conda.yaml
+code_configuration:
+  code: code
+  scoring_script: batch_driver.py
+compute: azureml:batch-cluster-lp
+resources:
+  instance_count: 1
+settings:
+  max_concurrency_per_instance: 1
+  mini_batch_size: 1
+  output_action: summary_only
+  retry_settings:
+    max_retries: 1
+    timeout: 9999
+  logging_level: info
+  environment_variables:
+    OPENAI_API_TYPE: azure_ad
+    OPENAI_API_BASE: $OPENAI_API_BASE
+    OPENAI_API_VERSION: 2023-03-15-preview
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/endpoint.yml b/cli/endpoints/batch/deploy-models/openai-embeddings/endpoint.yml
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/batchEndpoint.schema.json
+name: text-embedding-ada-qwerty
+description: An endpoint to generate embeddings in batch for the ADA-002 model from OpenAI
+auth_mode: aad_token
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/environment/conda.yaml b/cli/endpoints/batch/deploy-models/openai-embeddings/environment/conda.yaml
@@ -0,0 +1,14 @@
+channels:
+- conda-forge
+dependencies:
+- python=3.8.5
+- pip<=23.2.1
+- pip:
+  - openai==0.27.8
+  - requests==2.31.0
+  - tenacity==8.2.2
+  - tiktoken==0.4.0
+  - azureml-core
+  - azure-identity
+  - datasets
+  - mlflow
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/environment/environment.yml b/cli/endpoints/batch/deploy-models/openai-embeddings/environment/environment.yml
@@ -0,0 +1,4 @@
+$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
+name: batch-openai-mlflow
+image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
+conda_file: conda.yaml
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/model/MLmodel b/cli/endpoints/batch/deploy-models/openai-embeddings/model/MLmodel
@@ -0,0 +1,19 @@
+flavors:
+  openai:
+    code: null
+    data: model.yaml
+    openai_version: 0.27.8
+  python_function:
+    data: model.yaml
+    env:
+      conda: conda.yaml
+      virtualenv: python_env.yaml
+    loader_module: mlflow.openai
+    python_version: 3.8.5
+mlflow_version: 2.5.1.dev0
+model_uuid: b9a39a71f54e41efbd83b8307294b4d8
+signature:
+  inputs: '[{"type": "string"}]'
+  outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
+  params: '[{"name": "batch_size", "dtype": "long", "default": 16, "shape": null}]'
+utc_time_created: '2023-08-15 05:08:52.461694'
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/model/conda.yaml b/cli/endpoints/batch/deploy-models/openai-embeddings/model/conda.yaml
@@ -0,0 +1,14 @@
+channels:
+- conda-forge
+dependencies:
+- python=3.8.5
+- pip<=23.2.1
+- pip:
+  - mlflow==2.5.0
+  - gunicorn==20.1.0
+  - numpy==1.24.4
+  - openai==0.27.8
+  - requests==2.31.0
+  - tenacity==8.2.2
+  - tiktoken==0.4.0
+name: mlflow-env
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/model/model.yaml b/cli/endpoints/batch/deploy-models/openai-embeddings/model/model.yaml
@@ -0,0 +1,3 @@
+engine: text-embedding-ada-002
+model: text-embedding-ada-002
+task: embeddings
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/model/python_env.yaml b/cli/endpoints/batch/deploy-models/openai-embeddings/model/python_env.yaml
@@ -0,0 +1,7 @@
+python: 3.8.5
+build_dependencies:
+- pip==23.2.1
+- setuptools
+- wheel==0.38.4
+dependencies:
+- -r requirements.txt
diff --git a/cli/endpoints/batch/deploy-models/openai-embeddings/model/requirements.txt b/cli/endpoints/batch/deploy-models/openai-embeddings/model/requirements.txt
@@ -0,0 +1,7 @@
+mlflow==2.7.0
+gunicorn==20.1.0
+numpy==1.24.4
+openai==0.27.8
+requests==2.31.0
+tenacity==8.2.2
+tiktoken==0.4.0
diff --git a/sdk/python/endpoints/batch/deploy-models/openai-embeddings/code/batch_driver.py b/sdk/python/endpoints/batch/deploy-models/openai-embeddings/code/batch_driver.py
@@ -0,0 +1,88 @@
+import os
+import glob
+import mlflow
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from typing import List
+from datasets import load_dataset
+
+DATA_READERS = {
+    ".csv": "csv",
+    ".tsv": "tsv",
+    ".parquet": "parquet",
+    ".json": "json",
+    ".jsonl": "json",
+    ".arrow": "arrow",
+    ".txt": "text",
+}
+
+
+def init():
+    global model
+    global output_file
+    global task_name
+    global text_column
+
+    # AZUREML_MODEL_DIR is the path where the model is located.
+    # If the model is MLFlow, you don't need to indicate further.
+    model_path = glob.glob(os.environ["AZUREML_MODEL_DIR"] + "/*/")[0]
+    # AZUREML_BI_TEXT_COLUMN is an environment variable you can use
+    # to indicate over which column you want to run the model on. It can
+    # used only if the model has one single input.
+    text_column = os.environ.get("AZUREML_BI_TEXT_COLUMN", None)
+
+    model = mlflow.pyfunc.load_model(model_path)
+    model_info = mlflow.models.get_model_info(model_path)
+
+    if not mlflow.openai.FLAVOR_NAME in model_info.flavors:
+        raise ValueError(
+            "The indicated model doesn't have an OpenAI flavor on it. Use "
+            "``mlflow.openai.log_model`` to log OpenAI models."
+        )
+
+    if text_column:
+        if (
+            model.metadata
+            and model.metadata.signature
+            and len(model.metadata.signature.inputs) > 1
+        ):
+            raise ValueError(
+                "The model requires more than 1 input column to run. You can't use "
+                "AZUREML_BI_TEXT_COLUMN to indicate which column to send to the model. Format your "
+                f"data with columns {model.metadata.signature.inputs.input_names()} instead."
+            )
+
+    task_name = model._model_impl.model["task"]
+    output_path = os.environ["AZUREML_BI_OUTPUT_PATH"]
+    output_file = os.path.join(output_path, f"{task_name}.jsonl")
+
+
+def run(mini_batch: List[str]):
+    if mini_batch:
+        filtered_files = filter(lambda x: Path(x).suffix in DATA_READERS, mini_batch)
+        results = []
+
+        for file in filtered_files:
+            data_format = Path(file).suffix
+            data = load_dataset(DATA_READERS[data_format], data_files={"data": file})[
+                "data"
+            ].data.to_pandas()
+            if text_column:
+                data = data.loc[[text_column]]
+            scores = model.predict(data)
+            results.append(
+                pd.DataFrame(
+                    {
+                        "file": np.repeat(Path(file).name, len(scores)),
+                        "row": range(0, len(scores)),
+                        task_name: scores,
+                    }
+                )
+            )
+
+        pd.concat(results, axis="rows").to_json(
+            output_file, orient="records", mode="a", lines=True
+        )
+
+    return mini_batch