Skip to content

Commit

Permalink
Batch OpenAI embeddings (#2715)
Browse files Browse the repository at this point in the history
* fix: updating deployments schemas

* openai embeddings

* black

* tags

* size

* size
  • Loading branch information
santiagxf authored Oct 12, 2023
1 parent 85f2e7c commit f30476c
Show file tree
Hide file tree
Showing 22 changed files with 8,624 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import glob
import mlflow
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List
from datasets import load_dataset

DATA_READERS = {
".csv": "csv",
".tsv": "tsv",
".parquet": "parquet",
".json": "json",
".jsonl": "json",
".arrow": "arrow",
".txt": "text",
}


def init():
global model
global output_file
global task_name
global text_column

# AZUREML_MODEL_DIR is the path where the model is located.
# If the model is MLFlow, you don't need to indicate further.
model_path = glob.glob(os.environ["AZUREML_MODEL_DIR"] + "/*/")[0]
# AZUREML_BI_TEXT_COLUMN is an environment variable you can use
# to indicate over which column you want to run the model on. It can
# used only if the model has one single input.
text_column = os.environ.get("AZUREML_BI_TEXT_COLUMN", None)

model = mlflow.pyfunc.load_model(model_path)
model_info = mlflow.models.get_model_info(model_path)

if not mlflow.openai.FLAVOR_NAME in model_info.flavors:
raise ValueError(
"The indicated model doesn't have an OpenAI flavor on it. Use "
"``mlflow.openai.log_model`` to log OpenAI models."
)

if text_column:
if (
model.metadata
and model.metadata.signature
and len(model.metadata.signature.inputs) > 1
):
raise ValueError(
"The model requires more than 1 input column to run. You can't use "
"AZUREML_BI_TEXT_COLUMN to indicate which column to send to the model. Format your "
f"data with columns {model.metadata.signature.inputs.input_names()} instead."
)

task_name = model._model_impl.model["task"]
output_path = os.environ["AZUREML_BI_OUTPUT_PATH"]
output_file = os.path.join(output_path, f"{task_name}.jsonl")


def run(mini_batch: List[str]):
if mini_batch:
filtered_files = filter(lambda x: Path(x).suffix in DATA_READERS, mini_batch)
results = []

for file in filtered_files:
data_format = Path(file).suffix
data = load_dataset(DATA_READERS[data_format], data_files={"data": file})[
"data"
].data.to_pandas()
if text_column:
data = data.loc[[text_column]]
scores = model.predict(data)
results.append(
pd.DataFrame(
{
"file": np.repeat(Path(file).name, len(scores)),
"row": range(0, len(scores)),
task_name: scores,
}
)
)

pd.concat(results, axis="rows").to_json(
output_file, orient="records", mode="a", lines=True
)

return mini_batch
3,721 changes: 3,721 additions & 0 deletions cli/endpoints/batch/deploy-models/openai-embeddings/data/billsum-0.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
set -e

# <set_variables>
export ENDPOINT_NAME="<YOUR_ENDPOINT_NAME>"
# </set_variables>

# <set_openai>
OPENAI_API_BASE="https://<deployment>.openai.azure.com/"
# </set_openai>

# <name_endpoint>
ENDPOINT_NAME="text-davinci-002"
# </name_endpoint>

# The following code ensures the created deployment has a unique name
ENDPOINT_SUFIX=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w ${1:-5} | head -n 1)
ENDPOINT_NAME="$ENDPOINT_NAME-$ENDPOINT_SUFIX"

echo "Register the model"
# <register_model>
MODEL_NAME='text-embedding-ada-002'
az ml model create --name $MODEL_NAME --path "model"
# </register_model>

echo "Creating batch endpoint $ENDPOINT_NAME"
# <create_endpoint>
az ml batch-endpoint create -n $ENDPOINT_NAME -f endpoint.yml
# </create_endpoint>

echo "Creating batch deployment $DEPLOYMENT_NAME for endpoint $ENDPOINT_NAME"
# <create_deployment>
az ml batch-deployment create --file deployment.yml \
--endpoint-name $ENDPOINT_NAME \
--set-default \
--set settings.environment_variables.OPENAI_API_BASE=$OPENAI_API_BASE
# </create_deployment>

echo "Invoking batch endpoint"
# <start_batch_scoring_job>
JOB_NAME=$(az ml batch-endpoint invoke --name $ENDPOINT_NAME --input data --query name -o tsv)
# </start_batch_scoring_job>

echo "Showing job detail"
# <show_job_in_studio>
az ml job show -n $JOB_NAME --web
# </show_job_in_studio>

echo "Stream job logs to console"
# <stream_job_logs>
az ml job stream -n $JOB_NAME
# </stream_job_logs>

# <check_job_status>
STATUS=$(az ml job show -n $JOB_NAME --query status -o tsv)
echo $STATUS
if [[ $STATUS == "Completed" ]]
then
echo "Job completed"
elif [[ $STATUS == "Failed" ]]
then
echo "Job failed"
exit 1
else
echo "Job status not failed or completed"
exit 2
fi
# </check_job_status>

echo "Download scores to local path"
# <download_outputs>
az ml job download --name $JOB_NAME --output-name score --download-path ./
# </download_outputs>

echo "Delete resources"
# <delete_endpoint>
az ml batch-endpoint delete --name $ENDPOINT_NAME --yes
# </delete_endpoint>
28 changes: 28 additions & 0 deletions cli/endpoints/batch/deploy-models/openai-embeddings/deployment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
$schema: https://azuremlschemas.azureedge.net/latest/batchDeployment.schema.json
endpoint_name: text-embedding-ada-qwerty
name: default
description: The default deployment for generating embeddings
type: model
model: azureml:text-embedding-ada-002@latest
environment:
name: batch-openai-mlflow
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
conda_file: environment/conda.yaml
code_configuration:
code: code
scoring_script: batch_driver.py
compute: azureml:batch-cluster-lp
resources:
instance_count: 1
settings:
max_concurrency_per_instance: 1
mini_batch_size: 1
output_action: summary_only
retry_settings:
max_retries: 1
timeout: 9999
logging_level: info
environment_variables:
OPENAI_API_TYPE: azure_ad
OPENAI_API_BASE: $OPENAI_API_BASE
OPENAI_API_VERSION: 2023-03-15-preview
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/batchEndpoint.schema.json
name: text-embedding-ada-qwerty
description: An endpoint to generate embeddings in batch for the ADA-002 model from OpenAI
auth_mode: aad_token
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
channels:
- conda-forge
dependencies:
- python=3.8.5
- pip<=23.2.1
- pip:
- openai==0.27.8
- requests==2.31.0
- tenacity==8.2.2
- tiktoken==0.4.0
- azureml-core
- azure-identity
- datasets
- mlflow
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: batch-openai-mlflow
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04
conda_file: conda.yaml
19 changes: 19 additions & 0 deletions cli/endpoints/batch/deploy-models/openai-embeddings/model/MLmodel
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
flavors:
openai:
code: null
data: model.yaml
openai_version: 0.27.8
python_function:
data: model.yaml
env:
conda: conda.yaml
virtualenv: python_env.yaml
loader_module: mlflow.openai
python_version: 3.8.5
mlflow_version: 2.5.1.dev0
model_uuid: b9a39a71f54e41efbd83b8307294b4d8
signature:
inputs: '[{"type": "string"}]'
outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "float64", "shape": [-1]}}]'
params: '[{"name": "batch_size", "dtype": "long", "default": 16, "shape": null}]'
utc_time_created: '2023-08-15 05:08:52.461694'
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
channels:
- conda-forge
dependencies:
- python=3.8.5
- pip<=23.2.1
- pip:
- mlflow==2.5.0
- gunicorn==20.1.0
- numpy==1.24.4
- openai==0.27.8
- requests==2.31.0
- tenacity==8.2.2
- tiktoken==0.4.0
name: mlflow-env
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
engine: text-embedding-ada-002
model: text-embedding-ada-002
task: embeddings
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python: 3.8.5
build_dependencies:
- pip==23.2.1
- setuptools
- wheel==0.38.4
dependencies:
- -r requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
mlflow==2.7.0
gunicorn==20.1.0
numpy==1.24.4
openai==0.27.8
requests==2.31.0
tenacity==8.2.2
tiktoken==0.4.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import os
import glob
import mlflow
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List
from datasets import load_dataset

DATA_READERS = {
".csv": "csv",
".tsv": "tsv",
".parquet": "parquet",
".json": "json",
".jsonl": "json",
".arrow": "arrow",
".txt": "text",
}


def init():
global model
global output_file
global task_name
global text_column

# AZUREML_MODEL_DIR is the path where the model is located.
# If the model is MLFlow, you don't need to indicate further.
model_path = glob.glob(os.environ["AZUREML_MODEL_DIR"] + "/*/")[0]
# AZUREML_BI_TEXT_COLUMN is an environment variable you can use
# to indicate over which column you want to run the model on. It can
# used only if the model has one single input.
text_column = os.environ.get("AZUREML_BI_TEXT_COLUMN", None)

model = mlflow.pyfunc.load_model(model_path)
model_info = mlflow.models.get_model_info(model_path)

if not mlflow.openai.FLAVOR_NAME in model_info.flavors:
raise ValueError(
"The indicated model doesn't have an OpenAI flavor on it. Use "
"``mlflow.openai.log_model`` to log OpenAI models."
)

if text_column:
if (
model.metadata
and model.metadata.signature
and len(model.metadata.signature.inputs) > 1
):
raise ValueError(
"The model requires more than 1 input column to run. You can't use "
"AZUREML_BI_TEXT_COLUMN to indicate which column to send to the model. Format your "
f"data with columns {model.metadata.signature.inputs.input_names()} instead."
)

task_name = model._model_impl.model["task"]
output_path = os.environ["AZUREML_BI_OUTPUT_PATH"]
output_file = os.path.join(output_path, f"{task_name}.jsonl")


def run(mini_batch: List[str]):
if mini_batch:
filtered_files = filter(lambda x: Path(x).suffix in DATA_READERS, mini_batch)
results = []

for file in filtered_files:
data_format = Path(file).suffix
data = load_dataset(DATA_READERS[data_format], data_files={"data": file})[
"data"
].data.to_pandas()
if text_column:
data = data.loc[[text_column]]
scores = model.predict(data)
results.append(
pd.DataFrame(
{
"file": np.repeat(Path(file).name, len(scores)),
"row": range(0, len(scores)),
task_name: scores,
}
)
)

pd.concat(results, axis="rows").to_json(
output_file, orient="records", mode="a", lines=True
)

return mini_batch
Loading

0 comments on commit f30476c

Please sign in to comment.