-
Notifications
You must be signed in to change notification settings - Fork 3
/
infinity_snowflake_arctic_embed_l_335m.py
79 lines (66 loc) · 2.58 KB
/
infinity_snowflake_arctic_embed_l_335m.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# # Fast inference with Infinity (Snowflake/snowflake-arctic-embed-l)
import os
import subprocess
import secrets
from modal import Image, Secret, App, enter, gpu, method, web_server
MODEL_DIR = "/model"
BASE_MODEL = "Snowflake/snowflake-arctic-embed-l"
# ## Define a container image
# We want to create a Modal image which has the model weights pre-saved to a directory. The benefit of this
# is that the container no longer has to re-download the model from Huggingface - instead, it will take
# advantage of Modal's internal filesystem for faster cold starts.
#
# ### Download the weights
# We can download the model to a particular directory using the HuggingFace utility function `snapshot_download`.
#
# Tip: avoid using global variables in this function. Changes to code outside this function will not be detected and the download step will not re-run.
def download_model_to_folder():
from huggingface_hub import snapshot_download
from transformers.utils import move_cache
os.makedirs(MODEL_DIR, exist_ok=True)
snapshot_download(
BASE_MODEL,
local_dir=MODEL_DIR,
ignore_patterns=["*.pt", "*.bin"], # Using safetensors
)
move_cache()
# ### Image definition
# We'll start from a recommended Docker Hub image and install `vLLM`.
# Then we'll use `run_function` to run the function defined above to ensure the weights of
# the model are saved within the container image.
image = (
Image.from_registry("nvidia/cuda:12.1.1-devel-ubuntu22.04", add_python="3.10")
.pip_install(
"wheel==0.44.0",
"huggingface_hub==0.25.0",
"hf-transfer==0.1.8",
"torch==2.4.1",
"transformers==4.44.2",
"sentence-transformers==3.1.0",
"infinity_emb[all]==0.0.56"
)
.apt_install("git")
# Use the barebones hf-transfer package for maximum download speeds. No progress bar, but expect 700MB/s.
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.run_function(
download_model_to_folder,
secrets=[Secret.from_name("huggingface")],
timeout=60 * 20,
)
)
app = App("infinity-snowflake-arctic-embed-l-335m", image=image)
GPU_CONFIG = gpu.T4(count=1)
# Run a web server on port 8000 and expose vLLM OpenAI compatible server
@app.function(
allow_concurrent_inputs=100,
container_idle_timeout=15,
gpu=GPU_CONFIG,
secrets=[
Secret.from_name("huggingface"),
Secret.from_dotenv(),
],
)
@web_server(7997, startup_timeout=300)
def infinity_embeddings_server():
cmd = f"infinity_emb v2 --device cuda --engine torch --model-id {BASE_MODEL}"
subprocess.Popen(cmd, shell=True)