forked from theroyallab/tabbyAPI
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
182 lines (139 loc) · 5.84 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
"""The main tabbyAPI module. Contains the FastAPI server and endpoints."""
import asyncio
import json
import os
import pathlib
import platform
import signal
from loguru import logger
from typing import Optional
from common import config, gen_logging, sampling, model
from common.args import convert_args_to_dict, init_argparser
from common.auth import load_auth_keys
from common.logger import setup_logger
from common.networking import is_port_in_use
from common.signals import signal_handler
from common.utils import unwrap
from endpoints.server import export_openapi, start_api
from endpoints.utils import do_export_openapi
if not do_export_openapi:
from backends.exllamav2.utils import check_exllama_version
async def entrypoint_async():
"""Async entry function for program startup"""
network_config = config.network_config()
host = unwrap(network_config.get("host"), "127.0.0.1")
port = unwrap(network_config.get("port"), 5000)
# Check if the port is available and attempt to bind a fallback
if is_port_in_use(port):
fallback_port = port + 1
if is_port_in_use(fallback_port):
logger.error(
f"Ports {port} and {fallback_port} are in use by different services.\n"
"Please free up those ports or specify a different one.\n"
"Exiting."
)
return
else:
logger.warning(
f"Port {port} is currently in use. Switching to {fallback_port}."
)
port = fallback_port
# Initialize auth keys
load_auth_keys(unwrap(network_config.get("disable_auth"), False))
# Override the generation log options if given
log_config = config.logging_config()
if log_config:
gen_logging.update_from_dict(log_config)
gen_logging.broadcast_status()
# Set sampler parameter overrides if provided
sampling_config = config.sampling_config()
sampling_override_preset = sampling_config.get("override_preset")
if sampling_override_preset:
try:
sampling.overrides_from_file(sampling_override_preset)
except FileNotFoundError as e:
logger.warning(str(e))
# If an initial model name is specified, create a container
# and load the model
model_config = config.model_config()
model_name = model_config.get("model_name")
if model_name:
model_path = pathlib.Path(unwrap(model_config.get("model_dir"), "models"))
model_path = model_path / model_name
await model.load_model(model_path.resolve(), **model_config)
# Load loras after loading the model
lora_config = config.lora_config()
if lora_config.get("loras"):
lora_dir = pathlib.Path(unwrap(lora_config.get("lora_dir"), "loras"))
await model.container.load_loras(lora_dir.resolve(), **lora_config)
# If an initial embedding model name is specified, create a separate container
# and load the model
embedding_config = config.embeddings_config()
embedding_model_name = embedding_config.get("embedding_model_name")
if embedding_model_name:
embedding_model_path = pathlib.Path(
unwrap(embedding_config.get("embedding_model_dir"), "models")
)
embedding_model_path = embedding_model_path / embedding_model_name
try:
await model.load_embedding_model(embedding_model_path, **embedding_config)
except ImportError as ex:
logger.error(ex.msg)
await start_api(host, port)
def entrypoint(arguments: Optional[dict] = None):
setup_logger()
# Set up signal aborting
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)
# Load from YAML config
config.from_file(pathlib.Path("config.yml"))
# Parse and override config from args
if arguments is None:
parser = init_argparser()
arguments = convert_args_to_dict(parser.parse_args(), parser)
config.from_args(arguments)
if do_export_openapi:
openapi_json = export_openapi()
with open("openapi.json", "w") as f:
f.write(json.dumps(openapi_json))
logger.info("Successfully wrote OpenAPI spec to openapi.json")
return
developer_config = config.developer_config()
# Check exllamav2 version and give a descriptive error if it's too old
# Skip if launching unsafely
if unwrap(developer_config.get("unsafe_launch"), False):
logger.warning(
"UNSAFE: Skipping ExllamaV2 version check.\n"
"If you aren't a developer, please keep this off!"
)
else:
check_exllama_version()
# Enable CUDA malloc backend
if unwrap(developer_config.get("cuda_malloc_backend"), False):
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"
logger.warning("EXPERIMENTAL: Enabled the pytorch CUDA malloc backend.")
# Use Uvloop/Winloop
if unwrap(developer_config.get("uvloop"), False):
if platform.system() == "Windows":
from winloop import install
else:
from uvloop import install
# Set loop event policy
install()
logger.warning("EXPERIMENTAL: Running program with Uvloop/Winloop.")
# Set the process priority
if unwrap(developer_config.get("realtime_process_priority"), False):
import psutil
current_process = psutil.Process(os.getpid())
if platform.system() == "Windows":
current_process.nice(psutil.REALTIME_PRIORITY_CLASS)
else:
current_process.nice(psutil.IOPRIO_CLASS_RT)
logger.warning(
"EXPERIMENTAL: Process priority set to Realtime. \n"
"If you're not running on administrator/sudo, the priority is set to high."
)
# Enter into the async event loop
asyncio.run(entrypoint_async())
if __name__ == "__main__":
entrypoint()