-
Notifications
You must be signed in to change notification settings - Fork 55
/
swiss_army_llama.py
1468 lines (1314 loc) · 85 KB
/
swiss_army_llama.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import shared_resources
from shared_resources import initialize_globals, download_models, is_gpu_available
from logger_config import setup_logger
from database_functions import AsyncSessionLocal
from ramdisk_functions import clear_ramdisk
from misc_utility_functions import build_faiss_indexes, configure_redis_optimally
from embeddings_data_models import DocumentEmbedding, ShowLogsIncrementalModel
from embeddings_data_models import EmbeddingRequest, SemanticSearchRequest, AdvancedSemanticSearchRequest, SimilarityRequest, TextCompletionRequest, AddGrammarRequest
from embeddings_data_models import EmbeddingResponse, SemanticSearchResponse, AdvancedSemanticSearchResponse, SimilarityResponse, AllStringsResponse, AllDocumentsResponse, TextCompletionResponse, AudioTranscriptResponse, ImageQuestionResponse, AddGrammarResponse
from service_functions import get_or_compute_embedding, get_or_compute_transcript, add_model_url, download_file, start_resource_monitoring, end_resource_monitoring, decompress_data
from service_functions import get_list_of_corpus_identifiers_from_list_of_embedding_texts, compute_embeddings_for_document, parse_submitted_document_file_into_sentence_strings_func
from service_functions import generate_completion_from_llm, ask_question_about_image, validate_bnf_grammar_func, convert_document_to_sentences_func, get_audio_duration_seconds, prepare_string_for_embedding
from grammar_builder import GrammarBuilder
from log_viewer_functions import show_logs_incremental_func, show_logs_func
from uvicorn_config import option
import asyncio
import glob
import json
import os
import sys
import random
import tempfile
import traceback
import zipfile
from pathlib import Path
from datetime import datetime
from hashlib import sha3_256
from typing import List, Optional, Dict, Any
from urllib.parse import unquote
import numpy as np
from decouple import config
import uvicorn
import fastapi
from fastapi import FastAPI, HTTPException, Request, UploadFile, File, Form
from fastapi.responses import JSONResponse, FileResponse, HTMLResponse
from contextlib import asynccontextmanager
from sqlalchemy import select
from sqlalchemy import text as sql_text
from sqlalchemy.exc import SQLAlchemyError
import faiss
import fast_vector_similarity as fvs
import uvloop
from magika import Magika
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
logger = setup_logger()
magika = Magika()
gpu_check_results = is_gpu_available()
configure_redis_optimally()
logger.info(f"\nGPU check results:\n {gpu_check_results}\n")
class GracefulExit(BaseException):
pass
def raise_graceful_exit():
raise GracefulExit()
@asynccontextmanager
async def lifespan(app: FastAPI):
# Startup code
await initialize_globals()
yield
# Shutdown code (if any)
pass
# Note: the Ramdisk setup and teardown requires sudo; to enable password-less sudo, edit your sudoers file with `sudo visudo`.
# Add the following lines, replacing username with your actual username
# username ALL=(ALL) NOPASSWD: /bin/mount -t tmpfs -o size=*G tmpfs /mnt/ramdisk
# username ALL=(ALL) NOPASSWD: /bin/umount /mnt/ramdisk
# Global variables
use_hardcoded_security_token = 0
if use_hardcoded_security_token:
SECURITY_TOKEN = "Test123$"
USE_SECURITY_TOKEN = config("USE_SECURITY_TOKEN", default=False, cast=bool)
else:
USE_SECURITY_TOKEN = False
DEFAULT_MODEL_NAME = config("DEFAULT_MODEL_NAME", default="Meta-Llama-3-8B-Instruct.Q3_K_S", cast=str)
DEFAULT_EMBEDDING_MODEL_NAME = config("DEFAULT_EMBEDDING_MODEL_NAME", default="nomic-embed-text-v1.5.Q6_K", cast=str)
DEFAULT_MULTI_MODAL_MODEL_NAME = config("DEFAULT_MULTI_MODAL_MODEL_NAME", default="llava-llama-3-8b-v1_1-int4", cast=str)
USE_RAMDISK = config("USE_RAMDISK", default=False, cast=bool)
USE_RESOURCE_MONITORING = config("USE_RESOURCE_MONITORING", default=1, cast=bool)
RAMDISK_PATH = config("RAMDISK_PATH", default="/mnt/ramdisk", cast=str)
BASE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING = config("MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING", default=100, cast=int)
DEFAULT_COMPLETION_TEMPERATURE = config("DEFAULT_COMPLETION_TEMPERATURE", default=0.7, cast=float)
DEFAULT_MAX_COMPLETION_TOKENS = config("DEFAULT_MAX_COMPLETION_TOKENS", default=1000, cast=int)
DEFAULT_NUMBER_OF_COMPLETIONS_TO_GENERATE = config("DEFAULT_NUMBER_OF_COMPLETIONS_TO_GENERATE", default=1, cast=int)
DEFAULT_EMBEDDING_POOLING_METHOD = config("DEFAULT_EMBEDDING_POOLING_METHOD", default="mean", cast=str)
logger.info(f"USE_RAMDISK is set to: {USE_RAMDISK}")
description_string = """
🇨🇭🎖️🦙 Swiss Army Llama is your One-Stop-Shop to Quickly and Conveniently Integrate Powerful Local LLM Functionality into your Project via a REST API.
"""
app = FastAPI(title="Swiss Army Llama", description=description_string, docs_url="/", lifespan=lifespan) # Set the Swagger UI to root
@app.exception_handler(SQLAlchemyError)
async def sqlalchemy_exception_handler(request: Request, exc: SQLAlchemyError) -> JSONResponse:
logger.exception(exc)
return JSONResponse(status_code=500, content={"message": "Database error occurred"})
@app.exception_handler(Exception)
async def general_exception_handler(request: Request, exc: Exception) -> JSONResponse:
logger.exception(exc)
return JSONResponse(status_code=500, content={"message": "An unexpected error occurred"})
@app.get("/", include_in_schema=False)
async def custom_swagger_ui_html():
return fastapi.templating.get_swagger_ui_html(openapi_url="/openapi.json", title=app.title, swagger_favicon_url=app.swagger_ui_favicon_url)
@app.get("/get_list_of_available_model_names/",
summary="Retrieve Available Model Names",
description="""Retrieve the list of available model names for generating embeddings.
### Parameters:
- `token`: Security token (optional).
### Response:
The response will include a JSON object containing the list of available model names. Note that these are all GGML format models designed to work with llama_cpp.
### Example Response:
```json
{
"model_names": ["Meta-Llama-3-8B-Instruct.Q3_K_S", "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M", "my_super_custom_model"]
}
```""",
response_description="A JSON object containing the list of available model names.")
async def get_list_of_available_model_names(token: str = None) -> Dict[str, List[str]]:
if USE_SECURITY_TOKEN and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
models_dir = os.path.join(RAMDISK_PATH, 'models') if USE_RAMDISK else os.path.join(BASE_DIRECTORY, 'models')
logger.info(f"Looking for models in: {models_dir}") # Add this line for debugging
logger.info(f"Directory content: {os.listdir(models_dir)}") # Add this line for debugging
model_files = glob.glob(os.path.join(models_dir, "*.bin")) + glob.glob(os.path.join(models_dir, "*.gguf")) # Find all files with .bin or .gguf extension
model_names = sorted([os.path.splitext(os.path.basename(model_file))[0] for model_file in model_files]) # Remove both extensions, but ignore other periods in the filename
return {"model_names": model_names}
@app.get("/get_list_of_available_bnf_grammars",
response_model=List[str],
summary="Get Available BNF Grammars",
description="Returns a list of all the valid .gbnf files in the grammar_files directory.",
response_description="A list containing the names of all valid .gbnf files.")
async def get_list_of_available_bnf_grammars(token: str = None) -> List[str]:
if USE_SECURITY_TOKEN and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
try:
grammar_files_dir = 'grammar_files'
if not os.path.exists(grammar_files_dir):
os.makedirs(grammar_files_dir)
valid_files = [f for f in os.listdir(grammar_files_dir) if f.endswith('.gbnf')]
return valid_files
except Exception as e:
raise HTTPException(status_code=500, detail=f"An error occurred: {e}")
@app.get("/get_all_stored_strings/",
summary="Retrieve All Strings",
description="""Retrieve a list of all stored strings from the database for which embeddings have been computed.
### Parameters:
- `token`: Security token (optional).
### Response:
The response will include a JSON object containing the list of all stored strings with computed embeddings.
### Example Response:
```json
{
"strings": ["The quick brown fox jumps over the lazy dog", "To be or not to be", "Hello, World!"]
}
```""",
response_description="A JSON object containing the list of all strings with computed embeddings.")
async def get_all_stored_strings(req: Request, token: str = None) -> AllStringsResponse:
logger.info("Received request to retrieve all stored strings for which embeddings have been computed")
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
logger.warning(f"Unauthorized request to retrieve all stored strings for which embeddings have been computed from {req.client.host}")
raise HTTPException(status_code=403, detail="Unauthorized")
try:
logger.info("Retrieving all stored strings with computed embeddings from the database")
async with AsyncSessionLocal() as session:
result = await session.execute(sql_text("SELECT DISTINCT text FROM embeddings"))
all_strings = [row[0] for row in result.fetchall()]
logger.info(f"Retrieved {len(all_strings):,} stored strings with computed embeddings from the database; Last 10 embedded strings: {all_strings[-10:]}")
return {"strings": all_strings}
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
logger.error(traceback.format_exc()) # Print the traceback
raise HTTPException(status_code=500, detail="Internal Server Error")
@app.get("/get_all_stored_documents/",
summary="Retrieve All Stored Documents",
description="""Retrieve a list of all stored documents from the database for which embeddings have been computed.
### Parameters:
- `token`: Security token (optional).
### Response:
The response will include a JSON object containing the list of all stored documents with computed embeddings.
### Example Response:
```json
{
"documents": ["document1.pdf", "document2.txt", "document3.md", "document4.json"]
}
```""",
response_description="A JSON object containing the list of all documents with computed embeddings.")
async def get_all_stored_documents(req: Request, token: str = None) -> AllDocumentsResponse:
logger.info("Received request to retrieve all stored documents with computed embeddings")
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
logger.warning(f"Unauthorized request to retrieve all stored documents for which all sentence embeddings have been computed from {req.client.host}")
raise HTTPException(status_code=403, detail="Unauthorized")
try:
logger.info("Retrieving all stored documents with computed embeddings from the database")
async with AsyncSessionLocal() as session:
result = await session.execute(sql_text("SELECT DISTINCT filename FROM document_embeddings"))
all_documents = [row[0] for row in result.fetchall()]
logger.info(f"Retrieved {len(all_documents):,} stored documents with computed embeddings from the database; Last 10 processed document filenames: {all_documents[-10:]}")
return {"documents": all_documents}
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
logger.error(traceback.format_exc()) # Print the traceback
raise HTTPException(status_code=500, detail="Internal Server Error")
@app.post("/add_new_model/",
summary="Add New Model by URL",
description="""Submit a new model URL for download and use. The model must satisfy the following criteria:
1. Must be in `.gguf` format.
2. Must be larger than 100 MB to ensure it's a valid model file.
### Parameters:
- `model_url`: The URL of the model weight file, which must end with `.gguf`. For example: `https://huggingface.co/kirp/TinyLlama-1.1B-Chat-v0.2-gguf/blob/main/ggml-model-q5_k_m.gguf`
- `token`: Security token (optional).
### Response:
The response will include a JSON object indicating whether the model was successfully added and downloaded. Possible status values are:
- `success`: Model was added and downloaded successfully.
- `failure`: Model download failed, likely because it's not a valid model file.
- `error`: The URL did not point to a `.gguf` file.
- `unknown`: An unexpected error occurred.
### Example Response:
```json
{
"status": "success",
"message": "Model added and downloaded successfully."
}
```
""",
response_description="A JSON object indicating the status of the model addition and download.")
async def add_new_model(model_url: str, token: str = None) -> Dict[str, Any]:
if USE_SECURITY_TOKEN and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
unique_id = f"add_model_{hash(model_url)}" # Generate a unique lock ID based on the model_url
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
decoded_model_url = unquote(model_url)
if not decoded_model_url.endswith('.gguf'):
return {"status": "error", "message": "Model URL must point to a .gguf file."}
corrected_model_url = add_model_url(decoded_model_url)
_, download_status = download_models()
status_dict = {status["url"]: status for status in download_status}
if corrected_model_url in status_dict:
return {"status": status_dict[corrected_model_url]["status"], "message": status_dict[corrected_model_url]["message"]}
return {"status": "unknown", "message": "Unexpected error."}
finally:
await shared_resources.lock_manager.unlock(lock)
else:
return {"status": "already processing", "message": "Another worker is already processing this model URL."}
@app.post("/get_embedding_vector_for_string/",
response_model=EmbeddingResponse,
summary="Retrieve Embedding Vector for a Given Text String",
description="""Retrieve the embedding vector for a given input text string using the specified model.
### Parameters:
- `request`: A JSON object containing the input text string (`text`) and the model name.
- `token`: Security token (optional).
- `document_file_hash`: The SHA3-256 hash of the document file, if applicable (optional).
### Request JSON Format:
The request must contain the following attributes:
- `text`: The input text for which the embedding vector is to be retrieved.
- `llm_model_name`: The model used to calculate the embedding (optional, will use the default model if not provided).
- `embedding_pooling_method`: The method used to pool the embeddings (Choices: 'mean', 'mins_maxes', 'svd', 'svd_first_four', 'ica', 'factor_analysis', 'gaussian_random_projection'; default is 'mean').
### Example (note that `llm_model_name` is optional):
```json
{
"text": "This is a sample text.",
"llm_model_name": "nomic-embed-text-v1.5.Q6_K",
"embedding_pooling_method": "svd",
"corpus_identifier_string": "pastel_related_documentation_corpus"
}
```
### Response:
The response will include the embedding vector for the input text string.
### Example Response:
```json
{
"embedding": [0.1234, 0.5678, ...]
}
```""", response_description="A JSON object containing the embedding vector for the input text.")
async def get_embedding_vector_for_string(request: EmbeddingRequest, req: Request = None, token: str = None, client_ip: str = None, document_file_hash: str = None) -> EmbeddingResponse:
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
logger.warning(f"Unauthorized request from client IP {client_ip}")
raise HTTPException(status_code=403, detail="Unauthorized")
try:
request.text = prepare_string_for_embedding(request.text)
unique_id = f"get_embedding_{request.text}_{request.llm_model_name}_{request.embedding_pooling_method}"
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
input_data = {"text": request.text}
context = start_resource_monitoring("get_embedding_vector_for_string", input_data, req.client.host if req else "localhost")
return await get_or_compute_embedding(request, req, client_ip, document_file_hash)
finally:
end_resource_monitoring(context)
await shared_resources.lock_manager.unlock(lock)
else:
return {"status": "already processing"}
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
logger.error(traceback.format_exc()) # Print the traceback
raise HTTPException(status_code=500, detail="Internal Server Error")
@app.post("/compute_similarity_between_strings/",
response_model=SimilarityResponse,
summary="Compute Similarity Between Two Strings",
description="""Compute the similarity between two given input strings using specified model embeddings and a selected similarity measure.
### Parameters:
- `request`: A JSON object containing the two strings, the model name, and the similarity measure.
- `token`: Security token (optional).
### Request JSON Format:
The request must contain the following attributes:
- `text1`: The first input text.
- `text2`: The second input text.
- `llm_model_name`: The model used to calculate embeddings (optional).
- `similarity_measure`: The similarity measure to be used. Supported measures include `all`, `spearman_rho`, `kendall_tau`, `approximate_distance_correlation`, `jensen_shannon_dependency_measure`, `normalized_mutual_information`, and `hoeffding_d` (optional, default is `all`).
### Example Request (note that `llm_model_name` and `similarity_measure` are optional):
```json
{
"text1": "This is a sample text.",
"text2": "This is another sample text.",
"llm_model_name": "nomic-embed-text-v1.5.Q6_K",
"similarity_measure": "all"
}
```""")
async def compute_similarity_between_strings(request: SimilarityRequest, req: Request, token: str = None) -> SimilarityResponse:
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
logger.info(f"Received request: {request}")
request_time = datetime.utcnow()
request.text1 = prepare_string_for_embedding(request.text1)
request.text2 = prepare_string_for_embedding(request.text2)
similarity_measure = request.similarity_measure.lower()
unique_id = f"compute_similarity_{request.text1}_{request.text2}_{request.llm_model_name}_{request.embedding_pooling_method}_{similarity_measure}"
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
client_ip = req.client.host if req else "localhost"
embedding_request1 = EmbeddingRequest(text=request.text1, llm_model_name=request.llm_model_name, embedding_pooling_method=request.embedding_pooling_method)
embedding_request2 = EmbeddingRequest(text=request.text2, llm_model_name=request.llm_model_name, embedding_pooling_method=request.embedding_pooling_method)
embedding1_response = await get_or_compute_embedding(request=embedding_request1, req=req, client_ip=client_ip, use_verbose=False)
embedding2_response = await get_or_compute_embedding(request=embedding_request2, req=req, client_ip=client_ip, use_verbose=False)
embedding1 = np.array(embedding1_response["embedding"])
embedding2 = np.array(embedding2_response["embedding"])
if embedding1.size == 0 or embedding2.size == 0:
raise HTTPException(status_code=400, detail="Could not calculate embeddings for the given texts")
params = {
"vector_1": embedding1.tolist(),
"vector_2": embedding2.tolist(),
"similarity_measure": similarity_measure
}
similarity_stats_str = fvs.py_compute_vector_similarity_stats(json.dumps(params))
similarity_stats_json = json.loads(similarity_stats_str)
if similarity_measure == 'all':
similarity_score = similarity_stats_json
else:
similarity_score = similarity_stats_json.get(similarity_measure, None)
if similarity_score is None:
raise HTTPException(status_code=400, detail="Invalid similarity measure specified")
response_time = datetime.utcnow()
total_time = (response_time - request_time).total_seconds()
logger.info(f"Computed similarity using {similarity_measure} in {total_time:,.2f} seconds; similarity score: {similarity_score}")
return {
"text1": request.text1,
"text2": request.text2,
"similarity_measure": similarity_measure,
"similarity_score": similarity_score,
"embedding1": embedding1.tolist(),
"embedding2": embedding2.tolist()
}
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
raise HTTPException(status_code=500, detail="Internal Server Error")
finally:
await shared_resources.lock_manager.unlock(lock)
else:
return {"status": "already processing"}
@app.post("/search_stored_embeddings_with_query_string_for_semantic_similarity/",
response_model=SemanticSearchResponse,
summary="Get Most Similar Strings from Stored Embeddings in Database",
description="""Find the most similar strings in the database to the given input "query" text. This endpoint uses a pre-computed FAISS index to quickly search for the closest matching strings.
### Parameters:
- `request`: A JSON object containing the query text, model name, an optional corpus identifier string to restrict the search to a specific corpus, and an optional number of most semantically similar strings to return.
- `req`: HTTP request object (internal use).
- `token`: Security token (optional).
### Request JSON Format:
The request must contain the following attributes:
- `query_text`: The input text for which to find the most similar string.
- `llm_model_name`: The model used to calculate embeddings.
- `embedding_pooling_method`: The method used to pool the embeddings (Choices: 'mean', 'mins_maxes', 'svd', 'svd_first_four', 'ica', 'factor_analysis', 'gaussian_random_projection'; default is 'mean').
- `corpus_identifier_string`: An optional string identifier to restrict the search to a specific corpus.
- `number_of_most_similar_strings_to_return`: (Optional) The number of most similar strings to return, defaults to 10.
### Example:
```json
{
"query_text": "Find me the most similar string!",
"llm_model_name": "nomic-embed-text-v1.5.Q6_K",
"corpus_identifier_string": "pastel_related_documentation_corpus",
"embedding_pooling_method": "mean",
"number_of_most_similar_strings_to_return": 5
}
```
### Response:
The response will include the most similar strings found in the database, along with the similarity scores and the corpus identifier string used for the search.
### Example Response:
```json
{
"query_text": "Find me the most similar string!",
"results": [
{"search_result_text": "This is the most similar string!", "similarity_to_query_text": 0.9823},
{"search_result_text": "Another similar string.", "similarity_to_query_text": 0.9721},
...
]
}
```""",
response_description="A JSON object containing the query text along with the most similar strings and similarity scores.")
async def search_stored_embeddings_with_query_string_for_semantic_similarity(request: SemanticSearchRequest, req: Request, token: str = None) -> SemanticSearchResponse:
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
global faiss_indexes, associated_texts_by_model_and_pooling_method
request_time = datetime.utcnow()
request.query_text = prepare_string_for_embedding(request.query_text)
unique_id = f"semantic_search_{request.query_text}_{request.llm_model_name}_{request.embedding_pooling_method}_{request.corpus_identifier_string}_{request.number_of_most_similar_strings_to_return}" # Unique ID for this operation
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
faiss_indexes, associated_texts_by_model_and_pooling_method = await build_faiss_indexes(force_rebuild=True)
try:
faiss_index = faiss_indexes[(request.llm_model_name, request.embedding_pooling_method)]
except KeyError:
raise HTTPException(status_code=400, detail=f"No FAISS index found for model: {request.llm_model_name} and pooling method: {request.embedding_pooling_method}")
llm_model_name = request.llm_model_name
embedding_pooling_method = request.embedding_pooling_method
num_results = request.number_of_most_similar_strings_to_return
num_results_before_corpus_filter = num_results*25
total_entries = len(associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]) # Get the total number of entries for the model and pooling method
num_results = min(num_results, total_entries) # Ensure num_results doesn't exceed the total number of entries
num_results_before_corpus_filter = min(num_results_before_corpus_filter, total_entries) # Ensure num_results_before_corpus_filter doesn't exceed the total number of entries
logger.info(f"Received request to find {num_results:,} most similar strings for query text: `{request.query_text}` using model: {llm_model_name}, pooling method: {embedding_pooling_method}, and corpus: {request.corpus_identifier_string}")
try:
logger.info(f"Computing embedding for input text: {request.query_text}")
embedding_request = EmbeddingRequest(text=request.query_text, llm_model_name=request.llm_model_name, embedding_pooling_method=request.embedding_pooling_method, corpus_identifier_string=request.corpus_identifier_string)
embedding_response = await get_or_compute_embedding(embedding_request, req)
embedding_json = embedding_response["text_embedding_dict"]["embedding_json"]
embedding_vector = json.loads(embedding_json)
input_embedding = np.array(embedding_vector).astype('float32').reshape(1, -1)
faiss.normalize_L2(input_embedding) # Normalize the input vector for cosine similarity
results = [] # Create an empty list to store the results
faiss_index = faiss_indexes[(llm_model_name, embedding_pooling_method)]
associated_texts = associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]
list_of_corpus_identifier_strings = await get_list_of_corpus_identifiers_from_list_of_embedding_texts(associated_texts, llm_model_name, embedding_pooling_method)
logger.info(f"Searching for the most similar string in the FAISS index using {embedding_pooling_method} embeddings")
if faiss_index is None:
raise HTTPException(status_code=400, detail=f"No FAISS index found for model: {llm_model_name} and pooling method: {embedding_pooling_method}")
similarities, indices = faiss_index.search(input_embedding.reshape(1, -1), num_results_before_corpus_filter) # Search for num_results similar strings
for ii in range(num_results_before_corpus_filter):
index = indices[0][ii]
if index < len(associated_texts):
similarity = float(similarities[0][ii]) # Convert numpy.float32 to native float
most_similar_text = associated_texts[index]
corpus_identifier_string = list_of_corpus_identifier_strings[index]
if (corpus_identifier_string == request.corpus_identifier_string) and (most_similar_text != request.query_text) and (len(results) <= num_results):
results.append({"search_result_text": most_similar_text, "similarity_to_query_text": similarity})
else:
logger.warning(f"Index {index} out of range for model {llm_model_name} and pooling method {embedding_pooling_method}")
response_time = datetime.utcnow()
total_time = (response_time - request_time).total_seconds()
logger.info(f"Finished searching for the most similar string in the FAISS index in {total_time:,.2f} seconds. Found {len(results):,} results, returning the top {num_results:,}.")
logger.info(f"Found most similar strings for query string {request.query_text}: {results}")
if len(results) == 0:
logger.info(f"No results found for query string {request.query_text}.")
raise HTTPException(status_code=400, detail=f"No results found for query string {request.query_text} and model {llm_model_name} and pooling method {embedding_pooling_method} and corpus {request.corpus_identifier_string}.")
return {"query_text": request.query_text, "corpus_identifier_string": request.corpus_identifier_string, "embedding_pooling_method": embedding_pooling_method, "results": results} # Return the response matching the SemanticSearchResponse model
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
logger.error(traceback.format_exc()) # Print the traceback
raise HTTPException(status_code=500, detail="Internal Server Error")
finally:
await shared_resources.lock_manager.unlock(lock)
else:
return {"status": "already processing"}
@app.post("/advanced_search_stored_embeddings_with_query_string_for_semantic_similarity/",
response_model=AdvancedSemanticSearchResponse,
summary="Advanced Semantic Search with Two-Step Similarity Measures",
description="""Perform an advanced semantic search by first using FAISS and cosine similarity to narrow down the most similar strings in the database, and then applying additional similarity measures for finer comparison.
### Parameters:
- `request`: A JSON object containing the query text, model name, an optional corpus identifier string to restrict the search to a specific corpus, an optional similarity filter percentage, and an optional number of most similar strings to return.
- `req`: HTTP request object (internal use).
- `token`: Security token (optional).
### Request JSON Format:
The request must contain the following attributes:
- `query_text`: The input text for which to find the most similar string.
- `llm_model_name`: The model used to calculate embeddings.
- `embedding_pooling_method`: The method used to pool the embeddings (Choices: 'mean', 'mins_maxes', 'svd', 'svd_first_four', 'ica', 'factor_analysis', 'gaussian_random_projection'; default is 'mean').
- `corpus_identifier_string`: An optional string identifier to restrict the search to a specific corpus.
- `similarity_filter_percentage`: (Optional) The percentage of embeddings to filter based on cosine similarity, defaults to 0.02 (i.e., top 2%).
- `number_of_most_similar_strings_to_return`: (Optional) The number of most similar strings to return after applying the second similarity measure, defaults to 10.
- `result_sorting_metric`: (Optional) The metric to sort the results by, defaults to 'hoeffding_d'. Choices: 'hoeffding_d', 'cosine_similarity', 'spearman_rho', 'kendall_tau', 'approximate_distance_correlation', 'jensen_shannon_dependency_measure', 'hamming_distance'.
### Example:
```json
{
"query_text": "Find me the most similar string!",
"llm_model_name": "nomic-embed-text-v1.5.Q6_K",
"embedding_pooling_method": "mean",
"corpus_identifier_string": "specific_corpus"
"similarity_filter_percentage": 0.02,
"number_of_most_similar_strings_to_return": 5,
"result_sorting_metric": "hoeffding_d"
}
```
### Response:
The response will include the most similar strings found in the database, along with their similarity scores for multiple measures.
### Example Response:
```json
{
"query_text": "Find me the most similar string!",
"results": [
{"search_result_text": "This is the most similar string!", "similarity_to_query_text": {"cosine_similarity": 0.9823, "spearman_rho": 0.8, ... }},
{"search_result_text": "Another similar string.", "similarity_to_query_text": {"cosine_similarity": 0.9721, "spearman_rho": 0.75, ... }},
...
]
}
```""", response_description="A JSON object containing the query text and the most similar strings, along with their similarity scores for multiple measures.")
async def advanced_search_stored_embeddings_with_query_string_for_semantic_similarity(request: AdvancedSemanticSearchRequest, req: Request, token: str = None) -> AdvancedSemanticSearchResponse:
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
global faiss_indexes, associated_texts_by_model_and_pooling_method
request_time = datetime.utcnow()
request.query_text = prepare_string_for_embedding(request.query_text)
unique_id = f"advanced_semantic_search_{request.query_text}_{request.llm_model_name}_{request.embedding_pooling_method}_{request.similarity_filter_percentage}_{request.number_of_most_similar_strings_to_return}"
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
context = start_resource_monitoring("advanced_search_stored_embeddings_with_query_string_for_semantic_similarity", request.dict(), req.client.host if req else "localhost")
faiss_indexes, associated_texts_by_model_and_pooling_method = await build_faiss_indexes(force_rebuild=True)
try:
faiss_index = faiss_indexes[(request.llm_model_name, request.embedding_pooling_method)]
except KeyError:
raise HTTPException(status_code=400, detail=f"No FAISS index found for model: {request.llm_model_name} and pooling method: {request.embedding_pooling_method}")
llm_model_name = request.llm_model_name
embedding_pooling_method = request.embedding_pooling_method
num_results_before_corpus_filter = request.number_of_most_similar_strings_to_return*25
logger.info(f"Received request to find most similar strings for query text: `{request.query_text}` using model: {llm_model_name}")
try:
logger.info(f"Computing embedding for input text: {request.query_text}")
embedding_request = EmbeddingRequest(text=request.query_text, llm_model_name=llm_model_name, embedding_pooling_method=embedding_pooling_method)
embedding_response = await get_or_compute_embedding(embedding_request, req)
embedding_json = embedding_response["text_embedding_dict"]["embedding_json"]
embedding_vector = json.loads(embedding_json)
input_embedding = np.array(embedding_vector).astype('float32').reshape(1, -1)
faiss.normalize_L2(input_embedding)
logger.info(f"Computed embedding for input text: {request.query_text}")
final_results = []
faiss_index = faiss_indexes[(llm_model_name, embedding_pooling_method)]
if faiss_index is None:
raise HTTPException(status_code=400, detail=f"No FAISS index found for model: {llm_model_name} and pooling method: {embedding_pooling_method}")
num_results = max([1, int((1 - request.similarity_filter_percentage) * len(associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]))])
num_results_before_corpus_filter = min(num_results_before_corpus_filter, len(associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]))
similarities, indices = faiss_index.search(input_embedding, num_results_before_corpus_filter)
filtered_indices = indices[0]
filtered_similarities = similarities[0]
similarity_results = []
associated_texts = associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]
list_of_corpus_identifier_strings = await get_list_of_corpus_identifiers_from_list_of_embedding_texts(associated_texts, llm_model_name, embedding_pooling_method)
for idx, similarity in zip(filtered_indices, filtered_similarities):
if idx < len(associated_texts) and list_of_corpus_identifier_strings[idx] == request.corpus_identifier_string:
associated_text = associated_texts[idx]
similarity_results.append((similarity, associated_text))
similarity_results = sorted(similarity_results, key=lambda x: x[0], reverse=True)[:num_results]
for _, associated_text in similarity_results:
embedding_request = EmbeddingRequest(text=associated_text, llm_model_name=llm_model_name, embedding_pooling_method=embedding_pooling_method)
embedding_response = await get_or_compute_embedding(request=embedding_request, req=req, use_verbose=False)
embedding_json = embedding_response["text_embedding_dict"]["embedding_json"]
embedding_vector = json.loads(embedding_json)
comparison__embedding = np.array(embedding_vector).astype('float32').reshape(1, -1)
params = {
"vector_1": input_embedding.tolist()[0],
"vector_2": comparison__embedding.tolist()[0],
"similarity_measure": "all"
}
similarity_stats_str = fvs.py_compute_vector_similarity_stats(json.dumps(params))
similarity_stats_json = json.loads(similarity_stats_str)
final_results.append({
"search_result_text": associated_text,
"similarity_to_query_text": similarity_stats_json
})
num_to_return = request.number_of_most_similar_strings_to_return if request.number_of_most_similar_strings_to_return is not None else len(final_results)
results = sorted(final_results, key=lambda x: x["similarity_to_query_text"][request.result_sorting_metric], reverse=True)[:num_to_return]
response_time = datetime.utcnow()
total_time = (response_time - request_time).total_seconds()
logger.info(f"Finished advanced search in {total_time} seconds. Found {len(results)} results.")
return {"query_text": request.query_text, "corpus_identifier_string": request.corpus_identifier_string, "embedding_pooling_method": request.embedding_pooling_method, "results": results}
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
traceback.print_exc()
raise HTTPException(status_code=500, detail="Internal Server Error")
finally:
await shared_resources.lock_manager.unlock(lock)
end_resource_monitoring(context)
else:
return {"status": "already processing"}
@app.post("/get_all_embedding_vectors_for_document/",
summary="Get Embeddings for a Document",
description="""Extract text embeddings for a document. This endpoint supports plain text, .doc/.docx (MS Word), PDF files, images (using Tesseract OCR), and many other file types supported by the textract library.
### Parameters:
- `file`: The uploaded document file (either plain text, .doc/.docx, PDF, etc.).
- `url`: URL of the document file to download (optional; in lieu of `file`).
- `hash`: SHA3-256 hash of the document file to verify integrity (optional; in lieu of `file`).
- `size`: Size of the document file in bytes to verify completeness (optional; in lieu of `file`).
- `llm_model_name`: The model used to calculate embeddings (optional).
- `embedding_pooling_method`: The method used to pool the embeddings (Choices: 'mean', 'mins_maxes', 'svd', 'svd_first_four', 'ica', 'factor_analysis', 'gaussian_random_projection'; default is 'mean').
- `corpus_identifier_string`: An optional string identifier for grouping documents into a specific corpus.
- `json_format`: The format of the JSON response (optional, see details below).
- `send_back_json_or_zip_file`: Whether to return a JSON file or a ZIP file containing the embeddings file (optional, defaults to `zip`).
- `query_text`: An optional query text to perform a semantic search with the same parameters used for the document embedding request.
- `token`: Security token (optional).
### JSON Format Options:
The format of the JSON string returned by the endpoint (default is `records`; these are the options supported by the Pandas `to_json()` function):
- `split` : dict like {`index` -> [index], `columns` -> [columns], `data` -> [values]}
- `records` : list like [{column -> value}, … , {column -> value}]
- `index` : dict like {index -> {column -> value}}
- `columns` : dict like {column -> {index -> value}}
- `values` : just the values array
- `table` : dict like {`schema`: {schema}, `data`: {data}}
### Examples:
- Plain Text: Submit a file containing plain text.
- MS Word: Submit a `.doc` or `.docx` file.
- PDF: Submit a `.pdf` file.""",
response_description="Either a ZIP file containing the embeddings JSON file or a direct JSON response, depending on the value of `send_back_json_or_zip_file`.")
async def get_all_embedding_vectors_for_document(
file: UploadFile = File(None),
url: str = Form(None),
hash: str = Form(None),
size: int = Form(None),
llm_model_name: str = DEFAULT_EMBEDDING_MODEL_NAME,
embedding_pooling_method: str = DEFAULT_EMBEDDING_POOLING_METHOD,
corpus_identifier_string: str = "",
json_format: str = 'records',
send_back_json_or_zip_file: str = 'zip',
query_text: str = None,
token: str = None,
req: Request = None
):
logger.info(f"Received request with embedding_pooling_method: {embedding_pooling_method}")
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
raise HTTPException(status_code=403, detail="Unauthorized")
client_ip = req.client.host if req else "localhost"
request_time = datetime.utcnow()
if file:
input_data_binary = await file.read()
result = magika.identify_bytes(input_data_binary)
detected_data_type = result.output.ct_label
temp_file = tempfile.NamedTemporaryFile(suffix=f".{detected_data_type}", delete=False)
temp_file_path = temp_file.name
logger.info(f"Temp file path: {temp_file_path}")
with open(temp_file_path, 'wb') as buffer:
buffer.write(input_data_binary)
elif url and hash and size:
temp_file_path = await download_file(url, size, hash)
with open(temp_file_path, 'rb') as f:
input_data_binary = f.read()
result = magika.identify_bytes(input_data_binary)
detected_data_type = result.output.ct_label
new_temp_file_path = temp_file_path + f".{detected_data_type}"
os.rename(temp_file_path, new_temp_file_path)
temp_file_path = new_temp_file_path
else:
raise HTTPException(status_code=400, detail="Invalid input. Provide either a file or URL with hash and size.")
try:
hash_obj = sha3_256()
with open(temp_file_path, 'rb') as buffer:
for chunk in iter(lambda: buffer.read(1024), b''):
hash_obj.update(chunk)
document_file_hash = hash_obj.hexdigest()
logger.info(f"SHA3-256 hash of submitted file: {document_file_hash}")
if corpus_identifier_string == "":
corpus_identifier_string = document_file_hash
unique_id = f"document_embedding_{document_file_hash}_{llm_model_name}_{embedding_pooling_method}"
# Exponential backoff with jitter for acquiring lock
max_retries = 5
for attempt in range(max_retries):
try:
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
break
except Exception as e:
wait_time = (2 ** attempt) + (random.randint(0, 1000) / 1000)
logger.warning(f"Attempt {attempt + 1}: Failed to acquire lock: {e}. Retrying in {wait_time:,.2f} seconds.")
await asyncio.sleep(wait_time) # Wait before retrying
if not lock or not lock.valid:
raise HTTPException(status_code=503, detail="Service temporarily unavailable. Please try again later.")
try:
async with AsyncSessionLocal() as session:
result = await session.execute(select(DocumentEmbedding).filter(DocumentEmbedding.document_file_hash == document_file_hash, DocumentEmbedding.llm_model_name == llm_model_name, DocumentEmbedding.embedding_pooling_method == embedding_pooling_method))
existing_document_embedding = result.scalar_one_or_none()
if existing_document_embedding:
logger.info("Document has been processed before, returning existing result")
sentences = existing_document_embedding.sentences
document_embedding_results_json_compressed_binary = existing_document_embedding.document_embedding_results_json_compressed_binary
document_embedding_results_json_decompressed_binary = decompress_data(document_embedding_results_json_compressed_binary)
json_content = document_embedding_results_json_decompressed_binary.decode('utf-8')
if len(json_content) == 0:
raise HTTPException(status_code=400, detail="Could not retrieve document embedding results.")
existing_document = 1
document_embedding_request = {}
else:
document_embedding_request = {}
existing_document = 0
with open(temp_file_path, 'rb') as f:
input_data_binary = f.read()
result = magika.identify_bytes(input_data_binary)
mime_type = result.output.mime_type
sentences, thousands_of_input_words = await parse_submitted_document_file_into_sentence_strings_func(temp_file_path, mime_type)
document_embedding_request['mime_type'] = mime_type
document_embedding_request['sentences'] = sentences
document_embedding_request['total_number_of_sentences'] = len(sentences)
document_embedding_request['total_words'] = sum(len(sentence.split()) for sentence in sentences)
document_embedding_request['total_characters'] = sum(len(sentence) for sentence in sentences)
document_embedding_request['thousands_of_input_words'] = thousands_of_input_words
document_embedding_request['file_size_mb'] = os.path.getsize(temp_file_path) / (1024 * 1024)
document_embedding_request['corpus_identifier_string'] = corpus_identifier_string
document_embedding_request['embedding_pooling_method'] = embedding_pooling_method
document_embedding_request['llm_model_name'] = llm_model_name
document_embedding_request['document_file_hash'] = document_file_hash
if thousands_of_input_words > MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING:
raise HTTPException(status_code=400, detail=f"Document contains ~{int(thousands_of_input_words*1000):,} words, more than the maximum of {MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING*1000:,} words, which would take too long to compute embeddings for. Please submit a smaller document.")
first_10_words_of_input_text = ' '.join(' '.join(sentences).split()[:10])
logger.info(f"Received request to extract embeddings for document with MIME type: {mime_type} and size: {os.path.getsize(temp_file_path):,} bytes from IP address: {client_ip}; First 10 words of the document: '{first_10_words_of_input_text}...'")
logger.info(f"Document contains ~{int(thousands_of_input_words*1000):,} words, which is within the maximum of {MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING*1000:,} words. Proceeding with embedding computation using {llm_model_name} and pooling method {embedding_pooling_method}.")
input_data = {
"sentences": sentences,
"file_size_mb": os.path.getsize(temp_file_path) / (1024 * 1024),
"mime_type": mime_type
}
context = start_resource_monitoring("get_all_embedding_vectors_for_document", input_data, client_ip)
try:
json_content = await compute_embeddings_for_document(sentences=sentences, llm_model_name=llm_model_name, embedding_pooling_method=embedding_pooling_method, corpus_identifier_string=corpus_identifier_string, client_ip=client_ip, document_file_hash=document_file_hash, file=file, original_file_content=input_data_binary, json_format=json_format)
logger.info(f"Done getting all regular embeddings for document containing {len(sentences):,} sentences with model {llm_model_name} and embedding pooling method {embedding_pooling_method} and corpus {corpus_identifier_string}")
except Exception as e:
logger.error(f"Error while computing embeddings for document: {e}")
traceback.print_exc()
raise HTTPException(status_code=400, detail="Error while computing embeddings for document")
finally:
end_resource_monitoring(context)
if query_text:
use_advanced_semantic_search = 0
if use_advanced_semantic_search:
search_request = AdvancedSemanticSearchRequest(
query_text=query_text,
llm_model_name=llm_model_name,
embedding_pooling_method=embedding_pooling_method,
corpus_identifier_string=corpus_identifier_string,
similarity_filter_percentage=0.01,
result_sorting_metric="hoeffding_d",
number_of_most_similar_strings_to_return=10
)
logger.info(f"Performing advanced semantic search for model {llm_model_name} and pooling method {embedding_pooling_method}...")
search_response = await advanced_search_stored_embeddings_with_query_string_for_semantic_similarity(search_request, req, token)
search_results = search_response["results"]
else:
search_request = SemanticSearchRequest(
query_text=query_text,
llm_model_name=llm_model_name,
embedding_pooling_method=embedding_pooling_method,
corpus_identifier_string=corpus_identifier_string,
number_of_most_similar_strings_to_return=10
)
logger.info(f"Performing semantic search for model {llm_model_name} and pooling method {embedding_pooling_method}...")
search_response = await search_stored_embeddings_with_query_string_for_semantic_similarity(search_request, req, token)
search_results = search_response["results"]
logger.info(f"Advanced semantic search completed. Results for query text '{query_text}'\n: {search_results}")
json_content_dict = {"document_embedding_request": document_embedding_request, "document_embedding_results": json.loads(json_content), "semantic_search_request": dict(search_request), "semantic_search_results": search_results}
json_content = json.dumps(json_content_dict)
else:
json_content_dict = {"document_embedding_request": document_embedding_request, "document_embedding_results": json.loads(json_content)}
json_content = json.dumps(json_content_dict)
overall_total_time = (datetime.utcnow() - request_time).total_seconds()
json_content_length = len(json_content)
if json_content_length > 0:
if not existing_document:
logger.info(f"The response took {overall_total_time:,.2f} seconds to generate, or {float(overall_total_time / (thousands_of_input_words)):,.2f} seconds per thousand input tokens and {overall_total_time / (float(json_content_length) / 1000000.0):,.2f} seconds per million output characters.")
if send_back_json_or_zip_file == 'json':
logger.info(f"Returning JSON response for document containing {len(sentences):,} sentences with model {llm_model_name}; first 100 characters out of {json_content_length:,} total of JSON response: {json_content[:100]}" if 'sentences' in locals() else f"Returning JSON response; first 100 characters out of {json_content_length:,} total of JSON response: {json_content[:100]}")
return JSONResponse(content=json.loads(json_content.decode()))
else:
original_filename_without_extension, _ = os.path.splitext(file.filename if file else os.path.basename(url))
json_file_path = f"/tmp/{original_filename_without_extension}.json"
with open(json_file_path, 'w') as json_file:
json_file.write(json_content)
zip_file_path = f"/tmp/{original_filename_without_extension}.zip"
with zipfile.ZipFile(zip_file_path, 'w', compression=zipfile.ZIP_DEFLATED) as zipf:
zipf.write(json_file_path, os.path.basename(json_file_path))
logger.info(f"Returning ZIP response for document containing {len(sentences):,} sentences with model {llm_model_name}; first 100 characters out of {json_content_length:,} total of JSON response: {json_content[:100]}")
return FileResponse(zip_file_path, headers={"Content-Disposition": f"attachment; filename={original_filename_without_extension}.zip"})
finally:
await shared_resources.lock_manager.unlock(lock)
except Exception as e:
logger.error(f"Error in processing: {e}")
traceback.print_exc()
raise HTTPException(status_code=500, detail="An error occurred while processing the request.")
@app.post("/get_text_completions_from_input_prompt/",
response_model=List[TextCompletionResponse],
summary="Generate Text Completions for a Given Input Prompt",
description="""Generate text completions for a given input prompt string using the specified model.
### Parameters:
- `request`: A JSON object containing the input prompt string (`input_prompt`), the model name, an optional grammar file, an optional number of tokens to generate, and an optional number of completions to generate.
- `token`: Security token (optional).
### Request JSON Format:
The request must contain the following attributes:
- `input_prompt`: The input prompt from which to generate a completion with the LLM model.
- `llm_model_name`: The model used to calculate the embedding (optional, will use the default model if not provided).
- `temperature`: The temperature to use for text generation (optional, defaults to 0.7).
- `grammar_file_string`: The grammar file used to restrict text generation (optional; default is to not use any grammar file). Examples: `json`, `list`)
- `number_of_completions_to_generate`: The number of completions to generate (optional, defaults to 1).
- `number_of_tokens_to_generate`: The number of tokens to generate (optional, defaults to 1000).
### Example (note that `llm_model_name` is optional):
```json
{
"input_prompt": "The Kings of France in the 17th Century:",
"llm_model_name": "Meta-Llama-3-8B-Instruct.Q3_K_S",
"temperature": 0.95,
"grammar_file_string": "json",
"number_of_tokens_to_generate": 500,
"number_of_completions_to_generate": 3
}
```
### Response:
The response will include the generated text completion, the time taken to compute the generation in seconds, and the request details (input prompt, model name, grammar file, and number of tokens to generate).
### Example Response:
```json
[
{
"input_prompt": "The Kings of France in the 17th Century:",
"llm_model_name": "Meta-Llama-3-8B-Instruct.Q3_K_S",
"grammar_file_string": "json",
"number_of_tokens_to_generate": 500,
"number_of_completions_to_generate": 3,
"time_taken_in_seconds": 67.17,
"generated_text": "{\"kings\":[\\n {\\n \"name\": \"Henry IV\",\\n \"reign_start\": 1589,\\n \"reign_end\": 1610\\n },\\n {\\n \"name\": \"Louis XIII\",\\n \"reign_start\": 1610,\\n \"reign_end\": 1643\\n },\\n {\\n \"name\": \"Louis XIV\",\\n \"reign_start\": 1643,\\n \"reign_end\": 1715\\n },\\n {\\n \"name\": \"Louis XV\",\\n \"reign_start\": 1715,\\n \"reign_end\": 1774\\n },\\n {\\n \"name\": \"Louis XVI\",\\n \"reign_start\": 1774,\\n \"reign_end\": 1792\\n }\\n]}",
"finish_reason": "stop",
"llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 218, \"total_tokens\": 231}"
},
{
"input_prompt": "The Kings of France in the 17th Century:",
"llm_model_name": "Meta-Llama-3-8B-Instruct.Q3_K_S",
"grammar_file_string": "json",
"number_of_tokens_to_generate": 500,
"number_of_completions_to_generate": 3,
"time_taken_in_seconds": 67.17,
"generated_text": "{\"kings\":\\n [ {\"name\": \"Henry IV\",\\n \"reignStart\": \"1589\",\\n \"reignEnd\": \"1610\"},\\n {\"name\": \"Louis XIII\",\\n \"reignStart\": \"1610\",\\n \"reignEnd\": \"1643\"},\\n {\"name\": \"Louis XIV\",\\n \"reignStart\": \"1643\",\\n \"reignEnd\": \"1715\"}\\n ]}",
"finish_reason": "stop",
"llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 115, \"total_tokens\": 128}"
},
{
"input_prompt": "The Kings of France in the 17th Century:",
"llm_model_name": "Meta-Llama-3-8B-Instruct.Q3_K_S",
"grammar_file_string": "json",
"number_of_tokens_to_generate": 500,
"number_of_completions_to_generate": 3,
"time_taken_in_seconds": 67.17,
"generated_text": "{\\n\"Henri IV\": \"1589-1610\",\\n\"Louis XIII\": \"1610-1643\",\\n\"Louis XIV\": \"1643-1715\",\\n\"Louis XV\": \"1715-1774\",\\n\"Louis XVI\": \"1774-1792\",\\n\"Louis XVIII\": \"1814-1824\",\\n\"Charles X\": \"1824-1830\",\\n\"Louis XIX (previously known as Charles X): \" \\n : \"1824-1830\",\\n\"Charles X (previously known as Louis XIX)\": \"1824-1830\"}",
"finish_reason": "stop",
"llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 168, \"total_tokens\": 181}"
}
]
```""", response_description="A JSON object containing the the generated text completion of the input prompt and the request details.")
async def get_text_completions_from_input_prompt(request: TextCompletionRequest, req: Request = None, token: str = None, client_ip: str = None) -> List[TextCompletionResponse]:
if USE_SECURITY_TOKEN and use_hardcoded_security_token and (token is None or token != SECURITY_TOKEN):
logger.warning(f"Unauthorized request from client IP {client_ip}")
raise HTTPException(status_code=403, detail="Unauthorized")
context = start_resource_monitoring("get_text_completions_from_input_prompt", request.dict(), client_ip)
try:
unique_id = f"text_completion_{hash(request.input_prompt)}_{request.llm_model_name}"
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
response = await generate_completion_from_llm(request, req, client_ip)
return response
finally:
await shared_resources.lock_manager.unlock(lock)
else:
return {"status": "already processing"}
except Exception as e:
logger.error(f"An error occurred while processing the request: {e}")
logger.error(traceback.format_exc()) # Print the traceback
raise HTTPException(status_code=500, detail="Internal Server Error")
finally:
end_resource_monitoring(context)
@app.post("/ask_question_about_image/",
response_model=List[ImageQuestionResponse],
summary="Ask a Question About an Image",
description="""Ask a question about an image using a specified LLaVA model.
### Parameters:
- `image`: The image file to ask a question about. Can be any common image format, such as PNG, JPEG, or GIF (it will be automatically converted to a PNG file for processing).
- `question`: The question to ask about the image.
- `llm_model_name`: The model used to answer the question (must include 'llava').
- `temperature`: The temperature to use for text generation (optional, defaults to 0.7).
- `number_of_tokens_to_generate`: The number of tokens to generate (optional, defaults to 256).
- `number_of_completions_to_generate`: The number of completions to generate (optional, defaults to 1).
- `token`: Security token (optional).
### Example Request:
Submit a file and a JSON request for processing.
### Example Response:
```json
[
{
"question": "What is happening in this image?",
"llm_model_name": "llava-llama-3-8b-v1_1-int4",
"image_hash": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
"time_taken_in_seconds": 12.34,
"number_of_tokens_to_generate": 256,
"number_of_completions_to_generate": 1,
"generated_text": "The image shows a sunset over a mountain range.",
"finish_reason": "stop",
"llm_model_usage_json": "{\"prompt_tokens\": 13, \"completion_tokens\": 218, \"total_tokens\": 231}"
}
]
""",
response_description="A JSON object containing the generated answer(s) to the question about the image and the request details.")
async def ask_question_about_image_endpoint(