diff --git a/.gitignore b/.gitignore index 738f01d..1da5499 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ .env api.env *.env + +# Python +__pycache__ diff --git a/Makefile b/Makefile index d44679e..7d1e4eb 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,5 @@ restart:docker-compose.yaml # No internal docker connection to TDS .PHONY:up-prod up-prod:docker-compose.yaml - $(DOCKER_COMPOSE) compose up -f docker-compose.prod.yaml -d + $(DOCKER_COMPOSE) compose -f docker-compose.prod.yaml up -d diff --git a/api.env.sample b/api.env.sample index ce76db5..a74b151 100644 --- a/api.env.sample +++ b/api.env.sample @@ -1,7 +1,6 @@ REDIS_HOST=redis REDIS_PORT=6379 -TDS_URL=http://data-service-api:8000 -SKEMA_RS_URL=http://skema-rs:8080 -TA1_UNIFIED_URL=http://skema-unified:8000 -INTEGRATED_TR_URL=http://integrated-tr:7778 -MIT_TR_URL=http://mit-tr:8000 \ No newline at end of file +TDS_URL=http://data-service.staging.terarium.ai +SKEMA_RS_URL=http://skema-rs.staging.terarium.ai +TA1_UNIFIED_URL=http://skema-unified.staging.terarium.ai +MIT_TR_URL=http://mit-tr.staging.terarium.ai \ No newline at end of file diff --git a/api/__pycache__/server.cpython-310.pyc b/api/__pycache__/server.cpython-310.pyc deleted file mode 100644 index f76f333..0000000 Binary files a/api/__pycache__/server.cpython-310.pyc and /dev/null differ diff --git a/api/server.py b/api/server.py index e5eb42d..4ba767e 100644 --- a/api/server.py +++ b/api/server.py @@ -34,14 +34,14 @@ def build_api(*args) -> FastAPI: app = build_api() -@app.get("/status/{simulation_id}") -def get_status(simulation_id: str): +@app.get("/status/{extraction_job_id}") +def get_status(extraction_job_id: str): """ Retrieve the status of a simulation """ from utils import fetch_job_status - status, result = fetch_job_status(simulation_id) + status, result = fetch_job_status(extraction_job_id) if not isinstance(status, str): return status @@ -70,7 +70,7 @@ def mathml_to_amr(payload: List[str], model: str = "petrinet"): @app.post("/pdf_extractions") async def pdf_extractions( - pdf: UploadFile = File(...), + artifact_id: str, annotate_skema: bool = True, annotate_mit: bool = True, name: str = None, @@ -84,36 +84,14 @@ async def pdf_extractions( from utils import create_job - # Create a file-like object from the file content - filename = pdf.filename - pdf_file = io.BytesIO(await pdf.read()) - - if filename.split(".")[-1] == "pdf": - # Open the PDF file and extract text content - pdf_reader = pypdf.PdfReader(pdf_file) - num_pages = len(pdf_reader.pages) - - text_content = "" - for page_number in range(num_pages): - page = pdf_reader.pages[page_number] - text_content += page.extract_text() - else: - # Open the TXT file and extract text content - with pdf_file as pdf: - text_content = "" - for page in pdf: - text_content += page.decode("utf-8") - operation_name = "operations.pdf_extractions" # text_content = text_content[: len(text_content) // 2] options = { - "text_content": text_content, + "artifact_id": artifact_id, "annotate_skema": annotate_skema, "annotate_mit": annotate_mit, - "bytes_obj": pdf_file.seek(0), - "filename": filename, - "name": filename.split(".")[0] if name is None else name, + "name": name, "description": description, } @@ -136,3 +114,19 @@ def profile_dataset(dataset_id, document_text): resp = create_job(operation_name=operation_name, options=options) return resp + + +@app.post("/link_amr") +def link_amr(artifact_id, model_id): + from utils import create_job + + operation_name = "operations.link_amr" + + options = { + "artifact_id": artifact_id, + "model_id": model_id, + } + + resp = create_job(operation_name=operation_name, options=options) + + return resp diff --git a/docker-compose.prod.yaml b/docker-compose.prod.yaml index 7df8f34..2d96235 100644 --- a/docker-compose.prod.yaml +++ b/docker-compose.prod.yaml @@ -39,59 +39,60 @@ services: - redis networks: - ta1-extraction-service - mit-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mit-tr - networks: - - ta1-extraction-service - skema-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-tr - networks: - - ta1-extraction-service - integrated-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: integrated-tr - networks: - - ta1-extraction-service - skema-py: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-py - networks: - - ta1-extraction-service - skema-rs: - extends: - # Changed to port 8085 - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-rs - networks: - - ta1-extraction-service - graphdb: - extends: - # Changed to port 7688 - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: graphdb - networks: - - ta1-extraction-service - eq2mml: - extends: - # Changed to port 8011 - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: eq2mml - networks: - - ta1-extraction-service - mathjax: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mathjax - networks: - - ta1-extraction-service volumes: mg_lib: mg_log: mg_etc: + + # mit-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mit-tr + # networks: + # - ta1-extraction-service + # skema-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-tr + # networks: + # - ta1-extraction-service + # integrated-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: integrated-tr + # networks: + # - ta1-extraction-service + # skema-py: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-py + # networks: + # - ta1-extraction-service + # skema-rs: + # extends: + # # Changed to port 8085 + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-rs + # networks: + # - ta1-extraction-service + # graphdb: + # extends: + # # Changed to port 7688 + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: graphdb + # networks: + # - ta1-extraction-service + # eq2mml: + # extends: + # # Changed to port 8011 + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: eq2mml + # networks: + # - ta1-extraction-service + # mathjax: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mathjax + # networks: + # - ta1-extraction-service diff --git a/docker-compose.yaml b/docker-compose.yaml index 47f2f70..59c288f 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -43,54 +43,55 @@ services: networks: - ta1-extraction-service - data-api - mit-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mit-tr - networks: - - ta1-extraction-service - skema-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-tr - networks: - - ta1-extraction-service - skema-py: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-py - networks: - - ta1-extraction-service - skema-unified: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-unified - skema-rs: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-rs - networks: - - ta1-extraction-service - graphdb: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: graphdb - networks: - - ta1-extraction-service - eq2mml: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: eq2mml - networks: - - ta1-extraction-service - mathjax: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mathjax - networks: - - ta1-extraction-service volumes: mg_lib: mg_log: mg_etc: + + # mit-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mit-tr + # networks: + # - ta1-extraction-service + # skema-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-tr + # networks: + # - ta1-extraction-service + # skema-py: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-py + # networks: + # - ta1-extraction-service + # skema-unified: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-unified + # skema-rs: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-rs + # networks: + # - ta1-extraction-service + # graphdb: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: graphdb + # networks: + # - ta1-extraction-service + # eq2mml: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: eq2mml + # networks: + # - ta1-extraction-service + # mathjax: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mathjax + # networks: + # - ta1-extraction-service diff --git a/workers/operations.py b/workers/operations.py index e0e013e..180b82d 100644 --- a/workers/operations.py +++ b/workers/operations.py @@ -6,7 +6,7 @@ import requests import pandas -from utils import put_amr_to_tds, put_artifact_to_tds +from utils import put_amr_to_tds, put_artifact_extraction_to_tds, get_artifact_from_tds TDS_API = os.getenv("TDS_URL") SKEMA_API = os.getenv("SKEMA_RS_URL") @@ -46,23 +46,34 @@ def put_mathml_to_skema(*args, **kwargs): def pdf_extractions(*args, **kwargs): # Get options - text_content = kwargs.get("text_content") + artifact_id = kwargs.get("artifact_id") annotate_skema = kwargs.get("annotate_skema") annotate_mit = kwargs.get("annotate_mit") - bytes_obj = kwargs.get("bytes_obj") - filename = kwargs.get("filename") name = kwargs.get("name") description = kwargs.get("description") + artifact_json, downloaded_artifact = get_artifact_from_tds( + artifact_id=artifact_id + ) # Assumes downloaded artifact is PDF, doesn't type check + filename = artifact_json.get("file_names")[0] + # Try to feed text to the unified service - unified_text_reading_url = f"{UNIFIED_API}/text-reading/integrated-text-extractions?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" + unified_text_reading_url = f"{UNIFIED_API}/text-reading/integrated-pdf-extractions?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" headers = {"Content-Type": "application/json"} - put_payload = {"texts": [text_content]} + put_payload = { + "pdfs": [ + ( + "extractions.json", + downloaded_artifact, + "application/json", + ) + ] + } try: response = requests.post( unified_text_reading_url, - data=json.dumps(put_payload, default=str), + files=put_payload, headers=headers, ) extraction_json = response.json() @@ -77,28 +88,42 @@ def pdf_extractions(*args, **kwargs): "status_code": 500, "extraction": None, "artifact_id": None, + "error": "Extractions did not complete, extractions values were null.", } - artifact_response = put_artifact_to_tds( - bytes_obj=bytes_obj, - name=name, - description=description, + artifact_response = put_artifact_extraction_to_tds( + name=name if name is not None else artifact_json.get("name"), + description=description + if description is not None + else artifact_json.get("description"), filename=filename, extractions=extraction_json, ) - response = { - "status_code": response.status_code, - "extraction": extraction_json, - "artifact_id": artifact_response.get("artifact_id"), - } + if artifact_response.get("status") == 200: + response = { + "extraction_status_code": response.status_code, + "extraction": extraction_json, + "tds_status_code": artifact_response.get("status"), + "error": None, + } + else: + response = { + "extraction_status_code": response.status_code, + "extraction": extraction_json, + "tds_status_code": artifact_response.get("status"), + "error": "PUT extraction metadata to TDS failed, please check TDS api logs.", + } return response -def data_profiling(dataset_id, document_text): +def data_profiling(*args, **kwargs): openai_key = os.getenv("OPENAI_API_KEY") + dataset_id = kwargs.get("dataset_id") + document_text = kwargs.get("document_text") + tds_datasets_url = f"{TDS_API}/datasets" dataset = requests.get(tds_datasets_url, data={"id": dataset_id}) @@ -170,3 +195,53 @@ def data_profiling(dataset_id, document_text): resp = requests.post(f"{TDS_API}/datasets", json=dataset) dataset_id = resp.json()["id"] resp.json() + + +def link_amr(*args, **kwargs): + artifact_id = kwargs.get("artifact_id") + model_id = kwargs.get("model_id") + + artifact_json, downloaded_artifact = get_artifact_from_tds(artifact_id=artifact_id) + + tds_models_url = f"{TDS_API}/models" + + model = requests.get(tds_models_url, data={"model_id": model_id}) + model_json = model.json() + + model_amr = model_json.get("model") + + files = { + "amr_file": ( + "amr.json", + io.BytesIO(json.dumps(model_amr, ensure_ascii=False).encode("utf-8")), + "application/json", + ), + "text_extractions_file": ( + "extractions.json", + downloaded_artifact, + "application/json", + ), + } + + params = {"amr_type": "petrinet"} + + skema_amr_linking_url = f"{UNIFIED_API}/metal/link_amr" + + response = requests.post(skema_amr_linking_url, files=files, params=params) + + if response.status_code == 200: + enriched_amr = response.json() + + model_json["model"] = enriched_amr + model_id = model_json.get("id") + + new_model_payload = model_json + + update_response = requests.put( + f"{tds_models_url}/{model_id}", data=new_model_payload + ) + + return { + "status": update_response.status_code, + "message": "Model enriched and updated in TDS", + } diff --git a/workers/utils.py b/workers/utils.py index 978f470..49b984c 100644 --- a/workers/utils.py +++ b/workers/utils.py @@ -40,8 +40,8 @@ def put_amr_to_tds(amr_payload): return {"model_id": model_id, "configuration_id": config_id} -def put_artifact_to_tds( - bytes_obj, name, description, filename, extractions +def put_artifact_extraction_to_tds( + name, description, filename, extractions ): # TODO change to get artifact from TDS via filename and artifact id maybe headers = {"Content-Type": "application/json"} @@ -55,22 +55,36 @@ def put_artifact_to_tds( # Create TDS artifact tds_artifact = TDS_API + "/artifacts" - artifact_response = requests.post( + artifact_response = requests.put( tds_artifact, data=json.dumps(artifact_payload, default=str), headers=headers ) - artifact_id = artifact_response.json().get("id") + artifact_put_status = artifact_response.status_code - # Get presigned URL - tds_presign_url = ( - TDS_API + f"/artifacts/{artifact_id}/upload-url?filename={filename}" - ) - tds_presign_response = requests.get(tds_presign_url) - presigned_url = tds_presign_response.json().get("url") + return {"status": artifact_put_status} + + +def get_artifact_from_tds(artifact_id): + tds_artifacts_url = f"{TDS_API}/artifacts/{artifact_id}" + + artifact = requests.get(tds_artifacts_url) + artifact_json = artifact.json() + + filename = artifact_json.get("file_names")[ + 0 + ] # Assumes only one file will be present for now. + + download_url = f"{TDS_API}/artifacts/{artifact_id}/download-url?artifact_id={artifact_id}&filename={filename}" + artifact_download_url = requests.get(download_url) + + presigned_download = artifact_download_url.json().get("url") + + print(presigned_download) + sys.stdout.flush() - upload_resp = requests.put(presigned_url, data=bytes_obj) + downloaded_artifact = requests.get(artifact_download_url.json().get("url")) - if upload_resp.status_code == 200: - print("File upload completed") + print(f"ARTIFACT RETRIEVAL STATUS:{downloaded_artifact.status_code}") + sys.stdout.flush() - return {"artifact_id": artifact_id} + return artifact_json, downloaded_artifact.content