From 4bbcb4e909ab66651b6eb203fa6c51f840a88ca8 Mon Sep 17 00:00:00 2001 From: Powell Date: Sat, 15 Jul 2023 20:21:43 -0600 Subject: [PATCH] Added requested changes for renaming, added amr_linking endpoint, changes to pdf extractions endpoint. --- .gitignore | 3 + Makefile | 2 +- api.env.sample | 9 +- api/__pycache__/server.cpython-310.pyc | Bin 3224 -> 0 bytes api/server.py | 50 +++++------- docker-compose.prod.yaml | 103 +++++++++++------------ docker-compose.yaml | 93 ++++++++++----------- workers/operations.py | 109 +++++++++++++++++++++---- workers/utils.py | 42 ++++++---- 9 files changed, 249 insertions(+), 162 deletions(-) delete mode 100644 api/__pycache__/server.cpython-310.pyc diff --git a/.gitignore b/.gitignore index 738f01d..1da5499 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ .env api.env *.env + +# Python +__pycache__ diff --git a/Makefile b/Makefile index d44679e..7d1e4eb 100644 --- a/Makefile +++ b/Makefile @@ -43,5 +43,5 @@ restart:docker-compose.yaml # No internal docker connection to TDS .PHONY:up-prod up-prod:docker-compose.yaml - $(DOCKER_COMPOSE) compose up -f docker-compose.prod.yaml -d + $(DOCKER_COMPOSE) compose -f docker-compose.prod.yaml up -d diff --git a/api.env.sample b/api.env.sample index ce76db5..a74b151 100644 --- a/api.env.sample +++ b/api.env.sample @@ -1,7 +1,6 @@ REDIS_HOST=redis REDIS_PORT=6379 -TDS_URL=http://data-service-api:8000 -SKEMA_RS_URL=http://skema-rs:8080 -TA1_UNIFIED_URL=http://skema-unified:8000 -INTEGRATED_TR_URL=http://integrated-tr:7778 -MIT_TR_URL=http://mit-tr:8000 \ No newline at end of file +TDS_URL=http://data-service.staging.terarium.ai +SKEMA_RS_URL=http://skema-rs.staging.terarium.ai +TA1_UNIFIED_URL=http://skema-unified.staging.terarium.ai +MIT_TR_URL=http://mit-tr.staging.terarium.ai \ No newline at end of file diff --git a/api/__pycache__/server.cpython-310.pyc b/api/__pycache__/server.cpython-310.pyc deleted file mode 100644 index f76f33331ef359a784e7d3dda77099c53c0ff0ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3224 zcmZ`*OLH8z5guS3J3IRz^`Iy@iZiBPTt!++NySkyiYu}lC#td@MJeUVB(AC90!t1x zk2Qd`!p&?|Qq(C`s{8>R$fIug56Q2%pCrv5Y@X1jq`m#-b1ooHD?pj++fc1HNCNIe!3z0&{}eHglfZ zQ^H*4?a(Q$$R1I&qO8IE9b&%bQ!r{pK>6VAWcA_A@LKru-zkv{Q1|cw7hk3WKKW=y zgu_yVVq6qyu@kCY9`c3kC>`-EEw~JaqRhiYsB{S1gsKd0-r4HEAR9efyDC+1!4~6k zAkq;Y68Wqg$apNW$?~pJqs@(ttQ;iSZYkAd<#o>H^{dyfKGft=&xtxomX(j=QlvX+ zA){0Ea3DD2g-Vl59!un0?Urml+~o=5LiTK9$f%nz7Uy#-ize6)RCN-uBek83(x@>z z;<+VG!Q_kX25>hd7>A4gXkRzC$7#l5V0K_yhR{(`|GfWDj}qslVs znyhZr-rxAWO!IMO028NdsynZYwJntx5Lj{C5CQ1)Tj=nsK%l!Zex?vV6{!G|zXgX+ zZFIM?52!p-0YWQU+01^1Sal{>f)LskS0d$Ka^ucgo3I>)Nx0C!$O$|iFu6SBYOovc zmD_PG-J=2{8Nn&Xnd&*(9ji2x;v8~=RDy3*QVh6Q!5tiN7XHknStnL;h-R(3(?IE9 zhpXB4@4{GOB5AvA8i4fzD&BzMq`Q$PYB$efRmMp!v_Ha|6kO@XDA~tYdeOwN`wiIc zbM59OT*U8D^k2dSSLV z+yDYj!agG~GJ<;+e#3EL0#$5<*+pImt=S( zXjxCyx`t|h#QWjLe3*bwVf!*%JGSpyKfGsF%FS?1@EtJu+U1ke3%oohI)Hdg6i2{J z-YiF4)LF4V8moBWKx4Wdjl7yA-UN3%r@A{|5En_#b)!~UKBwY5NQk$f>v`fL(zwJh z#M?-{1D$rjT%*^))ma=1xcMQ-)%fxqz*n0(j(8tm=FtpzlM)BrybanKy)+?yNn!>%ESCJw!{~6UC9NjASQZ(XN_sMa;r|&Wi5!L zrD_QxsXh7A*0=~282Ustkzonw4Z+;<|Aq}4hU<%h`3jUDLRo@!2wA99|Fh?-pvFSG z-tYH&{RL=62tK*1N&l;Fpow)DX=?&WZFuGSm+TxwS&#SV|!Gw8;sllB-NKnB!~x zEK!MsGWBiQUVIPad){4B(~P?(7f$#Um&f&9kZ5~U2eS>KH0ut$R>nEF*Vxs#PGF6K zG;{<0#80s^Mx3XiZ!yEG%1&XOglDn^8HiFFuJ*F+!qq+m7L-`5fabXZv?-vbm=f_F zoObFA_+6ASY0yjtwUaSj)t<&0XYeQtWqgj3H3-d`b5Rh-!!f)vc^nJ0pTOs&_y9WX zK~VsM2pa}bV+c=Kc<~se1m?H8oSWk?y+x}tiT>hk)gP2Xng^D7|KRItq-`jZrlbRW zprD%UYdmW1egZxQNalUu-+*E@&iIE=ge{4;X FastAPI: app = build_api() -@app.get("/status/{simulation_id}") -def get_status(simulation_id: str): +@app.get("/status/{extraction_job_id}") +def get_status(extraction_job_id: str): """ Retrieve the status of a simulation """ from utils import fetch_job_status - status, result = fetch_job_status(simulation_id) + status, result = fetch_job_status(extraction_job_id) if not isinstance(status, str): return status @@ -70,7 +70,7 @@ def mathml_to_amr(payload: List[str], model: str = "petrinet"): @app.post("/pdf_extractions") async def pdf_extractions( - pdf: UploadFile = File(...), + artifact_id: str, annotate_skema: bool = True, annotate_mit: bool = True, name: str = None, @@ -84,36 +84,14 @@ async def pdf_extractions( from utils import create_job - # Create a file-like object from the file content - filename = pdf.filename - pdf_file = io.BytesIO(await pdf.read()) - - if filename.split(".")[-1] == "pdf": - # Open the PDF file and extract text content - pdf_reader = pypdf.PdfReader(pdf_file) - num_pages = len(pdf_reader.pages) - - text_content = "" - for page_number in range(num_pages): - page = pdf_reader.pages[page_number] - text_content += page.extract_text() - else: - # Open the TXT file and extract text content - with pdf_file as pdf: - text_content = "" - for page in pdf: - text_content += page.decode("utf-8") - operation_name = "operations.pdf_extractions" # text_content = text_content[: len(text_content) // 2] options = { - "text_content": text_content, + "artifact_id": artifact_id, "annotate_skema": annotate_skema, "annotate_mit": annotate_mit, - "bytes_obj": pdf_file.seek(0), - "filename": filename, - "name": filename.split(".")[0] if name is None else name, + "name": name, "description": description, } @@ -136,3 +114,19 @@ def profile_dataset(dataset_id, document_text): resp = create_job(operation_name=operation_name, options=options) return resp + + +@app.post("/link_amr") +def link_amr(artifact_id, model_id): + from utils import create_job + + operation_name = "operations.link_amr" + + options = { + "artifact_id": artifact_id, + "model_id": model_id, + } + + resp = create_job(operation_name=operation_name, options=options) + + return resp diff --git a/docker-compose.prod.yaml b/docker-compose.prod.yaml index 7df8f34..2d96235 100644 --- a/docker-compose.prod.yaml +++ b/docker-compose.prod.yaml @@ -39,59 +39,60 @@ services: - redis networks: - ta1-extraction-service - mit-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mit-tr - networks: - - ta1-extraction-service - skema-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-tr - networks: - - ta1-extraction-service - integrated-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: integrated-tr - networks: - - ta1-extraction-service - skema-py: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-py - networks: - - ta1-extraction-service - skema-rs: - extends: - # Changed to port 8085 - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-rs - networks: - - ta1-extraction-service - graphdb: - extends: - # Changed to port 7688 - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: graphdb - networks: - - ta1-extraction-service - eq2mml: - extends: - # Changed to port 8011 - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: eq2mml - networks: - - ta1-extraction-service - mathjax: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mathjax - networks: - - ta1-extraction-service volumes: mg_lib: mg_log: mg_etc: + + # mit-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mit-tr + # networks: + # - ta1-extraction-service + # skema-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-tr + # networks: + # - ta1-extraction-service + # integrated-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: integrated-tr + # networks: + # - ta1-extraction-service + # skema-py: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-py + # networks: + # - ta1-extraction-service + # skema-rs: + # extends: + # # Changed to port 8085 + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-rs + # networks: + # - ta1-extraction-service + # graphdb: + # extends: + # # Changed to port 7688 + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: graphdb + # networks: + # - ta1-extraction-service + # eq2mml: + # extends: + # # Changed to port 8011 + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: eq2mml + # networks: + # - ta1-extraction-service + # mathjax: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mathjax + # networks: + # - ta1-extraction-service diff --git a/docker-compose.yaml b/docker-compose.yaml index 47f2f70..59c288f 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -43,54 +43,55 @@ services: networks: - ta1-extraction-service - data-api - mit-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mit-tr - networks: - - ta1-extraction-service - skema-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-tr - networks: - - ta1-extraction-service - skema-py: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-py - networks: - - ta1-extraction-service - skema-unified: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-unified - skema-rs: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: skema-rs - networks: - - ta1-extraction-service - graphdb: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: graphdb - networks: - - ta1-extraction-service - eq2mml: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: eq2mml - networks: - - ta1-extraction-service - mathjax: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: mathjax - networks: - - ta1-extraction-service volumes: mg_lib: mg_log: mg_etc: + + # mit-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mit-tr + # networks: + # - ta1-extraction-service + # skema-tr: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-tr + # networks: + # - ta1-extraction-service + # skema-py: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-py + # networks: + # - ta1-extraction-service + # skema-unified: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-unified + # skema-rs: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: skema-rs + # networks: + # - ta1-extraction-service + # graphdb: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: graphdb + # networks: + # - ta1-extraction-service + # eq2mml: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: eq2mml + # networks: + # - ta1-extraction-service + # mathjax: + # extends: + # file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml + # service: mathjax + # networks: + # - ta1-extraction-service diff --git a/workers/operations.py b/workers/operations.py index e0e013e..180b82d 100644 --- a/workers/operations.py +++ b/workers/operations.py @@ -6,7 +6,7 @@ import requests import pandas -from utils import put_amr_to_tds, put_artifact_to_tds +from utils import put_amr_to_tds, put_artifact_extraction_to_tds, get_artifact_from_tds TDS_API = os.getenv("TDS_URL") SKEMA_API = os.getenv("SKEMA_RS_URL") @@ -46,23 +46,34 @@ def put_mathml_to_skema(*args, **kwargs): def pdf_extractions(*args, **kwargs): # Get options - text_content = kwargs.get("text_content") + artifact_id = kwargs.get("artifact_id") annotate_skema = kwargs.get("annotate_skema") annotate_mit = kwargs.get("annotate_mit") - bytes_obj = kwargs.get("bytes_obj") - filename = kwargs.get("filename") name = kwargs.get("name") description = kwargs.get("description") + artifact_json, downloaded_artifact = get_artifact_from_tds( + artifact_id=artifact_id + ) # Assumes downloaded artifact is PDF, doesn't type check + filename = artifact_json.get("file_names")[0] + # Try to feed text to the unified service - unified_text_reading_url = f"{UNIFIED_API}/text-reading/integrated-text-extractions?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" + unified_text_reading_url = f"{UNIFIED_API}/text-reading/integrated-pdf-extractions?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" headers = {"Content-Type": "application/json"} - put_payload = {"texts": [text_content]} + put_payload = { + "pdfs": [ + ( + "extractions.json", + downloaded_artifact, + "application/json", + ) + ] + } try: response = requests.post( unified_text_reading_url, - data=json.dumps(put_payload, default=str), + files=put_payload, headers=headers, ) extraction_json = response.json() @@ -77,28 +88,42 @@ def pdf_extractions(*args, **kwargs): "status_code": 500, "extraction": None, "artifact_id": None, + "error": "Extractions did not complete, extractions values were null.", } - artifact_response = put_artifact_to_tds( - bytes_obj=bytes_obj, - name=name, - description=description, + artifact_response = put_artifact_extraction_to_tds( + name=name if name is not None else artifact_json.get("name"), + description=description + if description is not None + else artifact_json.get("description"), filename=filename, extractions=extraction_json, ) - response = { - "status_code": response.status_code, - "extraction": extraction_json, - "artifact_id": artifact_response.get("artifact_id"), - } + if artifact_response.get("status") == 200: + response = { + "extraction_status_code": response.status_code, + "extraction": extraction_json, + "tds_status_code": artifact_response.get("status"), + "error": None, + } + else: + response = { + "extraction_status_code": response.status_code, + "extraction": extraction_json, + "tds_status_code": artifact_response.get("status"), + "error": "PUT extraction metadata to TDS failed, please check TDS api logs.", + } return response -def data_profiling(dataset_id, document_text): +def data_profiling(*args, **kwargs): openai_key = os.getenv("OPENAI_API_KEY") + dataset_id = kwargs.get("dataset_id") + document_text = kwargs.get("document_text") + tds_datasets_url = f"{TDS_API}/datasets" dataset = requests.get(tds_datasets_url, data={"id": dataset_id}) @@ -170,3 +195,53 @@ def data_profiling(dataset_id, document_text): resp = requests.post(f"{TDS_API}/datasets", json=dataset) dataset_id = resp.json()["id"] resp.json() + + +def link_amr(*args, **kwargs): + artifact_id = kwargs.get("artifact_id") + model_id = kwargs.get("model_id") + + artifact_json, downloaded_artifact = get_artifact_from_tds(artifact_id=artifact_id) + + tds_models_url = f"{TDS_API}/models" + + model = requests.get(tds_models_url, data={"model_id": model_id}) + model_json = model.json() + + model_amr = model_json.get("model") + + files = { + "amr_file": ( + "amr.json", + io.BytesIO(json.dumps(model_amr, ensure_ascii=False).encode("utf-8")), + "application/json", + ), + "text_extractions_file": ( + "extractions.json", + downloaded_artifact, + "application/json", + ), + } + + params = {"amr_type": "petrinet"} + + skema_amr_linking_url = f"{UNIFIED_API}/metal/link_amr" + + response = requests.post(skema_amr_linking_url, files=files, params=params) + + if response.status_code == 200: + enriched_amr = response.json() + + model_json["model"] = enriched_amr + model_id = model_json.get("id") + + new_model_payload = model_json + + update_response = requests.put( + f"{tds_models_url}/{model_id}", data=new_model_payload + ) + + return { + "status": update_response.status_code, + "message": "Model enriched and updated in TDS", + } diff --git a/workers/utils.py b/workers/utils.py index 978f470..49b984c 100644 --- a/workers/utils.py +++ b/workers/utils.py @@ -40,8 +40,8 @@ def put_amr_to_tds(amr_payload): return {"model_id": model_id, "configuration_id": config_id} -def put_artifact_to_tds( - bytes_obj, name, description, filename, extractions +def put_artifact_extraction_to_tds( + name, description, filename, extractions ): # TODO change to get artifact from TDS via filename and artifact id maybe headers = {"Content-Type": "application/json"} @@ -55,22 +55,36 @@ def put_artifact_to_tds( # Create TDS artifact tds_artifact = TDS_API + "/artifacts" - artifact_response = requests.post( + artifact_response = requests.put( tds_artifact, data=json.dumps(artifact_payload, default=str), headers=headers ) - artifact_id = artifact_response.json().get("id") + artifact_put_status = artifact_response.status_code - # Get presigned URL - tds_presign_url = ( - TDS_API + f"/artifacts/{artifact_id}/upload-url?filename={filename}" - ) - tds_presign_response = requests.get(tds_presign_url) - presigned_url = tds_presign_response.json().get("url") + return {"status": artifact_put_status} + + +def get_artifact_from_tds(artifact_id): + tds_artifacts_url = f"{TDS_API}/artifacts/{artifact_id}" + + artifact = requests.get(tds_artifacts_url) + artifact_json = artifact.json() + + filename = artifact_json.get("file_names")[ + 0 + ] # Assumes only one file will be present for now. + + download_url = f"{TDS_API}/artifacts/{artifact_id}/download-url?artifact_id={artifact_id}&filename={filename}" + artifact_download_url = requests.get(download_url) + + presigned_download = artifact_download_url.json().get("url") + + print(presigned_download) + sys.stdout.flush() - upload_resp = requests.put(presigned_url, data=bytes_obj) + downloaded_artifact = requests.get(artifact_download_url.json().get("url")) - if upload_resp.status_code == 200: - print("File upload completed") + print(f"ARTIFACT RETRIEVAL STATUS:{downloaded_artifact.status_code}") + sys.stdout.flush() - return {"artifact_id": artifact_id} + return artifact_json, downloaded_artifact.content