diff --git a/api.env.sample b/api.env.sample index fd20eda..ce76db5 100644 --- a/api.env.sample +++ b/api.env.sample @@ -3,4 +3,5 @@ REDIS_PORT=6379 TDS_URL=http://data-service-api:8000 SKEMA_RS_URL=http://skema-rs:8080 TA1_UNIFIED_URL=http://skema-unified:8000 -INTEGRATED_TR_URL=http://integrated-tr:7778 \ No newline at end of file +INTEGRATED_TR_URL=http://integrated-tr:7778 +MIT_TR_URL=http://mit-tr:8000 \ No newline at end of file diff --git a/api/__pycache__/server.cpython-310.pyc b/api/__pycache__/server.cpython-310.pyc index 03fed3e..f76f333 100644 Binary files a/api/__pycache__/server.cpython-310.pyc and b/api/__pycache__/server.cpython-310.pyc differ diff --git a/api/server.py b/api/server.py index 43834a5..e5eb42d 100644 --- a/api/server.py +++ b/api/server.py @@ -105,6 +105,8 @@ async def pdf_extractions( text_content += page.decode("utf-8") operation_name = "operations.pdf_extractions" + + # text_content = text_content[: len(text_content) // 2] options = { "text_content": text_content, "annotate_skema": annotate_skema, @@ -118,3 +120,19 @@ async def pdf_extractions( resp = create_job(operation_name=operation_name, options=options) return resp + + +@app.post("/profile_dataset") +def profile_dataset(dataset_id, document_text): + from utils import create_job + + operation_name = "operations.data_profiling" + + options = { + "dataset_id": dataset_id, + "document_text": document_text, + } + + resp = create_job(operation_name=operation_name, options=options) + + return resp diff --git a/docker-compose.yaml b/docker-compose.yaml index 39b6434..47f2f70 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -55,12 +55,6 @@ services: service: skema-tr networks: - ta1-extraction-service - integrated-tr: - extends: - file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml - service: integrated-tr - networks: - - ta1-extraction-service skema-py: extends: file: ./askem-ta1-dockervm/end-to-end-rest/docker-compose.yml diff --git a/poetry.lock b/poetry.lock index d5f3107..38f4c85 100644 --- a/poetry.lock +++ b/poetry.lock @@ -94,6 +94,55 @@ category = "main" optional = false python-versions = ">=3.5" +[[package]] +name = "numpy" +version = "1.25.1" +description = "Fundamental package for array computing in Python" +category = "main" +optional = false +python-versions = ">=3.9" + +[[package]] +name = "pandas" +version = "2.0.3" +description = "Powerful data structures for data analysis, time series, and statistics" +category = "main" +optional = false +python-versions = ">=3.8" + +[package.dependencies] +numpy = [ + {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, + {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.1" + +[package.extras] +all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] +aws = ["s3fs (>=2021.08.0)"] +clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] +compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] +computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] +feather = ["pyarrow (>=7.0.0)"] +fss = ["fsspec (>=2021.07.0)"] +gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] +hdf5 = ["tables (>=3.6.1)"] +html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] +mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] +parquet = ["pyarrow (>=7.0.0)"] +performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] +plot = ["matplotlib (>=3.6.1)"] +postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] +spss = ["pyreadstat (>=1.1.2)"] +sql-other = ["SQLAlchemy (>=1.4.16)"] +test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.6.3)"] + [[package]] name = "pydantic" version = "2.0.2" @@ -139,6 +188,17 @@ docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow", "PyCryptodome"] image = ["Pillow"] +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +category = "main" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" + +[package.dependencies] +six = ">=1.5" + [[package]] name = "python-multipart" version = "0.0.6" @@ -150,6 +210,14 @@ python-versions = ">=3.7" [package.extras] dev = ["atomicwrites (==1.2.1)", "attrs (==19.2.0)", "coverage (==6.5.0)", "hatch", "invoke (==1.7.3)", "more-itertools (==4.3.0)", "pbr (==4.3.0)", "pluggy (==1.0.0)", "py (==1.11.0)", "pytest (==7.2.0)", "pytest-cov (==4.0.0)", "pytest-timeout (==2.1.0)", "pyyaml (==5.1)"] +[[package]] +name = "pytz" +version = "2023.3" +description = "World timezone definitions, modern and historical" +category = "main" +optional = false +python-versions = "*" + [[package]] name = "redis" version = "4.6.0" @@ -177,6 +245,14 @@ python-versions = ">=3.6" click = ">=5.0.0" redis = ">=4.0.0" +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" + [[package]] name = "sniffio" version = "1.3.0" @@ -208,6 +284,14 @@ category = "main" optional = false python-versions = ">=3.7" +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +category = "main" +optional = false +python-versions = ">=2" + [[package]] name = "uvicorn" version = "0.22.0" @@ -229,7 +313,7 @@ api = ["uvicorn", "fastapi", "pypdf"] [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "e914d9dc04ecae98945f7f6ebddcb8aaf3320b6e2f3a183dff3d5fecb3f63998" +content-hash = "bb25e2106398102bf97440511698e37ae68ef17f61d584e52a6fc5e8ed490a7b" [metadata.files] annotated-types = [ @@ -268,6 +352,60 @@ idna = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, ] +numpy = [ + {file = "numpy-1.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d339465dff3eb33c701430bcb9c325b60354698340229e1dff97745e6b3efa"}, + {file = "numpy-1.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d736b75c3f2cb96843a5c7f8d8ccc414768d34b0a75f466c05f3a739b406f10b"}, + {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4a90725800caeaa160732d6b31f3f843ebd45d6b5f3eec9e8cc287e30f2805bf"}, + {file = "numpy-1.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c6c9261d21e617c6dc5eacba35cb68ec36bb72adcff0dee63f8fbc899362588"}, + {file = "numpy-1.25.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0def91f8af6ec4bb94c370e38c575855bf1d0be8a8fbfba42ef9c073faf2cf19"}, + {file = "numpy-1.25.1-cp310-cp310-win32.whl", hash = "sha256:fd67b306320dcadea700a8f79b9e671e607f8696e98ec255915c0c6d6b818503"}, + {file = "numpy-1.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:c1516db588987450b85595586605742879e50dcce923e8973f79529651545b57"}, + {file = "numpy-1.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6b82655dd8efeea69dbf85d00fca40013d7f503212bc5259056244961268b66e"}, + {file = "numpy-1.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e8f6049c4878cb16960fbbfb22105e49d13d752d4d8371b55110941fb3b17800"}, + {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:41a56b70e8139884eccb2f733c2f7378af06c82304959e174f8e7370af112e09"}, + {file = "numpy-1.25.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5154b1a25ec796b1aee12ac1b22f414f94752c5f94832f14d8d6c9ac40bcca6"}, + {file = "numpy-1.25.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38eb6548bb91c421261b4805dc44def9ca1a6eef6444ce35ad1669c0f1a3fc5d"}, + {file = "numpy-1.25.1-cp311-cp311-win32.whl", hash = "sha256:791f409064d0a69dd20579345d852c59822c6aa087f23b07b1b4e28ff5880fcb"}, + {file = "numpy-1.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:c40571fe966393b212689aa17e32ed905924120737194b5d5c1b20b9ed0fb171"}, + {file = "numpy-1.25.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3d7abcdd85aea3e6cdddb59af2350c7ab1ed764397f8eec97a038ad244d2d105"}, + {file = "numpy-1.25.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:1a180429394f81c7933634ae49b37b472d343cccb5bb0c4a575ac8bbc433722f"}, + {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d412c1697c3853c6fc3cb9751b4915859c7afe6a277c2bf00acf287d56c4e625"}, + {file = "numpy-1.25.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:20e1266411120a4f16fad8efa8e0454d21d00b8c7cee5b5ccad7565d95eb42dd"}, + {file = "numpy-1.25.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:f76aebc3358ade9eacf9bc2bb8ae589863a4f911611694103af05346637df1b7"}, + {file = "numpy-1.25.1-cp39-cp39-win32.whl", hash = "sha256:247d3ffdd7775bdf191f848be8d49100495114c82c2bd134e8d5d075fb386a1c"}, + {file = "numpy-1.25.1-cp39-cp39-win_amd64.whl", hash = "sha256:1d5d3c68e443c90b38fdf8ef40e60e2538a27548b39b12b73132456847f4b631"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:35a9527c977b924042170a0887de727cd84ff179e478481404c5dc66b4170009"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d3fe3dd0506a28493d82dc3cf254be8cd0d26f4008a417385cbf1ae95b54004"}, + {file = "numpy-1.25.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:012097b5b0d00a11070e8f2e261128c44157a8689f7dedcf35576e525893f4fe"}, + {file = "numpy-1.25.1.tar.gz", hash = "sha256:9a3a9f3a61480cc086117b426a8bd86869c213fc4072e606f01c4e4b66eb92bf"}, +] +pandas = [ + {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, + {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, + {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, + {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, + {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, + {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, + {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, + {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, + {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, + {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, + {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, + {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, + {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, + {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, + {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, + {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, + {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, + {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, +] pydantic = [ {file = "pydantic-2.0.2-py3-none-any.whl", hash = "sha256:f5581e0c79b2ec2fa25a9d30d766629811cdda022107fa73d022ab5578873ae3"}, {file = "pydantic-2.0.2.tar.gz", hash = "sha256:b802f5245b8576315fe619e5989fd083448fa1258638ef9dac301ca60878396d"}, @@ -379,10 +517,18 @@ pypdf = [ {file = "pypdf-3.12.1-py3-none-any.whl", hash = "sha256:74aa287c83e9aad2ce4a3627458dad729e39b5deae52175fe9f97bfffdde41bc"}, {file = "pypdf-3.12.1.tar.gz", hash = "sha256:68bf9e089caaab356518410168df9ed90f0a6109e29adac168449d4054fa0094"}, ] +python-dateutil = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] python-multipart = [ {file = "python_multipart-0.0.6-py3-none-any.whl", hash = "sha256:ee698bab5ef148b0a760751c261902cd096e57e10558e11aca17646b74ee1c18"}, {file = "python_multipart-0.0.6.tar.gz", hash = "sha256:e9925a80bb668529f1b67c7fdb0a5dacdd7cbfc6fb0bff3ea443fe22bdd62132"}, ] +pytz = [ + {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"}, + {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"}, +] redis = [ {file = "redis-4.6.0-py3-none-any.whl", hash = "sha256:e2b03db868160ee4591de3cb90d40ebb50a90dd302138775937f6a42b7ed183c"}, {file = "redis-4.6.0.tar.gz", hash = "sha256:585dc516b9eb042a619ef0a39c3d7d55fe81bdb4df09a52c9cdde0d07bf1aa7d"}, @@ -391,6 +537,10 @@ rq = [ {file = "rq-1.15.1-py2.py3-none-any.whl", hash = "sha256:6e243d8d9c4af4686ded4b01b25ea1ff4bac4fc260b02638fbe9c8c17b004bd1"}, {file = "rq-1.15.1.tar.gz", hash = "sha256:1f49f4ac1a084044bb8e95b3f305c0bf17e55618b08c18e0b60c080f12d6f008"}, ] +six = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] sniffio = [ {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, @@ -403,6 +553,10 @@ typing-extensions = [ {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, ] +tzdata = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] uvicorn = [ {file = "uvicorn-0.22.0-py3-none-any.whl", hash = "sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996"}, {file = "uvicorn-0.22.0.tar.gz", hash = "sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8"}, diff --git a/pyproject.toml b/pyproject.toml index fa2d1d1..926cf51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,8 @@ authors = ["Powell Fendley"] readme = "README.md" packages = [{include = "extraction_service"}] + + [tool.poetry.dependencies] python = "^3.9" redis = "^4.6.0" @@ -17,9 +19,10 @@ python-multipart = "^0.0.6" uvicorn = { version = "^0.22.0", optional = true } fastapi = { version = "^0.100.0", optional = true } pypdf = { version = "^3.12.0", optional = true } +pandas = { version = "^2.0.3", optional = true } [tool.poetry.extras] -api = ["uvicorn", "fastapi", "pypdf"] +api = ["uvicorn", "fastapi", "pypdf", "pandas"] [build-system] requires = ["poetry-core"] diff --git a/workers/operations.py b/workers/operations.py index d4b9dbe..4d77e4c 100644 --- a/workers/operations.py +++ b/workers/operations.py @@ -4,12 +4,14 @@ import urllib import sys import requests +import pandas from utils import put_amr_to_tds, put_artifact_to_tds TDS_API = os.getenv("TDS_URL") SKEMA_API = os.getenv("SKEMA_RS_URL") UNIFIED_API = os.getenv("TA1_UNIFIED_URL") +MIT_API = os.getenv("MIT_TR_URL") # Worker jobs for TA1 services @@ -53,9 +55,7 @@ def pdf_extractions(*args, **kwargs): description = kwargs.get("description") # Try to feed text to the unified service - unified_text_reading_url = ( - UNIFIED_API - ) = f"/text-reading/integrated-text-extractions?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" + unified_text_reading_url = f"{UNIFIED_API}/text-reading/integrated-text-extractions?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" headers = {"Content-Type": "application/json"} put_payload = {"texts": [text_content]} @@ -70,17 +70,14 @@ def pdf_extractions(*args, **kwargs): if extraction_json.get("outputs", {"data": None}).get("data", None) is None: raise ValueError - except ValueError: - # Extractions were null from unified service, try integrated service directly. - text_reading_url = ( - os.getenv("INTEGRATED_TR_URL") - + f"/integrated_text_extractions/?annotate_skema={annotate_skema}&annotate_mit={annotate_mit}" - ) + extraction_json = extraction_json.get("outputs").get("data") - response = requests.post( - text_reading_url, data=json.dumps(put_payload, default=str), headers=headers - ) - extraction_json = response.json() + except ValueError: + return { + "status_code": 500, + "extraction": None, + "artifact_id": None, + } artifact_response = put_artifact_to_tds( bytes_obj=bytes_obj, @@ -90,10 +87,95 @@ def pdf_extractions(*args, **kwargs): extractions=extraction_json, ) - response = response = { + response = { "status_code": response.status_code, "extraction": extraction_json, "artifact_id": artifact_response.get("artifact_id"), } return response + + +def data_profiling(dataset_id, document_text): + openai_key = os.getenv("OPENAI_API_KEY") + + tds_datasets_url = f"{TDS_API}/datasets" + + dataset = requests.get(tds_datasets_url, data={"id": dataset_id}) + dataset_json = dataset.json() + + dataframes = [] + for filename in dataset_json.get("filenames", []): + gen_download_url = f"{TDS_API}/datasets/{dataset_id}/download-url?dataset_id={dataset_id}&filename={filename}" + dataset_download_url = requests.get(gen_download_url) + + downloaded_dataset = requests.get(dataset_download_url) + + dataframe = pandas.read_csv(downloaded_dataset.content) + dataframes.append(dataframe) + + final_df = pandas.merge(dataframes) + + ###################################################### + # Now we do the actual profiling! + ###################################################### + + # Here we perform our first call to the MIT service + mit_url = MIT_API + + csv_string = final_df.to_csv() + + resp = requests.post( + url=f"{mit_url}/annotation/link_dataset_col_to_dkg", + params={"csv_str": csv_string, "doc": document_text, "gpt_key": openai_key}, + ) + mit_groundings = resp.json() + + # here we perform our 2nd call to the MIT service + resp = requests.post( + url=f"{mit_url}/annotation/upload_file_extract/?gpt_key={openai_key}", + files={"file": csv_string}, + ) + resp.json() + mit_annotations = {a["name"]: a for a in resp.json()} + + ####################################### + # processing the results from MIT into the format + # expected by TDS + ####################################### + + columns = [] + for c in final_df.columns: + annotations = mit_annotations.get(c, {}).get("text_annotations", []) + # Skip any single empty strings that are sometimes returned and drop extra items that are sometimes included (usually the string 'class') + groundings = { + g[0]: g[1] + for g in mit_groundings.get(c, None).get("dkg_groundings", None) + if g and isinstance(g, list) + } + col = { + "name": c, + "data_type": "float", + "description": annotations[0].strip(), + "annotations": [], + "metadata": {}, + "grounding": { + "identifiers": groundings, + }, + } + columns.append(col) + + dataset["columns"] = columns + + dataset["metadata"] = { + "document_textuments": [ + { + "url": "https://github.com/reichlab/covid19-forecast-hub/blob/master/data-truth/README.md", + "title": "README: Ground truth data for the COVID-19 Forecast Hub", + } + ] + } + + resp = requests.post(f"{TDS_API}/datasets", json=dataset) + dataset_id = resp.json()["id"] + resp.json() diff --git a/workers/utils.py b/workers/utils.py index 40c6c23..978f470 100644 --- a/workers/utils.py +++ b/workers/utils.py @@ -50,7 +50,7 @@ def put_artifact_to_tds( "name": name, "description": description, "file_names": [filename], - "metadata": extractions, + "metadata": extractions[0], } # Create TDS artifact