Skip to content

Commit

Permalink
Merge branch 'main' into remove-extra-group
Browse files Browse the repository at this point in the history
  • Loading branch information
fivegrant committed Sep 7, 2023
2 parents ced0c67 + 6d92a70 commit 193a5eb
Show file tree
Hide file tree
Showing 7 changed files with 237 additions and 135 deletions.
38 changes: 19 additions & 19 deletions api/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ def code_to_amr(


@app.post("/pdf_to_text")
def pdf_to_text(artifact_id: str, redis=Depends(get_redis)) -> ExtractionJob:
"""Run text extractions over pdfs and stores the text as metadata on the artifact
def pdf_to_text(document_id: str, redis=Depends(get_redis)) -> ExtractionJob:
"""Run text extractions over pdfs and stores the text on the document
Args:
`artifact_id`: the id of the artifact to process
`document_id`: the id of the document to process
"""
operation_name = "operations.pdf_to_text"

options = {"artifact_id": artifact_id}
options = {"document_id": document_id}

resp = create_job(operation_name=operation_name, options=options, redis=redis)

Expand All @@ -140,7 +140,7 @@ def pdf_to_text(artifact_id: str, redis=Depends(get_redis)) -> ExtractionJob:

@app.post("/pdf_extractions")
async def pdf_extractions(
artifact_id: str,
document_id: str,
annotate_skema: bool = True,
annotate_mit: bool = True,
name: str = None,
Expand All @@ -156,7 +156,7 @@ async def pdf_extractions(

# text_content = text_content[: len(text_content) // 2]
options = {
"artifact_id": artifact_id,
"document_id": document_id,
"annotate_skema": annotate_skema,
"annotate_mit": annotate_mit,
"name": name,
Expand All @@ -170,23 +170,23 @@ async def pdf_extractions(

@app.post("/profile_dataset/{dataset_id}")
def profile_dataset(
dataset_id: str, artifact_id: Optional[str] = None, redis=Depends(get_redis)
dataset_id: str, document_id: Optional[str] = None, redis=Depends(get_redis)
) -> ExtractionJob:
"""Profile dataset with MIT's profiling service. This optionally accepts an `artifact_id` which
is expected to be some user uploaded document which has had its text extracted and stored to
`metadata.text`.
"""Profile dataset with MIT's profiling service. This optionally accepts an `document_id` which
is expected to be some user uploaded document which has had its text extracted and stored as
the `text` element on the document.
> NOTE: if nothing is found within `metadata.text` of the artifact then it is ignored.
> NOTE: if nothing is found within `text` of the document then it is ignored.
Args:
dataset_id: the id of the dataset to profile
artifact_id [optional]: the id of the artifact (paper/document) associated with the dataset.
document_id [optional]: the id of the document (paper/resource) associated with the dataset.
"""
operation_name = "operations.data_card"

options = {
"dataset_id": dataset_id,
"artifact_id": artifact_id,
"document_id": document_id,
}

resp = create_job(operation_name=operation_name, options=options, redis=redis)
Expand All @@ -195,8 +195,8 @@ def profile_dataset(


@app.post("/profile_model/{model_id}")
def profile_model(model_id: str, paper_artifact_id: str, redis=Depends(get_redis)) -> ExtractionJob:
"""Profile model with MIT's profiling service. This takes in a paper and code artifact
def profile_model(model_id: str, document_id: str, redis=Depends(get_redis)) -> ExtractionJob:
"""Profile model with MIT's profiling service. This takes in a paper and code document
and updates a model (AMR) with the profiled metadata card. It requires that the paper
has been extracted with `/pdf_to_text` and the code has been converted to an AMR
with `/code_to_amr`
Expand All @@ -205,25 +205,25 @@ def profile_model(model_id: str, paper_artifact_id: str, redis=Depends(get_redis
Args:
model_id: the id of the model to profile
paper_artifact_id: the id of the paper artifact
paper_document_id: the id of the paper document
"""
operation_name = "operations.model_card"

options = {"model_id": model_id, "paper_artifact_id": paper_artifact_id}
options = {"model_id": model_id, "paper_document_id": document_id}

resp = create_job(operation_name=operation_name, options=options, redis=redis)

return resp


@app.post("/link_amr")
def link_amr(artifact_id: str, model_id: str, redis=Depends(get_redis)) -> ExtractionJob:
def link_amr(document_id: str, model_id: str, redis=Depends(get_redis)) -> ExtractionJob:
raise HTTPException(status_code=501, detail="Endpoint is under development")

operation_name = "operations.link_amr"

options = {
"artifact_id": artifact_id,
"document_id": document_id,
"model_id": model_id,
}

Expand Down
1 change: 1 addition & 0 deletions env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ TA1_UNIFIED_URL=http://skema-unified.staging.terarium.ai
MIT_TR_URL=http://mit-tr.staging.terarium.ai
LOG_LEVEL=INFO
MOCK_TA1=True
MOCK_TDSt=True
OPENAI_API_KEY=foo
2 changes: 2 additions & 0 deletions lib/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

class Settings(BaseSettings):
MOCK_TA1: bool = True
MOCK_TDS: bool = True
REDIS_HOST: str = "redis.knowledge-middleware"
REDIS_PORT: int = 6379
TA1_UNIFIED_URL: str = "http://ta1:5"
Expand All @@ -11,4 +12,5 @@ class Settings(BaseSettings):
OPENAI_API_KEY: str = "foo"
LOG_LEVEL: str = "INFO"


settings = Settings()
21 changes: 15 additions & 6 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,26 +115,35 @@ def upload(filename, content):
def gen_tds_artifact(context_dir, http_mock, file_storage):
# Mock the TDS artifact
counter = count()
def generate(code=False):
def generate(code=False, **extra_params):
if code:
_type = "code"
else:
_type = "artifacts"
_type = "documents"
artifact = {
"id": f"{_type}-{next(counter)}",
"name": _type,
"description": f"test {_type}",
"timestamp": "2023-07-17T19:11:43",
"metadata": {},
"username": "n/a",
}
if code:
artifact["filename"] = "code.py"
artifact["language"] = "python"
else:
artifact["file_names"]: []
artifact_url = f"{settings.TDS_URL}/{_type}/{artifact['id']}"
http_mock.get(artifact_url, json=artifact)
http_mock.put(artifact_url)
artifact["file_names"] = []

# Override any defaults or extend with provided extra params
artifact.update(extra_params)

if settings.MOCK_TDS:
artifact_url = f"{settings.TDS_URL}/{_type}/{artifact['id']}"
http_mock.get(artifact_url, json=artifact)
http_mock.put(artifact_url)
else:
result = requests.post(f"{settings.TDS_URL}/{_type}", json=artifact)

return artifact
return generate

Expand Down
98 changes: 69 additions & 29 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,26 @@
params = get_parameterizations()

@pytest.mark.parametrize("resource", params["pdf_extraction"])
def test_pdf_extraction(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
def test_pdf_extraction(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
#### ARRANGE ####
text_json = json.load(open(f"{context_dir}/text.json"))
text = ""
for d in text_json:
text += f"{d['content']}\n"
tds_artifact = gen_tds_artifact()
tds_artifact["file_names"] = ["paper.pdf"]
tds_artifact["metadata"] = {"text": text}
tds_artifact = gen_tds_artifact(
id=f"test_pdf_extractions_{resource}",
file_names=["paper.pdf"],
text=text,
)
file_storage.upload("paper.pdf", "TEST TEXT")
document_id = tds_artifact["id"]

if settings.MOCK_TA1:
extractions = json.load(open(f"{context_dir}/extractions.json"))
http_mock.post(f"{settings.TA1_UNIFIED_URL}/text-reading/integrated-text-extractions?annotate_skema=True&annotate_mit=True", json=extractions)

query_params = {
"artifact_id": tds_artifact["id"],
"document_id": document_id,
"annotate_skema": True,
"annotate_mit": True,
"name": None,
Expand All @@ -56,14 +59,16 @@ def test_pdf_extraction(context_dir, http_mock, client, worker, gen_tds_artifact


@pytest.mark.parametrize("resource", params["pdf_to_text"])
def test_pdf_to_text(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
def test_pdf_to_text(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
#### ARRANGE ####
tds_artifact = gen_tds_artifact()
tds_artifact["file_names"] = ["paper.pdf"]
tds_artifact = gen_tds_artifact(
id=f"test_pdf_to_text_{resource}",
file_names=["paper.pdf"]
)
file_storage.upload("paper.pdf", "TEST TEXT")

query_params = {
"artifact_id": tds_artifact["id"],
"document_id": tds_artifact["id"],
}

if settings.MOCK_TA1:
Expand All @@ -88,10 +93,14 @@ def test_pdf_to_text(context_dir, http_mock, client, worker, gen_tds_artifact, f


@pytest.mark.parametrize("resource", params["code_to_amr"])
def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
#### ARRANGE ####
code = open(f"{context_dir}/code.py").read()
tds_code = gen_tds_artifact(code=True)
tds_code = gen_tds_artifact(
code=True,
id=f"test_code_to_amr_{resource}",
file_names=["code.py"]
)
tds_code["file_names"] = ["code.py"]
file_storage.upload("code.py", code)

Expand All @@ -101,9 +110,10 @@ def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, f
"description": "test description",
}

http_mock.post(f"{settings.TDS_URL}/provenance", json={})
http_mock.post(f"{settings.TDS_URL}/models", json={"id": "test"})
http_mock.post(f"{settings.TDS_URL}/model_configurations", json={"id": "test"})
if settings.MOCK_TDS:
http_mock.post(f"{settings.TDS_URL}/provenance", json={})
http_mock.post(f"{settings.TDS_URL}/models", json={"id": "test"})
http_mock.post(f"{settings.TDS_URL}/model_configurations", json={"id": "test"})
if settings.MOCK_TA1:
amr = json.load(open(f"{context_dir}/amr.json"))
http_mock.post(f"{settings.TA1_UNIFIED_URL}/workflows/code/snippets-to-pn-amr", json=amr)
Expand All @@ -122,6 +132,7 @@ def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, f
status_response = client.get(f"/status/{job_id}")

job = Job.fetch(job_id, connection=worker.connection)
print(job)
amr_instance = AMR(job.result["amr"])

#### ASSERT ####
Expand Down Expand Up @@ -190,16 +201,18 @@ def test_equations_to_amr(context_dir, http_mock, client, worker, file_storage):


@pytest.mark.parametrize("resource", params["profile_dataset"])
def test_profile_dataset(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
def test_profile_dataset(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
#### ARRANGE ####
CHAR_LIMIT = 250
text_json = json.load(open(f"{context_dir}/text.json"))
text = ""
for d in text_json:
text += f"{d['content']}\n"
tds_artifact = gen_tds_artifact()
tds_artifact["file_names"] = ["paper.pdf"]
tds_artifact["metadata"] = {"text": text[:CHAR_LIMIT]}
tds_artifact = gen_tds_artifact(
id=f"test_profile_dataset_{resource}",
file_names=["paper.pdf"],
metadata={"text": text[:CHAR_LIMIT]},
)
query_params = {
"artifact_id": tds_artifact["id"],
}
Expand Down Expand Up @@ -240,35 +253,62 @@ def test_profile_dataset(context_dir, http_mock, client, worker, gen_tds_artifac


@pytest.mark.parametrize("resource", params["profile_model"])
def test_profile_model(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
def test_profile_model(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
#### ARRANGE ####
text_json = json.load(open(f"{context_dir}/text.json"))
text = ""
for d in text_json:
text += f"{d['content']}\n"
text_artifact = gen_tds_artifact()
text_artifact["file_names"] = ["paper.pdf"]
text_artifact["metadata"] = {"text": text}
document = gen_tds_artifact(
id=f"test_profile_model_document_{resource}",
file_names=["paper.pdf"],
metadata={},
text=text,
)
file_storage.upload("paper.pdf", "TEST TEXT")

code = open(f"{context_dir}/code.py").read()
code_artifact = gen_tds_artifact()
code_artifact["file_names"] = ["code.py"]
code_artifact = gen_tds_artifact(
id=f"test_profile_model_code_{resource}",
code=True,
file_names=["code.py"]

)
file_storage.upload("code.py", code)

model_id = "test_profile_model"
amr = json.load(open(f"{context_dir}/amr.json"))
http_mock.post(f"{settings.TDS_URL}/provenance/search?search_type=models_from_code", json={"result": [text_artifact["id"]]})
http_mock.get(f"{settings.TDS_URL}/models/{text_artifact['id']}", json={"id":text_artifact["id"], "model": amr})
http_mock.put(f"{settings.TDS_URL}/models/{text_artifact['id']}", json={"id": text_artifact["id"]})
if settings.MOCK_TDS:
http_mock.post(f"{settings.TDS_URL}/provenance/search?search_type=models_from_code", json={"result": [code_artifact["id"]]})
http_mock.get(f"{settings.TDS_URL}/models/{model_id}", json={"id": model_id, "model": amr})
http_mock.put(f"{settings.TDS_URL}/models/{model_id}", json={"id": model_id})
else:
amr["id"] = model_id
requests.post(f"{settings.TDS_URL}/models", json=amr)
requests.post(
f"{settings.TDS_URL}/provenance",
json={
"timestamp": "2023-09-05T17:41:18.187841",
"relation_type": "EXTRACTED_FROM",
"left": model_id,
"left_type": "Model",
"right": code_artifact["id"],
"right_type": "Code",
}
)

if settings.MOCK_TA1:
model_card = json.load(open(f"{context_dir}/model_card.json"))
http_mock.post(f"{settings.MIT_TR_URL}/cards/get_model_card", json=model_card)

query_params = {"paper_artifact_id": text_artifact["id"]}
query_params = {
"document_id": document["id"],
"": "",
}

#### ACT ####
response = client.post(
f"/profile_model/{text_artifact['id']}",
f"/profile_model/{model_id}",
params=query_params,
headers={"Content-Type": "application/json"},
)
Expand Down
Loading

0 comments on commit 193a5eb

Please sign in to comment.