Merge branch 'main' into remove-extra-group

DARPA-ASKEM · Sep 7, 2023 · 193a5eb · 193a5eb
2 parents ced0c67 + 6d92a70
commit 193a5eb
Show file tree

Hide file tree

Showing 7 changed files with 237 additions and 135 deletions.
diff --git a/api/server.py b/api/server.py
@@ -123,15 +123,15 @@ def code_to_amr(
 
 
 @app.post("/pdf_to_text")
-def pdf_to_text(artifact_id: str, redis=Depends(get_redis)) -> ExtractionJob:
- """Run text extractions over pdfs and stores the text as metadata on the artifact
+def pdf_to_text(document_id: str, redis=Depends(get_redis)) -> ExtractionJob:
+ """Run text extractions over pdfs and stores the text on the document
 
  Args:
- `artifact_id`: the id of the artifact to process
+ `document_id`: the id of the document to process
  """
  operation_name = "operations.pdf_to_text"
 
- options = {"artifact_id": artifact_id}
+ options = {"document_id": document_id}
 
  resp = create_job(operation_name=operation_name, options=options, redis=redis)
 
@@ -140,7 +140,7 @@ def pdf_to_text(artifact_id: str, redis=Depends(get_redis)) -> ExtractionJob:
 
 @app.post("/pdf_extractions")
 async def pdf_extractions(
- artifact_id: str,
+ document_id: str,
  annotate_skema: bool = True,
  annotate_mit: bool = True,
  name: str = None,
@@ -156,7 +156,7 @@ async def pdf_extractions(
 
  # text_content = text_content[: len(text_content) // 2]
  options = {
- "artifact_id": artifact_id,
+ "document_id": document_id,
  "annotate_skema": annotate_skema,
  "annotate_mit": annotate_mit,
  "name": name,
@@ -170,23 +170,23 @@ async def pdf_extractions(
 
 @app.post("/profile_dataset/{dataset_id}")
 def profile_dataset(
- dataset_id: str, artifact_id: Optional[str] = None, redis=Depends(get_redis)
+ dataset_id: str, document_id: Optional[str] = None, redis=Depends(get_redis)
 ) -> ExtractionJob:
- """Profile dataset with MIT's profiling service. This optionally accepts an `artifact_id` which
- is expected to be some user uploaded document which has had its text extracted and stored to
- `metadata.text`.
+ """Profile dataset with MIT's profiling service. This optionally accepts an `document_id` which
+ is expected to be some user uploaded document which has had its text extracted and stored as 
+ the `text` element on the document.
 
- > NOTE: if nothing is found within `metadata.text` of the artifact then it is ignored.
+ > NOTE: if nothing is found within `text` of the document then it is ignored.
 
  Args:
  dataset_id: the id of the dataset to profile
- artifact_id [optional]: the id of the artifact (paper/document) associated with the dataset.
+ document_id [optional]: the id of the document (paper/resource) associated with the dataset.
  """
  operation_name = "operations.data_card"
 
  options = {
  "dataset_id": dataset_id,
- "artifact_id": artifact_id,
+ "document_id": document_id,
  }
 
  resp = create_job(operation_name=operation_name, options=options, redis=redis)
@@ -195,8 +195,8 @@ def profile_dataset(
 
 
 @app.post("/profile_model/{model_id}")
-def profile_model(model_id: str, paper_artifact_id: str, redis=Depends(get_redis)) -> ExtractionJob:
- """Profile model with MIT's profiling service. This takes in a paper and code artifact
+def profile_model(model_id: str, document_id: str, redis=Depends(get_redis)) -> ExtractionJob:
+ """Profile model with MIT's profiling service. This takes in a paper and code document
  and updates a model (AMR) with the profiled metadata card. It requires that the paper
  has been extracted with `/pdf_to_text` and the code has been converted to an AMR
  with `/code_to_amr`
@@ -205,25 +205,25 @@ def profile_model(model_id: str, paper_artifact_id: str, redis=Depends(get_redis
 
  Args:
  model_id: the id of the model to profile
- paper_artifact_id: the id of the paper artifact
+ paper_document_id: the id of the paper document
  """
  operation_name = "operations.model_card"
 
- options = {"model_id": model_id, "paper_artifact_id": paper_artifact_id}
+ options = {"model_id": model_id, "paper_document_id": document_id}
 
  resp = create_job(operation_name=operation_name, options=options, redis=redis)
 
  return resp
 
 
 @app.post("/link_amr")
-def link_amr(artifact_id: str, model_id: str, redis=Depends(get_redis)) -> ExtractionJob:
+def link_amr(document_id: str, model_id: str, redis=Depends(get_redis)) -> ExtractionJob:
  raise HTTPException(status_code=501, detail="Endpoint is under development")
 
  operation_name = "operations.link_amr"
 
  options = {
- "artifact_id": artifact_id,
+ "document_id": document_id,
  "model_id": model_id,
  }
 

diff --git a/env.sample b/env.sample
@@ -6,4 +6,5 @@ TA1_UNIFIED_URL=http://skema-unified.staging.terarium.ai
 MIT_TR_URL=http://mit-tr.staging.terarium.ai
 LOG_LEVEL=INFO
 MOCK_TA1=True
+MOCK_TDSt=True
 OPENAI_API_KEY=foo
diff --git a/lib/settings.py b/lib/settings.py
@@ -2,6 +2,7 @@
 
 class Settings(BaseSettings):
  MOCK_TA1: bool = True
+ MOCK_TDS: bool = True
  REDIS_HOST: str = "redis.knowledge-middleware"
  REDIS_PORT: int = 6379
  TA1_UNIFIED_URL: str = "http://ta1:5"
@@ -11,4 +12,5 @@ class Settings(BaseSettings):
  OPENAI_API_KEY: str = "foo"
  LOG_LEVEL: str = "INFO"
 
+
 settings = Settings()
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -115,26 +115,35 @@ def upload(filename, content):
 def gen_tds_artifact(context_dir, http_mock, file_storage):
  # Mock the TDS artifact
  counter = count()
- def generate(code=False):
+ def generate(code=False, **extra_params):
  if code:
  _type = "code"
  else:
- _type = "artifacts"
+ _type = "documents"
  artifact = {
  "id": f"{_type}-{next(counter)}",
  "name": _type,
  "description": f"test {_type}",
  "timestamp": "2023-07-17T19:11:43",
  "metadata": {},
+ "username": "n/a",
  }
  if code:
  artifact["filename"] = "code.py"
  artifact["language"] = "python"
  else:
- artifact["file_names"]: []
- artifact_url = f"{settings.TDS_URL}/{_type}/{artifact['id']}"
- http_mock.get(artifact_url, json=artifact)
- http_mock.put(artifact_url)
+ artifact["file_names"] = []
+
+ # Override any defaults or extend with provided extra params
+ artifact.update(extra_params)
+
+ if settings.MOCK_TDS:
+ artifact_url = f"{settings.TDS_URL}/{_type}/{artifact['id']}"
+ http_mock.get(artifact_url, json=artifact)
+ http_mock.put(artifact_url)
+ else:
+ result = requests.post(f"{settings.TDS_URL}/{_type}", json=artifact)
+
  return artifact
  return generate
 

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -15,23 +15,26 @@
 params = get_parameterizations()
 
 @pytest.mark.parametrize("resource", params["pdf_extraction"])
-def test_pdf_extraction(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
+def test_pdf_extraction(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
  #### ARRANGE ####
  text_json = json.load(open(f"{context_dir}/text.json"))
  text = ""
  for d in text_json:
  text += f"{d['content']}\n"
- tds_artifact = gen_tds_artifact()
- tds_artifact["file_names"] = ["paper.pdf"]
- tds_artifact["metadata"] = {"text": text}
+ tds_artifact = gen_tds_artifact(
+ id=f"test_pdf_extractions_{resource}",
+ file_names=["paper.pdf"],
+ text=text,
+ )
  file_storage.upload("paper.pdf", "TEST TEXT")
+ document_id = tds_artifact["id"]
 
  if settings.MOCK_TA1:
  extractions = json.load(open(f"{context_dir}/extractions.json"))
  http_mock.post(f"{settings.TA1_UNIFIED_URL}/text-reading/integrated-text-extractions?annotate_skema=True&annotate_mit=True", json=extractions)
 
  query_params = {
- "artifact_id": tds_artifact["id"],
+ "document_id": document_id,
  "annotate_skema": True,
  "annotate_mit": True,
  "name": None,
@@ -56,14 +59,16 @@ def test_pdf_extraction(context_dir, http_mock, client, worker, gen_tds_artifact
 
 
 @pytest.mark.parametrize("resource", params["pdf_to_text"])
-def test_pdf_to_text(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
+def test_pdf_to_text(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
  #### ARRANGE ####
- tds_artifact = gen_tds_artifact()
- tds_artifact["file_names"] = ["paper.pdf"]
+ tds_artifact = gen_tds_artifact(
+ id=f"test_pdf_to_text_{resource}",
+ file_names=["paper.pdf"]
+ )
  file_storage.upload("paper.pdf", "TEST TEXT")
 
  query_params = {
- "artifact_id": tds_artifact["id"],
+ "document_id": tds_artifact["id"],
  }
 
  if settings.MOCK_TA1:
@@ -88,10 +93,14 @@ def test_pdf_to_text(context_dir, http_mock, client, worker, gen_tds_artifact, f
 
 
 @pytest.mark.parametrize("resource", params["code_to_amr"])
-def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
+def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
  #### ARRANGE ####
  code = open(f"{context_dir}/code.py").read()
- tds_code = gen_tds_artifact(code=True)
+ tds_code = gen_tds_artifact(
+ code=True,
+ id=f"test_code_to_amr_{resource}",
+ file_names=["code.py"]
+ )
  tds_code["file_names"] = ["code.py"]
  file_storage.upload("code.py", code)
 
@@ -101,9 +110,10 @@ def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, f
  "description": "test description",
  }
 
- http_mock.post(f"{settings.TDS_URL}/provenance", json={})
- http_mock.post(f"{settings.TDS_URL}/models", json={"id": "test"})
- http_mock.post(f"{settings.TDS_URL}/model_configurations", json={"id": "test"})
+ if settings.MOCK_TDS:
+ http_mock.post(f"{settings.TDS_URL}/provenance", json={})
+ http_mock.post(f"{settings.TDS_URL}/models", json={"id": "test"})
+ http_mock.post(f"{settings.TDS_URL}/model_configurations", json={"id": "test"})
  if settings.MOCK_TA1:
  amr = json.load(open(f"{context_dir}/amr.json"))
  http_mock.post(f"{settings.TA1_UNIFIED_URL}/workflows/code/snippets-to-pn-amr", json=amr)
@@ -122,6 +132,7 @@ def test_code_to_amr(context_dir, http_mock, client, worker, gen_tds_artifact, f
  status_response = client.get(f"/status/{job_id}")
 
  job = Job.fetch(job_id, connection=worker.connection)
+ print(job)
  amr_instance = AMR(job.result["amr"])
 
  #### ASSERT ####
@@ -190,16 +201,18 @@ def test_equations_to_amr(context_dir, http_mock, client, worker, file_storage):
 
 
 @pytest.mark.parametrize("resource", params["profile_dataset"])
-def test_profile_dataset(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
+def test_profile_dataset(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
  #### ARRANGE ####
  CHAR_LIMIT = 250
  text_json = json.load(open(f"{context_dir}/text.json"))
  text = ""
  for d in text_json:
  text += f"{d['content']}\n"
- tds_artifact = gen_tds_artifact()
- tds_artifact["file_names"] = ["paper.pdf"]
- tds_artifact["metadata"] = {"text": text[:CHAR_LIMIT]}
+ tds_artifact = gen_tds_artifact(
+ id=f"test_profile_dataset_{resource}",
+ file_names=["paper.pdf"],
+ metadata={"text": text[:CHAR_LIMIT]},
+ )
  query_params = {
  "artifact_id": tds_artifact["id"],
  }
@@ -240,35 +253,62 @@ def test_profile_dataset(context_dir, http_mock, client, worker, gen_tds_artifac
 
 
 @pytest.mark.parametrize("resource", params["profile_model"])
-def test_profile_model(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage):
+def test_profile_model(context_dir, http_mock, client, worker, gen_tds_artifact, file_storage, resource):
  #### ARRANGE ####
  text_json = json.load(open(f"{context_dir}/text.json"))
  text = ""
  for d in text_json:
  text += f"{d['content']}\n"
- text_artifact = gen_tds_artifact()
- text_artifact["file_names"] = ["paper.pdf"]
- text_artifact["metadata"] = {"text": text}
+ document = gen_tds_artifact(
+ id=f"test_profile_model_document_{resource}",
+ file_names=["paper.pdf"],
+ metadata={},
+ text=text,
+ )
  file_storage.upload("paper.pdf", "TEST TEXT")
 
  code = open(f"{context_dir}/code.py").read()
- code_artifact = gen_tds_artifact()
- code_artifact["file_names"] = ["code.py"]
+ code_artifact = gen_tds_artifact(
+ id=f"test_profile_model_code_{resource}",
+ code=True,
+ file_names=["code.py"]
+
+ )
  file_storage.upload("code.py", code)
 
+ model_id = "test_profile_model"
  amr = json.load(open(f"{context_dir}/amr.json"))
- http_mock.post(f"{settings.TDS_URL}/provenance/search?search_type=models_from_code", json={"result": [text_artifact["id"]]})
- http_mock.get(f"{settings.TDS_URL}/models/{text_artifact['id']}", json={"id":text_artifact["id"], "model": amr})
- http_mock.put(f"{settings.TDS_URL}/models/{text_artifact['id']}", json={"id": text_artifact["id"]})
+ if settings.MOCK_TDS:
+ http_mock.post(f"{settings.TDS_URL}/provenance/search?search_type=models_from_code", json={"result": [code_artifact["id"]]})
+ http_mock.get(f"{settings.TDS_URL}/models/{model_id}", json={"id": model_id, "model": amr})
+ http_mock.put(f"{settings.TDS_URL}/models/{model_id}", json={"id": model_id})
+ else:
+ amr["id"] = model_id
+ requests.post(f"{settings.TDS_URL}/models", json=amr)
+ requests.post(
+ f"{settings.TDS_URL}/provenance", 
+ json={
+ "timestamp": "2023-09-05T17:41:18.187841",
+ "relation_type": "EXTRACTED_FROM",
+ "left": model_id,
+ "left_type": "Model",
+ "right": code_artifact["id"],
+ "right_type": "Code",
+ }
+ )
+
  if settings.MOCK_TA1:
  model_card = json.load(open(f"{context_dir}/model_card.json"))
  http_mock.post(f"{settings.MIT_TR_URL}/cards/get_model_card", json=model_card)
 
- query_params = {"paper_artifact_id": text_artifact["id"]}
+ query_params = {
+ "document_id": document["id"],
+ "": "",
+ }
 
  #### ACT ####
  response = client.post(
- f"/profile_model/{text_artifact['id']}",
+ f"/profile_model/{model_id}",
  params=query_params,
  headers={"Content-Type": "application/json"},
  )