API: refactor and fix get_result(wait=True)

The previous `GET /dispatches/{dispatch_id}` endpoint was trying to do too much. Its responsibilities are now separated into two endpoints: * `GET /dispatches`: bulk query dispatch summaries (including status) with options to filter by `dispatch_id`, sort chronologically, and also limit the output to status only. * `GET /dispatches/{dispatch_id}`: download manifest To achieve the desired behavior of `get_result(id, wait=True)`, the client 1. Polls the dispatch status by querying the first endpoint. 2. Downloads the manifest after the dispatch has reached a final status. The server no longer returns 503 errors when the dispatch is not yet "ready". A 503 status code is not entirely accurate here because it is intended to convey temporary service unavailablity resulting from server overload or rate limiting. However, the fact that the workflow is still running does not indicate any fault of the server. These changes will allow `get_result(dispatch_id, wait=True)` to wait as long as required instead of erroring out after some time. Supporting improvements: DAL: Add sorting and pagination to Controller DAL: improve bulk get when retrieving only some columns Directly select the specified columns instead of retrieving the whole ORM entities and deferring column loading using load_only
AgnostiqHQ · Jun 14, 2024 · 71ea26f · 71ea26f
1 parent 215d8d3
commit 71ea26f
Show file tree

Hide file tree

Showing 11 changed files with 347 additions and 204 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - Improved handling of Covalent version mismatches between client and
   executor environments
+- `get_result(wait=True)` will wait as long as needed
 
 ### Removed
 

diff --git a/covalent/_results_manager/results_manager.py b/covalent/_results_manager/results_manager.py
@@ -19,12 +19,11 @@
 
 import contextlib
 import os
+import time
 from pathlib import Path
-from typing import Dict, List, Optional
+from typing import List, Optional
 
 from furl import furl
-from requests.adapters import HTTPAdapter
-from urllib3.util import Retry
 
 from .._api.apiclient import CovalentAPIClient
 from .._serialize.common import load_asset
@@ -40,9 +39,9 @@
 from .._shared_files.exceptions import MissingLatticeRecordError
 from .._shared_files.schemas.asset import AssetSchema
 from .._shared_files.schemas.result import ResultSchema
+from .._shared_files.util_classes import RESULT_STATUS, Status
 from .._shared_files.utils import copy_file_locally, format_server_url
 from .result import Result
-from .wait import EXTREME
 
 app_log = logger.app_log
 log_stack_info = logger.log_stack_info
@@ -139,12 +138,20 @@ def cancel(dispatch_id: str, task_ids: List[int] = None, dispatcher_addr: str =
 # Multi-part
 
 
+def _query_dispatch_status(dispatch_id: str, api_client: CovalentAPIClient):
+    endpoint = "/api/v2/dispatches"
+    resp = api_client.get(endpoint, params={"dispatch_id": dispatch_id, "status_only": True})
+    resp.raise_for_status()
+    dispatches = resp.json()["dispatches"]
+    if len(dispatches) == 0:
+        raise MissingLatticeRecordError
+
+    return dispatches[0]["status"]
+
+
 def _get_result_export_from_dispatcher(
-    dispatch_id: str,
-    wait: bool = False,
-    status_only: bool = False,
-    dispatcher_addr: str = None,
-) -> Dict:
+    dispatch_id: str, api_client: CovalentAPIClient
+) -> ResultSchema:
     """
     Internal function to get the results of a dispatch from the server without checking if it is ready to read.
 
@@ -161,24 +168,21 @@ def _get_result_export_from_dispatcher(
         MissingLatticeRecordError: If the result is not found.
     """
 
-    if dispatcher_addr is None:
-        dispatcher_addr = format_server_url()
+    # if dispatcher_addr is None:
+    #     dispatcher_addr = format_server_url()
 
-    retries = int(EXTREME) if wait else 5
+    # retries = int(EXTREME) if wait else 5
 
-    adapter = HTTPAdapter(max_retries=Retry(total=retries, backoff_factor=1))
-    api_client = CovalentAPIClient(dispatcher_addr, adapter=adapter, auto_raise=False)
+    # adapter = HTTPAdapter(max_retries=Retry(total=retries, backoff_factor=1))
+    # api_client = CovalentAPIClient(dispatcher_addr, adapter=adapter, auto_raise=False)
 
     endpoint = f"/api/v2/dispatches/{dispatch_id}"
-    response = api_client.get(
-        endpoint,
-        params={"wait": wait, "status_only": status_only},
-    )
+    response = api_client.get(endpoint)
     if response.status_code == 404:
         raise MissingLatticeRecordError
     response.raise_for_status()
     export = response.json()
-    return export
+    return ResultSchema.model_validate(export)
 
 
 # Function to download default assets
@@ -346,11 +350,17 @@ def from_dispatch_id(
         wait: bool = False,
         dispatcher_addr: str = None,
     ) -> "ResultManager":
-        export = _get_result_export_from_dispatcher(
-            dispatch_id, wait, status_only=False, dispatcher_addr=dispatcher_addr
-        )
+        if dispatcher_addr is None:
+            dispatcher_addr = format_server_url()
 
-        manifest = ResultSchema.model_validate(export["result_export"])
+        api_client = CovalentAPIClient(dispatcher_addr)
+        if wait:
+            status = Status(_query_dispatch_status(dispatch_id, api_client))
+            while not RESULT_STATUS.is_terminal(status):
+                time.sleep(1)
+                status = Status(_query_dispatch_status(dispatch_id, api_client))
+
+        manifest = _get_result_export_from_dispatcher(dispatch_id, api_client)
 
         # sort the nodes
         manifest.lattice.transport_graph.nodes.sort(key=lambda x: x.id)
@@ -408,14 +418,15 @@ def _get_result_multistage(
 
     """
 
+    if dispatcher_addr is None:
+        dispatcher_addr = format_server_url()
+
+    api_client = CovalentAPIClient(dispatcher_addr)
     try:
         if status_only:
-            return _get_result_export_from_dispatcher(
-                dispatch_id=dispatch_id,
-                wait=wait,
-                status_only=status_only,
-                dispatcher_addr=dispatcher_addr,
-            )
+            status = _query_dispatch_status(dispatch_id, api_client)
+            return {"id": dispatch_id, "status": status}
+
         rm = get_result_manager(dispatch_id, results_dir, wait, dispatcher_addr)
         _get_default_assets(rm)
 
@@ -496,23 +507,14 @@ def get_result(
         The Result object from the Covalent server
 
     """
-    max_attempts = int(os.getenv("COVALENT_GET_RESULT_RETRIES", 10))
-    num_attempts = 0
-    while num_attempts < max_attempts:
-        try:
-            return _get_result_multistage(
-                dispatch_id=dispatch_id,
-                wait=wait,
-                dispatcher_addr=dispatcher_addr,
-                status_only=status_only,
-                results_dir=results_dir,
-                workflow_output=workflow_output,
-                intermediate_outputs=intermediate_outputs,
-                sublattice_results=sublattice_results,
-                qelectron_db=qelectron_db,
-            )
-
-        except RecursionError as re:
-            app_log.error(re)
-            num_attempts += 1
-    raise RuntimeError("Timed out waiting for result. Please retry or check dispatch.")
+    return _get_result_multistage(
+        dispatch_id=dispatch_id,
+        wait=wait,
+        dispatcher_addr=dispatcher_addr,
+        status_only=status_only,
+        results_dir=results_dir,
+        workflow_output=workflow_output,
+        intermediate_outputs=intermediate_outputs,
+        sublattice_results=sublattice_results,
+        qelectron_db=qelectron_db,
+    )
diff --git a/covalent/triggers/base.py b/covalent/triggers/base.py
@@ -15,8 +15,6 @@
 # limitations under the License.
 
 
-import asyncio
-import json
 from abc import abstractmethod
 
 import requests
@@ -108,17 +106,12 @@ def _get_status(self) -> Status:
         """
 
         if self.use_internal_funcs:
-            from covalent_dispatcher._service.app import export_result
+            from covalent_dispatcher._service.app import get_dispatches_bulk
 
-            response = asyncio.run_coroutine_threadsafe(
-                export_result(self.lattice_dispatch_id, status_only=True),
-                self.event_loop,
-            ).result()
-
-            if isinstance(response, dict):
-                return response["status"]
-
-            return json.loads(response.body.decode()).get("status")
+            response = get_dispatches_bulk(
+                dispatch_id=[self.lattice_dispatch_id], status_only=True
+            )
+            return response.dispatches[0].status
 
         from .. import get_result
 

diff --git a/covalent_dispatcher/_dal/controller.py b/covalent_dispatcher/_dal/controller.py
@@ -17,10 +17,12 @@
 
 from __future__ import annotations
 
-from typing import Generic, Type, TypeVar
+from typing import Generic, List, Optional, Sequence, Type, TypeVar, Union
 
 from sqlalchemy import select, update
-from sqlalchemy.orm import Session, load_only
+from sqlalchemy.engine import Row
+from sqlalchemy.orm import Session
+from sqlalchemy.sql.expression import Select, desc
 
 from .._db import models
 
@@ -50,11 +52,16 @@ def get(
         cls,
         session: Session,
         *,
+        stmt: Optional[Select] = None,
         fields: list,
         equality_filters: dict,
         membership_filters: dict,
         for_update: bool = False,
-    ):
+        sort_fields: List[str] = [],
+        reverse: bool = True,
+        offset: int = 0,
+        max_items: Optional[int] = None,
+    ) -> Union[Sequence[Row], Sequence[T]]:
         """Bulk ORM-enabled SELECT.
 
         Args:
@@ -64,19 +71,40 @@ def get(
             membership_filters: Dict{field_name: value_list}
             for_update: Whether to lock the selected rows
 
+        Returns:
+            A list of SQLAlchemy Rows or whole ORM entities depending
+        on whether only a subset of fields is specified.
+
         """
-        stmt = select(cls.model)
+        if stmt is None:
+            if len(fields) > 0:
+                entities = [getattr(cls.model, attr) for attr in fields]
+                stmt = select(*entities)
+            else:
+                stmt = select(cls.model)
+
         for attr, val in equality_filters.items():
             stmt = stmt.where(getattr(cls.model, attr) == val)
         for attr, vals in membership_filters.items():
             stmt = stmt.where(getattr(cls.model, attr).in_(vals))
-        if len(fields) > 0:
-            attrs = [getattr(cls.model, f) for f in fields]
-            stmt = stmt.options(load_only(*attrs))
         if for_update:
             stmt = stmt.with_for_update()
-
-        return session.scalars(stmt).all()
+        for attr in sort_fields:
+            if reverse:
+                stmt = stmt.order_by(desc(getattr(cls.model, attr)))
+            else:
+                stmt = stmt.order_by(getattr(cls.model, attr))
+
+        stmt = stmt.offset(offset)
+        if max_items:
+            stmt = stmt.limit(max_items)
+
+        if len(fields) == 0:
+            # Return whole ORM entities
+            return session.scalars(stmt).all()
+        else:
+            # Return a named tuple containing the selected cols
+            return session.execute(stmt).all()
 
     @classmethod
     def get_by_primary_key(

diff --git a/covalent_dispatcher/_dal/result.py b/covalent_dispatcher/_dal/result.py
@@ -21,6 +21,7 @@
 from datetime import datetime
 from typing import Any, Dict, List
 
+from sqlalchemy import select
 from sqlalchemy.orm import Session
 
 from covalent._shared_files import logger
@@ -45,6 +46,41 @@
 class ResultMeta(Record[models.Lattice]):
     model = models.Lattice
 
+    @classmethod
+    def get_toplevel_dispatches(
+        cls,
+        session: Session,
+        *,
+        fields: list,
+        equality_filters: dict,
+        membership_filters: dict,
+        for_update: bool = False,
+        sort_fields: List[str] = [],
+        reverse: bool = True,
+        offset: int = 0,
+        max_items: int = 10,
+    ):
+        if len(fields) > 0:
+            entities = [getattr(cls.model, attr) for attr in fields]
+            stmt = select(*entities)
+        else:
+            stmt = select(cls.model)
+
+        stmt = stmt.where(models.Lattice.root_dispatch_id == models.Lattice.dispatch_id)
+
+        return cls.get(
+            session=session,
+            stmt=stmt,
+            fields=fields,
+            equality_filters=equality_filters,
+            membership_filters=membership_filters,
+            for_update=for_update,
+            sort_fields=sort_fields,
+            reverse=reverse,
+            offset=offset,
+            max_items=max_items,
+        )
+
 
 class ResultAsset(Record[models.LatticeAsset]):
     model = models.LatticeAsset
@@ -175,7 +211,7 @@ def _update_dispatch(
             with self.session() as session:
                 electron_rec = Electron.get_db_records(
                     session,
-                    keys={"id", "parent_lattice_id"},
+                    keys=ELECTRON_KEYS,
                     equality_filters={"id": self._electron_id},
                     membership_filters={},
                 )[0]
@@ -343,7 +379,7 @@ def _get_incomplete_nodes(self):
             A dictionary {"failed": [node_ids], "cancelled": [node_ids]}
         """
         with self.session() as session:
-            query_keys = {"parent_lattice_id", "node_id", "name", "status"}
+            query_keys = {"id", "parent_lattice_id", "node_id", "name", "status"}
             records = Electron.get_db_records(
                 session,
                 keys=query_keys,