From c8b6918ff97cc5d348715aa55f38886936f82fbb Mon Sep 17 00:00:00 2001 From: Chris Carlon Date: Tue, 26 Nov 2024 21:45:20 +0000 Subject: [PATCH] feat(ckan explorer): added in get orgs list functionality - needed slightly different implementation for london datastore as they don't have the get_organzatios endpoint it seems [2024-11-26] --- HerdingCats/endpoints/api_endpoints.py | 6 ++-- HerdingCats/explorer/cat_explore.py | 48 ++++++++++++++++++++++---- 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/HerdingCats/endpoints/api_endpoints.py b/HerdingCats/endpoints/api_endpoints.py index 1d5c0f8..c73c1b2 100644 --- a/HerdingCats/endpoints/api_endpoints.py +++ b/HerdingCats/endpoints/api_endpoints.py @@ -4,6 +4,7 @@ class CkanApiPathsDocs: PACKAGE_LIST = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.package_list" PACKAGE_SEARCH = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.package_search" + CURRENT_PACKAGE_LIST_WITH_RESOURCES = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.current_package_list_with_resources" # Need to add the rest !! @@ -15,6 +16,7 @@ class CkanApiPaths: CURRENT_PACKAGE_LIST_WITH_RESOURCES = BASE_PATH.format( "current_package_list_with_resources" ) + ORGANIZATION_LIST = BASE_PATH.format("organization_list") # Add more paths as needed... @@ -22,8 +24,8 @@ class CkanDataCatalogues(Enum): LONDON_DATA_STORE = "https://data.london.gov.uk" UK_GOV = "https://data.gov.uk" SUBAK = "https://data.subak.org" - HUMANITARIAN = "https://data.humdata.org" - AFRICA = "https://open.africa" + HUMANITARIAN_DATA_STORE = "https://data.humdata.org" + OPEN_AFRICA = "https://open.africa" # CANADA_GOV = "https://search.open.canada.ca/opendata" NEED TO LOOK INTO THIS ONE MORE # NORTHERN_DATA_MILL = "https://datamillnorth.org" NEED TO LOOK INTO THIS ONE MORE # Add more catalogues as needed... diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py index f415a17..df5c7b0 100644 --- a/HerdingCats/explorer/cat_explore.py +++ b/HerdingCats/explorer/cat_explore.py @@ -1,11 +1,10 @@ -from numpy._core.multiarray import empty import requests import pandas as pd import polars as pl import duckdb import json -from typing import Any, Dict, Optional, Union, Literal, List +from typing import Any, Dict, Optional, Union, Literal, List, Tuple from loguru import logger from urllib.parse import urlencode @@ -29,9 +28,14 @@ def __init__(self, cat_session: CatSession): CkanCatSession # Example usage... + import HerdingCats as hc + + def main(): + with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session: + explore = hc.CkanCatExplorer(session) + if __name__ == "__main__": - with CatSession("data.london.gov.uk") as session: - explore = CkanCatExplorer(session) + main() """ self.cat_session = cat_session @@ -42,7 +46,7 @@ def check_site_health(self) -> None: """ Make sure the Ckan endpoints are healthy and reachable - This calls the Ckan package_list endpoint to check if site is reacheable. + This calls the Ckan package_list endpoint to check if site is still reacheable. # Example usage... if __name__ == "__main__": @@ -361,6 +365,38 @@ def package_list_dataframe_extra( logger.error(f"Failed to search datasets: {e}") raise CatExplorerError(f"Failed to search datasets: {str(e)}") + def get_organisation_list(self) -> Tuple[int, list]: + """ + Returns total number of orgs or maintainers if org endpoint does not work, + as well as list of the org or mantainers themselves. + + """ + url = self.cat_session.base_url + CkanApiPaths.ORGANIZATION_LIST + try: + response = self.cat_session.session.get(url) + response.raise_for_status() + data = response.json() + organisations = data["result"] + length = len(organisations) + return length, organisations + except (requests.RequestException, Exception) as e: + logger.warning(f"Primary organisation search method failed - attempting secondary method that fetches 'maintainers' only - this may still be useful but not as accurate: {e}") + try: + # Secondary method using package endpoint + package_url = self.cat_session.base_url + CkanApiPaths.CURRENT_PACKAGE_LIST_WITH_RESOURCES + package_response = self.cat_session.session.get(package_url) + package_response.raise_for_status() + data = package_response.json() + + # Convert list of maintainers to a dictionary + maintainers = list(set(entry.get("maintainer", "N/A") for entry in data["result"] if entry.get("maintainer"))) + length = len(maintainers) + return length, maintainers + + except (requests.RequestException, Exception) as e: + logger.error(f"Both organization list methods failed: {e}") + raise + # ---------------------------- # Show catalogue freshness # ---------------------------- @@ -374,7 +410,7 @@ def catalogue_freshness(self): It currently uses metadata_modified at the dataset level - not resource level. """ logger.warning( - "This method might not work for all catalogues, and will return 0s. It currently only works for the London Datastore. We are working on improving this" + "This method DOES NOT work for all catalogues, and will return 0s. It currently only works for the London Datastore. We are working on improving and fixing this." ) url = (