From 840dfb8c2cb10db31c28fdb714955c5acb5fd95e Mon Sep 17 00:00:00 2001 From: christophercarlon Date: Sat, 28 Sep 2024 15:05:25 +0100 Subject: [PATCH] Added in more opensoft data catalogues --- HerdingCats/__init__.py | 6 +- HerdingCats/endpoints/api_endpoints.py | 11 ++- HerdingCats/explorer/cat_explore.py | 102 ++++++++++++++++++++++++- HerdingCats/session/cat_session.py | 1 + README.md | 2 + 5 files changed, 117 insertions(+), 5 deletions(-) diff --git a/HerdingCats/__init__.py b/HerdingCats/__init__.py index 4d66071..e20037e 100644 --- a/HerdingCats/__init__.py +++ b/HerdingCats/__init__.py @@ -1,7 +1,8 @@ from .data_loader.data_loader import CkanCatResourceLoader -from .explorer.cat_explore import CkanCatExplorer +from .explorer.cat_explore import CkanCatExplorer, OpenDataSoftCatExplorer from .session.cat_session import CatSession from .errors.cats_errors import CatSessionError, CatExplorerError +from .endpoints.api_endpoints import CkanApiPaths, OpenDataSoftDataCatalogues __all__ = [ "CkanCatResourceLoader", @@ -9,6 +10,9 @@ "CatSession", "CatSessionError", "CatExplorerError", + "CkanApiPaths", + "OpenDataSoftDataCatalogues", + "OpenDataSoftCatExplorer", ] __version__ = "0.1.3" diff --git a/HerdingCats/endpoints/api_endpoints.py b/HerdingCats/endpoints/api_endpoints.py index 9708f79..404a4f1 100644 --- a/HerdingCats/endpoints/api_endpoints.py +++ b/HerdingCats/endpoints/api_endpoints.py @@ -32,15 +32,22 @@ class CkanDataCatalogues(Enum): class OpenDataSoftDataCatalogues(Enum): UK_POWER_NETWORKS = "https://ukpowernetworks.opendatasoft.com" - UKPN = "https://ukpowernetworks.opendatasoft.com" INFRABEL = "https://opendata.infrabel.be" + PARIS = "https://opendata.paris.fr" + TOULOUSE = "https://data.toulouse-metropole.fr" # Add more catalogues as needed... # OPEN DATASOFT class OpenDataSoftApiPaths: + # Normal base paths... BASE_PATH = "/api/v2/catalog/{}" - SHOW_DATASETS = BASE_PATH.format("/datasets") + SHOW_DATASETS = BASE_PATH.format("datasets") + + # Alternativre base paths... + BASE_PATH_2 = "/api/explore/v2.0/catalog/{}" + SHOW_DATASETS_2 = BASE_PATH_2.format("datasets") + # Add more paths as needed... diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py index a7d38ca..678b7bf 100644 --- a/HerdingCats/explorer/cat_explore.py +++ b/HerdingCats/explorer/cat_explore.py @@ -8,16 +8,17 @@ from loguru import logger from urllib.parse import urlencode -from ..endpoints.api_endpoints import CkanApiPaths +from ..endpoints.api_endpoints import CkanApiPaths, OpenDataSoftApiPaths from ..errors.cats_errors import CatExplorerError from ..session.cat_session import CatSession # FIND THE DATA YOU WANT / NEED / ISOLATE PACKAGES AND RESOURCES +# For Ckan Catalogues Only class CkanCatExplorer: def __init__(self, cat_session: CatSession): """ - Takes in a CkanCatSession + Takes in a CatSession Allows user to start exploring data catalogue programatically @@ -844,3 +845,100 @@ def _duckdb_explore( print("First few elements of input data:") print(json.dumps(data[:2], indent=2)) return pd.DataFrame() + + +# FIND THE DATA YOU WANT / NEED / ISOLATE PACKAGES AND RESOURCES +# For Open Datasoft Catalogues Only +class OpenDataSoftCatExplorer: + def __init__(self, cat_session: CatSession): + """ + Takes in a CatSession + + Allows user to start exploring data catalogue programatically + + Make sure you pass a valid CkanCatSession in + + Args: + CkanCatSession + + # Example usage... + if __name__ == "__main__": + with CatSession("ukpowernetworks.opendatasoft.com") as session: + explore = CatExplorer(session) + """ + self.cat_session = cat_session + + def fetch_all_datasets(self) -> dict | None: + urls = [ + self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASETS, + self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASETS_2, + ] + dataset_dict = {} + total_count = 0 + + for url in urls: + offset = 0 + limit = 100 + + try: + while True: + params = {"offset": offset, "limit": limit} + response = self.cat_session.session.get(url, params=params) + + if response.status_code == 400 and url == urls[0]: + logger.warning( + "SHOW_DATASETS endpoint returned 400 status. Trying SHOW_DATASETS_2." + ) + break # Break the inner loop to try the next URL + + response.raise_for_status() + result = response.json() + + for dataset_info in result.get("datasets", []): + if ( + "dataset" in dataset_info + and "metas" in dataset_info["dataset"] + and "default" in dataset_info["dataset"]["metas"] + and "title" in dataset_info["dataset"]["metas"]["default"] + and "dataset_id" in dataset_info["dataset"] + ): + title = dataset_info["dataset"]["metas"]["default"]["title"] + dataset_id = dataset_info["dataset"]["dataset_id"] + dataset_dict[title] = dataset_id + + # Update total_count if available + if "total_count" in result: + total_count = result["total_count"] + + # Check if we've reached the end of the datasets + if len(result.get("datasets", [])) < limit: + break + offset += limit + + # If we've successfully retrieved datasets, no need to try the second URL + if dataset_dict: + break + + except requests.RequestException as e: + if url == urls[-1]: + logger.error(f"Failed to fetch datasets: {e}") + raise CatExplorerError(f"Failed to fetch datasets: {str(e)}") + else: + logger.warning( + f"Failed to fetch datasets from {url}: {e}. Trying next URL." + ) + + if dataset_dict: + returned_count = len(dataset_dict) + if returned_count == total_count: + logger.success( + f"total_count = {total_count} AND returned_count = {returned_count}" + ) + else: + logger.warning( + f"Mismatch in counts: total_count = {total_count}, returned_count = {returned_count}" + ) + return dataset_dict + else: + logger.warning("No datasets were retrieved.") + return None diff --git a/HerdingCats/session/cat_session.py b/HerdingCats/session/cat_session.py index 77cfcbe..8aa8f83 100644 --- a/HerdingCats/session/cat_session.py +++ b/HerdingCats/session/cat_session.py @@ -1,4 +1,5 @@ import requests + from typing import Union from loguru import logger from urllib.parse import urlparse diff --git a/README.md b/README.md index b033a3f..a08f600 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,8 @@ This will improve and speed up how people: | Humanitarian Data Exchange | https://data.humdata.org | CKAN | | UK Power Networks | https://ukpowernetworks.opendatasoft.com | Open Datasoft | | Infrabel | https://opendata.infrabel.be | Open Datasoft | +| Paris | hhttps://opendata.paris.fr | Open Datasoft | +| Toulouse | https://data.toulouse-metropole.fr | Open Datasoft | **TBC** | Catalogue Name | Website | Catalogue API Endpoint Definition |