From 3d96544c12440ff43657f031ce735b70a946e8ee Mon Sep 17 00:00:00 2001 From: Chris Carlon Date: Mon, 2 Dec 2024 23:50:22 +0000 Subject: [PATCH] style: small refactors [2024-12-02] --- HerdingCats/endpoints/api_endpoints.py | 9 ++-- HerdingCats/errors/cats_errors.py | 2 +- HerdingCats/explorer/cat_explore.py | 58 +++++++++++++++++--------- 3 files changed, 42 insertions(+), 27 deletions(-) diff --git a/HerdingCats/endpoints/api_endpoints.py b/HerdingCats/endpoints/api_endpoints.py index d2aed23..0071ac9 100644 --- a/HerdingCats/endpoints/api_endpoints.py +++ b/HerdingCats/endpoints/api_endpoints.py @@ -8,7 +8,6 @@ class CkanApiPathsDocs: ORGANIZATION_LIST = "https://docs.ckan.org/en/2.10/api/index.html#ckan.logic.action.get.organization_list" # Need to add the rest.... - class CkanApiPaths: BASE_PATH = "/api/3/action/{}" PACKAGE_LIST = BASE_PATH.format("package_list") @@ -20,7 +19,6 @@ class CkanApiPaths: ORGANIZATION_LIST = BASE_PATH.format("organization_list") # Add more paths as needed... - class CkanDataCatalogues(Enum): LONDON_DATA_STORE = "https://data.london.gov.uk" UK_GOV = "https://data.gov.uk" @@ -31,7 +29,6 @@ class CkanDataCatalogues(Enum): # NORTHERN_DATA_MILL = "https://datamillnorth.org" NEED TO LOOK INTO THIS ONE MORE # Add more catalogues as needed... - # OPEN DATASOFT class OpenDataSoftDataCatalogues(Enum): UK_POWER_NETWORKS = "https://ukpowernetworks.opendatasoft.com" @@ -57,19 +54,19 @@ class OpenDataSoftApiPaths: SHOW_DATASET_EXPORTS_2 = BASE_PATH_2.format("datasets/{}/exports") # Add more paths as needed... - # DCAT TBC class DcatApiPaths: BASE_PATH = "/api/feed/dcat-ap/2.1.1.json" # Add more paths as needed... +# BESPOKE DATA GOUV FR +class FrenchGouvApiDocs: + DATASET_DOCS = "https://guides.data.gouv.fr/guide-data.gouv.fr/api-1/reference/datasets" -# Bespoke Data.Gouv.Fr (France) class FrenchGouvApiPaths: BASE_PATH = "/api/1/{}" SHOW_DATASETS = BASE_PATH.format("datasets") SHOW_DATASETS_BY_ID = BASE_PATH.format("datasets/{}") - class FrenchGouvCatalogue(Enum): GOUV_FR = "https://www.data.gouv.fr" diff --git a/HerdingCats/errors/cats_errors.py b/HerdingCats/errors/cats_errors.py index e9aee59..709fe8b 100644 --- a/HerdingCats/errors/cats_errors.py +++ b/HerdingCats/errors/cats_errors.py @@ -21,7 +21,7 @@ def __init__(self, message: str, original_error: Optional[Exception] = None) -> # Build the error message with color error_msg = ( - f"{self.RED}OpenDataSoftExplorer Error: {message}{self.RESET}" + f"{self.RED}OpenDataSoftExplorer Error 🐈‍⬛: {message}{self.RESET}" ) if original_error: diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py index 631d6df..d4abad9 100644 --- a/HerdingCats/explorer/cat_explore.py +++ b/HerdingCats/explorer/cat_explore.py @@ -1215,7 +1215,7 @@ def main(): # ---------------------------- def check_health_check(self) -> None: """ - TBC + Check the health of the french government's opendata catalogue endpoint """ url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS @@ -1240,41 +1240,59 @@ def check_health_check(self) -> None: def get_all_datasets(self) -> dict: """ Paginates through all datasets in the French Government data catalogue - and creates a dictionary of acronyms and IDs. - + and creates a dictionary of acronyms and IDs using streaming. Returns: dict: Dictionary with dataset IDs as keys and acronyms as values """ datasets = {} page = 1 + page_size = 100 base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS while True: try: - # Make request with pagination - params = {'page': page} - response = self.cat_session.session.get(base_url, params=params) - + params = { + 'page': page, + 'page_size': page_size + } + # Stream the response + response = self.cat_session.session.get(base_url, params=params, stream=True) if response.status_code != 200: logger.error(f"Failed to fetch page {page} with status code {response.status_code}") break - data = response.json() + # Process the streaming response + next_page_exists = False + for line in response.iter_lines(): + if not line: + continue - # Process datasets on current page - for dataset in data['data']: - dataset_id = dataset.get('id', '') - # Handle null or empty acronyms by setting to empty string - acronym = dataset.get('acronym') if dataset.get('acronym') else '' - datasets[dataset_id] = acronym + decoded_line = line.decode('utf-8') - # Check if we've reached the last page - if not data.get('next_page'): + # Check for dataset entries + if '"id":' in decoded_line and '"acronym":' in decoded_line: + # Extract just what we need using string operations + id_start = decoded_line.find('"id": "') + 7 + id_end = decoded_line.find('"', id_start) + dataset_id = decoded_line[id_start:id_end] + + acronym_start = decoded_line.find('"acronym": "') + 11 + if acronym_start > 10: # Found acronym + acronym_end = decoded_line.find('"', acronym_start) + acronym = decoded_line[acronym_start:acronym_end] + else: + acronym = '' + + datasets[dataset_id] = acronym + + # Check for next_page + elif '"next_page":' in decoded_line: + next_page_exists = 'null' not in decoded_line + + if not next_page_exists: break page += 1 - - # Optional: Log progress every 10 pages if page % 10 == 0: logger.info(f"Processed {page} pages ({len(datasets)} datasets)") @@ -1387,8 +1405,8 @@ def get_datasets_by_identifiers(self, identifiers: list) -> dict: if dataset: results[identifier] = dataset - # Optional: Add a small delay to avoid overwhelming the API - time.sleep(0.1) + # Add a small delay to avoid overwhelming the API + time.sleep(0.5) except Exception as e: logger.error(f"Error processing identifier {identifier}: {str(e)}")