From 3d96544c12440ff43657f031ce735b70a946e8ee Mon Sep 17 00:00:00 2001
From: Chris Carlon <cmcarleone@gmail.com>
Date: Mon, 2 Dec 2024 23:50:22 +0000
Subject: [PATCH] style: small refactors [2024-12-02]

---
 HerdingCats/endpoints/api_endpoints.py |  9 ++--
 HerdingCats/errors/cats_errors.py      |  2 +-
 HerdingCats/explorer/cat_explore.py    | 58 +++++++++++++++++---------
 3 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/HerdingCats/endpoints/api_endpoints.py b/HerdingCats/endpoints/api_endpoints.py
index d2aed23..0071ac9 100644
--- a/HerdingCats/endpoints/api_endpoints.py
+++ b/HerdingCats/endpoints/api_endpoints.py
@@ -8,7 +8,6 @@ class CkanApiPathsDocs:
     ORGANIZATION_LIST = "https://docs.ckan.org/en/2.10/api/index.html#ckan.logic.action.get.organization_list"
     # Need to add the rest....
 
-
 class CkanApiPaths:
     BASE_PATH = "/api/3/action/{}"
     PACKAGE_LIST = BASE_PATH.format("package_list")
@@ -20,7 +19,6 @@ class CkanApiPaths:
     ORGANIZATION_LIST = BASE_PATH.format("organization_list")
     # Add more paths as needed...
 
-
 class CkanDataCatalogues(Enum):
     LONDON_DATA_STORE = "https://data.london.gov.uk"
     UK_GOV = "https://data.gov.uk"
@@ -31,7 +29,6 @@ class CkanDataCatalogues(Enum):
     # NORTHERN_DATA_MILL = "https://datamillnorth.org" NEED TO LOOK INTO THIS ONE MORE
     # Add more catalogues as needed...
 
-
 # OPEN DATASOFT
 class OpenDataSoftDataCatalogues(Enum):
     UK_POWER_NETWORKS = "https://ukpowernetworks.opendatasoft.com"
@@ -57,19 +54,19 @@ class OpenDataSoftApiPaths:
     SHOW_DATASET_EXPORTS_2 = BASE_PATH_2.format("datasets/{}/exports")
     # Add more paths as needed...
 
-
 # DCAT TBC
 class DcatApiPaths:
     BASE_PATH = "/api/feed/dcat-ap/2.1.1.json"
     # Add more paths as needed...
 
+# BESPOKE DATA GOUV FR
+class FrenchGouvApiDocs:
+    DATASET_DOCS = "https://guides.data.gouv.fr/guide-data.gouv.fr/api-1/reference/datasets"
 
-# Bespoke Data.Gouv.Fr (France)
 class FrenchGouvApiPaths:
     BASE_PATH = "/api/1/{}"
     SHOW_DATASETS = BASE_PATH.format("datasets")
     SHOW_DATASETS_BY_ID = BASE_PATH.format("datasets/{}")
 
-
 class FrenchGouvCatalogue(Enum):
     GOUV_FR = "https://www.data.gouv.fr"
diff --git a/HerdingCats/errors/cats_errors.py b/HerdingCats/errors/cats_errors.py
index e9aee59..709fe8b 100644
--- a/HerdingCats/errors/cats_errors.py
+++ b/HerdingCats/errors/cats_errors.py
@@ -21,7 +21,7 @@ def __init__(self, message: str, original_error: Optional[Exception] = None) ->
 
         # Build the error message with color
         error_msg = (
-            f"{self.RED}OpenDataSoftExplorer Error: {message}{self.RESET}"
+            f"{self.RED}OpenDataSoftExplorer Error 🐈‍⬛: {message}{self.RESET}"
         )
 
         if original_error:
diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py
index 631d6df..d4abad9 100644
--- a/HerdingCats/explorer/cat_explore.py
+++ b/HerdingCats/explorer/cat_explore.py
@@ -1215,7 +1215,7 @@ def main():
     # ----------------------------
     def check_health_check(self) -> None:
         """
-        TBC
+        Check the health of the french government's opendata catalogue endpoint
         """
 
         url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS
@@ -1240,41 +1240,59 @@ def check_health_check(self) -> None:
     def get_all_datasets(self) -> dict:
         """
         Paginates through all datasets in the French Government data catalogue
-        and creates a dictionary of acronyms and IDs.
-
+        and creates a dictionary of acronyms and IDs using streaming.
         Returns:
             dict: Dictionary with dataset IDs as keys and acronyms as values
         """
         datasets = {}
         page = 1
+        page_size = 100
         base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS
 
         while True:
             try:
-                # Make request with pagination
-                params = {'page': page}
-                response = self.cat_session.session.get(base_url, params=params)
-
+                params = {
+                    'page': page,
+                    'page_size': page_size
+                }
+                # Stream the response
+                response = self.cat_session.session.get(base_url, params=params, stream=True)
                 if response.status_code != 200:
                     logger.error(f"Failed to fetch page {page} with status code {response.status_code}")
                     break
 
-                data = response.json()
+                # Process the streaming response
+                next_page_exists = False
+                for line in response.iter_lines():
+                    if not line:
+                        continue
 
-                # Process datasets on current page
-                for dataset in data['data']:
-                    dataset_id = dataset.get('id', '')
-                    # Handle null or empty acronyms by setting to empty string
-                    acronym = dataset.get('acronym') if dataset.get('acronym') else ''
-                    datasets[dataset_id] = acronym
+                    decoded_line = line.decode('utf-8')
 
-                # Check if we've reached the last page
-                if not data.get('next_page'):
+                    # Check for dataset entries
+                    if '"id":' in decoded_line and '"acronym":' in decoded_line:
+                        # Extract just what we need using string operations
+                        id_start = decoded_line.find('"id": "') + 7
+                        id_end = decoded_line.find('"', id_start)
+                        dataset_id = decoded_line[id_start:id_end]
+
+                        acronym_start = decoded_line.find('"acronym": "') + 11
+                        if acronym_start > 10:  # Found acronym
+                            acronym_end = decoded_line.find('"', acronym_start)
+                            acronym = decoded_line[acronym_start:acronym_end]
+                        else:
+                            acronym = ''
+
+                        datasets[dataset_id] = acronym
+
+                    # Check for next_page
+                    elif '"next_page":' in decoded_line:
+                        next_page_exists = 'null' not in decoded_line
+
+                if not next_page_exists:
                     break
 
                 page += 1
-
-                # Optional: Log progress every 10 pages
                 if page % 10 == 0:
                     logger.info(f"Processed {page} pages ({len(datasets)} datasets)")
 
@@ -1387,8 +1405,8 @@ def get_datasets_by_identifiers(self, identifiers: list) -> dict:
                 if dataset:
                     results[identifier] = dataset
 
-                # Optional: Add a small delay to avoid overwhelming the API
-                time.sleep(0.1)
+                # Add a small delay to avoid overwhelming the API
+                time.sleep(0.5)
 
             except Exception as e:
                 logger.error(f"Error processing identifier {identifier}: {str(e)}")