Skip to content

Commit

Permalink
style: small refactors [2024-12-02]
Browse files Browse the repository at this point in the history
  • Loading branch information
CHRISCARLON committed Dec 2, 2024
1 parent ec508e3 commit 3d96544
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 27 deletions.
9 changes: 3 additions & 6 deletions HerdingCats/endpoints/api_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class CkanApiPathsDocs:
ORGANIZATION_LIST = "https://docs.ckan.org/en/2.10/api/index.html#ckan.logic.action.get.organization_list"
# Need to add the rest....


class CkanApiPaths:
BASE_PATH = "/api/3/action/{}"
PACKAGE_LIST = BASE_PATH.format("package_list")
Expand All @@ -20,7 +19,6 @@ class CkanApiPaths:
ORGANIZATION_LIST = BASE_PATH.format("organization_list")
# Add more paths as needed...


class CkanDataCatalogues(Enum):
LONDON_DATA_STORE = "https://data.london.gov.uk"
UK_GOV = "https://data.gov.uk"
Expand All @@ -31,7 +29,6 @@ class CkanDataCatalogues(Enum):
# NORTHERN_DATA_MILL = "https://datamillnorth.org" NEED TO LOOK INTO THIS ONE MORE
# Add more catalogues as needed...


# OPEN DATASOFT
class OpenDataSoftDataCatalogues(Enum):
UK_POWER_NETWORKS = "https://ukpowernetworks.opendatasoft.com"
Expand All @@ -57,19 +54,19 @@ class OpenDataSoftApiPaths:
SHOW_DATASET_EXPORTS_2 = BASE_PATH_2.format("datasets/{}/exports")
# Add more paths as needed...


# DCAT TBC
class DcatApiPaths:
BASE_PATH = "/api/feed/dcat-ap/2.1.1.json"
# Add more paths as needed...

# BESPOKE DATA GOUV FR
class FrenchGouvApiDocs:
DATASET_DOCS = "https://guides.data.gouv.fr/guide-data.gouv.fr/api-1/reference/datasets"

# Bespoke Data.Gouv.Fr (France)
class FrenchGouvApiPaths:
BASE_PATH = "/api/1/{}"
SHOW_DATASETS = BASE_PATH.format("datasets")
SHOW_DATASETS_BY_ID = BASE_PATH.format("datasets/{}")


class FrenchGouvCatalogue(Enum):
GOUV_FR = "https://www.data.gouv.fr"
2 changes: 1 addition & 1 deletion HerdingCats/errors/cats_errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def __init__(self, message: str, original_error: Optional[Exception] = None) ->

# Build the error message with color
error_msg = (
f"{self.RED}OpenDataSoftExplorer Error: {message}{self.RESET}"
f"{self.RED}OpenDataSoftExplorer Error 🐈‍⬛: {message}{self.RESET}"
)

if original_error:
Expand Down
58 changes: 38 additions & 20 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -1215,7 +1215,7 @@ def main():
# ----------------------------
def check_health_check(self) -> None:
"""
TBC
Check the health of the french government's opendata catalogue endpoint
"""

url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS
Expand All @@ -1240,41 +1240,59 @@ def check_health_check(self) -> None:
def get_all_datasets(self) -> dict:
"""
Paginates through all datasets in the French Government data catalogue
and creates a dictionary of acronyms and IDs.
and creates a dictionary of acronyms and IDs using streaming.
Returns:
dict: Dictionary with dataset IDs as keys and acronyms as values
"""
datasets = {}
page = 1
page_size = 100
base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS

while True:
try:
# Make request with pagination
params = {'page': page}
response = self.cat_session.session.get(base_url, params=params)

params = {
'page': page,
'page_size': page_size
}
# Stream the response
response = self.cat_session.session.get(base_url, params=params, stream=True)
if response.status_code != 200:
logger.error(f"Failed to fetch page {page} with status code {response.status_code}")
break

data = response.json()
# Process the streaming response
next_page_exists = False
for line in response.iter_lines():
if not line:
continue

# Process datasets on current page
for dataset in data['data']:
dataset_id = dataset.get('id', '')
# Handle null or empty acronyms by setting to empty string
acronym = dataset.get('acronym') if dataset.get('acronym') else ''
datasets[dataset_id] = acronym
decoded_line = line.decode('utf-8')

# Check if we've reached the last page
if not data.get('next_page'):
# Check for dataset entries
if '"id":' in decoded_line and '"acronym":' in decoded_line:
# Extract just what we need using string operations
id_start = decoded_line.find('"id": "') + 7
id_end = decoded_line.find('"', id_start)
dataset_id = decoded_line[id_start:id_end]

acronym_start = decoded_line.find('"acronym": "') + 11
if acronym_start > 10: # Found acronym
acronym_end = decoded_line.find('"', acronym_start)
acronym = decoded_line[acronym_start:acronym_end]
else:
acronym = ''

datasets[dataset_id] = acronym

# Check for next_page
elif '"next_page":' in decoded_line:
next_page_exists = 'null' not in decoded_line

if not next_page_exists:
break

page += 1

# Optional: Log progress every 10 pages
if page % 10 == 0:
logger.info(f"Processed {page} pages ({len(datasets)} datasets)")

Expand Down Expand Up @@ -1387,8 +1405,8 @@ def get_datasets_by_identifiers(self, identifiers: list) -> dict:
if dataset:
results[identifier] = dataset

# Optional: Add a small delay to avoid overwhelming the API
time.sleep(0.1)
# Add a small delay to avoid overwhelming the API
time.sleep(0.5)

except Exception as e:
logger.error(f"Error processing identifier {identifier}: {str(e)}")
Expand Down

0 comments on commit 3d96544

Please sign in to comment.