Skip to content

Commit

Permalink
Added in more opensoft data catalogues
Browse files Browse the repository at this point in the history
  • Loading branch information
CHRISCARLON committed Sep 28, 2024
1 parent 9bac8dc commit 840dfb8
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 5 deletions.
6 changes: 5 additions & 1 deletion HerdingCats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from .data_loader.data_loader import CkanCatResourceLoader
from .explorer.cat_explore import CkanCatExplorer
from .explorer.cat_explore import CkanCatExplorer, OpenDataSoftCatExplorer
from .session.cat_session import CatSession
from .errors.cats_errors import CatSessionError, CatExplorerError
from .endpoints.api_endpoints import CkanApiPaths, OpenDataSoftDataCatalogues

__all__ = [
"CkanCatResourceLoader",
"CkanCatExplorer",
"CatSession",
"CatSessionError",
"CatExplorerError",
"CkanApiPaths",
"OpenDataSoftDataCatalogues",
"OpenDataSoftCatExplorer",
]

__version__ = "0.1.3"
11 changes: 9 additions & 2 deletions HerdingCats/endpoints/api_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,22 @@ class CkanDataCatalogues(Enum):

class OpenDataSoftDataCatalogues(Enum):
UK_POWER_NETWORKS = "https://ukpowernetworks.opendatasoft.com"
UKPN = "https://ukpowernetworks.opendatasoft.com"
INFRABEL = "https://opendata.infrabel.be"
PARIS = "https://opendata.paris.fr"
TOULOUSE = "https://data.toulouse-metropole.fr"
# Add more catalogues as needed...


# OPEN DATASOFT
class OpenDataSoftApiPaths:
# Normal base paths...
BASE_PATH = "/api/v2/catalog/{}"
SHOW_DATASETS = BASE_PATH.format("/datasets")
SHOW_DATASETS = BASE_PATH.format("datasets")

# Alternativre base paths...
BASE_PATH_2 = "/api/explore/v2.0/catalog/{}"
SHOW_DATASETS_2 = BASE_PATH_2.format("datasets")

# Add more paths as needed...


Expand Down
102 changes: 100 additions & 2 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@
from loguru import logger
from urllib.parse import urlencode

from ..endpoints.api_endpoints import CkanApiPaths
from ..endpoints.api_endpoints import CkanApiPaths, OpenDataSoftApiPaths
from ..errors.cats_errors import CatExplorerError
from ..session.cat_session import CatSession


# FIND THE DATA YOU WANT / NEED / ISOLATE PACKAGES AND RESOURCES
# For Ckan Catalogues Only
class CkanCatExplorer:
def __init__(self, cat_session: CatSession):
"""
Takes in a CkanCatSession
Takes in a CatSession
Allows user to start exploring data catalogue programatically
Expand Down Expand Up @@ -844,3 +845,100 @@ def _duckdb_explore(
print("First few elements of input data:")
print(json.dumps(data[:2], indent=2))
return pd.DataFrame()


# FIND THE DATA YOU WANT / NEED / ISOLATE PACKAGES AND RESOURCES
# For Open Datasoft Catalogues Only
class OpenDataSoftCatExplorer:
def __init__(self, cat_session: CatSession):
"""
Takes in a CatSession
Allows user to start exploring data catalogue programatically
Make sure you pass a valid CkanCatSession in
Args:
CkanCatSession
# Example usage...
if __name__ == "__main__":
with CatSession("ukpowernetworks.opendatasoft.com") as session:
explore = CatExplorer(session)
"""
self.cat_session = cat_session

def fetch_all_datasets(self) -> dict | None:
urls = [
self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASETS,
self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASETS_2,
]
dataset_dict = {}
total_count = 0

for url in urls:
offset = 0
limit = 100

try:
while True:
params = {"offset": offset, "limit": limit}
response = self.cat_session.session.get(url, params=params)

if response.status_code == 400 and url == urls[0]:
logger.warning(
"SHOW_DATASETS endpoint returned 400 status. Trying SHOW_DATASETS_2."
)
break # Break the inner loop to try the next URL

response.raise_for_status()
result = response.json()

for dataset_info in result.get("datasets", []):
if (
"dataset" in dataset_info
and "metas" in dataset_info["dataset"]
and "default" in dataset_info["dataset"]["metas"]
and "title" in dataset_info["dataset"]["metas"]["default"]
and "dataset_id" in dataset_info["dataset"]
):
title = dataset_info["dataset"]["metas"]["default"]["title"]
dataset_id = dataset_info["dataset"]["dataset_id"]
dataset_dict[title] = dataset_id

# Update total_count if available
if "total_count" in result:
total_count = result["total_count"]

# Check if we've reached the end of the datasets
if len(result.get("datasets", [])) < limit:
break
offset += limit

# If we've successfully retrieved datasets, no need to try the second URL
if dataset_dict:
break

except requests.RequestException as e:
if url == urls[-1]:
logger.error(f"Failed to fetch datasets: {e}")
raise CatExplorerError(f"Failed to fetch datasets: {str(e)}")
else:
logger.warning(
f"Failed to fetch datasets from {url}: {e}. Trying next URL."
)

if dataset_dict:
returned_count = len(dataset_dict)
if returned_count == total_count:
logger.success(
f"total_count = {total_count} AND returned_count = {returned_count}"
)
else:
logger.warning(
f"Mismatch in counts: total_count = {total_count}, returned_count = {returned_count}"
)
return dataset_dict
else:
logger.warning("No datasets were retrieved.")
return None
1 change: 1 addition & 0 deletions HerdingCats/session/cat_session.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import requests

from typing import Union
from loguru import logger
from urllib.parse import urlparse
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ This will improve and speed up how people:
| Humanitarian Data Exchange | https://data.humdata.org | CKAN |
| UK Power Networks | https://ukpowernetworks.opendatasoft.com | Open Datasoft |
| Infrabel | https://opendata.infrabel.be | Open Datasoft |
| Paris | hhttps://opendata.paris.fr | Open Datasoft |
| Toulouse | https://data.toulouse-metropole.fr | Open Datasoft |

**TBC**
| Catalogue Name | Website | Catalogue API Endpoint Definition |
Expand Down

0 comments on commit 840dfb8

Please sign in to comment.