Skip to content

Commit

Permalink
feat(catalogues): added in more support for open data catalogues [202…
Browse files Browse the repository at this point in the history
…4-11-27]
  • Loading branch information
CHRISCARLON committed Nov 27, 2024
1 parent 610dae1 commit 13bf83b
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 13 deletions.
4 changes: 2 additions & 2 deletions HerdingCats/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .data_loader.data_loader import CkanCatResourceLoader
from .data_loader.data_loader import CkanCatResourceLoader, OpenDataSoftResourceLoader
from .explorer.cat_explore import CkanCatExplorer, OpenDataSoftCatExplorer
from .session.cat_session import CatSession
from .errors.cats_errors import CatSessionError, CatExplorerError
from .errors.cats_errors import CatSessionError, CatExplorerError, OpenDataSoftExplorerError
from .endpoints.api_endpoints import CkanDataCatalogues, OpenDataSoftDataCatalogues, CkanApiPathsDocs

__all__ = [
Expand Down
68 changes: 67 additions & 1 deletion HerdingCats/data_loader/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import pyarrow.parquet as pq
import uuid

from ..errors.cats_errors import OpenDataSoftExplorerError
from io import BytesIO
from typing import Optional, Literal, List
from typing import Optional, Literal, List, Dict
from loguru import logger
from botocore.exceptions import ClientError

Expand Down Expand Up @@ -399,3 +400,68 @@ def aws_s3_data_loader(
except requests.RequestException as e:
logger.error(f"Error fetching data from URL: {e}")
return

class OpenDataSoftResourceLoader:
def __init__(self) -> None:
pass

def polars_data_loader(
self, resource_data: Optional[List[Dict]], format_type: Literal["parquet"], api_key: Optional[str] = None
) -> pl.DataFrame:
"""
Load data from a resource URL into a Polars DataFrame.
Args:
resource_data: List of dictionaries containing resource information
format_type: Expected format type (currently only supports 'parquet')
api_key: Optional API key for authentication with OpenDataSoft
Returns:
Polars DataFrame
Raises:
OpenDataSoftExplorerError: If resource data is missing or download fails
# Example usage
import HerdingCats as hc
from pprint import pprint
def main():
with hc.CatSession(hc.OpenDataSoftDataCatalogues.UK_POWER_NETWORKS) as session:
explore = hc.OpenDataSoftCatExplorer(session)
data_loader = hc.OpenDataSoftResourceLoader()
data = explore.show_dataset_export_options_dict("ukpn-smart-meter-installation-volumes")
pl_df = data_loader.polars_data_loader(data, "parquet", "api_key")
print(pl_df.head(10))
if __name__ == "__main__":
main()
"""
if not resource_data:
raise OpenDataSoftExplorerError("No resource data provided")

headers = {'Accept': 'application/parquet'}
if api_key:
headers['Authorization'] = f'apikey {api_key}'

for resource in resource_data:
if resource.get('format', '').lower() == 'parquet':
url = resource.get('download_url')
if not url:
continue
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
binary_data = BytesIO(response.content)
df = pl.read_parquet(binary_data)

if df.height == 0 and not api_key:
raise OpenDataSoftExplorerError(
"Received empty DataFrame. This likely means an API key is required for this dataset. "
"Please provide an API key and try again. You can usually do this by creating an account with the datastore you are tyring to access"
)
return df

except (requests.RequestException, Exception) as e:
raise OpenDataSoftExplorerError("Failed to download resource", e)

raise OpenDataSoftExplorerError("No parquet format resource found")
5 changes: 5 additions & 0 deletions HerdingCats/endpoints/api_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,21 @@ class OpenDataSoftDataCatalogues(Enum):
INFRABEL = "https://opendata.infrabel.be"
PARIS = "https://opendata.paris.fr"
TOULOUSE = "https://data.toulouse-metropole.fr"
ELIA_BELGIAN_ENERGY = "https://opendata.elia.be"
# Add more catalogues as needed...

class OpenDataSoftApiPaths:
# Normal base paths...
BASE_PATH = "/api/v2/catalog/{}"
SHOW_DATASETS = BASE_PATH.format("datasets")
SHOW_DATASET_INFO = BASE_PATH.format("datasets/{}")
SHOW_DATASET_EXPORTS = BASE_PATH.format("datasets/{}/exports")

# Alternative base paths...
BASE_PATH_2 = "/api/explore/v2.0/catalog/{}"
SHOW_DATASETS_2 = BASE_PATH_2.format("datasets")
SHOW_DATASET_INFO_2 = BASE_PATH_2.format("datasets/{}")
SHOW_DATASET_EXPORTS_2 = BASE_PATH_2.format("datasets/{}/exports")
# Add more paths as needed...


Expand Down
31 changes: 31 additions & 0 deletions HerdingCats/errors/cats_errors.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,36 @@
from typing import Optional

class CatSessionError(Exception):
pass

class CatExplorerError(Exception):
pass

class OpenDataSoftExplorerError(Exception):
"""
Custom exception class for OpenDataSoft Explorer errors with colored output using ANSI codes.
"""
# ANSI escape codes for colors
RED = '\033[91m'
YELLOW = '\033[93m'
RESET = '\033[0m'

def __init__(self, message: str, original_error: Optional[Exception] = None) -> None:
self.message = message
self.original_error = original_error

# Build the error message with color
error_msg = (
f"{self.RED}OpenDataSoftExplorer Error: {message}{self.RESET}"
)

if original_error:
error_msg += (
f"\n{self.YELLOW}Original error: "
f"{str(original_error)}{self.RESET}"
)

super().__init__(error_msg)

def __str__(self) -> str:
return self.args[0]
58 changes: 48 additions & 10 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def package_list_dictionary(self) -> dict:
dictionary_data = {item: item for item in dictionary_prep}
return dictionary_data
except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def package_list_dataframe(
Expand Down Expand Up @@ -216,7 +215,6 @@ def package_list_dataframe(
raise ValueError(f"Unsupported DataFrame type: {df_type}")

except (requests.RequestException, Exception) as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def package_list_dictionary_extra(self):
Expand Down Expand Up @@ -257,7 +255,6 @@ def package_list_dictionary_extra(self):
]
return dictionary_data
except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")
return

Expand Down Expand Up @@ -360,7 +357,6 @@ def package_list_dataframe_extra(
raise ValueError(f"Unsupported DataFrame type: {df_type}")

except (requests.RequestException, Exception) as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def get_organisation_list(self) -> Tuple[int, list]:
Expand Down Expand Up @@ -458,7 +454,6 @@ def catalogue_freshness(self):
return df

except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

# ----------------------------
Expand Down Expand Up @@ -500,7 +495,6 @@ def package_show_info_json(self, package_name: Union[str, dict, Any]) -> List[Di
return self._extract_resource_data(result_data)

except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

# ----------------------------
Expand Down Expand Up @@ -542,7 +536,6 @@ def main():
data = response.json()
return data["result"]
except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def package_search_condense_json_unpacked(
Expand Down Expand Up @@ -609,7 +602,6 @@ def main():
)

except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def package_search_condense_dataframe_packed(
Expand Down Expand Up @@ -705,7 +697,6 @@ def package_search_condense_dataframe_packed(
return pd.DataFrame(extracted_data)

except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def package_search_condense_dataframe_unpacked(
Expand Down Expand Up @@ -812,7 +803,6 @@ def package_search_condense_dataframe_unpacked(
return self._create_pandas_dataframe(extracted_data)

except requests.RequestException as e:
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

# ----------------------------
Expand Down Expand Up @@ -1104,3 +1094,51 @@ def fetch_all_datasets(self) -> dict | None:
else:
logger.warning("No datasets were retrieved.")
return None

def show_dataset_info_dict(self, dataset_id):
urls = [
self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASET_INFO.format(dataset_id),
self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASET_INFO.format(dataset_id),
]
last_error = []
for url in urls:
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
return data
except requests.RequestException as e:
last_error = e
continue
error_msg = f"\033[91mFailed to fetch dataset: {str(last_error)}. Are you sure this dataset exists? Check again.\033[0m"
raise CatExplorerError(error_msg)

def show_dataset_export_options_dict(self, dataset_id):
urls = [
self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASET_EXPORTS.format(dataset_id),
self.cat_session.base_url + OpenDataSoftApiPaths.SHOW_DATASET_EXPORTS_2.format(dataset_id),
]
last_error = []
for url in urls:
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()

# Extract download links and formats
export_options = []
for link in data['links']:
if link['rel'] != 'self':
export_options.append({
'format': link['rel'],
'download_url': link['href']
})

return export_options

except requests.RequestException as e:
last_error = e
continue

error_msg = f"\033[91mFailed to fetch dataset: {str(last_error)}. Are you sure this dataset exists? Check again.\033[0m"
raise CatExplorerError(error_msg)

0 comments on commit 13bf83b

Please sign in to comment.