Skip to content

Commit

Permalink
feat(ckan explorer): added in get orgs list functionality - needed sl…
Browse files Browse the repository at this point in the history
…ightly different implementation for london datastore as they don't have the get_organzatios endpoint it seems [2024-11-26]
  • Loading branch information
CHRISCARLON committed Nov 26, 2024
1 parent c87c435 commit c8b6918
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 8 deletions.
6 changes: 4 additions & 2 deletions HerdingCats/endpoints/api_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
class CkanApiPathsDocs:
PACKAGE_LIST = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.package_list"
PACKAGE_SEARCH = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.package_search"
CURRENT_PACKAGE_LIST_WITH_RESOURCES = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.current_package_list_with_resources"
# Need to add the rest !!


Expand All @@ -15,15 +16,16 @@ class CkanApiPaths:
CURRENT_PACKAGE_LIST_WITH_RESOURCES = BASE_PATH.format(
"current_package_list_with_resources"
)
ORGANIZATION_LIST = BASE_PATH.format("organization_list")
# Add more paths as needed...


class CkanDataCatalogues(Enum):
LONDON_DATA_STORE = "https://data.london.gov.uk"
UK_GOV = "https://data.gov.uk"
SUBAK = "https://data.subak.org"
HUMANITARIAN = "https://data.humdata.org"
AFRICA = "https://open.africa"
HUMANITARIAN_DATA_STORE = "https://data.humdata.org"
OPEN_AFRICA = "https://open.africa"
# CANADA_GOV = "https://search.open.canada.ca/opendata" NEED TO LOOK INTO THIS ONE MORE
# NORTHERN_DATA_MILL = "https://datamillnorth.org" NEED TO LOOK INTO THIS ONE MORE
# Add more catalogues as needed...
Expand Down
48 changes: 42 additions & 6 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from numpy._core.multiarray import empty
import requests
import pandas as pd
import polars as pl
import duckdb
import json

from typing import Any, Dict, Optional, Union, Literal, List
from typing import Any, Dict, Optional, Union, Literal, List, Tuple
from loguru import logger
from urllib.parse import urlencode

Expand All @@ -29,9 +28,14 @@ def __init__(self, cat_session: CatSession):
CkanCatSession
# Example usage...
import HerdingCats as hc
def main():
with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session:
explore = hc.CkanCatExplorer(session)
if __name__ == "__main__":
with CatSession("data.london.gov.uk") as session:
explore = CkanCatExplorer(session)
main()
"""
self.cat_session = cat_session

Expand All @@ -42,7 +46,7 @@ def check_site_health(self) -> None:
"""
Make sure the Ckan endpoints are healthy and reachable
This calls the Ckan package_list endpoint to check if site is reacheable.
This calls the Ckan package_list endpoint to check if site is still reacheable.
# Example usage...
if __name__ == "__main__":
Expand Down Expand Up @@ -361,6 +365,38 @@ def package_list_dataframe_extra(
logger.error(f"Failed to search datasets: {e}")
raise CatExplorerError(f"Failed to search datasets: {str(e)}")

def get_organisation_list(self) -> Tuple[int, list]:
"""
Returns total number of orgs or maintainers if org endpoint does not work,
as well as list of the org or mantainers themselves.
"""
url = self.cat_session.base_url + CkanApiPaths.ORGANIZATION_LIST
try:
response = self.cat_session.session.get(url)
response.raise_for_status()
data = response.json()
organisations = data["result"]
length = len(organisations)
return length, organisations
except (requests.RequestException, Exception) as e:
logger.warning(f"Primary organisation search method failed - attempting secondary method that fetches 'maintainers' only - this may still be useful but not as accurate: {e}")
try:
# Secondary method using package endpoint
package_url = self.cat_session.base_url + CkanApiPaths.CURRENT_PACKAGE_LIST_WITH_RESOURCES
package_response = self.cat_session.session.get(package_url)
package_response.raise_for_status()
data = package_response.json()

# Convert list of maintainers to a dictionary
maintainers = list(set(entry.get("maintainer", "N/A") for entry in data["result"] if entry.get("maintainer")))
length = len(maintainers)
return length, maintainers

except (requests.RequestException, Exception) as e:
logger.error(f"Both organization list methods failed: {e}")
raise

# ----------------------------
# Show catalogue freshness
# ----------------------------
Expand All @@ -374,7 +410,7 @@ def catalogue_freshness(self):
It currently uses metadata_modified at the dataset level - not resource level.
"""
logger.warning(
"This method might not work for all catalogues, and will return 0s. It currently only works for the London Datastore. We are working on improving this"
"This method DOES NOT work for all catalogues, and will return 0s. It currently only works for the London Datastore. We are working on improving and fixing this."
)

url = (
Expand Down

0 comments on commit c8b6918

Please sign in to comment.