diff --git a/cvmfsscraper/__init__.py b/cvmfsscraper/__init__.py index 8ed1981..a317ce9 100644 --- a/cvmfsscraper/__init__.py +++ b/cvmfsscraper/__init__.py @@ -1,10 +1,42 @@ """Core of the cvmfsscraper package.""" +import logging from concurrent.futures import ThreadPoolExecutor, as_completed from typing import List +import structlog + from cvmfsscraper.server import CVMFSServer, Stratum0Server, Stratum1Server +structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.StackInfoRenderer(), + structlog.dev.set_exc_info, + structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S", utc=False), + structlog.processors.JSONRenderer(), + ], + context_class=dict, + logger_factory=structlog.stdlib.LoggerFactory(), # Ensure compatibility + cache_logger_on_first_use=True, +) + + +def set_log_level(level: int) -> None: + """Set the log level for the library. + + This function allows the consumer of the library to set the desired log level. + + :param level: The log level to set. This should be a value from the logging module, + such as logging.INFO, logging.DEBUG, etc. + """ + logging.basicConfig(level=level, format="%(message)s") + structlog.configure( + wrapper_class=structlog.stdlib.BoundLogger, + cache_logger_on_first_use=True, + ) + def scrape_server( dns_name: str, diff --git a/cvmfsscraper/exceptions.py b/cvmfsscraper/exceptions.py index e1876f9..57eed5e 100644 --- a/cvmfsscraper/exceptions.py +++ b/cvmfsscraper/exceptions.py @@ -2,6 +2,10 @@ from typing import Any +import structlog + +log = structlog.getLogger(__name__) + class CVMFSScraperBaseException(Exception): """Base exception for cvmfsscraper.""" @@ -12,6 +16,15 @@ def __init__( """Initialize the exception.""" self.message = message self.original_exception = original_excption + + log.debug( + "Exception raised", + exception=self.__class__.__name__, + message=message, + original_exception=original_excption, + args=args, + ) + super().__init__(message, *args) diff --git a/cvmfsscraper/http_get_models.py b/cvmfsscraper/http_get_models.py index 1f523ea..98b402b 100644 --- a/cvmfsscraper/http_get_models.py +++ b/cvmfsscraper/http_get_models.py @@ -4,10 +4,13 @@ from enum import Enum from typing import Any, Dict, List, Optional, Type, Union +import structlog from pydantic import BaseModel, Field, field_validator, model_validator from cvmfsscraper.exceptions import CVMFSValidationError +log = structlog.getLogger(__name__) + def hex_field(min_length: int, max_length: int, alias: str): """Create a Field for hexadecimal strings with a specified length range. @@ -24,6 +27,16 @@ def hex_field(min_length: int, max_length: int, alias: str): class CVMFSBaseModel(BaseModel): """Base model for CVMFS models.""" + def __init__(self, **kwargs: Any) -> None: + """Initialize the model.""" + log.debug( + "Initializing pydantic model", + model=self.__class__.__name__, + kwargs=kwargs, + ) + + super().__init__(**kwargs) + class RepositoryOrReplica(BaseModel): """Model for a repository or replica.""" diff --git a/cvmfsscraper/main.py b/cvmfsscraper/main.py index 6c71d3d..14e7256 100644 --- a/cvmfsscraper/main.py +++ b/cvmfsscraper/main.py @@ -2,11 +2,15 @@ from typing import Any, Dict, List +import structlog + from cvmfsscraper import scrape as scrape_proper from cvmfsscraper import scrape_server as scrape_server_proper from cvmfsscraper.server import CVMFSServer from cvmfsscraper.tools import deprecated +deplog = structlog.getLogger("deprecation") + def scrape(*args: Any, **kwargs: Dict[str, Any]) -> List[CVMFSServer]: """Legacy API support for cvmfsscraper.""" @@ -14,6 +18,15 @@ def scrape(*args: Any, **kwargs: Dict[str, Any]) -> List[CVMFSServer]: "cvmfsserver.main.scrape", "cvmfsserver.scrape", ) + deplog.warning( + "Deprecated API used", + deprecated="cvmfsserver.main.scrape", + replacement="cvmfsserver.scrape", + message=( + "cvmfsserver.main.scrape is deprecated and will be removed in a future release." + "Please use cvmfsserver.scrape instead." + ), + ) return scrape_proper(*args, **kwargs) @@ -23,4 +36,13 @@ def scrape_server(*args: Any, **kwargs: Dict[str, Any]) -> CVMFSServer: "cvmfsserver.main.scrape_server", "cvmfsserver.scrape_server", ) + deplog.warning( + "Deprecated API used", + deprecated="cvmfsserver.main.scrape_server", + replacement="cvmfsserver.scrape_server", + message=( + "cvmfsserver.main.scrape_server is deprecated and will be removed in a future release." + "Please use cvmfsserver.scrape_server instead." + ), + ) return scrape_server_proper(*args, **kwargs) diff --git a/cvmfsscraper/repository.py b/cvmfsscraper/repository.py index 1446c46..60fa3cd 100644 --- a/cvmfsscraper/repository.py +++ b/cvmfsscraper/repository.py @@ -1,12 +1,15 @@ """A CVMFS repository.""" from typing import Dict +import structlog + from cvmfsscraper.http_get_models import ( Endpoints, GetCVMFSPublished, GetCVMFSStatusJSON, ) -from cvmfsscraper.tools import warn + +log = structlog.getLogger(__name__) class Repository: @@ -57,6 +60,8 @@ def __init__(self, server: object, name: str, url: str): self.fetch_errors = [] + log.debug("Initalizing repository", server=server.name, name=name, url=url) + self.scrape() def __str__(self) -> str: @@ -65,18 +70,33 @@ def __str__(self) -> str: def scrape(self) -> None: """Scrape the repository.""" + log.debug( + "Scraping repository", server=self.server, name=self.name, url=self.path + ) try: cvmfspublished = self.fetch_cvmfspublished() self.parse_cvmfspublished(cvmfspublished) except Exception as exc: - warn("CVMFSpublished", exc) + log.warn( + "Scrape error", + exc=exc, + server=self.server, + name=self.name, + url=self.path, + ) self.fetch_errors.append({"path": self.path, "error": exc}) try: repo = self.fetch_repository() self.parse_status_json(repo) except Exception as exc: - warn("Repository", exc) + log.warn( + "Scrape error", + exc=exc, + server=self.server, + name=self.name, + url=self.path, + ) self.fetch_errors.append({"path": self.path, "error": exc}) def attribute_mapping(self) -> Dict[str, str]: diff --git a/cvmfsscraper/server.py b/cvmfsscraper/server.py index c402565..416ae2d 100644 --- a/cvmfsscraper/server.py +++ b/cvmfsscraper/server.py @@ -4,6 +4,8 @@ from typing import Dict, List from urllib import error, request +import structlog + from cvmfsscraper.constants import GeoAPIStatus from cvmfsscraper.http_get_models import ( EndpointClassesType, @@ -14,7 +16,9 @@ RepositoryOrReplica, ) from cvmfsscraper.repository import Repository -from cvmfsscraper.tools import GEOAPI_SERVERS, warn +from cvmfsscraper.tools import GEOAPI_SERVERS + +log = structlog.getLogger(__name__) class CVMFSServer: @@ -60,6 +64,14 @@ def __init__( self.fetch_errors = [] + log.info( + "Initializing server", + server=server, + repos=repos, + ignore_repos=ignore_repos, + scrape_on_init=scrape_on_init, + ) + if scrape_on_init: self.scrape() @@ -73,6 +85,8 @@ def url(self) -> str: def scrape(self) -> None: """Scrape the server.""" + log.info("Scraping server", server=self.name) + self.populate_repositories() if not self.fetch_errors: @@ -106,6 +120,7 @@ def populate_repositories(self) -> None: If the server is down, the list will be empty. """ + log.info("Populating repositories", server=self.name) try: repodata = self.fetch_repositories_json() @@ -118,7 +133,11 @@ def populate_repositories(self) -> None: self._is_down = False except Exception as e: # pragma: no cover - warn(f"Populate repository: {self.name}", e) + log.error( + "Populate repository failure", + exc=e, + server=self.name, + ) self.fetch_errors.append({"path": self.name, "error": e}) def process_repositories_json( @@ -185,7 +204,11 @@ def check_geoapi_status(self) -> GeoAPIStatus: else: return GeoAPIStatus.LOCATION_ERROR except Exception as e: # pragma: no cover - warn("GEOAPI failure", e) + log.error( + "GeoAPI failure", + exc=e, + name=self.name, + ) return GeoAPIStatus.NO_RESPONSE def fetch_repositories_json(self) -> GetCVMFSRepositoriesJSON: @@ -244,19 +267,42 @@ def fetch_endpoint( if not isinstance(endpoint, Endpoints): # type: ignore raise TypeError("endpoint must be an Endpoints enum value") + log.debug( + "Fetching endpoint", server=self.name, endpoint=endpoint.name, repo=repo + ) + geoapi_str = ",".join(geoapi_servers) formatted_path = endpoint.path.format(repo=repo, geoapi_str=geoapi_str) url = f"{self.url()}/cvmfs/{formatted_path}" timeout_seconds = 5 try: + log.info("Fetching url", url=url) content = request.urlopen(url, timeout=timeout_seconds) if endpoint in [Endpoints.REPOSITORIES_JSON, Endpoints.CVMFS_STATUS_JSON]: + log.debug( + "Fetched JSON endpoint", + server=self.name, + endpoint=endpoint.name, + repo=repo, + ) content = json.loads(content.read()) elif endpoint == Endpoints.CVMFS_PUBLISHED: + log.debug( + "Fetched .cvmfspublished", + server=self.name, + endpoint=endpoint.name, + repo=repo, + ) content = GetCVMFSPublished.parse_blob(content.read()) elif endpoint == Endpoints.GEOAPI: + log.debug( + "Fetched geoapi", + server=self.name, + endpoint=endpoint.name, + repo=repo, + ) indices = [int(x) for x in content.read().decode().split(",")] content = { "host_indices": indices, @@ -266,7 +312,14 @@ def fetch_endpoint( return endpoint.model_class(**content) except error.URLError as e: - warn(f"fetch_endpoint: {url}", e) + log.error( + "Fetch endpoint failure", + exc=e, + name=self.name, + endpoint=endpoint.name, + repo=repo, + url=url, + ) raise e from e diff --git a/cvmfsscraper/tools.py b/cvmfsscraper/tools.py index ac66ead..4b7852f 100644 --- a/cvmfsscraper/tools.py +++ b/cvmfsscraper/tools.py @@ -6,6 +6,10 @@ import urllib.request from typing import Any +import structlog + +log = structlog.getLogger(__name__) + GEOAPI_SERVERS = [ "cvmfs-s1fnal.opensciencegrid.org", "cvmfs-stratum-one.cern.ch", @@ -25,6 +29,8 @@ def deprecated(old: str, new: str) -> None: def fetch_absolute(obj: object, url: str) -> str: """Fetch an absolute URL, handle exceptions.""" + log.info("Fetching", url=url) + timeout_seconds = 5 try: content = urllib.request.urlopen(url, timeout=timeout_seconds).read() @@ -34,7 +40,7 @@ def fetch_absolute(obj: object, url: str) -> str: return content except Exception as e: - warn(f"fetch_absolute: {url}", e) + log.warn("Fetch absolute", url=url, exception=e) obj.fetch_errors.append({"path": url, "error": e}) return diff --git a/poetry.lock b/poetry.lock index dcc7346..50353cd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -540,6 +540,23 @@ files = [ {file = "ruff-0.1.14.tar.gz", hash = "sha256:ad3f8088b2dfd884820289a06ab718cde7d38b94972212cc4ba90d5fbc9955f3"}, ] +[[package]] +name = "structlog" +version = "24.1.0" +description = "Structured Logging for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "structlog-24.1.0-py3-none-any.whl", hash = "sha256:3f6efe7d25fab6e86f277713c218044669906537bb717c1807a09d46bca0714d"}, + {file = "structlog-24.1.0.tar.gz", hash = "sha256:41a09886e4d55df25bdcb9b5c9674bccfab723ff43e0a86a1b7b236be8e57b16"}, +] + +[package.extras] +dev = ["structlog[tests,typing]"] +docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-mermaid", "sphinxext-opengraph", "twisted"] +tests = ["freezegun (>=0.2.8)", "pretend", "pytest (>=6.0)", "pytest-asyncio (>=0.17)", "simplejson"] +typing = ["mypy (>=1.4)", "rich", "twisted"] + [[package]] name = "tomli" version = "2.0.1" @@ -612,4 +629,4 @@ test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "4a35316972d141ffc73ca2076896042a1cadb92bdd08987e2de410448eabeb07" +content-hash = "8c458c2ef413fe3f018d0abe90159a6787d3530cec0cb919246ff2448f426922" diff --git a/pyproject.toml b/pyproject.toml index 66ecf98..8817ecc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ packages = [{ include = "cvmfsscraper" }, { include = "scripts/*" }] [tool.poetry.dependencies] python = "^3.8" pydantic = "*" +structlog = "^24" [tool.poetry.group.dev.dependencies] ruff = "*"