openml · PGijsbers · Oct 14, 2024 · Oct 7, 2024 · Oct 14, 2024 · Oct 14, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -7,28 +7,28 @@ files: |
   )/.*\.py$
 repos:
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.14
+    rev: v0.6.9
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix, --no-cache]
       - id: ruff-format
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.8.0
+    rev: v1.11.2
     hooks:
       - id: mypy
         additional_dependencies:
           - types-requests
           - types-python-dateutil
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.27.3
+    rev: 0.29.4
     hooks:
       - id: check-github-workflows
         files: '^github/workflows/.*\.ya?ml$'
         types: ["yaml"]
       - id: check-dependabot
         files: '^\.github/dependabot\.ya?ml$'
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
     hooks:
       - id: check-added-large-files
         files: ".*"

diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -351,7 +351,7 @@ def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None
     return md5_checksum == md5_checksum_download
 
 
-def _send_request(  # noqa: C901
+def _send_request(  # noqa: C901, PLR0912
     request_method: str,
     url: str,
     data: DATA_TYPE,
@@ -387,18 +387,15 @@ def _send_request(  # noqa: C901
                     # -- Check if encoding is not UTF-8 perhaps
                     if __is_checksum_equal(response.content, md5_checksum):
                         raise OpenMLHashException(
-                            "Checksum of downloaded file is unequal to the expected checksum {}"
-                            "because the text encoding is not UTF-8 when downloading {}. "
-                            "There might be a sever-sided issue with the file, "
-                            "see: https://github.com/openml/openml-python/issues/1180.".format(
-                                md5_checksum,
-                                url,
-                            ),
+                            f"Checksum of downloaded file is unequal to the expected checksum"
+                            f"{md5_checksum} because the text encoding is not UTF-8 when "
+                            f"downloading {url}. There might be a sever-sided issue with the file, "
+                            "see: https://github.com/openml/openml-python/issues/1180.",
                         )
 
                     raise OpenMLHashException(
-                        "Checksum of downloaded file is unequal to the expected checksum {} "
-                        "when downloading {}.".format(md5_checksum, url),
+                        f"Checksum of downloaded file is unequal to the expected checksum "
+                        f"{md5_checksum} when downloading {url}.",
                     )
 
                 return response
@@ -464,7 +461,7 @@ def __parse_server_exception(
         server_exception = xmltodict.parse(response.text)
     except xml.parsers.expat.ExpatError as e:
         raise e
-    except Exception as e:  # noqa: BLE001
+    except Exception as e:
         # OpenML has a sophisticated error system
         # where information about failures is provided. try to parse this
         raise OpenMLServerError(

diff --git a/openml/cli.py b/openml/cli.py
@@ -1,4 +1,5 @@
-""""Command Line Interface for `openml` to configure its settings."""
+"""Command Line Interface for `openml` to configure its settings."""
+
 from __future__ import annotations
 
 import argparse

diff --git a/openml/config.py b/openml/config.py
@@ -278,17 +278,17 @@ def _setup(config: _Config | None = None) -> None:
             _root_cache_directory.mkdir(exist_ok=True, parents=True)
     except PermissionError:
         openml_logger.warning(
-            "No permission to create openml cache directory at %s! This can result in "
-            "OpenML-Python not working properly." % _root_cache_directory,
+            f"No permission to create openml cache directory at {_root_cache_directory}!"
+            " This can result in OpenML-Python not working properly.",
         )
 
     if cache_exists:
         _create_log_handlers()
     else:
         _create_log_handlers(create_file_handler=False)
         openml_logger.warning(
-            "No permission to create OpenML directory at %s! This can result in OpenML-Python "
-            "not working properly." % config_dir,
+            f"No permission to create OpenML directory at {config_dir}! This can result in "
+            " OpenML-Python not working properly.",
         )
 
 

diff --git a/openml/datasets/dataset.py b/openml/datasets/dataset.py
@@ -156,14 +156,14 @@ def find_invalid_characters(string: str, pattern: str) -> str:
             )
 
         if dataset_id is None:
-            pattern = "^[\x00-\x7F]*$"
+            pattern = "^[\x00-\x7f]*$"
             if description and not re.match(pattern, description):
                 # not basiclatin (XSD complains)
                 invalid_characters = find_invalid_characters(description, pattern)
                 raise ValueError(
                     f"Invalid symbols {invalid_characters} in description: {description}",
                 )
-            pattern = "^[\x00-\x7F]*$"
+            pattern = "^[\x00-\x7f]*$"
             if citation and not re.match(pattern, citation):
                 # not basiclatin (XSD complains)
                 invalid_characters = find_invalid_characters(citation, pattern)
@@ -574,7 +574,7 @@ def _parse_data_from_file(self, data_file: Path) -> tuple[list[str], list[bool],
     def _parse_data_from_pq(self, data_file: Path) -> tuple[list[str], list[bool], pd.DataFrame]:
         try:
             data = pd.read_parquet(data_file)
-        except Exception as e:  # noqa: BLE001
+        except Exception as e:
             raise Exception(f"File: {data_file}") from e
         categorical = [data[c].dtype.name == "category" for c in data.columns]
         attribute_names = list(data.columns)
@@ -816,7 +816,7 @@ def get_data(  # noqa: C901, PLR0912, PLR0915
                 to_exclude.extend(self.ignore_attribute)
 
         if len(to_exclude) > 0:
-            logger.info("Going to remove the following attributes: %s" % to_exclude)
+            logger.info(f"Going to remove the following attributes: {to_exclude}")
             keep = np.array([column not in to_exclude for column in attribute_names])
             data = data.loc[:, keep] if isinstance(data, pd.DataFrame) else data[:, keep]
 

diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -6,6 +6,7 @@
 import warnings
 from collections import OrderedDict
 from pathlib import Path
+from pyexpat import ExpatError
 from typing import TYPE_CHECKING, Any, overload
 from typing_extensions import Literal
 
@@ -15,7 +16,6 @@
 import pandas as pd
 import urllib3
 import xmltodict
-from pyexpat import ExpatError
 from scipy.sparse import coo_matrix
 
 import openml._api_calls
@@ -85,8 +85,7 @@ def list_datasets(
     *,
     output_format: Literal["dataframe"],
     **kwargs: Any,
-) -> pd.DataFrame:
-    ...
+) -> pd.DataFrame: ...
 
 
 @overload
@@ -98,8 +97,7 @@ def list_datasets(
     tag: str | None,
     output_format: Literal["dataframe"],
     **kwargs: Any,
-) -> pd.DataFrame:
-    ...
+) -> pd.DataFrame: ...
 
 
 @overload
@@ -111,8 +109,7 @@ def list_datasets(
     tag: str | None = ...,
     output_format: Literal["dict"] = "dict",
     **kwargs: Any,
-) -> pd.DataFrame:
-    ...
+) -> pd.DataFrame: ...
 
 
 def list_datasets(
@@ -207,17 +204,15 @@ def _list_datasets(
     data_id: list | None = ...,
     output_format: Literal["dict"] = "dict",
     **kwargs: Any,
-) -> dict:
-    ...
+) -> dict: ...
 
 
 @overload
 def _list_datasets(
     data_id: list | None = ...,
     output_format: Literal["dataframe"] = "dataframe",
     **kwargs: Any,
-) -> pd.DataFrame:
-    ...
+) -> pd.DataFrame: ...
 
 
 def _list_datasets(
@@ -256,18 +251,16 @@ def _list_datasets(
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
     if data_id is not None:
-        api_call += "/data_id/%s" % ",".join([str(int(i)) for i in data_id])
+        api_call += "/data_id/{}".format(",".join([str(int(i)) for i in data_id]))
     return __list_datasets(api_call=api_call, output_format=output_format)
 
 
 @overload
-def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict:
-    ...
+def __list_datasets(api_call: str, output_format: Literal["dict"] = "dict") -> dict: ...
 
 
 @overload
-def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame:
-    ...
+def __list_datasets(api_call: str, output_format: Literal["dataframe"]) -> pd.DataFrame: ...
 
 
 def __list_datasets(
@@ -785,10 +778,8 @@ def create_dataset(  # noqa: C901, PLR0912, PLR0915
         if not is_row_id_an_attribute:
             raise ValueError(
                 "'row_id_attribute' should be one of the data attribute. "
-                " Got '{}' while candidates are {}.".format(
-                    row_id_attribute,
-                    [attr[0] for attr in attributes_],
-                ),
+                f" Got '{row_id_attribute}' while candidates are"
+                f" {[attr[0] for attr in attributes_]}.",
             )
 
     if isinstance(data, pd.DataFrame):

diff --git a/openml/evaluations/functions.py b/openml/evaluations/functions.py
@@ -32,8 +32,7 @@ def list_evaluations(
     per_fold: bool | None = ...,
     sort_order: str | None = ...,
     output_format: Literal["dict", "object"] = "dict",
-) -> dict:
-    ...
+) -> dict: ...
 
 
 @overload
@@ -51,8 +50,7 @@ def list_evaluations(
     per_fold: bool | None = ...,
     sort_order: str | None = ...,
     output_format: Literal["dataframe"] = ...,
-) -> pd.DataFrame:
-    ...
+) -> pd.DataFrame: ...
 
 
 def list_evaluations(
@@ -204,24 +202,24 @@ def _list_evaluations(
     -------
     dict of objects, or dataframe
     """
-    api_call = "evaluation/list/function/%s" % function
+    api_call = f"evaluation/list/function/{function}"
     if kwargs is not None:
         for operator, value in kwargs.items():
             api_call += f"/{operator}/{value}"
     if tasks is not None:
-        api_call += "/task/%s" % ",".join([str(int(i)) for i in tasks])
+        api_call += "/task/{}".format(",".join([str(int(i)) for i in tasks]))
     if setups is not None:
-        api_call += "/setup/%s" % ",".join([str(int(i)) for i in setups])
+        api_call += "/setup/{}".format(",".join([str(int(i)) for i in setups]))
     if flows is not None:
-        api_call += "/flow/%s" % ",".join([str(int(i)) for i in flows])
+        api_call += "/flow/{}".format(",".join([str(int(i)) for i in flows]))
     if runs is not None:
-        api_call += "/run/%s" % ",".join([str(int(i)) for i in runs])
+        api_call += "/run/{}".format(",".join([str(int(i)) for i in runs]))
     if uploaders is not None:
-        api_call += "/uploader/%s" % ",".join([str(int(i)) for i in uploaders])
+        api_call += "/uploader/{}".format(",".join([str(int(i)) for i in uploaders]))
     if study is not None:
         api_call += "/study/%d" % study
     if sort_order is not None:
-        api_call += "/sort_order/%s" % sort_order
+        api_call += f"/sort_order/{sort_order}"
 
     return __list_evaluations(api_call, output_format=output_format)
 
@@ -236,7 +234,7 @@ def __list_evaluations(
     # Minimalistic check if the XML is useful
     if "oml:evaluations" not in evals_dict:
         raise ValueError(
-            "Error in return XML, does not contain " '"oml:evaluations": %s' % str(evals_dict),
+            "Error in return XML, does not contain " f'"oml:evaluations": {evals_dict!s}',
         )
 
     assert isinstance(evals_dict["oml:evaluations"]["oml:evaluation"], list), type(