diff --git a/.github/workflows/add-3plicense-warning.yml b/.github/workflows/add-3plicense-warning.yml index 4d6ea0aef..1e377d57c 100644 --- a/.github/workflows/add-3plicense-warning.yml +++ b/.github/workflows/add-3plicense-warning.yml @@ -3,7 +3,7 @@ name: "Add 3P License Warning to PR" on: pull_request: paths: - - setup.py + - pyproject.toml # Cancel in progress workflows on pull_requests. # https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value @@ -23,9 +23,9 @@ jobs: steps: - run: | BODY_MSG=$(cat << EOF - ⚠️ This PR changed **setup.py** file. ⚠️ - - PR Creator must update 📃 THIRD_PARTY_LICENSES.txt, if any 📚 library added/removed in **setup.py**. - - PR Approver must confirm 📃 THIRD_PARTY_LICENSES.txt updated, if any 📚 library added/removed in **setup.py**. + ⚠️ This PR changed **pyproject.toml** file. ⚠️ + - PR Creator must update 📃 THIRD_PARTY_LICENSES.txt, if any 📚 library added/removed in **pyproject.toml**. + - PR Approver must confirm 📃 THIRD_PARTY_LICENSES.txt updated, if any 📚 library added/removed in **pyproject.toml**. EOF ) echo "BODY_MSG<> $GITHUB_ENV diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml index 071250038..3c628f772 100644 --- a/.github/workflows/publish-to-pypi.yml +++ b/.github/workflows/publish-to-pypi.yml @@ -16,15 +16,15 @@ jobs: python-version: "3.x" - name: Build distribution 📦 run: | - pip install wheel + pip install build make dist - name: Validate run: | pip install dist/*.whl python -c "import ads;" -## To run publish to test PyPI secret with token needs to be added, +## To run publish to test PyPI a secret with token needs to be added, ## this one GH_ADS_TESTPYPI_TOKEN - removed after initial test. -## Project name also needed to be updated in setup.py - setup(name="test_oracle_ads", ...), +## Project name also needed to be updated in pyproject.toml - name = "test_oracle_ads" in [project] section ## regular name is occupied by former developer and can't be used for testing # - name: Publish distribution 📦 to Test PyPI # env: diff --git a/.github/workflows/run-unittests-default_setup.yml b/.github/workflows/run-unittests-default_setup.yml index 6084f6ea6..da6ba5c80 100644 --- a/.github/workflows/run-unittests-default_setup.yml +++ b/.github/workflows/run-unittests-default_setup.yml @@ -9,8 +9,7 @@ on: - develop paths: - "ads/**" - - "!ads/ads_version.json" - - setup.py + - pyproject.toml - "**requirements.txt" - .github/workflows/run-unittests.yml - .github/workflows/run-unittests-default_setup.yml diff --git a/.github/workflows/run-unittests.yml b/.github/workflows/run-unittests.yml index aa5ae9e1f..c5ca97368 100644 --- a/.github/workflows/run-unittests.yml +++ b/.github/workflows/run-unittests.yml @@ -9,8 +9,7 @@ on: - develop paths: - "ads/**" - - "!ads/ads_version.json" - - setup.py + - pyproject.toml - "**requirements.txt" - .github/workflows/run-unittests.yml - .github/workflows/run-unittests-default_setup.yml @@ -39,9 +38,9 @@ jobs: fail-fast: false matrix: python-version: ["3.8", "3.9", "3.10"] - test-path: ["tests/unitary", "tests/unitary/with_extras/model"] + test-path: ["tests/unitary/with_extras tests/unitary/default_setup", "tests/unitary/with_extras/model"] include: - - test-path: "tests/unitary" + - test-path: "tests/unitary/with_extras tests/unitary/default_setup" ignore-path: "--ignore tests/unitary/with_extras/model --ignore tests/unitary/with_extras/feature_store" name: "unitary" - test-path: "tests/unitary/with_extras/model" @@ -148,15 +147,15 @@ jobs: run: | set -x # print commands that are executed - # Prepare default cov body text + # Prepare default cov body text COV_BODY_INTRO="📌 Overall coverage:\n\n" echo COV_BODY="$COV_BODY_INTRO No success to gather report. 😿" >> $GITHUB_ENV # Prepare file paths to .coverage files # Filenames taken from job.test last step with name - "Save coverage files" - FILE_UNITARY="cov-reports-unitary/.coverage"; [[ ! -f $FILE_UNITARY ]] && FILE_UNITARY="" + FILE_UNITARY="cov-reports-unitary/.coverage"; [[ ! -f $FILE_UNITARY ]] && FILE_UNITARY="" FILE_MODEL="cov-reports-model/.coverage"; [[ ! -f $FILE_MODEL ]] && FILE_MODEL="" - + # Combine coverage files pip install coverage coverage combine $FILE_UNITARY $FILE_MODEL @@ -166,7 +165,7 @@ jobs: # Calculate overall coverage and update body message COV=$(grep -E 'pc_cov' htmlcov/index.html | cut -d'>' -f 2 | cut -d'%' -f 1) - if [[ ! -z $COV ]]; then + if [[ ! -z $COV ]]; then if [[ $COV -lt 50 ]]; then COLOR=red; elif [[ $COV -lt 80 ]]; then COLOR=yellow; else COLOR=green; fi echo COV_BODY="$COV_BODY_INTRO ![Coverage-$COV%](https://img.shields.io/badge/coverage-$COV%25-$COLOR)" >> $GITHUB_ENV fi @@ -176,23 +175,23 @@ jobs: run: | set -x # print commands that are executed - # Prepare default diff body text + # Prepare default diff body text DIFF_BODY_INTRO="📌 Cov diff with **${{ env.COMPARE_BRANCH }}**:\n\n" echo DIFF_BODY="$DIFF_BODY_INTRO No success to gather report. 😿" >> $GITHUB_ENV # Prepare file paths to coverage xml files # Filenames taken from job.test last step with name - "Save coverage files" - FILE1="cov-reports-unitary/coverage.xml"; [[ ! -f $FILE1 ]] && FILE1="" + FILE1="cov-reports-unitary/coverage.xml"; [[ ! -f $FILE1 ]] && FILE1="" FILE2="cov-reports-model/coverage.xml"; [[ ! -f $FILE2 ]] && FILE2="" echo "FILE1=$FILE1" >> $GITHUB_ENV echo "FILE2=$FILE2" >> $GITHUB_ENV # Calculate coverage diff and update body message pip install diff_cover - diff-cover $FILE1 $FILE2 --compare-branch=origin/${{ env.COMPARE_BRANCH }} + diff-cover $FILE1 $FILE2 --compare-branch=origin/${{ env.COMPARE_BRANCH }} DIFF=$(diff-cover $FILE1 $FILE2 \ --compare-branch=origin/${{ env.COMPARE_BRANCH }} | grep Coverage: | cut -d' ' -f 2 | cut -d'%' -f 1) - if [[ -z $DIFF ]]; then + if [[ -z $DIFF ]]; then DIFF_INFO=$(diff-cover $FILE1 $FILE2 \ --compare-branch=origin/${{ env.COMPARE_BRANCH }} | grep "No lines"); echo DIFF_BODY="$DIFF_BODY_INTRO $DIFF_INFO">> $GITHUB_ENV diff --git a/MANIFEST.in b/MANIFEST.in index f03e3df04..00307141d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,5 @@ include LICENSE.txt include THIRD_PARTY_LICENSES.txt -include ads/ads_version.json include ads/common/*.json include ads/feature_engineering/*.json include ads/templates/*.jinja2 @@ -15,6 +14,7 @@ include ads/opctl/conda/manifest_template.yaml include ads/opctl/conda/config.yaml include ads/opctl/config/diagnostics/**/*.yaml include ads/opctl/templates/*.jinja2 +include pyproject.toml global-exclude tests/** global-exclude notebooks/** exclude tests/** @@ -23,4 +23,4 @@ exclude build/lib/tests/** exclude build/lib/notebooks/** exclude benchmark/** include ads/ads -include ads/model/common/*.* \ No newline at end of file +include ads/model/common/*.* diff --git a/Makefile b/Makefile index 049288f79..e8726f7ed 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ RELEASE_BRANCH := release/ads DOCS_RELEASE_BRANCH := release CLONE_DIR := /tmp/advanced-ds DOCS_CLONE_DIR := /tmp/ads-docs -COPY_INVENTORY := setup.py CONTRIBUTING.md LICENSE.txt MANIFEST.in README-development.md README.md SECURITY.md THIRD_PARTY_LICENSES.txt +COPY_INVENTORY := CONTRIBUTING.md LICENSE.txt MANIFEST.in README-development.md README.md SECURITY.md THIRD_PARTY_LICENSES.txt prepare-release-branch: clean @git checkout master @@ -13,31 +13,32 @@ prepare-release-branch: clean prepare-ads: @echo "Started advanced-ds clone at $$(date)" @git clone ssh://git@bitbucket.oci.oraclecorp.com:7999/odsc/advanced-ds.git --branch $(RELEASE_BRANCH) --depth 1 $(CLONE_DIR) - @echo "Finished cloning at $$(date)" + @echo "Finished cloning at $$(date)" cp -r $(CLONE_DIR)/ads . $(foreach var,$(COPY_INVENTORY),cp $(CLONE_DIR)/$(var) .;) -prepare-docs: +prepare-docs: @echo "Started ads_docs clone at $$(date)" @git clone ssh://git@bitbucket.oci.oraclecorp.com:7999/odsc/ads_docs.git --branch $(DOCS_RELEASE_BRANCH) --depth 1 $(DOCS_CLONE_DIR) - @echo "Finished cloning at $$(date)" + @echo "Finished cloning at $$(date)" cp -r $(DOCS_CLONE_DIR)/source docs/ && cp $(DOCS_CLONE_DIR)/requirements.txt docs prepare: prepare-release-branch prepare-ads prepare-docs push: clean - @bash -c 'if [[ $$(git branch | grep \*) == "* release/$(RELEASE_VERSION)" ]];then echo "Version matching current branch"; else echo "Set proper value to RELEASE_VERSION";exit 1 ; fi' + @bash -c 'if [[ $$(git branch | grep \*) == "* release/$(RELEASE_VERSION)" ]];then echo "Version matching current branch"; else echo "Set proper value to RELEASE_VERSION";exit 1 ; fi' @git add . @git commit -m "Release version: $(RELEASE_VERSION)" @git push --set-upstream origin release/$(RELEASE_VERSION) dist: clean - @python3 setup.py sdist bdist_wheel + @python3 -m build publish: dist @twine upload dist/* clean: + @echo "Cleaning - removing dist, *.pyc, Thumbs.db and other files" @rm -rf dist build oracle_ads.egg-info @find ./ -name '*.pyc' -exec rm -f {} \; @find ./ -name 'Thumbs.db' -exec rm -f {} \; diff --git a/README-development.md b/README-development.md index 99d35555d..d09515bef 100644 --- a/README-development.md +++ b/README-development.md @@ -58,7 +58,7 @@ Open the destination folder where you want to clone ADS library, and install dep python3 -m pip install -e . ``` -To which packages were installed and their version numbers, run: +To view which packages were installed and their version numbers, run: ```bash python3 -m pip freeze @@ -71,10 +71,11 @@ You should also set up configuration files, see the [SDK and CLI Configuration F ### Step 5: Versioning and generation the wheel -Use `ads_version.json` for versioning. The ADS SDK is packaged as a wheel. To generate the wheel, you can run: +Bump the versions in `pyproject.toml`. The ADS SDK using [build](https://pypa-build.readthedocs.io/en/stable/index.html) as build frontend. To generate sdist and wheel, you can run: ```bash - python3 setup.py sdist bdist_wheel + pip install build + python3 -m build ``` This wheel can then be installed using `pip`. @@ -85,7 +86,7 @@ The SDK uses pytest as its test framework. ### Running default setup tests -Default setup tests for testing ADS SDK without extra dependencies, specified in setup.py. +Default setup tests for testing ADS SDK without extra dependencies, specified in `pyproject.toml` in `[project.optional-dependencies]`. ```bash # Update your environment with tests dependencies diff --git a/README.md b/README.md index df714acd4..e56452084 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,6 @@ You have various options when installing ADS. ### Installing extras libraries -The `all-optional` module will install all optional dependencies. Note the single quotes around installation of extra libraries. - -```bash - python3 -m pip install 'oracle-ads[all-optional]' -``` - To work with gradient boosting models, install the `boosted` module. This module includes XGBoost and LightGBM model classes. ```bash @@ -107,6 +101,8 @@ Install the `viz` module to include libraries for visualization tasks. Some of t python3 -m pip install 'oracle-ads[viz]' ``` +See `pyproject.toml` file `[project.optional-dependencies]` section for full list of modules and its list of extra libraries. + **Note** Multiple extra dependencies can be installed together. For example: diff --git a/SECURITY.md b/SECURITY.md index fb2384138..2ca81027f 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -21,7 +21,7 @@ security features are welcome on GitHub Issues. Security updates will be released on a regular cadence. Many of our projects will typically release security fixes in conjunction with the -[Oracle Critical Patch Update][3] program. Additional +Oracle Critical Patch Update program. Additional information, including past advisories, is available on our [security alerts][4] page. diff --git a/ads/__init__.py b/ads/__init__.py index 65e06fc51..8a2faf209 100644 --- a/ads/__init__.py +++ b/ads/__init__.py @@ -6,15 +6,18 @@ from __future__ import print_function, division, absolute_import import os +import sys import logging import json from typing import Callable, Dict, Optional, Union -__version__ = "" -with open( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "ads_version.json") -) as version_file: - __version__ = json.load(version_file)["version"] +# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/#single-sourcing-the-package-version +if sys.version_info >= (3, 8): + from importlib import metadata +else: + import importlib_metadata as metadata + +__version__ = metadata.version("oracle_ads") import oci import matplotlib.font_manager # causes matplotlib to regenerate its fonts @@ -138,7 +141,8 @@ def hello(): oci v{oci.__version__} ocifs v{ocifs.__version__} -""") +""" + ) configure_plotting() diff --git a/ads/ads_version.json b/ads/ads_version.json deleted file mode 100644 index ad9f8034e..000000000 --- a/ads/ads_version.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "version": "2.8.8" -} diff --git a/ads/cli.py b/ads/cli.py index de0bd2418..2025790c4 100644 --- a/ads/cli.py +++ b/ads/cli.py @@ -1,10 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -from ads.common import logger + import traceback +import sys + +from ads.common import logger try: import click @@ -22,11 +25,13 @@ logger.debug(traceback.format_exc()) exit() +# https://packaging.python.org/en/latest/guides/single-sourcing-package-version/#single-sourcing-the-package-version +if sys.version_info >= (3, 8): + from importlib import metadata +else: + import importlib_metadata as metadata -with open( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "ads_version.json") -) as version_file: - ADS_VERSION = json.load(version_file)["version"] +ADS_VERSION = metadata.version("oracle_ads") @click.group() diff --git a/ads/common/auth.py b/ads/common/auth.py index 7fc9de2ba..e3aec78e6 100644 --- a/ads/common/auth.py +++ b/ads/common/auth.py @@ -889,11 +889,12 @@ def _read_security_token_file(self, security_token_file: str) -> str: str: Security token string. """ - if not os.path.isfile(security_token_file): + expanded_path = os.path.expanduser(security_token_file) + if not os.path.isfile(expanded_path): raise ValueError("Invalid `security_token_file`. Specify a valid path.") try: token = None - with open(security_token_file, 'r') as f: + with open(expanded_path, 'r') as f: token = f.read() return token except: diff --git a/ads/common/model_export_util.py b/ads/common/model_export_util.py index cb9ed815e..09e708bcf 100644 --- a/ads/common/model_export_util.py +++ b/ads/common/model_export_util.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2020, 2022 Oracle and/or its affiliates. +# Copyright (c) 2020, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import warnings @@ -261,7 +261,7 @@ def prepare_generic_model( progress.update("Updating requirements.txt") if fn_artifact_files_included: - # fdk removed from dependency list in setup.py (fn deployments deprecated) + # fdk removed from dependency list in pyproject.toml (fn deployments deprecated) # before we request versions we want to check if fdk installed by user # and provide support in error message, if not installed try: @@ -449,7 +449,6 @@ def _sklearn_to_onnx(model=None, target_dir=None, X=None, y=None, **kwargs): def _automl_to_pkl(model=None, target_dir=None, **kwargs): - with open(os.path.join(target_dir, "model.pkl"), "wb") as outfile: cloudpickle.dump(model, outfile) @@ -484,7 +483,6 @@ def _lightgbm_to_onnx(model=None, target_dir=None, X=None, y=None, **kwargs): options={"nocl": [True, False], "zipmap": [True, False]}, ) elif lightgbm.sklearn.LGBMRegressor in model_est_types: - from onnxmltools.convert.lightgbm.operator_converters.LightGbm import ( convert_lightgbm, ) @@ -527,7 +525,6 @@ def _xgboost_to_onnx(model=None, target_dir=None, X=None, y=None, **kwargs): else: model_est_types = [type(model.est)] if xgboost.sklearn.XGBClassifier in model_est_types: - from onnxmltools.convert.xgboost.operator_converters.XGBoost import ( convert_xgboost, ) diff --git a/ads/common/utils.py b/ads/common/utils.py index 12242ec7c..49afcaada 100644 --- a/ads/common/utils.py +++ b/ads/common/utils.py @@ -53,6 +53,9 @@ from ads.dataset.progress import DummyProgressBar, TqdmProgressBar from . import auth as authutil +from oci import object_storage +from ads.common.oci_client import OCIClientFactory +from ads.common.object_storage_details import ObjectStorageDetails # For Model / Model Artifact libraries lib_translator = {"sklearn": "scikit-learn"} @@ -100,6 +103,9 @@ # declare custom exception class +# The number of worker processes to use in parallel for uploading individual parts of a multipart upload. +DEFAULT_PARALLEL_PROCESS_COUNT = 9 + class FileOverwriteError(Exception): # pragma: no cover pass @@ -1599,3 +1605,103 @@ def is_path_exists(uri: str, auth: Optional[Dict] = None) -> bool: if fsspec.filesystem(path_scheme, **storage_options).exists(uri): return True return False + + +def upload_to_os( + src_uri: str, + dst_uri: str, + auth: dict = None, + parallel_process_count: int = DEFAULT_PARALLEL_PROCESS_COUNT, + progressbar_description: str = "Uploading `{src_uri}` to `{dst_uri}`.", + force_overwrite: bool = False, +): + """Utilizes `oci.object_storage.Uploadmanager` to upload file to Object Storage. + + Parameters + ---------- + src_uri: str + The path to the file to upload. This should be local path. + dst_uri: str + Object Storage path, eg. `oci://my-bucket@my-tenancy/prefix``. + auth: (Dict, optional) Defaults to None. + default_signer() + parallel_process_count: (int, optional) Defaults to 3. + The number of worker processes to use in parallel for uploading individual + parts of a multipart upload. + progressbar_description: (str, optional) Defaults to `"Uploading `{src_uri}` to `{dst_uri}`"`. + Prefix for the progressbar. + force_overwrite: (bool, optional). Defaults to False. + Whether to overwrite existing files or not. + + Returns + ------- + Response: oci.response.Response + The response from multipart commit operation or the put operation. + + Raise + ----- + ValueError + When the given `dst_uri` is not a valid Object Storage path. + FileNotFoundError + When the given `src_uri` does not exist. + RuntimeError + When upload operation fails. + """ + if not os.path.exists(src_uri): + raise FileNotFoundError(f"The give src_uri: {src_uri} does not exist.") + + if not ObjectStorageDetails.is_oci_path( + dst_uri + ) or not ObjectStorageDetails.is_valid_uri(dst_uri): + raise ValueError( + f"The given dst_uri:{dst_uri} is not a valid Object Storage path." + ) + + auth = auth or authutil.default_signer() + + if not force_overwrite and is_path_exists(dst_uri): + raise FileExistsError( + f"The `{dst_uri}` exists. Please use a new file name or " + "set force_overwrite to True if you wish to overwrite." + ) + + upload_manager = object_storage.UploadManager( + object_storage_client=OCIClientFactory(**auth).object_storage, + parallel_process_count=parallel_process_count, + allow_multipart_uploads=True, + allow_parallel_uploads=True, + ) + + file_size = os.path.getsize(src_uri) + with open(src_uri, "rb") as fs: + with tqdm( + total=file_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + position=0, + leave=False, + file=sys.stdout, + desc=progressbar_description, + ) as pbar: + + def progress_callback(progress): + pbar.update(progress) + + bucket_details = ObjectStorageDetails.from_path(dst_uri) + response = upload_manager.upload_stream( + namespace_name=bucket_details.namespace, + bucket_name=bucket_details.bucket, + object_name=bucket_details.filepath, + stream_ref=fs, + progress_callback=progress_callback, + ) + + if response.status == 200: + print(f"{src_uri} has been successfully uploaded to {dst_uri}.") + else: + raise RuntimeError( + f"Failed to upload {src_uri}. Response code is {response.status}" + ) + + return response diff --git a/ads/dataset/helper.py b/ads/dataset/helper.py index 9a245dadb..777a6eb39 100644 --- a/ads/dataset/helper.py +++ b/ads/dataset/helper.py @@ -314,7 +314,6 @@ def _get_dtype_from_error(e): error_string = str(e) if "mismatched dtypes" in error_string.lower(): - # For the mismatched dtypes error, dask either returns a error message containing the dtype argument # to specify, or the found and expected dtypes in a table format, depending on what stage # the type inferencing fails. The below logic supports building the dtype dictionary for both cases @@ -732,8 +731,8 @@ def down_sample(df, target): """ dfs = [] target_value_counts = df[target].value_counts() - min_key = min(target_value_counts.iteritems(), key=lambda k: k[1]) - for key, value in target_value_counts.iteritems(): + min_key = min(target_value_counts.items(), key=lambda k: k[1]) + for key, value in target_value_counts.items(): if key != min_key[0]: dfs.append( df[df[target] == key].sample(frac=1 - ((value - min_key[1]) / value)) @@ -835,6 +834,7 @@ def _log_yscale_not_set(): "`yscale` parameter is not set. Valid values are `'linear'`, `'log'`, `'symlog'`." ) + def infer_target_type(target, target_series, discover_target_type=True): # if type discovery is turned off, infer type from pandas dtype if discover_target_type: @@ -845,6 +845,7 @@ def infer_target_type(target, target_series, discover_target_type=True): target_type = get_feature_type(target, target_series) return target_type + def get_target_type(target, sampled_df, **init_kwargs): discover_target_type = init_kwargs.get("type_discovery", True) if target in init_kwargs.get("types", {}): @@ -852,6 +853,7 @@ def get_target_type(target, sampled_df, **init_kwargs): discover_target_type = False return infer_target_type(target, sampled_df[target], discover_target_type) + def get_dataset( df: pd.DataFrame, sampled_df: pd.DataFrame, @@ -860,12 +862,12 @@ def get_dataset( shape: Tuple[int, int], positive_class=None, **init_kwargs, -): +): from ads.dataset.classification_dataset import ( - BinaryClassificationDataset, - BinaryTextClassificationDataset, - MultiClassClassificationDataset, - MultiClassTextClassificationDataset + BinaryClassificationDataset, + BinaryTextClassificationDataset, + MultiClassClassificationDataset, + MultiClassTextClassificationDataset, ) from ads.dataset.forecasting_dataset import ForecastingDataset from ads.dataset.regression_dataset import RegressionDataset @@ -874,9 +876,7 @@ def get_dataset( logger.warning( "It is not recommended to use an empty column as the target variable." ) - raise ValueError( - f"We do not support using empty columns as the chosen target" - ) + raise ValueError(f"We do not support using empty columns as the chosen target") if utils.is_same_class(target_type, ContinuousTypedFeature): return RegressionDataset( df=df, @@ -899,9 +899,9 @@ def get_dataset( ) # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type - elif utils.is_same_class(target_type, CategoricalTypedFeature) or utils.is_same_class( - target_type, OrdinalTypedFeature - ): + elif utils.is_same_class( + target_type, CategoricalTypedFeature + ) or utils.is_same_class(target_type, OrdinalTypedFeature): if target_type.meta_data["internal"]["unique"] == 2: if is_text_data(sampled_df, target): return BinaryTextClassificationDataset( @@ -946,17 +946,13 @@ def get_dataset( or "text" in target_type["type"] or "text" in target ): - raise ValueError( - f"The column {target} cannot be used as the target column." - ) + raise ValueError(f"The column {target} cannot be used as the target column.") elif ( utils.is_same_class(target_type, GISTypedFeature) or "coord" in target_type["type"] or "coord" in target ): - raise ValueError( - f"The column {target} cannot be used as the target column." - ) + raise ValueError(f"The column {target} cannot be used as the target column.") # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a # binary target, but only data on one instance elif target_type["low_level_type"] == "bool": @@ -974,6 +970,7 @@ def get_dataset( f"For example, types = {{{target}: 'category'}}" ) + def open( source, target=None, @@ -1074,9 +1071,7 @@ def open( progress.update("Opening data") path = ElaboratedPath(source, format=format, **kwargs) reader_fn = ( - get_format_reader(path=path, **kwargs) - if reader_fn is None - else reader_fn + get_format_reader(path=path, **kwargs) if reader_fn is None else reader_fn ) df = load_dataset(path=path, reader_fn=reader_fn, **kwargs) name = path.name @@ -1108,6 +1103,7 @@ def open( ), ) + def build_dataset( df: pd.DataFrame, shape: Tuple[int, int], @@ -1149,9 +1145,7 @@ def build_dataset( discover_target_type = False # if type discovery is turned off, infer type from pandas dtype - target_type = infer_target_type( - target, sampled_df[target], discover_target_type - ) + target_type = infer_target_type(target, sampled_df[target], discover_target_type) result = get_dataset( df=df, @@ -1168,6 +1162,7 @@ def build_dataset( ) return result + class CustomFormatReaders: @staticmethod def read_tsv(path: str, **kwargs) -> pd.DataFrame: @@ -1352,7 +1347,6 @@ def read_xml(path: str, **kwargs) -> pd.DataFrame: import xml.etree.cElementTree as et def get_children(df, node, parent, i): - for name in node.attrib.keys(): df.at[i, parent + name] = node.attrib[name] for child in list(node): @@ -1374,6 +1368,7 @@ def get_children(df, node, parent, i): last_i = i return ret_df + reader_fns = { "csv": pd.read_csv, "tsv": CustomFormatReaders.read_tsv, @@ -1399,6 +1394,7 @@ def get_children(df, node, parent, i): "xml": CustomFormatReaders.read_xml, } + def validate_kwargs(func: Callable, kwargs): valid_params = inspect.signature(func).parameters if "kwargs" in valid_params: @@ -1406,6 +1402,7 @@ def validate_kwargs(func: Callable, kwargs): else: return {k: v for k, v in kwargs.items() if k in valid_params} + def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable: format_key = path.format try: @@ -1420,6 +1417,7 @@ def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable: return reader_fn + def load_dataset(path: ElaboratedPath, reader_fn: Callable, **kwargs) -> pd.DataFrame: dfs = [] for filename in path.paths: diff --git a/ads/dataset/label_encoder.py b/ads/dataset/label_encoder.py index bc9860d57..e558fb076 100644 --- a/ads/dataset/label_encoder.py +++ b/ads/dataset/label_encoder.py @@ -1,27 +1,55 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2020, 2022 Oracle and/or its affiliates. +# Copyright (c) 2020, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import bisect -from collections import defaultdict +import numpy as np +from collections import defaultdict from sklearn.base import TransformerMixin from sklearn.preprocessing import LabelEncoder class DataFrameLabelEncoder(TransformerMixin): """ - Label encoder for pandas.dataframe. dask.dataframe.core.DataFrame + Label encoder for `pandas.DataFrame` and `dask.dataframe.core.DataFrame`. + + Attributes + ---------- + label_encoders : defaultdict + Holds the label encoder for each column. + + Examples + -------- + >>> import pandas as pd + >>> from ads.dataset.label_encoder import DataFrameLabelEncoder + + >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) + >>> le = DataFrameLabelEncoder() + >>> le.fit_transform(X=df) + """ def __init__(self): + """Initialize an instance of DataFrameLabelEncoder.""" self.label_encoders = defaultdict(LabelEncoder) - def fit(self, X): + def fit(self, X: "pandas.DataFrame"): """ - Fits a DataFrameLAbelEncoder. + Fits a DataFrameLabelEncoder. + + Parameters + ---------- + X : pandas.DataFrame + Target values. + + Returns + ------- + self : returns an instance of self. + Fitted label encoder. + """ for column in X.columns: if X[column].dtype.name in ["object", "category"]: @@ -33,12 +61,24 @@ def fit(self, X): for class_ in self.label_encoders[column].classes_.tolist() ] bisect.insort_left(label_encoder_classes_, "unknown") + label_encoder_classes_ = np.asarray(label_encoder_classes_) self.label_encoders[column].classes_ = label_encoder_classes_ return self - def transform(self, X): + def transform(self, X: "pandas.DataFrame"): """ - Transforms a dataset using the DataFrameLAbelEncoder. + Transforms a dataset using the DataFrameLabelEncoder. + + Parameters + ---------- + X : pandas.DataFrame + Target values. + + Returns + ------- + pandas.DataFrame + Labels as normalized encodings. + """ categorical_columns = list(self.label_encoders.keys()) if len(categorical_columns) == 0: diff --git a/ads/dataset/recommendation_transformer.py b/ads/dataset/recommendation_transformer.py index 94bd69912..158b2f2b1 100644 --- a/ads/dataset/recommendation_transformer.py +++ b/ads/dataset/recommendation_transformer.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- -# Copyright (c) 2020, 2022 Oracle and/or its affiliates. +# Copyright (c) 2020, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from __future__ import print_function, absolute_import @@ -131,7 +131,6 @@ def _get_recommendations(self, df): self.feature_metadata_[self.target_] = self.target_type_ for column in df.columns.values[df.isnull().any()]: - # filter out columns that were discovered as constant or primary key columns in the previous step, # as they would get dropped before imputation if ( @@ -246,10 +245,10 @@ def _get_recommendations(self, df): if not self.is_balanced and self.fix_imbalance: target_value_counts = df[self.target_].value_counts() minority_class_len = min( - target_value_counts.iteritems(), key=lambda k: k[1] + target_value_counts.items(), key=lambda k: k[1] )[1] majority_class_len = max( - target_value_counts.iteritems(), key=lambda k: k[1] + target_value_counts.items(), key=lambda k: k[1] )[1] minor_majority_ratio = minority_class_len / majority_class_len diff --git a/ads/evaluations/evaluation_plot.py b/ads/evaluations/evaluation_plot.py index 13a9f88a3..fb89edaee 100644 --- a/ads/evaluations/evaluation_plot.py +++ b/ads/evaluations/evaluation_plot.py @@ -447,7 +447,7 @@ def _lift_and_gain_chart(cls, ax, evaluation): @classmethod def _lift_chart(cls, ax, evaluation): - for mod_name, col in evaluation.iteritems(): + for mod_name, col in evaluation.items(): if col["y_score"] is not None: ax.plot( col["percentages"][1:], @@ -476,7 +476,7 @@ def _lift_chart(cls, ax, evaluation): @classmethod def _gain_chart(cls, ax, evaluation): - for mod_name, col in evaluation.iteritems(): + for mod_name, col in evaluation.items(): if col["y_score"] is not None: ax.plot( col["percentages"], @@ -517,7 +517,7 @@ def _pr_curve(cls, axs, evaluation): ax.axis("off") return if cls.prob_type == "_bin": - for mod_name, col in evaluation.iteritems(): + for mod_name, col in evaluation.items(): if col["y_score"] is not None: ax.plot( col["recall_values"], @@ -589,7 +589,7 @@ def _roc_curve(cls, axs, evaluation): ax.axis("off") return if cls.prob_type == "_bin": - for mod_name, col in evaluation.iteritems(): + for mod_name, col in evaluation.items(): if col["y_score"] is not None: ax.plot( col["false_positive_rate"], @@ -803,7 +803,6 @@ def _pretty_scatter( label=None, plot_kwargs=None, ): - if plot_kwargs is None: plot_kwargs = {} ax.scatter(x, y, s=s, label=label, marker="o", alpha=alpha, **plot_kwargs) diff --git a/ads/feature_engineering/accessor/dataframe_accessor.py b/ads/feature_engineering/accessor/dataframe_accessor.py index e2618e1fa..92ec9babd 100644 --- a/ads/feature_engineering/accessor/dataframe_accessor.py +++ b/ads/feature_engineering/accessor/dataframe_accessor.py @@ -218,7 +218,7 @@ def feature_type_description(self) -> pd.DataFrame: for col in self._obj: series_feature_type_df = self._obj[col].ads.feature_type_description series_feature_type_df.insert(0, "Column", col) - result_df = result_df.append(series_feature_type_df) + result_df = pd.concat([result_df, series_feature_type_df]) result_df.reset_index(drop=True, inplace=True) return result_df diff --git a/ads/feature_engineering/accessor/mixin/correlation.py b/ads/feature_engineering/accessor/mixin/correlation.py index 593d9391f..4bf7864c5 100644 --- a/ads/feature_engineering/accessor/mixin/correlation.py +++ b/ads/feature_engineering/accessor/mixin/correlation.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ from __future__ import print_function, absolute_import @@ -68,7 +68,7 @@ def _list_to_dataframe( correlation_matrix = correlation_matrix.loc[:, correlation_matrix.index] if normal_form: data = [] - for (col1, col2), corr in correlation_matrix.stack().iteritems(): + for (col1, col2), corr in correlation_matrix.stack().items(): data.append([col1, col2, round(corr, 4)]) return pd.DataFrame(data, columns=["Column 1", "Column 2", "Value"]) else: @@ -161,6 +161,6 @@ def cont_vs_cont(df: pd.DataFrame, normal_form: bool = True) -> pd.DataFrame: if not normal_form: return df.corr(method="pearson") data = [] - for (col1, col2), corr in df.corr(method="pearson").stack().iteritems(): + for (col1, col2), corr in df.corr(method="pearson").stack().items(): data.append([col1, col2, round(corr, 4)]) return pd.DataFrame(data, columns=["Column 1", "Column 2", "Value"]) diff --git a/ads/feature_engineering/accessor/mixin/eda_mixin.py b/ads/feature_engineering/accessor/mixin/eda_mixin.py index 5b9414036..927b3e94c 100644 --- a/ads/feature_engineering/accessor/mixin/eda_mixin.py +++ b/ads/feature_engineering/accessor/mixin/eda_mixin.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -262,5 +262,5 @@ def warning(self) -> pd.DataFrame: warning_df = self._obj[col].ads.warning() if warning_df is not None: warning_df.insert(0, "Column", col) - result_df = result_df.append(warning_df) + result_df = pd.concat([result_df, warning_df]) return result_df.reset_index(drop=True) diff --git a/ads/feature_engineering/accessor/mixin/eda_mixin_series.py b/ads/feature_engineering/accessor/mixin/eda_mixin_series.py index 72900ffff..88c980354 100644 --- a/ads/feature_engineering/accessor/mixin/eda_mixin_series.py +++ b/ads/feature_engineering/accessor/mixin/eda_mixin_series.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -80,6 +80,6 @@ def warning(self) -> pd.DataFrame: warning_df = feature_type.warning(self._obj) if warning_df is not None: warning_df.insert(0, "Feature Type", feature_type.name) - result_df = result_df.append(warning_df) + result_df = pd.concat([result_df, warning_df]) result_df.reset_index(drop=True, inplace=True) return result_df diff --git a/ads/feature_engineering/accessor/mixin/feature_types_mixin.py b/ads/feature_engineering/accessor/mixin/feature_types_mixin.py index 30152b387..243ac8fc7 100644 --- a/ads/feature_engineering/accessor/mixin/feature_types_mixin.py +++ b/ads/feature_engineering/accessor/mixin/feature_types_mixin.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -115,14 +115,14 @@ def warning_registered(self) -> pd.DataFrame: for col in self._obj.columns: feature_type_df = self._obj[col].ads.warning_registered() feature_type_df.insert(0, "Column", col) - result_df = result_df.append(feature_type_df) + result_df = pd.concat([result_df, feature_type_df]) else: result_df = pd.DataFrame((), columns=common_columns) for feature_type in self._feature_type: feature_type_df = feature_type.warning.registered() feature_type_df.insert(0, "Feature Type", feature_type.name) feature_type_df = feature_type_df.rename(columns={"Name": "Warning"}) - result_df = result_df.append(feature_type_df) + result_df = pd.concat([result_df, feature_type_df]) result_df.reset_index(drop=True, inplace=True) return result_df @@ -155,14 +155,14 @@ def validator_registered(self) -> pd.DataFrame: for col in self._obj.columns: feature_type_df = self._obj[col].ads.validator_registered() feature_type_df.insert(0, "Column", col) - result_df = result_df.append(feature_type_df) + result_df = pd.concat([result_df, feature_type_df]) else: result_df = pd.DataFrame((), columns=common_columns) for feature_type in self._feature_type: feature_type_df = feature_type.validator.registered() feature_type_df.insert(0, "Feature Type", feature_type.name) feature_type_df = feature_type_df.rename(columns={"Name": "Validator"}) - result_df = result_df.append(feature_type_df) + result_df = pd.concat([result_df, feature_type_df]) result_df.reset_index(drop=True, inplace=True) return result_df diff --git a/ads/feature_engineering/feature_type/creditcard.py b/ads/feature_engineering/feature_type/creditcard.py index 44e040cb9..63bd42ecf 100644 --- a/ads/feature_engineering/feature_type/creditcard.py +++ b/ads/feature_engineering/feature_type/creditcard.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -198,6 +198,7 @@ def feature_stat(x: pd.Series): df_stat = _count_unique_missing(x) card_types = x.apply(assign_issuer) value_counts = card_types.value_counts() + value_counts.rename("creditcard", inplace=True) value_counts.index = [ "count_" + cardtype for cardtype in list(value_counts.index) ] diff --git a/ads/feature_engineering/feature_type/handler/feature_validator.py b/ads/feature_engineering/feature_type/handler/feature_validator.py index 778558e62..6dff154a1 100644 --- a/ads/feature_engineering/feature_type/handler/feature_validator.py +++ b/ads/feature_engineering/feature_type/handler/feature_validator.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -261,7 +261,7 @@ def registered(self) -> pd.DataFrame: for key, feature_validator in self._validators.items(): feature_validators_df = feature_validator.registered() feature_validators_df.insert(0, "Validator", key) - result_df = result_df.append(feature_validators_df) + result_df = pd.concat([result_df, feature_validators_df]) result_df.reset_index(drop=True, inplace=True) return result_df diff --git a/ads/feature_engineering/feature_type/handler/feature_warning.py b/ads/feature_engineering/feature_type/handler/feature_warning.py index cee189e69..08caa809a 100644 --- a/ads/feature_engineering/feature_type/handler/feature_warning.py +++ b/ads/feature_engineering/feature_type/handler/feature_warning.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -280,7 +280,7 @@ def _process(self) -> pd.DataFrame: f"Details: '{name}' should return a DataFrame " f"with columns: {expected_columns}." ) - result_df = result_df.append(handler_result) + result_df = pd.concat([result_df, handler_result]) result_df.reset_index(drop=True, inplace=True) return result_df diff --git a/ads/feature_engineering/feature_type_manager.py b/ads/feature_engineering/feature_type_manager.py index c32c2601b..77ddd56ac 100644 --- a/ads/feature_engineering/feature_type_manager.py +++ b/ads/feature_engineering/feature_type_manager.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*-- -# Copyright (c) 2021, 2022 Oracle and/or its affiliates. +# Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ """ @@ -371,7 +371,7 @@ def warning_registered(cls) -> pd.DataFrame: feature_type_df = feature_type.warning.registered() feature_type_df.insert(0, "Feature Type", feature_type.name) feature_type_df = feature_type_df.rename(columns={"Name": "Warning"}) - result_df = result_df.append(feature_type_df) + result_df = pd.concat([result_df, feature_type_df]) result_df.reset_index(drop=True, inplace=True) return result_df @@ -401,6 +401,6 @@ def validator_registered(cls) -> pd.DataFrame: feature_type_df = feature_type.validator.registered() feature_type_df.insert(0, "Feature Type", feature_type.name) feature_type_df = feature_type_df.rename(columns={"Name": "Validator"}) - result_df = result_df.append(feature_type_df) + result_df = pd.concat([result_df, feature_type_df]) result_df.reset_index(drop=True, inplace=True) return result_df diff --git a/ads/hpo/ads_search_space.py b/ads/hpo/ads_search_space.py index 3e45637be..e9b2de9c0 100644 --- a/ads/hpo/ads_search_space.py +++ b/ads/hpo/ads_search_space.py @@ -121,7 +121,7 @@ def __init__(self, strategy): def suggest_space(self, **kwargs): space = { "alpha": LogUniformDistribution(10**-4, 10**-1), - "penalty": CategoricalDistribution(["l1", "l2", "none"]), + "penalty": CategoricalDistribution(["l1", "l2", None]), } if self.strategy != "perfunctory": space.update( @@ -144,7 +144,6 @@ def __init__(self, strategy): super(SVCSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = { "C": LogUniformDistribution(10**-4, 10**-1), "max_iter": CategoricalDistribution([1000]), @@ -172,7 +171,6 @@ def __init__(self, strategy): super(LinearSVCSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = { "C": LogUniformDistribution(10**-4, 10**-1), "dual": CategoricalDistribution([False]), @@ -197,7 +195,6 @@ def __init__(self, strategy): super(LinearSVRSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = {"C": LogUniformDistribution(10**-4, 10**-1)} if self.strategy != "perfunctory": @@ -217,7 +214,6 @@ def __init__(self, strategy): super(DecisionTreeClassifierSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = { "max_depth": IntUniformDistribution(1, 5), "min_impurity_decrease": UniformDistribution(0, 0.05), @@ -241,7 +237,6 @@ def __init__(self, strategy): super(DecisionTreeRegressorSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = { "max_depth": IntUniformDistribution(1, 5), "min_impurity_decrease": UniformDistribution(0, 0.05), @@ -252,7 +247,11 @@ def suggest_space(self, **kwargs): space.update( { "criterion": CategoricalDistribution( - ["mse", "friedman_mse", "mae"] + [ + "squared_error", + "friedman_mse", + "absolute_error", + ] ), "min_samples_leaf": IntUniformDistribution(2, 500), } @@ -335,7 +334,6 @@ def __init__(self, strategy): super(ExtraTreesClassifierSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = { "n_estimators": IntUniformDistribution(50, 250), "max_depth": IntUniformDistribution(1, 5), @@ -374,7 +372,6 @@ def __init__(self, strategy): super(GradientBoostingRegressorSearchSpace, self).__init__(strategy) def suggest_space(self, **kwargs): - space = { "max_depth": IntUniformDistribution(1, 5), "max_features": CategoricalDistribution(["sqrt", "log2"]), diff --git a/ads/hpo/objective.py b/ads/hpo/objective.py index 5f6aad4be..1e455ee0c 100644 --- a/ads/hpo/objective.py +++ b/ads/hpo/objective.py @@ -184,7 +184,6 @@ def _cross_validate_with_pruning( for step in range(self.max_iter): for i, (train, test) in enumerate(self.cv.split(X, y, groups=self.groups)): - out = self._partial_fit_and_score( X, y, estimators[i], train, test, partial_fit_params ) @@ -201,7 +200,6 @@ def _cross_validate_with_pruning( trial.report(intermediate_value, step=step) if trial.should_prune(): - self._store_scores(trial, scores, self.scoring_name) raise optuna.TrialPruned(f"trial was pruned at iteration {step}.") diff --git a/ads/hpo/search_cv.py b/ads/hpo/search_cv.py index 7721f1935..c7f10147b 100644 --- a/ads/hpo/search_cv.py +++ b/ads/hpo/search_cv.py @@ -67,7 +67,7 @@ class State(Enum): COMPLETED = auto() -class InvalidStateTransition(Exception): # pragma: no cover +class InvalidStateTransition(Exception): # pragma: no cover """ `Invalid State Transition` is raised when an invalid transition request is made, such as calling halt without a running process. @@ -76,7 +76,7 @@ class InvalidStateTransition(Exception): # pragma: no cover pass -class ExitCriterionError(Exception): # pragma: no cover +class ExitCriterionError(Exception): # pragma: no cover """ `ExitCriterionError` is raised when an attempt is made to check exit status for a different exit type than the tuner was initialized with. For example, if an HPO study has an exit criteria based @@ -87,14 +87,14 @@ class ExitCriterionError(Exception): # pragma: no cover pass -class DuplicatedStudyError(Exception): # pragma: no cover +class DuplicatedStudyError(Exception): # pragma: no cover """ `DuplicatedStudyError` is raised when a new tuner process is created with a study name that already exists in storage. """ -class NoRestartError(Exception): # pragma: no cover +class NoRestartError(Exception): # pragma: no cover """ `NoRestartError` is raised when an attempt is made to check how many seconds have transpired since the HPO process was last resumed from a halt. This can happen if the process has been terminated @@ -497,7 +497,6 @@ def _get_param_distributions(self, strategy): return param_distributions def _check_search_space(self, param_distributions): - validate_search_space(self.model.get_params().keys(), param_distributions) def _check_is_fitted(self): @@ -1044,7 +1043,7 @@ def _extract_estimator(self): def _extract_scoring_name(self): if isinstance(self.scoring, str): return self.scoring - if self._scorer.__class__.__name__ != "function": + if not callable(self._scorer): return ( self._scorer if isinstance(self._scorer, str) diff --git a/ads/jobs/builders/runtimes/pytorch_runtime.py b/ads/jobs/builders/runtimes/pytorch_runtime.py index 941c300cd..6d69a5218 100644 --- a/ads/jobs/builders/runtimes/pytorch_runtime.py +++ b/ads/jobs/builders/runtimes/pytorch_runtime.py @@ -205,7 +205,8 @@ def run(self, dsc_job, **kwargs): if not envs: envs = {} # Huggingface accelerate requires machine rank - envs["RANK"] = str(i) + # Here we use NODE_RANK to store the machine rank + envs["NODE_RANK"] = str(i) envs["WORLD_SIZE"] = str(replicas) if main_run: envs["MAIN_JOB_RUN_OCID"] = main_run.id diff --git a/ads/jobs/templates/driver_pytorch.py b/ads/jobs/templates/driver_pytorch.py index 332442e7c..a9a029898 100644 --- a/ads/jobs/templates/driver_pytorch.py +++ b/ads/jobs/templates/driver_pytorch.py @@ -694,7 +694,7 @@ def __init__(self, code_dir: str = driver_utils.DEFAULT_CODE_DIR) -> None: # --multi_gpu will be set automatically if there is more than 1 GPU # self.multi_gpu = bool(self.node_count > 1 or self.gpu_count > 1) self.num_machines = self.node_count - self.machine_rank = os.environ["RANK"] + self.machine_rank = os.environ["NODE_RANK"] # Total number of processes across all nodes # Here we assume all nodes are having the same shape self.num_processes = (self.gpu_count if self.gpu_count else 1) * self.node_count diff --git a/ads/jobs/templates/driver_utils.py b/ads/jobs/templates/driver_utils.py index 401b12e5b..74f4beb52 100644 --- a/ads/jobs/templates/driver_utils.py +++ b/ads/jobs/templates/driver_utils.py @@ -276,7 +276,7 @@ def copy_inputs(mappings: dict = None): return for src, dest in mappings.items(): - logger.debug("Copying %s to %s", src, dest) + logger.debug("Copying %s to %s", src, os.path.abspath(dest)) # Create the dest dir if one does not exist. if str(dest).endswith("/"): dest_dir = dest @@ -439,6 +439,10 @@ def install_pip_packages(self, packages: str = None): packages = os.environ.get(CONST_ENV_PIP_PKG) if not packages: return self + # The package requirement may contain special character like '>'. + # Here we wrap each package requirement with single quote to make sure they can be installed correctly + package_list = shlex.split(packages) + packages = " ".join([f"'{package}'" for package in package_list]) self.run_command( f"pip install {packages}", conda_prefix=self.conda_prefix, check=True ) diff --git a/ads/model/artifact_uploader.py b/ads/model/artifact_uploader.py index b40840708..260761d34 100644 --- a/ads/model/artifact_uploader.py +++ b/ads/model/artifact_uploader.py @@ -94,6 +94,8 @@ def _upload(self): class SmallArtifactUploader(ArtifactUploader): + """The class helper to upload small model artifacts.""" + PROGRESS_STEPS_COUNT = 1 def _upload(self): @@ -104,6 +106,39 @@ def _upload(self): class LargeArtifactUploader(ArtifactUploader): + """ + The class helper to upload large model artifacts. + + Attributes + ---------- + artifact_path: str + The model artifact location. + artifact_zip_path: str + The uri of the zip of model artifact. + auth: dict + The default authetication is set using `ads.set_auth` API. + If you need to override the default, use the `ads.common.auth.api_keys` or + `ads.common.auth.resource_principal` to create appropriate authentication signer + and kwargs required to instantiate IdentityClient object. + bucket_uri: str + The OCI Object Storage URI where model artifacts will be copied to. + The `bucket_uri` is only necessary for uploading large artifacts which + size is greater than 2GB. Example: `oci://@/prefix/`. + dsc_model: OCIDataScienceModel + The data scince model instance. + overwrite_existing_artifact: bool + Overwrite target bucket artifact if exists. + progress: TqdmProgressBar + An instance of the TqdmProgressBar. + region: str + The destination Object Storage bucket region. + By default the value will be extracted from the `OCI_REGION_METADATA` environment variables. + remove_existing_artifact: bool + Wether artifacts uploaded to object storage bucket need to be removed or not. + upload_manager: UploadManager + The uploadManager simplifies interaction with the Object Storage service. + """ + PROGRESS_STEPS_COUNT = 4 def __init__( @@ -115,6 +150,7 @@ def __init__( region: Optional[str] = None, overwrite_existing_artifact: Optional[bool] = True, remove_existing_artifact: Optional[bool] = True, + parallel_process_count: int = utils.DEFAULT_PARALLEL_PROCESS_COUNT, ): """Initializes `LargeArtifactUploader` instance. @@ -139,7 +175,9 @@ def __init__( overwrite_existing_artifact: (bool, optional). Defaults to `True`. Overwrite target bucket artifact if exists. remove_existing_artifact: (bool, optional). Defaults to `True`. - Wether artifacts uploaded to object storage bucket need to be removed or not. + Whether artifacts uploaded to object storage bucket need to be removed or not. + parallel_process_count: (int, optional). + The number of worker processes to use in parallel for uploading individual parts of a multipart upload. """ if not bucket_uri: raise ValueError("The `bucket_uri` must be provided.") @@ -150,36 +188,45 @@ def __init__( self.bucket_uri = bucket_uri self.overwrite_existing_artifact = overwrite_existing_artifact self.remove_existing_artifact = remove_existing_artifact + self._parallel_process_count = parallel_process_count def _upload(self): """Uploads model artifacts to the model catalog.""" self.progress.update("Copying model artifact to the Object Storage bucket") - try: - bucket_uri = self.bucket_uri - bucket_uri_file_name = os.path.basename(bucket_uri) + bucket_uri = self.bucket_uri + bucket_uri_file_name = os.path.basename(bucket_uri) - if not bucket_uri_file_name: - bucket_uri = os.path.join(bucket_uri, f"{self.dsc_model.id}.zip") - elif not bucket_uri.lower().endswith(".zip"): - bucket_uri = f"{bucket_uri}.zip" + if not bucket_uri_file_name: + bucket_uri = os.path.join(bucket_uri, f"{self.dsc_model.id}.zip") + elif not bucket_uri.lower().endswith(".zip"): + bucket_uri = f"{bucket_uri}.zip" - bucket_file_name = utils.copy_file( - self.artifact_zip_path, - bucket_uri, - force_overwrite=self.overwrite_existing_artifact, - auth=self.auth, - progressbar_description="Copying model artifact to the Object Storage bucket", - ) - except FileExistsError: + if not self.overwrite_existing_artifact and utils.is_path_exists( + uri=bucket_uri, auth=self.auth + ): raise FileExistsError( - f"The `{self.bucket_uri}` exists. Please use a new file name or " + f"The bucket_uri=`{self.bucket_uri}` exists. Please use a new file name or " "set `overwrite_existing_artifact` to `True` if you wish to overwrite." ) + + try: + utils.upload_to_os( + src_uri=self.artifact_zip_path, + dst_uri=bucket_uri, + auth=self.auth, + parallel_process_count=self._parallel_process_count, + force_overwrite=self.overwrite_existing_artifact, + progressbar_description="Copying model artifact to the Object Storage bucket.", + ) + except Exception as ex: + raise RuntimeError( + f"Failed to upload model artifact to the given Object Storage path `{self.bucket_uri}`." + f"See Exception: {ex}" + ) + self.progress.update("Exporting model artifact to the model catalog") - self.dsc_model.export_model_artifact( - bucket_uri=bucket_file_name, region=self.region - ) + self.dsc_model.export_model_artifact(bucket_uri=bucket_uri, region=self.region) if self.remove_existing_artifact: self.progress.update( diff --git a/ads/model/datascience_model.py b/ads/model/datascience_model.py index 4a5cdc120..8bbf6d0da 100644 --- a/ads/model/datascience_model.py +++ b/ads/model/datascience_model.py @@ -35,7 +35,7 @@ _MAX_ARTIFACT_SIZE_IN_BYTES = 2147483648 # 2GB -class ModelArtifactSizeError(Exception): # pragma: no cover +class ModelArtifactSizeError(Exception): # pragma: no cover def __init__(self, max_artifact_size: str): super().__init__( f"The model artifacts size is greater than `{max_artifact_size}`. " @@ -562,6 +562,8 @@ def create(self, **kwargs) -> "DataScienceModel": and kwargs required to instantiate IdentityClient object. timeout: (int, optional). Defaults to 10 seconds. The connection timeout in seconds for the client. + parallel_process_count: (int, optional). + The number of worker processes to use in parallel for uploading individual parts of a multipart upload. Returns ------- @@ -607,6 +609,7 @@ def create(self, **kwargs) -> "DataScienceModel": region=kwargs.pop("region", None), auth=kwargs.pop("auth", None), timeout=kwargs.pop("timeout", None), + parallel_process_count=kwargs.pop("parallel_process_count", None), ) # Sync up model @@ -623,6 +626,7 @@ def upload_artifact( overwrite_existing_artifact: Optional[bool] = True, remove_existing_artifact: Optional[bool] = True, timeout: Optional[int] = None, + parallel_process_count: int = utils.DEFAULT_PARALLEL_PROCESS_COUNT, ) -> None: """Uploads model artifacts to the model catalog. @@ -646,6 +650,8 @@ def upload_artifact( Wether artifacts uploaded to object storage bucket need to be removed or not. timeout: (int, optional). Defaults to 10 seconds. The connection timeout in seconds for the client. + parallel_process_count: (int, optional) + The number of worker processes to use in parallel for uploading individual parts of a multipart upload. """ # Upload artifact to the model catalog if not self.artifact: @@ -676,6 +682,7 @@ def upload_artifact( bucket_uri=bucket_uri, overwrite_existing_artifact=overwrite_existing_artifact, remove_existing_artifact=remove_existing_artifact, + parallel_process_count=parallel_process_count, ) else: artifact_uploader = SmallArtifactUploader( diff --git a/ads/model/deployment/model_deployment.py b/ads/model/deployment/model_deployment.py index 41a493f09..1b6e1c3d0 100644 --- a/ads/model/deployment/model_deployment.py +++ b/ads/model/deployment/model_deployment.py @@ -1304,7 +1304,8 @@ def from_id(cls, id: str) -> "ModelDeployment": ModelDeployment The ModelDeployment instance (self). """ - return cls()._update_from_oci_model(OCIDataScienceModelDeployment.from_id(id)) + oci_model = OCIDataScienceModelDeployment.from_id(id) + return cls(properties=oci_model)._update_from_oci_model(oci_model) @classmethod def from_dict(cls, obj_dict: Dict) -> "ModelDeployment": @@ -1503,7 +1504,9 @@ def _build_model_deployment_details(self) -> CreateModelDeploymentDetails: **create_model_deployment_details ).to_oci_model(CreateModelDeploymentDetails) - def _update_model_deployment_details(self, **kwargs) -> UpdateModelDeploymentDetails: + def _update_model_deployment_details( + self, **kwargs + ) -> UpdateModelDeploymentDetails: """Builds UpdateModelDeploymentDetails from model deployment instance. Returns @@ -1527,7 +1530,7 @@ def _update_model_deployment_details(self, **kwargs) -> UpdateModelDeploymentDet return OCIDataScienceModelDeployment( **update_model_deployment_details ).to_oci_model(UpdateModelDeploymentDetails) - + def _update_spec(self, **kwargs) -> "ModelDeployment": """Updates model deployment specs from kwargs. @@ -1542,7 +1545,7 @@ def _update_spec(self, **kwargs) -> "ModelDeployment": Model deployment freeform tags defined_tags: (dict) Model deployment defined tags - + Additional kwargs arguments. Can be any attribute that `ads.model.deployment.ModelDeploymentCondaRuntime`, `ads.model.deployment.ModelDeploymentContainerRuntime` and `ads.model.deployment.ModelDeploymentInfrastructure` accepts. @@ -1559,12 +1562,12 @@ def _update_spec(self, **kwargs) -> "ModelDeployment": specs = { "self": self._spec, "runtime": self.runtime._spec, - "infrastructure": self.infrastructure._spec + "infrastructure": self.infrastructure._spec, } sub_set = { self.infrastructure.CONST_ACCESS_LOG, self.infrastructure.CONST_PREDICT_LOG, - self.infrastructure.CONST_SHAPE_CONFIG_DETAILS + self.infrastructure.CONST_SHAPE_CONFIG_DETAILS, } for spec_value in specs.values(): for key in spec_value: @@ -1572,7 +1575,9 @@ def _update_spec(self, **kwargs) -> "ModelDeployment": if key in sub_set: for sub_key in converted_specs[key]: converted_sub_key = ads_utils.snake_to_camel(sub_key) - spec_value[key][converted_sub_key] = converted_specs[key][sub_key] + spec_value[key][converted_sub_key] = converted_specs[key][ + sub_key + ] else: spec_value[key] = copy.deepcopy(converted_specs[key]) self = ( @@ -1616,14 +1621,14 @@ def _build_model_deployment_configuration_details(self) -> Dict: infrastructure.CONST_MEMORY_IN_GBS: infrastructure.shape_config_details.get( "memory_in_gbs", None ) - or infrastructure.shape_config_details.get( - "memoryInGBs", None - ) + or infrastructure.shape_config_details.get("memoryInGBs", None) or DEFAULT_MEMORY_IN_GBS, } if infrastructure.subnet_id: - instance_configuration[infrastructure.CONST_SUBNET_ID] = infrastructure.subnet_id + instance_configuration[ + infrastructure.CONST_SUBNET_ID + ] = infrastructure.subnet_id scaling_policy = { infrastructure.CONST_POLICY_TYPE: "FIXED_SIZE", @@ -1638,13 +1643,11 @@ def _build_model_deployment_configuration_details(self) -> Dict: model_id = runtime.model_uri if not model_id.startswith("ocid"): - from ads.model.datascience_model import DataScienceModel - + dsc_model = DataScienceModel( name=self.display_name, - compartment_id=self.infrastructure.compartment_id - or COMPARTMENT_OCID, + compartment_id=self.infrastructure.compartment_id or COMPARTMENT_OCID, project_id=self.infrastructure.project_id or PROJECT_OCID, artifact=runtime.model_uri, ).create( @@ -1653,7 +1656,7 @@ def _build_model_deployment_configuration_details(self) -> Dict: region=runtime.region, overwrite_existing_artifact=runtime.overwrite_existing_artifact, remove_existing_artifact=runtime.remove_existing_artifact, - timeout=runtime.timeout + timeout=runtime.timeout, ) model_id = dsc_model.id diff --git a/ads/model/generic_model.py b/ads/model/generic_model.py index 913d09c06..eaa3e8dc1 100644 --- a/ads/model/generic_model.py +++ b/ads/model/generic_model.py @@ -29,6 +29,7 @@ JOB_RUN_OCID, NB_SESSION_COMPARTMENT_OCID, NB_SESSION_OCID, + PIPELINE_RUN_COMPARTMENT_OCID, PROJECT_OCID, ) from ads.evaluations import EvaluatorMixin @@ -92,7 +93,11 @@ from ads.model.transformer.onnx_transformer import ONNXTransformer _TRAINING_RESOURCE_ID = JOB_RUN_OCID or NB_SESSION_OCID -_COMPARTMENT_OCID = NB_SESSION_COMPARTMENT_OCID or JOB_RUN_COMPARTMENT_OCID +_COMPARTMENT_OCID = ( + NB_SESSION_COMPARTMENT_OCID + or JOB_RUN_COMPARTMENT_OCID + or PIPELINE_RUN_COMPARTMENT_OCID +) MODEL_DEPLOYMENT_INSTANCE_SHAPE = "VM.Standard.E4.Flex" MODEL_DEPLOYMENT_INSTANCE_OCPUS = 1 @@ -137,7 +142,7 @@ class DataScienceModelType(str, metaclass=ExtendedEnumMeta): MODEL = "datasciencemodel" -class NotActiveDeploymentError(Exception): # pragma: no cover +class NotActiveDeploymentError(Exception): # pragma: no cover def __init__(self, state: str): msg = ( "To perform a prediction the deployed model needs to be in an active state. " @@ -146,15 +151,15 @@ def __init__(self, state: str): super().__init__(msg) -class SerializeModelNotImplementedError(NotImplementedError): # pragma: no cover +class SerializeModelNotImplementedError(NotImplementedError): # pragma: no cover pass -class SerializeInputNotImplementedError(NotImplementedError): # pragma: no cover +class SerializeInputNotImplementedError(NotImplementedError): # pragma: no cover pass -class RuntimeInfoInconsistencyError(Exception): # pragma: no cover +class RuntimeInfoInconsistencyError(Exception): # pragma: no cover pass @@ -1328,7 +1333,7 @@ def from_model_artifact( If `model_file_name` not provided. """ if ( - cls._PREFIX is not "spark" + cls._PREFIX != "spark" and artifact_dir and ObjectStorageDetails.is_oci_path(artifact_dir) ): @@ -1435,7 +1440,7 @@ def from_model_catalog( An instance of GenericModel class. """ if ( - cls._PREFIX is not "spark" + cls._PREFIX != "spark" and artifact_dir and ObjectStorageDetails.is_oci_path(artifact_dir) ): @@ -1557,7 +1562,7 @@ def from_model_deployment( An instance of GenericModel class. """ if ( - cls._PREFIX is not "spark" + cls._PREFIX != "spark" and artifact_dir and ObjectStorageDetails.is_oci_path(artifact_dir) ): @@ -1654,7 +1659,7 @@ def update_deployment( Model deployment freeform tags defined_tags: (dict) Model deployment defined tags - + Additional kwargs arguments. Can be any attribute that `ads.model.deployment.ModelDeploymentCondaRuntime`, `ads.model.deployment.ModelDeploymentContainerRuntime` and `ads.model.deployment.ModelDeploymentInfrastructure` accepts. @@ -1827,6 +1832,7 @@ def save( model_version_set: Optional[Union[str, ModelVersionSet]] = None, version_label: Optional[str] = None, featurestore_dataset=None, + parallel_process_count: int = utils.DEFAULT_PARALLEL_PROCESS_COUNT, **kwargs, ) -> str: """Saves model artifacts to the model catalog. @@ -1860,6 +1866,8 @@ def save( The model version lebel. featurestore_dataset: (Dataset, optional). The feature store dataset + parallel_process_count: (int, optional) + The number of worker processes to use in parallel for uploading individual parts of a multipart upload. kwargs: project_id: (str, optional). Project OCID. If not specified, the value will be taken either @@ -1884,6 +1892,18 @@ def save( ------- str The model id. + + Examples + -------- + Example for saving large model artifacts (>2GB): + >>> model.save( + ... bucket_uri="oci://my-bucket@my-tenancy/", + ... overwrite_existing_artifact=True, + ... remove_existing_artifact=True, + ... remove_existing_artifact=True, + ... parallel_process_count=9, + ... ) + """ # Set default display_name if not specified - randomly generated easy to remember name generated if not display_name: @@ -1964,6 +1984,7 @@ def save( bucket_uri=bucket_uri, overwrite_existing_artifact=overwrite_existing_artifact, remove_existing_artifact=remove_existing_artifact, + parallel_process_count=parallel_process_count, **kwargs, ) @@ -2133,45 +2154,45 @@ def deploy( existing_infrastructure = self.model_deployment.infrastructure existing_runtime = self.model_deployment.runtime property_dict = ModelProperties( - compartment_id = existing_infrastructure.compartment_id + compartment_id=existing_infrastructure.compartment_id or self.properties.compartment_id or _COMPARTMENT_OCID, - project_id = existing_infrastructure.project_id + project_id=existing_infrastructure.project_id or self.properties.project_id or PROJECT_OCID, - deployment_instance_shape = existing_infrastructure.shape_name + deployment_instance_shape=existing_infrastructure.shape_name or self.properties.deployment_instance_shape or MODEL_DEPLOYMENT_INSTANCE_SHAPE, - deployment_instance_count = existing_infrastructure.replica + deployment_instance_count=existing_infrastructure.replica or self.properties.deployment_instance_count or MODEL_DEPLOYMENT_INSTANCE_COUNT, - deployment_bandwidth_mbps = existing_infrastructure.bandwidth_mbps + deployment_bandwidth_mbps=existing_infrastructure.bandwidth_mbps or self.properties.deployment_bandwidth_mbps or MODEL_DEPLOYMENT_BANDWIDTH_MBPS, - deployment_ocpus = existing_infrastructure.shape_config_details.get( + deployment_ocpus=existing_infrastructure.shape_config_details.get( "ocpus", None ) or self.properties.deployment_ocpus or MODEL_DEPLOYMENT_INSTANCE_OCPUS, - deployment_memory_in_gbs = existing_infrastructure.shape_config_details.get( + deployment_memory_in_gbs=existing_infrastructure.shape_config_details.get( "memoryInGBs", None ) or self.properties.deployment_memory_in_gbs or MODEL_DEPLOYMENT_INSTANCE_MEMORY_IN_GBS, - deployment_log_group_id = existing_infrastructure.log_group_id + deployment_log_group_id=existing_infrastructure.log_group_id or self.properties.deployment_log_group_id, - deployment_access_log_id = existing_infrastructure.access_log.get( + deployment_access_log_id=existing_infrastructure.access_log.get( "log_id", None ) or self.properties.deployment_access_log_id, - deployment_predict_log_id = existing_infrastructure.predict_log.get( + deployment_predict_log_id=existing_infrastructure.predict_log.get( "log_id", None ) or self.properties.deployment_predict_log_id, - deployment_image = existing_runtime.image + deployment_image=getattr(existing_runtime, "image", None) or self.properties.deployment_image, - deployment_instance_subnet_id = existing_infrastructure.subnet_id - or self.properties.deployment_instance_subnet_id + deployment_instance_subnet_id=existing_infrastructure.subnet_id + or self.properties.deployment_instance_subnet_id, ).to_dict() property_dict.update(override_properties) @@ -2245,17 +2266,18 @@ def deploy( runtime = None if self.properties.deployment_image: - image_digest = ( - kwargs.pop("image_digest", None) or existing_runtime.image_digest + image_digest = kwargs.pop("image_digest", None) or getattr( + existing_runtime, "image_digest", None + ) + cmd = kwargs.pop("cmd", []) or getattr(existing_runtime, "cmd", []) + entrypoint = kwargs.pop("entrypoint", []) or getattr( + existing_runtime, "entrypoint", [] ) - cmd = kwargs.pop("cmd", []) or existing_runtime.cmd - entrypoint = kwargs.pop("entrypoint", []) or existing_runtime.entrypoint - server_port = ( - kwargs.pop("server_port", None) or existing_runtime.server_port + server_port = kwargs.pop("server_port", None) or getattr( + existing_runtime, "server_port", None ) - health_check_port = ( - kwargs.pop("health_check_port", None) - or existing_runtime.health_check_port + health_check_port = kwargs.pop("health_check_port", None) or getattr( + existing_runtime, "health_check_port", None ) runtime = ( ModelDeploymentContainerRuntime() @@ -2863,6 +2885,7 @@ def upload_artifact( uri: str, auth: Optional[Dict] = None, force_overwrite: Optional[bool] = False, + parallel_process_count: int = utils.DEFAULT_PARALLEL_PROCESS_COUNT, ) -> None: """Uploads model artifacts to the provided `uri`. The artifacts will be zipped before uploading. @@ -2882,6 +2905,8 @@ def upload_artifact( authentication signer and kwargs required to instantiate IdentityClient object. force_overwrite: bool Overwrite target_dir if exists. + parallel_process_count: (int, optional) + The number of worker processes to use in parallel for uploading individual parts of a multipart upload. """ if not uri: raise ValueError("The `uri` must be provided.") @@ -2896,19 +2921,34 @@ def upload_artifact( uri = os.path.join(uri, f"{self.model_id}.zip") tmp_artifact_zip_path = None + progressbar_description = f"Uploading an artifact ZIP archive to {uri}." try: # Zip artifacts tmp_artifact_zip_path = zip_artifact(self.artifact_dir) # Upload artifacts to the provided destination - utils.copy_file( - uri_src=tmp_artifact_zip_path, - uri_dst=uri, - auth=auth, - force_overwrite=force_overwrite, - progressbar_description=f"Uploading an artifact ZIP archive to the {uri}", + if ObjectStorageDetails.is_oci_path( + uri + ) and ObjectStorageDetails.is_valid_uri(uri): + utils.upload_to_os( + src_uri=tmp_artifact_zip_path, + dst_uri=uri, + auth=auth, + parallel_process_count=parallel_process_count, + progressbar_description=progressbar_description, + ) + else: + utils.copy_file( + uri_src=tmp_artifact_zip_path, + uri_dst=uri, + auth=auth, + force_overwrite=force_overwrite, + progressbar_description=progressbar_description, + ) + except Exception as ex: + raise RuntimeError( + f"Failed to upload model artifact to the given Object Storage path `{uri}`." + f"See Exception: {ex}" ) - except Exception: - raise finally: if tmp_artifact_zip_path: os.remove(tmp_artifact_zip_path) diff --git a/ads/model/serde/model_serializer.py b/ads/model/serde/model_serializer.py index 96845dc09..f8147ca27 100644 --- a/ads/model/serde/model_serializer.py +++ b/ads/model/serde/model_serializer.py @@ -532,7 +532,7 @@ def _generate_initial_types(self, X_sample: Any) -> List: # the input types of all the columns one by one. auto_generated_initial_types = [] - for i, col in X_sample.iteritems(): + for i, col in X_sample.items(): if is_numeric_dtype(col.dtypes): auto_generated_initial_types.append( ( @@ -586,7 +586,7 @@ def is_either_numerical_or_string_dataframe(data: pd.DataFrame) -> bool: return isinstance(data, pd.DataFrame) and all( [ is_numeric_dtype(col.dtypes) or is_string_dtype(col.dtypes) - for _, col in data.iteritems() + for _, col in data.items() ] ) diff --git a/ads/model/service/oci_datascience_model.py b/ads/model/service/oci_datascience_model.py index 865bbf108..ec5c3e4ce 100644 --- a/ads/model/service/oci_datascience_model.py +++ b/ads/model/service/oci_datascience_model.py @@ -38,19 +38,19 @@ ) -class ModelProvenanceNotFoundError(Exception): # pragma: no cover +class ModelProvenanceNotFoundError(Exception): # pragma: no cover pass -class ModelArtifactNotFoundError(Exception): # pragma: no cover +class ModelArtifactNotFoundError(Exception): # pragma: no cover pass -class ModelNotSavedError(Exception): # pragma: no cover +class ModelNotSavedError(Exception): # pragma: no cover pass -class ModelWithActiveDeploymentError(Exception): # pragma: no cover +class ModelWithActiveDeploymentError(Exception): # pragma: no cover pass @@ -410,7 +410,7 @@ def export_model_artifact(self, bucket_uri: str, region: str = None): # Show progress of exporting model artifacts self._wait_for_work_request( work_request_id=work_request_id, - num_steps=3, + num_steps=2, ) @check_for_model_id( @@ -596,3 +596,7 @@ def _wait_for_work_request(self, work_request_id: str, num_steps: int = 3) -> No ) else: break + + while i < num_steps: + progress.update() + i += 1 diff --git a/ads/opctl/cli.py b/ads/opctl/cli.py index 91f807056..9b43ba0c3 100644 --- a/ads/opctl/cli.py +++ b/ads/opctl/cli.py @@ -192,10 +192,10 @@ def init_vscode(**kwargs): "--oci-config", help="oci config file", required=False, - default=None, + default=authutil.DEFAULT_LOCATION, ), click.option( - "--oci-profile", help="oci config profile", required=False, default=None + "--oci-profile", help="oci config profile", required=False, default=authutil.DEFAULT_PROFILE ), click.option( "--conf-file", @@ -393,7 +393,11 @@ def run(file, **kwargs): if os.path.exists(file): auth = {} if kwargs["auth"]: - auth = authutil.create_signer(kwargs["auth"]) + auth = authutil.create_signer( + auth_type=kwargs["auth"], + oci_config_location=kwargs["oci_config"], + profile=kwargs["oci_profile"] + ) else: auth = authutil.default_signer() diff --git a/ads/templates/score_onnx.jinja2 b/ads/templates/score_onnx.jinja2 index 5212ddfb2..fded33300 100644 --- a/ads/templates/score_onnx.jinja2 +++ b/ads/templates/score_onnx.jinja2 @@ -86,7 +86,7 @@ def predict(data, model=load_model()): input_data = {} model_inputs = model.get_inputs() i = 0 - for _, col in X.iteritems(): + for _, col in X.items(): if isinstance(col, pd.Series): col_val = [[item] for item in col.values.tolist()] elif isinstance(col, np.array): diff --git a/ads/templates/score_onnx_new.jinja2 b/ads/templates/score_onnx_new.jinja2 index e01219d0e..aed2dbc65 100644 --- a/ads/templates/score_onnx_new.jinja2 +++ b/ads/templates/score_onnx_new.jinja2 @@ -152,7 +152,7 @@ def predict(data, model=load_model()): input_data = {} model_inputs = model.get_inputs() i = 0 - for _, col in X.iteritems(): + for _, col in X.items(): if isinstance(col, pd.Series): col_val = [[item] for item in col.values.tolist()] elif isinstance(col, np.array): diff --git a/ads/type_discovery/typed_feature.py b/ads/type_discovery/typed_feature.py index a565a00b5..a650c797e 100644 --- a/ads/type_discovery/typed_feature.py +++ b/ads/type_discovery/typed_feature.py @@ -349,7 +349,7 @@ def sub_vectorization( unigrams = { k: int(v) for k, v in dict( - zip(v1.get_feature_names(), np.asarray(X1.sum(axis=0)).ravel()) + zip(v1.get_feature_names_out(), np.asarray(X1.sum(axis=0)).ravel()) ).items() } @@ -366,7 +366,7 @@ def sub_vectorization( bigrams = { k: int(v) for k, v in dict( - zip(v2.get_feature_names(), np.asarray(X2.sum(axis=0)).ravel()) + zip(v2.get_feature_names_out(), np.asarray(X2.sum(axis=0)).ravel()) ).items() } @@ -404,7 +404,6 @@ def vectorization(feature_name, series, mean_document_length): @staticmethod def build(name, series, is_cjk, is_html): - internal = {"cjk": is_cjk, "html": is_html} if is_cjk: diff --git a/build_spec.yaml b/build_spec.yaml deleted file mode 100644 index c267d2abe..000000000 --- a/build_spec.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2023, 2022, Oracle and/or its affiliates. - -version: 0.1 -component: build -timeoutInSeconds: 1000 -shell: bash - -steps: - - type: Command - name: "compress the repo" - command: | - tar -cvzf ${OCI_WORKSPACE_DIR}/repo.tgz ./ -outputArtifacts: - - name: artifact - type: BINARY - location: ${OCI_WORKSPACE_DIR}/repo.tgz diff --git a/dev-requirements.txt b/dev-requirements.txt index b4cabdc43..2662845e7 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,6 +1,5 @@ -r test-requirements.txt --e ".[opctl]" --e ".[all-optional]" +-e ".[bds,data,geo,huggingface,notebook,onnx,opctl,optuna,spark,tensorflow,text,torch,viz]" arff category_encoders dask @@ -12,7 +11,5 @@ nltk pdfplumber py4j pyarrow -pyspark -setuptools tables xlrd>=1.2.0 diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index bf75b5af7..04e1b9465 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,15 @@ Release Notes ============= +2.8.9 +----- +Release date: September 5, 2023 + +* Upgraded the ``scikit-learn`` dependency to ``>=1.0``. +* Upgraded the ``pandas`` dependency to ``>1.2.1,<2.1`` to allow you to use ADS with pandas 2.0. +* Fixed the "Attribute not found" error, when ``deploy()`` called twice in ``GenericModel``. +* Fixed the fetch of the security token, when the relative path for the ``security_token_file`` is provided (used in session token-bases authentication). + 2.8.8 ----- Release date: July 27, 2023 diff --git a/docs/source/user_guide/cli/quickstart.rst b/docs/source/user_guide/cli/quickstart.rst index c3e5481b5..86b30c589 100644 --- a/docs/source/user_guide/cli/quickstart.rst +++ b/docs/source/user_guide/cli/quickstart.rst @@ -57,55 +57,49 @@ Installing the ``oracle-ads`` base package Installing extras libraries +++++++++++++++++++++++++++ -The ``all-optional`` module will install all optional dependencies. - -.. code-block:: bash - - $ python3 -m pip install oracle-ads[all-optional] - To work with gradient boosting models, install the ``boosted`` module. This module includes XGBoost and LightGBM model classes. .. code-block:: bash $ python3 -m pip install oracle-ads[boosted] -For big data use cases using Oracle Big Data Service (BDS), install the ``bds`` module. It includes the following libraries: `ibis-framework[impala]`, `hdfs[kerberos]` and `sqlalchemy`. +For big data use cases using Oracle Big Data Service (BDS), install the ``bds`` module. It includes the following libraries: ``ibis-framework[impala]``, ``hdfs[kerberos]`` and ``sqlalchemy``. .. code-block:: bash $ python3 -m pip install oracle-ads[bds] -To work with a broad set of data formats (for example, Excel, Avro, etc.) install the ``data`` module. It includes the following libraries: `fastavro`, `openpyxl`, `pandavro`, `asteval`, `datefinder`, `htmllistparse`, and `sqlalchemy`. +To work with a broad set of data formats (for example, Excel, Avro, etc.) install the ``data`` module. It includes the following libraries: ``fastavro``, ``openpyxl``, ``pandavro``, ``asteval``, ``datefinder``, ``htmllistparse``, and ``sqlalchemy``. .. code-block:: bash $ python3 -m pip install oracle-ads[data] -To work with geospatial data install the ``geo`` module. It includes the `geopandas` and libraries from the `viz` module. +To work with geospatial data install the ``geo`` module. It includes the ``geopandas`` and libraries from the ``viz`` module. .. code-block:: bash $ python3 -m pip install oracle-ads[geo] -Install the ``notebook`` module to use ADS within the Oracle Cloud Infrastructure Data Science service `Notebook Session `_. This module installs `ipywidgets` and `ipython` libraries. +Install the ``notebook`` module to use ADS within the Oracle Cloud Infrastructure Data Science service `Notebook Session `_. This module installs ``ipywidgets`` and ``ipython`` libraries. .. code-block:: bash $ python3 -m pip install oracle-ads[notebook] -To work with ONNX-compatible run times and libraries designed to maximize performance and model portability, install the ``onnx`` module. It includes the following libraries, `onnx`, `onnxruntime`, `onnxmltools`, `skl2onnx`, `xgboost`, `lightgbm` and libraries from the `viz` module. +To work with ONNX-compatible run times and libraries designed to maximize performance and model portability, install the ``onnx`` module. It includes the following libraries, ``onnx``, ``onnxruntime``, ``onnxmltools``, ``skl2onnx``, ``xgboost``, ``lightgbm`` and libraries from the ``viz`` module. .. code-block:: bash $ python3 -m pip install oracle-ads[onnx] -For infrastructure tasks, install the ``opctl`` module. It includes the following libraries, `oci-cli`, `docker`, `conda-pack`, `nbconvert`, `nbformat`, and `inflection`. +For infrastructure tasks, install the ``opctl`` module. It includes the following libraries, ``oci-cli``, ``docker``, ``conda-pack``, ``nbconvert``, ``nbformat``, and ``inflection``. .. code-block:: bash $ python3 -m pip install oracle-ads[opctl] -For hyperparameter optimization tasks install the ``optuna`` module. It includes the `optuna` and libraries from the `viz` module. +For hyperparameter optimization tasks install the ``optuna`` module. It includes the ``optuna`` and libraries from the ``viz`` module. .. code-block:: bash @@ -117,30 +111,32 @@ For Spark tasks install the ``spark`` module. $ python3 -m pip install oracle-ads[spark] -Install the ``tensorflow`` module to include `tensorflow` and libraries from the ``viz`` module. +Install the ``tensorflow`` module to include ``tensorflow`` and libraries from the ``viz`` module. .. code-block:: bash $ python3 -m pip install oracle-ads[tensorflow] -For text related tasks, install the ``text`` module. This will include the `wordcloud`, `spacy` libraries. +For text related tasks, install the ``text`` module. This will include the ``wordcloud``, ``spacy`` libraries. .. code-block:: bash $ python3 -m pip install oracle-ads[text] -Install the ``torch`` module to include `pytorch` and libraries from the ``viz`` module. +Install the ``torch`` module to include ``pytorch`` and libraries from the ``viz`` module. .. code-block:: bash $ python3 -m pip install oracle-ads[torch] -Install the ``viz`` module to include libraries for visualization tasks. Some of the key packages are `bokeh`, `folium`, `seaborn` and related packages. +Install the ``viz`` module to include libraries for visualization tasks. Some of the key packages are ``bokeh``, ``folium``, ``seaborn`` and related packages. .. code-block:: bash $ python3 -m pip install oracle-ads[viz] +See ``pyproject.toml`` file ``[project.optional-dependencies]`` section for full list of modules and its list of extra libraries. + **Note** Multiple extra dependencies can be installed together. For example: @@ -148,5 +144,3 @@ Multiple extra dependencies can be installed together. For example: .. code-block:: bash $ python3 -m pip install oracle-ads[notebook,viz,text] - - diff --git a/docs/source/user_guide/jobs/data_science_job.rst b/docs/source/user_guide/jobs/data_science_job.rst index c15d23dda..4c9c0333d 100644 --- a/docs/source/user_guide/jobs/data_science_job.rst +++ b/docs/source/user_guide/jobs/data_science_job.rst @@ -35,6 +35,7 @@ is available on `Data Science AI Sample GitHub Repository `_, `DeepSpeed `_, or `Accelerate `_, you can run them using OCI Data Science Jobs with zero code change. For multi-node training, ADS will launch multiple job runs, each corresponding to one node. + +See `Distributed Data Parallel in PyTorch `_ for a series of tutorials on PyTorch distributed training. + +.. admonition:: Prerequisite + :class: note + + You need oracle-ads\>=2.8.8 to create a job with :py:class:`~ads.jobs.PyTorchDistributedRuntime`. + + You also need to specify a conda environment with PyTorch\>=1.10 and oracle-ads\>=2.6.8 for the job. See the :ref:`Conda Environment ` about specifying the conda environment for a job. + + We recommend using the ``pytorch20_p39_gpu_v1`` service conda environment and add additional packages as needed. + + You need to specify a subnet ID and allow ingress traffic within the subnet. + + +Torchrun Example +================ + +Here is an example to train a GPT model using the source code directly from the official PyTorch Examples Github repository. See `Training "Real-World" models with DDP `_ tutorial for a walkthrough of the source code. + +.. include:: ../jobs/tabs/pytorch_ddp_torchrun.rst + +.. include:: ../jobs/tabs/run_job.rst + + +Source Code +=========== + +The source code location can be specified as Git repository, local path or remote URI supported by +`fsspec `_. + +You can use the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_git` method to specify the source code ``url`` on a Git repository. You can optionally specify the ``branch`` or ``commit`` for checking out the source code. + +For a public repository, we recommend the "http://" or "https://" URL. +Authentication may be required for the SSH URL even if the repository is public. + +To use a private repository, you must first save an SSH key to +`OCI Vault `_ as a secret, +and provide the ``secret_ocid`` when calling :py:meth:`~ads.jobs.GitPythonRuntime.with_source`. +For more information about creating and using secrets, +see `Managing Secret with Vault `_. +For repository on GitHub, you could setup the +`GitHub Deploy Key `_ as secret. + +.. admonition:: Git Version for Private Repository + :class: note + + Git version of 2.3+ is required to use a private repository. + +Alternatively, you can use the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_source` method to specify the source code as e a local path or a remote URI supported by +`fsspec `_. +For example, you can specify files on OCI object storage using URI like +``oci://bucket@namespace/path/to/prefix``. ADS will use the authentication method configured by +:py:meth:`ads.set_auth()` to fetch the files and upload them as job artifact. The source code can be a single file, a compressed file/archive (zip/tar), or a folder. + +Working Directory +================= + +The default working directory depends on how the source code is specified. +* When the source code is specified as Git repository URL, the default working directory is the root of the Git repository. +* When the source code is a single file (script), the default working directory containing the file. +* When the source code is specified as a local or remote directory, the default working directory is the directory containing the source code directory. + +The working directory of your workload can be configured by :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_working_dir`. See :ref:`Python Runtime Working Directory ` for more details. + +Input Data +========== + +You can specify the input (training) data for the job using the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_inputs` method, which takes a dictionary mapping the "source" to the "destination". The "source" can be an OCI object storage URI, HTTP or FTP URL. The "destination" is the local path in a job run. If the "destination" is specified as relative path, it will be relative to the working directory. + +Outputs +======= + +You can specify the output data to be copied to the object storage by using the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_output` method. +It allows you to specify the output path ``output_path`` +in the job run and a remote URI (``output_uri``). +Files in the ``output_path`` are copied to the remote output URI after the job run finishes successfully. +Note that the ``output_path`` should be a path relative to the working directory. + +OCI object storage location can be specified in the format of ``oci://bucket_name@namespace/path/to/dir``. +Please make sure you configure the I AM policy to allow the job run dynamic group to use object storage. + +Number of nodes +=============== + +The :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_replica` method helps you to specify the number node for the training job. + +Command +======= + +The command to start your workload is specified by using the :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_command` method. + +For ``torchrun``, ADS will set ``--nnode``, ``--nproc_per_node``, ``--rdzv_backend`` and ``--rdzv_endpoint`` automatically. You do not need to specify them in the command unless you would like to override the values. The default ``rdzv_backend`` will be ``c10d``. The default port for ``rdzv_endpoint`` is 29400 + +If you workload uses Deepspeed, you also need to set ``use_deepspeed`` to ``True`` when specifying the command. For Deepspeed, ADS will generate the hostfile automatically and setup the SSH configurations. + +For ``accelerate launch``, you can add your config YAML to the source code and specify it using ``--config_file`` argument. In your config, please use ``LOCAL_MACHINE`` as the compute environment. The same config file will be used by all nodes in multi-node workload. ADS will set ``--num_processes``, ``--num_machines``, ``--machine_rank``, ``--main_process_ip`` and ``--main_process_port`` automatically. For these arguments, ADS will override the values from your config YAML. If you would like to use your own values, you need to specify them as command arguments. The default ``main_process_port`` is 29400. + +Additional dependencies +======================= + +The :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_dependency` method helps you to specify additional dependencies to be installed into the conda environment before starting your workload. +* ``pip_req`` specifies the path of the ``requirements.txt`` file in your source code. +* ``pip_pkg`` specifies the packages to be installed as a string. + +Python Paths +============ + +The working directory is added to the Python paths automatically. +You can call :py:meth:`~ads.jobs.PyTorchDistributedRuntime.with_python_path` to add additional python paths as needed. +The paths should be relative paths from the working directory. + diff --git a/docs/source/user_guide/jobs/tabs/llama2_full.rst b/docs/source/user_guide/jobs/tabs/llama2_full.rst new file mode 100644 index 000000000..270f9386e --- /dev/null +++ b/docs/source/user_guide/jobs/tabs/llama2_full.rst @@ -0,0 +1,128 @@ +.. tabs:: + + .. code-tab:: python + :caption: Python + + from ads.jobs import Job, DataScienceJob, PyTorchDistributedRuntime + + job = ( + Job(name="LLAMA2-Fine-Tuning") + .with_infrastructure( + DataScienceJob() + .with_log_group_id("") + .with_log_id("") + .with_compartment_id("") + .with_project_id("") + .with_subnet_id("") + .with_shape_name("VM.GPU.A10.1") + .with_block_storage_size(256) + ) + .with_runtime( + PyTorchDistributedRuntime() + # Specify the service conda environment by slug name. + .with_service_conda("pytorch20_p39_gpu_v1") + .with_git( + url="https://github.com/facebookresearch/llama-recipes.git", + commit="03faba661f079ee1ecaeb66deaa6bdec920a7bab" + ) + .with_dependency( + pip_pkg=" ".join([ + "'accelerate>=0.21.0'", + "appdirs", + "loralib", + "bitsandbytes==0.39.1", + "black", + "'black[jupyter]'", + "datasets", + "fire", + "'git+https://github.com/huggingface/peft.git'", + "'transformers>=4.31.0'", + "sentencepiece", + "py7zr", + "scipy", + "optimum" + ]) + ) + .with_output("/home/datascience/outputs", "oci://bucket@namespace/outputs/$JOB_RUN_OCID") + .with_command(" ".join([ + "torchrun llama_finetuning.py", + "--enable_fsdp", + "--pure_bf16", + "--batch_size_training 1", + "--micro_batch_size 1", + "--model_name $MODEL_NAME", + "--dist_checkpoint_root_folder /home/datascience/outputs", + "--dist_checkpoint_folder fine-tuned" + ])) + .with_replica(2) + .with_environment_variable( + MODEL_NAME="meta-llama/Llama-2-7b-hf", + HUGGING_FACE_HUB_TOKEN="", + LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib", + ) + ) + ) + + .. code-tab:: yaml + :caption: YAML + + kind: job + apiVersion: v1.0 + spec: + name: LLAMA2-Fine-Tuning + infrastructure: + kind: infrastructure + spec: + blockStorageSize: 256 + compartmentId: "" + logGroupId: "" + logId: "" + projectId: "" + subnetId: "" + shapeName: VM.GPU.A10.2 + type: dataScienceJob + runtime: + kind: runtime + type: pyTorchDistributed + spec: + git: + url: https://github.com/facebookresearch/llama-recipes.git + commit: 03faba661f079ee1ecaeb66deaa6bdec920a7bab + command: >- + torchrun llama_finetuning.py + --enable_fsdp + --pure_bf16 + --batch_size_training 1 + --micro_batch_size 1 + --model_name $MODEL_NAME + --dist_checkpoint_root_folder /home/datascience/outputs + --dist_checkpoint_folder fine-tuned + replicas: 2 + conda: + type: service + slug: pytorch20_p39_gpu_v1 + dependencies: + pipPackages: >- + 'accelerate>=0.21.0' + appdirs + loralib + bitsandbytes==0.39.1 + black + 'black[jupyter]' + datasets + fire + 'git+https://github.com/huggingface/peft.git' + 'transformers>=4.31.0' + sentencepiece + py7zr + scipy + optimum + outputDir: /home/datascience/outputs + outputUri: oci://bucket@namespace/outputs/$JOB_RUN_OCID + env: + - name: MODEL_NAME + value: meta-llama/Llama-2-7b-hf + - name: HUGGING_FACE_HUB_TOKEN + value: "" + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib:/usr/local/nvidia/lib64:/opt/conda/lib diff --git a/docs/source/user_guide/jobs/tabs/pytorch_ddp_torchrun.rst b/docs/source/user_guide/jobs/tabs/pytorch_ddp_torchrun.rst new file mode 100644 index 000000000..9966559bb --- /dev/null +++ b/docs/source/user_guide/jobs/tabs/pytorch_ddp_torchrun.rst @@ -0,0 +1,79 @@ +.. tabs:: + + .. code-tab:: python + :caption: Python + + from ads.jobs import Job, DataScienceJob, PyTorchDistributedRuntime + + job = ( + Job(name="PyTorch DDP Job") + .with_infrastructure( + DataScienceJob() + # Configure logging for getting the job run outputs. + .with_log_group_id("") + # Log resource will be auto-generated if log ID is not specified. + .with_log_id("") + # If you are in an OCI data science notebook session, + # the following configurations are not required. + # Configurations from the notebook session will be used as defaults. + .with_compartment_id("") + .with_project_id("") + .with_subnet_id("") + .with_shape_name("VM.GPU.A10.1") + # Minimum/Default block storage size is 50 (GB). + .with_block_storage_size(50) + ) + .with_runtime( + PyTorchDistributedRuntime() + # Specify the service conda environment by slug name. + .with_service_conda("pytorch20_p39_gpu_v1") + .with_git(url="https://github.com/pytorch/examples.git", commit="d91085d2181bf6342ac7dafbeee6fc0a1f64dcec") + .with_dependency("distributed/minGPT-ddp/requirements.txt") + .with_inputs({ + "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt": "data/input.txt" + }) + .with_output("data", "oci://bucket_name@namespace/path/to/dir") + .with_command("torchrun distributed/minGPT-ddp/mingpt/main.py data_config.path=data/input.txt trainer_config.snapshot_path=data/snapshot.pt") + .with_replica(2) + ) + ) + + .. code-tab:: yaml + :caption: YAML + + kind: job + apiVersion: v1.0 + spec: + name: PyTorch-MinGPT + infrastructure: + kind: infrastructure + spec: + blockStorageSize: 50 + compartmentId: "{{ compartment_id }}" + logGroupId: "{{ log_group_id }}" + logId: "{{ log_id }}" + projectId: "{{ project_id }}" + subnetId: "{{ subnet_id }}" + shapeName: VM.GPU.A10.1 + type: dataScienceJob + runtime: + kind: runtime + type: pyTorchDistributed + spec: + replicas: 2 + conda: + type: service + slug: pytorch110_p38_gpu_v1 + dependencies: + pipRequirements: distributed/minGPT-ddp/requirements.txt + inputs: + "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt": "data/input.txt" + outputDir: data + outputUri: oci://bucket_name@namespace/path/to/dir + git: + url: https://github.com/pytorch/examples.git + commit: d91085d2181bf6342ac7dafbeee6fc0a1f64dcec + command: >- + torchrun distributed/minGPT-ddp/mingpt/main.py + data_config.path=data/input.txt + trainer_config.snapshot_path=data/snapshot.pt diff --git a/docs/source/user_guide/jobs/tabs/run_job.rst b/docs/source/user_guide/jobs/tabs/run_job.rst new file mode 100644 index 000000000..bce62eb9f --- /dev/null +++ b/docs/source/user_guide/jobs/tabs/run_job.rst @@ -0,0 +1,19 @@ +To create and start running the job: + +.. tabs:: + + .. code-tab:: python + :caption: Python + + # Create the job on OCI Data Science + job.create() + # Start a job run + run = job.run() + # Stream the job run outputs (from the first node) + run.watch() + + .. code-tab:: bash + :caption: YAML + + # Use the following command to start the job run + ads opctl run -f your_job.yaml diff --git a/docs/source/user_guide/jobs/tabs/training_job.rst b/docs/source/user_guide/jobs/tabs/training_job.rst index a9faaf511..5053efaa9 100644 --- a/docs/source/user_guide/jobs/tabs/training_job.rst +++ b/docs/source/user_guide/jobs/tabs/training_job.rst @@ -84,13 +84,3 @@ skipMetadataUpdate: true url: https://github.com/pytorch/examples.git workingDir: word_language_model - - -.. code-block:: python - - # Create the job on OCI Data Science - job.create() - # Start a job run - run = job.run() - # Stream the job run outputs - run.watch() \ No newline at end of file diff --git a/docs/source/user_guide/model_catalog/model_catalog.rst b/docs/source/user_guide/model_catalog/model_catalog.rst index a533c8749..aaebaa87c 100644 --- a/docs/source/user_guide/model_catalog/model_catalog.rst +++ b/docs/source/user_guide/model_catalog/model_catalog.rst @@ -1204,7 +1204,7 @@ If you don't have an Object Storage bucket, create one using the OCI SDK or the Allow service datascience to manage object-family in compartment where ALL {target.bucket.name=''} - Allow service objectstorage to manage object-family in compartment where ALL {target.bucket.name=''} + Allow service objectstorage- to manage object-family in compartment where ALL {target.bucket.name=''} Saving ====== @@ -1545,4 +1545,3 @@ In the next example, the model that was stored in the model catalog as part of t .. code-block:: python3 mc.delete_model(mc_model.id) - diff --git a/docs/source/user_guide/model_registration/large_model_artifact.rst b/docs/source/user_guide/model_registration/large_model_artifact.rst index 65a5e5b1f..711a17e66 100644 --- a/docs/source/user_guide/model_registration/large_model_artifact.rst +++ b/docs/source/user_guide/model_registration/large_model_artifact.rst @@ -13,7 +13,7 @@ If you don't have an Object Storage bucket, create one using the OCI SDK or the Allow service datascience to manage object-family in compartment where ALL {target.bucket.name=''} - Allow service objectstorage to manage object-family in compartment where ALL {target.bucket.name=''} + Allow service objectstorage- to manage object-family in compartment where ALL {target.bucket.name=''} See `API documentation <../../ads.model.html#id10>`__ for more details. diff --git a/docs/source/user_guide/model_registration/model_load.rst b/docs/source/user_guide/model_registration/model_load.rst index c063cba28..44fb3876e 100644 --- a/docs/source/user_guide/model_registration/model_load.rst +++ b/docs/source/user_guide/model_registration/model_load.rst @@ -119,7 +119,7 @@ If you don't have an Object Storage bucket, create one using the OCI SDK or the Allow service datascience to manage object-family in compartment where ALL {target.bucket.name=''} - Allow service objectstorage to manage object-family in compartment where ALL {target.bucket.name=''} + Allow service objectstorage- to manage object-family in compartment where ALL {target.bucket.name=''} The following example loads a model using the large model artifact approach. The ``bucket_uri`` has the following syntax: ``oci://@//`` See `API documentation <../../ads.model.html#id4>`__ for more details. @@ -169,4 +169,4 @@ Alternatively the ``.from_id()`` method can be used to load registered or deploy bucket_uri=@/prefix/>, force_overwrite=True, remove_existing_artifact=True, - ) \ No newline at end of file + ) diff --git a/docs/source/user_guide/model_serialization/lightgbmmodel.rst b/docs/source/user_guide/model_serialization/lightgbmmodel.rst index 1f6b4722c..e46c0ff90 100644 --- a/docs/source/user_guide/model_serialization/lightgbmmodel.rst +++ b/docs/source/user_guide/model_serialization/lightgbmmodel.rst @@ -46,7 +46,7 @@ In the following several code snippets you will prepare the data and train Light # Extract numerical columns and categorical columns categorical_cols = [] numerical_cols = [] - for i, col in X.iteritems(): + for i, col in X.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: @@ -235,7 +235,7 @@ Example # Extract numerical columns and categorical columns categorical_cols = [] numerical_cols = [] - for i, col in X.iteritems(): + for i, col in X.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: diff --git a/docs/source/user_guide/model_serialization/sklearnmodel.rst b/docs/source/user_guide/model_serialization/sklearnmodel.rst index 819f2af3f..224d6c366 100644 --- a/docs/source/user_guide/model_serialization/sklearnmodel.rst +++ b/docs/source/user_guide/model_serialization/sklearnmodel.rst @@ -31,7 +31,7 @@ The following steps take your trained ``scikit-learn`` model and deploy it into X = df.drop(columns=["Attrition", "name"]) # Data Preprocessing - for i, col in X.iteritems(): + for i, col in X.items(): col.replace("unknown", "", inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -43,7 +43,7 @@ The following steps take your trained ``scikit-learn`` model and deploy it into # Extract numerical columns and categorical columns categorical_cols = [] numerical_cols = [] - for i, col in X.iteritems(): + for i, col in X.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: @@ -199,7 +199,7 @@ Examples X = df.drop(columns=["Attrition", "name"]) # Data Preprocessing - for i, col in X.iteritems(): + for i, col in X.items(): col.replace("unknown", "", inplace=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) @@ -211,7 +211,7 @@ Examples # Extract numerical columns and categorical columns categorical_cols = [] numerical_cols = [] - for i, col in X.iteritems(): + for i, col in X.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: diff --git a/docs/source/user_guide/model_serialization/xgboostmodel.rst b/docs/source/user_guide/model_serialization/xgboostmodel.rst index d9cf4dd46..28964b1fb 100644 --- a/docs/source/user_guide/model_serialization/xgboostmodel.rst +++ b/docs/source/user_guide/model_serialization/xgboostmodel.rst @@ -48,7 +48,7 @@ In the following several code snippets you will prepare the data and train XGBoo # Extract numerical columns and categorical columns categorical_cols = [] numerical_cols = [] - for i, col in X.iteritems(): + for i, col in X.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: @@ -242,7 +242,7 @@ Example # Extract numerical columns and categorical columns categorical_cols = [] numerical_cols = [] - for i, col in X.iteritems(): + for i, col in X.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: diff --git a/docs/source/user_guide/model_training/index.rst b/docs/source/user_guide/model_training/index.rst index f25dcd47c..15cd19db7 100644 --- a/docs/source/user_guide/model_training/index.rst +++ b/docs/source/user_guide/model_training/index.rst @@ -19,6 +19,7 @@ TensorBoard provides the visualization and the tooling that is needed to watch a ads_tuner training_with_oci + training_llm distributed_training/overview tensorboard/tensorboard model_evaluation/index diff --git a/docs/source/user_guide/model_training/training_llm.rst b/docs/source/user_guide/model_training/training_llm.rst new file mode 100644 index 000000000..16e9ab3d2 --- /dev/null +++ b/docs/source/user_guide/model_training/training_llm.rst @@ -0,0 +1,61 @@ +Training Large Language Model +***************************** + +.. versionadded:: 2.8.8 + +Oracle Cloud Infrastructure (OCI) `Data Science Jobs (Jobs) `_ +provides fully managed infrastructure to enable training large language model at scale. +This page shows an example of fine-tuning the `Llama 2 `_ model. For model details on the APIs, see :doc:`../jobs/run_pytorch_ddp`. + +.. admonition:: Distributed Training with OCI Data Science + :class: note + + You need to configure your `networking `_ + and `IAM `_ policies. + We recommend running the training on a private subnet. + In this example, internet access is needed to download the source code and the pre-trained model. + +The `llama-recipes `_ repository contains example code to fine-tune llama2 model. +The example `fine-tuning script `_ support full parameter fine-tuning +and `Parameter-Efficient Fine-Tuning (PEFT) `_. +With ADS, you can start the training job by taking the source code directly from Github. + +Access the Pre-Trained Model +============================ + +To fine-tune the model, you will first need to access the pre-trained model. +The pre-trained model can be obtained from `Meta `_ +or `HuggingFace `_. +In this example, we will use the `access token `_ +to download the pre-trained model from HuggingFace (by setting the ``HUGGING_FACE_HUB_TOKEN`` environment variable). + +Fine-Tuning the Model +===================== + +You can define the training job with ADS Python APIs or YAML. Here the examples for fine-tuning full parameters of the `7B model `_ using `FSDP `_. + +.. include:: ../jobs/tabs/llama2_full.rst + +You can create and start the job run API call or ADS CLI. + +.. include:: ../jobs/tabs/run_job.rst + +The job run will: + +* Setup the PyTorch conda environment and install additional dependencies. +* Fetch the source code from GitHub and checkout the specific commit. +* Run the training script with the specific arguments, which includes downloading the model and dataset. +* Save the outputs to OCI object storage once the training finishes. + +Note that in the training command, there is no need specify the number of nodes, or the number of GPUs. ADS will automatically configure that base on the ``replica`` and ``shape`` you specified. + +The fine-tuning runs on the `samsum `_ dataset by default. You can also `add your custom datasets `_. + +The same training script also support Parameter-Efficient Fine-Tuning (PEFT). You can change the ``command`` to the following for PEFT with `LoRA `_ + +.. code-block:: bash + + torchrun llama_finetuning.py --enable_fsdp --use_peft --peft_method lora \ + --pure_bf16 --batch_size_training 1 --micro_batch_size 1 \ + --model_name /home/datascience/llama --output_dir /home/datascience/outputs + diff --git a/docs/source/user_guide/model_training/training_with_oci.rst b/docs/source/user_guide/model_training/training_with_oci.rst index 3e6763cec..ca558927f 100644 --- a/docs/source/user_guide/model_training/training_with_oci.rst +++ b/docs/source/user_guide/model_training/training_with_oci.rst @@ -12,6 +12,8 @@ using the source code directly from GitHub. .. include:: ../jobs/tabs/training_job.rst +.. include:: ../jobs/tabs/run_job.rst + The job run will: * Setup the PyTorch conda environment diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..28560e6e0 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,163 @@ +## This file created and used instead of setup.py for building and installing ads package. This change is to +## follow best practive to "not invoke setup.py directly", see detailed explanation why here: +## https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html. +## Check README-development.md and Makefile for instruction how to install or build ADS locally. + +[build-system] +# These are the assumed default build requirements from pip: +# https://pip.pypa.io/en/stable/reference/pip/#pep-517-and-518-support +# PEP 517 – A build-system independent format for source trees - https://peps.python.org/pep-0517/ +requires = ["flit-core >= 3.8"] +build-backend = "flit_core.buildapi" + + +[project] +# Declaring project metadata +# https://packaging.python.org/en/latest/specifications/declaring-project-metadata/ +# PEP 621 – Storing project metadata in pyproject.toml - https://peps.python.org/pep-0621/ +# PEP 518 – Specifying Minimum Build System Requirements for Python Projects https://peps.python.org/pep-0518/ + +# Required +name = "oracle_ads" # the install (PyPI) name; name for local build in [tool.flit.module] section below +version = "2.8.9" + +# Optional +description = "Oracle Accelerated Data Science SDK" +readme = {file = "README.md", content-type = "text/markdown"} +requires-python = ">=3.8" +license = {file = "LICENSE.txt"} +authors = [ + {name = "Oracle Data Science"} +] +keywords = [ + "Oracle Cloud Infrastructure", + "OCI", + "Machine Learning", + "ML", + "Artificial Intelligence", + "AI", + "Data Science", + "Cloud", + "Oracle", +] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "License :: OSI Approved :: Universal Permissive License (UPL)", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] + +# PEP 508 – Dependency specification for Python Software Packages - https://peps.python.org/pep-0508/ +# In dependencies se "; platform_machine == 'aarch64'" to specify ARM underlying platform +# Copied from install_requires list in setup.py, setup.py got removed in favor of this config file +dependencies = [ + "asteval>=0.9.25", + "cerberus>=1.3.4", + "cloudpickle>=1.6.0", + "fsspec>=0.8.7", + "gitpython>=3.1.2", + "jinja2>=2.11.2", + "matplotlib>=3.1.3", + "numpy>=1.19.2", + "oci>=2.104.3", + "ocifs>=1.1.3", + "pandas>1.2.1,<2.1", + "psutil>=5.7.2", + "python_jsonschema_objects>=0.3.13", + "PyYAML>=6", # pyyaml 5.4 is broken with cython 3 + "requests", + "scikit-learn>=1.0", + "tabulate>=0.8.9", + "tqdm>=4.59.0", +] + +[project.optional-dependencies] +# Copied from extras_require list in setup.py, setup.py got removed in favor of this config file +bds = [ + "hdfs[kerberos]", + "ibis-framework[impala]", + "sqlalchemy", +] +boosted = [ + "lightgbm", + "xgboost", +] +data = [ + "datefinder>=0.7.1", + "fastavro>=0.24.2", + "htmllistparse>=0.6.0", + "openpyxl>=3.0.7", + "oracledb>=1.0", + "pandavro>=1.6.0", + "sqlalchemy>=1.4.1, <=1.4.46", +] +geo = [ + "geopandas", + "oracle_ads[viz]", +] +huggingface = [ + "transformers", +] +notebook = [ + "ipython>=7.23.1, <8.0", + "ipywidgets~=7.6.3", +] +onnx = [ + "lightgbm==3.3.1", + "onnx>=1.12.0", + "onnxmltools>=1.10.0", + "onnxruntime>=1.10.0", + "oracle_ads[viz]", + "protobuf<=3.20", + "skl2onnx>=1.10.4", + "tf2onnx", + "xgboost<=1.7", +] +opctl = [ + "conda-pack", + "docker", + "inflection", + "nbconvert", + "nbformat", + "oci-cli", +] +optuna = [ + "optuna==2.9.0", + "oracle_ads[viz]", +] +spark = [ + "pyspark>=3.0.0", +] +tensorflow = [ + "oracle_ads[viz]", + "tensorflow", +] +text = [ + "spacy", + "wordcloud>=1.8.1", +] +torch = [ + "oracle_ads[viz]", + "torch", + "torchvision", +] +viz = [ + "bokeh>=2.3.0, <=2.4.3", + "folium>=0.12.1", + "graphviz<0.17", + "scipy>=1.5.4", + "seaborn>=0.11.0", +] + +[project.urls] +"Github" = "https://github.com/oracle/accelerated-data-science" +"Documentation" = "https://accelerated-data-science.readthedocs.io/en/latest/index.html" + +[project.scripts] +ads = "ads.cli:cli" + +[tool.flit.module] +name = "ads" # name for local build and import, see https://flit.pypa.io/en/latest/pyproject_toml.html#module-section diff --git a/setup.py b/setup.py index 379f97f01..c16a79ded 100644 --- a/setup.py +++ b/setup.py @@ -4,179 +4,10 @@ # Copyright (c) 2020, 2022 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import sys -import os -import json -from setuptools import setup, find_packages -from functools import reduce -from pathlib import Path -from setuptools.command.install import install -from setuptools.command.develop import develop - - -install_requires = [ - "asteval>=0.9.25", - "cerberus>=1.3.4", - "cloudpickle>=1.6.0", - "fsspec>=0.8.7", - "jinja2>=2.11.2", - "gitpython>=3.1.2", - "matplotlib>=3.1.3", - "numpy>=1.19.2", - "oci>=2.104.3", - "ocifs>=1.1.3", - "pandas>1.2.1,<1.6", - "python_jsonschema_objects>=0.3.13", - "PyYAML>=6", # pyyaml 5.4 is broken with cython 3 - "requests", - "scikit-learn>=0.23.2,<1.2", - "tabulate>=0.8.9", - "tqdm>=4.59.0", - "psutil>=5.7.2", -] - -extras_require = { - "boosted": [ - "xgboost", - "lightgbm", - ], - "notebook": [ - "ipywidgets~=7.6.3", - "ipython>=7.23.1, <8.0", - ], - "text": ["wordcloud>=1.8.1", "spacy"], - "viz": [ - "bokeh>=2.3.0, <=2.4.3", - "folium>=0.12.1", - "graphviz<0.17", - "scipy>=1.5.4", - "seaborn>=0.11.0", - ], - "data": [ - "fastavro>=0.24.2", - "openpyxl>=3.0.7", - "pandavro>=1.6.0", - "datefinder>=0.7.1", - "htmllistparse>=0.6.0", - "sqlalchemy>=1.4.1, <=1.4.46", - "oracledb>=1.0", - ], - "opctl": [ - "oci-cli", - "docker", - "conda-pack", - "nbconvert", - "nbformat", - "inflection", - ], - "bds": ["ibis-framework[impala]", "hdfs[kerberos]", "sqlalchemy"], - "spark": ["pyspark>=3.0.0", "delta-spark"], - "huggingface": ["transformers"], - "feature-store": [ - "pyspark>=3.0.0", - "delta-spark", - "great-expectations==0.15.39", - "pyarrow", - "plotly" - ], - "mlm_insights": ["mlm_insights==0.1.0.dev1"], -} - -this_directory = Path(__file__).parent - - -def update_extra_with_internal_packages(): - loaded_dep = {} - internal_deps = os.path.join(this_directory, "internal_extra_dependency.json") - print(f"looking for {internal_deps}") - if os.path.exists(internal_deps): - with open(internal_deps) as idf: - loaded_dep = json.load(idf) - print(f"Found: {loaded_dep}") - return loaded_dep - - -extras_require.update(update_extra_with_internal_packages()) - -extras_require["torch"] = extras_require["viz"] + ["torch"] + ["torchvision"] -extras_require["tensorflow"] = extras_require["viz"] + [ - "tensorflow", -] -extras_require["geo"] = extras_require["viz"] + ["geopandas"] -extras_require["onnx"] = extras_require["viz"] + [ - "protobuf<=3.20", - "onnx>=1.12.0", - "onnxruntime>=1.10.0", - "onnxmltools>=1.10.0", - "skl2onnx>=1.10.4", - "tf2onnx", - "xgboost==1.5.1", - "lightgbm==3.3.1", -] -extras_require["optuna"] = extras_require["viz"] + ["optuna==2.9.0"] - -extras_require["complete"] = sorted({v for req in extras_require.values() for v in req}) -extras_require["all-optional"] = reduce( - list.__add__, - [ - extras_require[k] - for k in extras_require - if k not in ["boosted", "opctl", "complete", "mlm_insights"] - ], -) -extras_require["all-public"] = reduce( - list.__add__, - [ - extras_require[k] - for k in extras_require - if k not in ["all-optional", "complete"] - ], -) -# Only include pytest-runner in setup_requires if we're invoking tests -if {"pytest", "test", "ptr"}.intersection(sys.argv): - setup_requires = ["pytest-runner"] -else: - setup_requires = [] - -ADS_VERSION = "UNKNOWN" -with open( - os.path.join(os.path.dirname(os.path.abspath(__file__)), "ads", "ads_version.json") -) as version_file: - ADS_VERSION = json.load(version_file)["version"] - - -long_description = (this_directory / "README.md").read_text() -setup( - name="oracle_ads", - version=ADS_VERSION, - description="Oracle Accelerated Data Science SDK", - author="Oracle Data Science", - license="Universal Permissive License 1.0", - long_description=long_description, - long_description_content_type="text/markdown", - packages=find_packages(), - url="https://docs.oracle.com/en-us/iaas/tools/ads-sdk/latest/index.html", - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "License :: OSI Approved :: Universal Permissive License (UPL)", - "Operating System :: OS Independent", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - keywords="Oracle Cloud Infrastructure, OCI, Machine Learning, ML, Artificial Intelligence, AI, Data Science, Cloud, Oracle", - include_package_data=True, - install_requires=install_requires, - python_requires=">=3.8", - setup_requires=setup_requires, - extras_require=extras_require, - tests_require=[ - "pytest", - ], - project_urls={ - "Github": "https://github.com/oracle/accelerated-data-science", - "Documentation": "https://accelerated-data-science.readthedocs.io/en/latest/index.html", - }, - entry_points={"console_scripts": ["ads=ads.cli:cli"]}, -) +### File setup.py obsolete and must not be used. Please update pyproject.toml instead. +### See detailed explanation why here: +### https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html. +# PEP 621 – Storing project metadata in pyproject.toml - https://peps.python.org/pep-0621/ +# PEP 518 – Specifying Minimum Build System Requirements for Python Projects https://peps.python.org/pep-0518/ +# PEP 508 – Dependency specification for Python Software Packages - https://peps.python.org/pep-0508/ +# PEP 517 – A build-system independent format for source trees - https://peps.python.org/pep-0517/ diff --git a/tests/integration/jobs/test_dsc_job.py b/tests/integration/jobs/test_dsc_job.py index 61ae273b4..e317de67e 100644 --- a/tests/integration/jobs/test_dsc_job.py +++ b/tests/integration/jobs/test_dsc_job.py @@ -74,7 +74,11 @@ class DSCJobTestCase(unittest.TestCase): @property def default_datascience_job(self): random.seed(self.random_seed) - return DataScienceJob().with_project_id(self.PROJECT_ID) + return ( + DataScienceJob() + .with_project_id(self.PROJECT_ID) + .with_subnet_id(self.SUBNET_ID) + ) @classmethod def setUpClass(cls) -> None: @@ -632,7 +636,7 @@ def assert_infra_before_build(self, infra): self.assertEqual(infra.project_id, self.PROJECT_ID) self.assertEqual(infra.compartment_id, None) self.assertEqual(infra.block_storage_size, None) - self.assertEqual(infra.subnet_id, None) + self.assertEqual(infra.subnet_id, self.SUBNET_ID) def test_build_job_within_notebook(self): job = ( @@ -666,4 +670,4 @@ def test_build_job_outside_notebook(self): self.assertEqual(job.infrastructure.project_id, self.PROJECT_ID) self.assertEqual(job.infrastructure.compartment_id, self.COMPARTMENT_ID) self.assertEqual(job.infrastructure.block_storage_size, 50) - self.assertEqual(job.infrastructure.subnet_id, None) + self.assertEqual(job.infrastructure.subnet_id, self.SUBNET_ID) diff --git a/tests/integration/jobs/test_jobs_managed_egress.py b/tests/integration/jobs/test_jobs_managed_egress.py deleted file mode 100644 index 1533b9757..000000000 --- a/tests/integration/jobs/test_jobs_managed_egress.py +++ /dev/null @@ -1,133 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2023 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -import copy -import os -from unittest import mock -from ads.jobs import Job, ScriptRuntime -from tests.integration.config import secrets -from tests.integration.jobs.test_dsc_job import DSCJobTestCaseWithCleanUp - - -# This notebook is configured with default networking (managed egress) -NOTEBOOK_WITH_ME = secrets.jobs.NOTEBOOK_WITH_ME -# This notebook is using a subnet -NOTEBOOK_WITH_SUBNET = secrets.jobs.NOTEBOOK_ID - - -class DSCJobManagedEgressTestCase(DSCJobTestCaseWithCleanUp): - @mock.patch.dict(os.environ, NB_SESSION_OCID=NOTEBOOK_WITH_ME) - def test_create_managed_egress_job_within_managed_egress_nb_session(self): - """Tests creating a job using default configurations from notebook with managed egress.""" - expected_infra_spec = { - "displayName": "my_script", - "compartmentId": self.COMPARTMENT_ID, - "jobType": "DEFAULT", - "jobInfrastructureType": "ME_STANDALONE", - "shapeName": "VM.Standard.E3.Flex", - "shapeConfigDetails": {"memoryInGBs": 16, "ocpus": 1}, - "blockStorageSize": 100, - "projectId": self.PROJECT_ID, - } - - expected_runtime_spec = copy.deepcopy(self.DEFAULT_RUNTIME_SPEC) - - # Create a job - job = ( - Job() - .with_infrastructure(self.default_datascience_job) - .with_runtime(ScriptRuntime().with_script(self.SCRIPT_URI)) - .create() - ) - - self.assert_job_creation(job, expected_infra_spec, expected_runtime_spec) - - @mock.patch.dict(os.environ, NB_SESSION_OCID=NOTEBOOK_WITH_SUBNET) - def test_create_managed_egress_job_within_nb_session_using_subnet(self): - """Tests creating a job using managed egress from notebook with a subnet.""" - expected_infra_spec = { - "displayName": "my_script", - "compartmentId": self.COMPARTMENT_ID, - "jobType": "DEFAULT", - "jobInfrastructureType": "ME_STANDALONE", - "shapeName": "VM.Standard.E3.Flex", - "shapeConfigDetails": {"memoryInGBs": 16, "ocpus": 1}, - "blockStorageSize": 100, - "projectId": self.PROJECT_ID, - } - - expected_runtime_spec = copy.deepcopy(self.DEFAULT_RUNTIME_SPEC) - - # Create a job - job = ( - Job() - .with_infrastructure( - self.default_datascience_job.with_job_infrastructure_type( - "ME_STANDALONE" - ) - ) - .with_runtime(ScriptRuntime().with_script(self.SCRIPT_URI)) - .create() - ) - - self.assert_job_creation(job, expected_infra_spec, expected_runtime_spec) - - @mock.patch.dict(os.environ, NB_SESSION_OCID=NOTEBOOK_WITH_SUBNET) - def test_create_job_using_same_subnet_within_nb_session_using_subnet(self): - """Tests creating a job using managed egress from notebook with a subnet.""" - expected_infra_spec = { - "displayName": "my_script", - "compartmentId": self.COMPARTMENT_ID, - "jobType": "DEFAULT", - "jobInfrastructureType": "STANDALONE", - "shapeName": "VM.Standard.E3.Flex", - "shapeConfigDetails": {"memoryInGBs": 16, "ocpus": 1}, - "blockStorageSize": 100, - "projectId": self.PROJECT_ID, - "subnetId": self.SUBNET_ID, - } - - expected_runtime_spec = copy.deepcopy(self.DEFAULT_RUNTIME_SPEC) - - # Create a job - job = ( - Job() - .with_infrastructure(self.default_datascience_job) - .with_runtime(ScriptRuntime().with_script(self.SCRIPT_URI)) - .create() - ) - - self.assert_job_creation(job, expected_infra_spec, expected_runtime_spec) - - @mock.patch.dict(os.environ, NB_SESSION_OCID=NOTEBOOK_WITH_SUBNET) - def test_create_job_using_different_subnet_within_nb_session_using_subnet(self): - """Tests creating a job using managed egress from notebook with a subnet.""" - expected_infra_spec = { - "displayName": "my_script", - "compartmentId": self.COMPARTMENT_ID, - "jobType": "DEFAULT", - "jobInfrastructureType": "STANDALONE", - "shapeName": "VM.Standard.E3.Flex", - "shapeConfigDetails": {"memoryInGBs": 16, "ocpus": 1}, - "blockStorageSize": 100, - "projectId": self.PROJECT_ID, - "subnetId": secrets.jobs.SUBNET_ID_DIFF, - } - - expected_runtime_spec = copy.deepcopy(self.DEFAULT_RUNTIME_SPEC) - - # Create a job - job = ( - Job() - .with_infrastructure( - self.default_datascience_job.with_subnet_id( - secrets.jobs.SUBNET_ID_DIFF - ) - ) - .with_runtime(ScriptRuntime().with_script(self.SCRIPT_URI)) - .create() - ) - - self.assert_job_creation(job, expected_infra_spec, expected_runtime_spec) diff --git a/tests/integration/jobs/test_jobs_runs.py b/tests/integration/jobs/test_jobs_runs.py index 6f0ba50fb..8aea62900 100644 --- a/tests/integration/jobs/test_jobs_runs.py +++ b/tests/integration/jobs/test_jobs_runs.py @@ -8,9 +8,11 @@ import fsspec import oci import pytest +import random from tests.integration.config import secrets from tests.integration.jobs.test_dsc_job import DSCJobTestCaseWithCleanUp +from ads.common.auth import default_signer from ads.jobs import ( Job, DataScienceJob, @@ -45,11 +47,11 @@ class DSCJobRunTestCase(DSCJobTestCaseWithCleanUp): ZIP_JOB_ENTRYPOINT = "job_archive/main.py" TEST_OUTPUT_DIR = "output" - TEST_OUTPUT_URI = f"oci://{secrets.jobs.BUCKET_B}@{secrets.common.NAMESPACE}/ads_int_test" - SHAPE_NAME = "VM.Standard2.1" - CUSTOM_CONDA = ( - f"oci://{secrets.jobs.BUCKET_B}@{secrets.common.NAMESPACE}/conda_environments/cpu/flaml/1.0/automl_flaml" + TEST_OUTPUT_URI = ( + f"oci://{secrets.jobs.BUCKET_B}@{secrets.common.NAMESPACE}/ads_int_test" ) + SHAPE_NAME = "VM.Standard2.1" + CUSTOM_CONDA = f"oci://{secrets.jobs.BUCKET_B}@{secrets.common.NAMESPACE}/conda_environments/cpu/flaml/1.0/automl_flaml" TEST_LOGS_FUNCTION = [ "This is a function in a package.", @@ -63,6 +65,14 @@ class DSCJobRunTestCase(DSCJobTestCaseWithCleanUp): "This is a function in a package.", ] + # With shortage of ip addresses in self.SUBNET_ID, + # added pool of subnets with extra 8+8 ip addresses to run tests in parallel: + SUBNET_POOL = { + secrets.jobs.SUBNET_ID_1: 8, # max 8 ip addresses available in SUBNET_ID_1 + secrets.jobs.SUBNET_ID_2: 8, + secrets.jobs.SUBNET_ID: 32, + } + def setUp(self) -> None: self.maxDiff = None return super().setUp() @@ -70,13 +80,38 @@ def setUp(self) -> None: @property def job_run_test_infra(self): """Data Science Job infrastructure with logging and managed egress for testing job runs""" + + # Pick subnet one of SUBNET_ID_1, SUBNET_ID_2, SUBNET_ID from self.SUBNET_POOL with available ip addresses. + # Wait for 4 minutes if no ip addresses in any of 3 subnets, do 5 retries. + max_retry_count = 5 + subnet_id = None + interval = 4 * 60 + core_client = oci.core.VirtualNetworkClient(**default_signer()) + while max_retry_count > 0: + for subnet, ips_limit in random.sample(list(self.SUBNET_POOL.items()), 2): + allocated_ips = core_client.list_private_ips(subnet_id=subnet).data + # Leave 4 extra ip address for later use by jobrun. Leave more extra ips in case tests will fail with + # "All the available IP addresses in the subnet have been allocated." + if len(allocated_ips) < ips_limit - 4: + subnet_id = subnet + break + if subnet_id: + break + else: + max_retry_count -= 1 + time.sleep(interval) + # After all retries and no subnet_id with available ip addresses - using SUBNET_ID_1, subnet_id can't be None + if not subnet_id: + subnet_id = secrets.jobs.SUBNET_ID_1 + return DataScienceJob( compartment_id=self.COMPARTMENT_ID, project_id=self.PROJECT_ID, shape_name=self.SHAPE_NAME, block_storage_size=50, log_id=self.LOG_ID, - job_infrastructure_type="ME_STANDALONE", + job_infrastructure_type="STANDALONE", + subnet_id=subnet_id, ) @staticmethod @@ -214,7 +249,9 @@ def test_run_python_in_zip_using_script_runtime(self): def test_run_script_with_many_logs(self): """Tests running a Python script generating many logs using ScriptRuntime.""" runtime = ScriptRuntime().with_source( - os.path.join(os.path.dirname(__file__), "../fixtures/script_with_many_logs.py") + os.path.join( + os.path.dirname(__file__), "../fixtures/script_with_many_logs.py" + ) ) logs = [f"LOG: {i}" for i in range(2000)] self.create_and_assert_job_run(runtime, logs) @@ -231,6 +268,8 @@ class GitRuntimeJobRunTest(DSCJobRunTestCase): @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) def test_run_git_with_entry_function_and_arguments(self): """Tests running a Python function from Git repo and passing in the arguments.""" + envs = dict(OCI_LOG_LEVEL="DEBUG") + envs.update(self.PROXY_ENVS) runtime = ( GitPythonRuntime() .with_source(secrets.jobs.GITHUB_SOURCE) @@ -243,9 +282,8 @@ def test_run_git_with_entry_function_and_arguments(self): # Keyword argument as a string key='{"key": ["val1", "val2"]}', ) - .with_environment_variable(OCI_LOG_LEVEL="DEBUG") + .with_environment_variable(**envs) ) - infra = self.job_run_test_infra self.create_and_assert_job_run( runtime, [ @@ -258,7 +296,6 @@ def test_run_git_with_entry_function_and_arguments(self): "Job completed.", "Saving metadata to job run...", ], - infra=infra, ) @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) @@ -266,15 +303,16 @@ def test_run_git_with_notebook_entrypoint_and_output_uri(self): """Tests running a notebook from Git repo and saving the outputs to object storage""" output_uri = os.path.join(self.TEST_OUTPUT_URI, "git_notebook") self.remove_objects(output_uri) + envs = dict(OCI_LOG_LEVEL="DEBUG") + envs.update(self.PROXY_ENVS) runtime = ( GitPythonRuntime(skip_metadata_update=True) .with_source(secrets.jobs.GITHUB_SOURCE) .with_entrypoint(path="src/test_notebook.ipynb") .with_output("src", output_uri) .with_service_conda("dbexp_p38_cpu_v1") - .with_environment_variable(OCI_LOG_LEVEL="DEBUG") + .with_environment_variable(**envs) ) - infra = self.job_run_test_infra self.create_and_assert_job_run( runtime, [ @@ -284,7 +322,6 @@ def test_run_git_with_notebook_entrypoint_and_output_uri(self): # The following log will only show up if OCI_LOG_LEVEL is set to DEBUG "Job completed.", ], - infra=infra, ) objects = self.list_objects(output_uri) self.remove_objects(output_uri) @@ -298,15 +335,16 @@ def test_run_git_with_shell_script_entrypoint(self): """Tests running a notebook from Git repo and saving the outputs to object storage""" output_uri = os.path.join(self.TEST_OUTPUT_URI, "git_notebook") self.remove_objects(output_uri) + envs = dict(OCI_LOG_LEVEL="DEBUG") + envs.update(self.PROXY_ENVS) runtime = ( GitPythonRuntime(skip_metadata_update=True) .with_source(secrets.jobs.GITHUB_SOURCE) .with_entrypoint(path="src/conda_list.sh") .with_service_conda("dbexp_p38_cpu_v1") .with_argument("0.5", "+", 0.2, equals="0.7") - .with_environment_variable(OCI_LOG_LEVEL="DEBUG") + .with_environment_variable(**envs) ) - infra = self.job_run_test_infra self.create_and_assert_job_run( runtime, [ @@ -314,7 +352,6 @@ def test_run_git_with_shell_script_entrypoint(self): "# packages in environment at /home/datascience/conda/dbexp_p38_cpu_v1:", "Job completed.", ], - infra=infra, ) @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) @@ -344,6 +381,8 @@ def test_run_git_with_http_proxy_and_entry_function(self): @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) def test_run_git_with_ssh_key(self): + envs = dict(OCI_LOG_LEVEL="DEBUG") + envs.update(self.PROXY_ENVS) runtime = ( GitPythonRuntime(skip_metadata_update=True) .with_source( @@ -353,13 +392,11 @@ def test_run_git_with_ssh_key(self): .with_entrypoint(path="src/main.py") .with_python_path("src") .with_custom_conda(self.CUSTOM_CONDA) - .with_environment_variable(OCI_LOG_LEVEL="DEBUG") + .with_environment_variable(**envs) ) - infra = self.job_run_test_infra self.create_and_assert_job_run( runtime, self.TEST_LOGS_SCRIPT, - infra=infra, ) @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) @@ -482,7 +519,6 @@ def test_run_python_with_notebook_entrypoint_and_output_uri(self): .with_output("outputs", output_uri) .with_service_conda("dbexp_p38_cpu_v1") ) - infra = self.job_run_test_infra self.create_and_assert_job_run( runtime, [ @@ -491,7 +527,6 @@ def test_run_python_with_notebook_entrypoint_and_output_uri(self): "This is a function in a module.", "This is a function in a package.", ], - infra=infra, ) objects = self.list_objects(output_uri) self.remove_objects(output_uri) @@ -499,7 +534,9 @@ def test_run_python_with_notebook_entrypoint_and_output_uri(self): class NotebookRuntimeJobRunTest(DSCJobRunTestCase): - NOTEBOOK_PATH = os.path.join(os.path.dirname(__file__), "../fixtures/ads_check.ipynb") + NOTEBOOK_PATH = os.path.join( + os.path.dirname(__file__), "../fixtures/ads_check.ipynb" + ) @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) def test_run_notebook(self): @@ -509,11 +546,9 @@ def test_run_notebook(self): .with_notebook(self.NOTEBOOK_PATH) .with_service_conda("dbexp_p38_cpu_v1") ) - infra = self.job_run_test_infra self.create_and_assert_job_run( runtime, ["2.6.8"], - infra=infra, ) @pytest.mark.skipif(SKIP_TEST_FLAG, reason=SKIP_TEST_REASON) @@ -526,7 +561,6 @@ def test_run_notebook_in_dir(self): .with_service_conda("dbexp_p38_cpu_v1") .with_output(output_uri) ) - infra = self.job_run_test_infra self.remove_objects(output_uri) self.create_and_assert_job_run( runtime, @@ -536,7 +570,6 @@ def test_run_notebook_in_dir(self): "This is a function in a module.", "This is a function in a package.", ], - infra=infra, ) objects = self.list_objects(output_uri) self.remove_objects(output_uri) @@ -558,10 +591,8 @@ def test_run_notebook_in_dir_with_invalid_path(self): .with_source(self.NOTEBOOK_PATH, notebook="test_notebook.ipynb") .with_service_conda("dbexp_p38_cpu_v1") ) - infra = self.job_run_test_infra with self.assertRaises(ValueError): self.create_and_assert_job_run( runtime, [], - infra=infra, ) diff --git a/tests/integration/other/model/scripts/sklearn.py b/tests/integration/other/model/scripts/sklearn.py index cdf824846..748db7df1 100644 --- a/tests/integration/other/model/scripts/sklearn.py +++ b/tests/integration/other/model/scripts/sklearn.py @@ -48,7 +48,7 @@ def bank_data(): df_clas = pd.read_csv(get_test_dataset_path("vor_bank.csv")) y_clas = df_clas["y"] X_clas = df_clas.drop(columns=["y"]) - for i, col in X_clas.iteritems(): + for i, col in X_clas.items(): col.replace("unknown", "", inplace=True) (X_train_clas, X_test_clas, y_train_clas, y_test_clas) = train_test_split( X_clas, y_clas, test_size=0.1, random_state=42 @@ -86,7 +86,7 @@ def sklearn_pipeline_with_sklearn_model(): (X_train_clas, X_test_clas, y_train_clas, y_test_clas) = bank_dataset categorical_cols = [] numerical_cols = [] - for i, col in X_train_clas.iteritems(): + for i, col in X_train_clas.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: @@ -137,7 +137,7 @@ def sklearn_pipeline_with_xgboost_model(): (X_train_clas, X_test_clas, y_train_clas, y_test_clas) = bank_dataset categorical_cols = [] numerical_cols = [] - for i, col in X_train_clas.iteritems(): + for i, col in X_train_clas.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: @@ -190,7 +190,7 @@ def sklearn_pipeline_with_lightgbm_model(): (X_train_clas, X_test_clas, y_train_clas, y_test_clas) = bank_dataset categorical_cols = [] numerical_cols = [] - for i, col in X_train_clas.iteritems(): + for i, col in X_train_clas.items(): if col.dtypes == "object": categorical_cols.append(col.name) else: diff --git a/tests/unitary/default_setup/common/test_common_utils.py b/tests/unitary/default_setup/common/test_common_utils.py index a6df31d97..634f844b6 100644 --- a/tests/unitary/default_setup/common/test_common_utils.py +++ b/tests/unitary/default_setup/common/test_common_utils.py @@ -10,7 +10,7 @@ import sys import tempfile from datetime import datetime -from unittest.mock import MagicMock, patch +from unittest.mock import MagicMock, patch, ANY import numpy as np import pandas as pd @@ -29,7 +29,9 @@ folder_size, human_size, remove_file, + upload_to_os, ) +from oci import object_storage DEFAULT_SIGNER_CONF = {"config": {}} @@ -498,3 +500,84 @@ def test_extract_region(self, input_params, expected_result): return_value={"config": {"region": "default_signer_region"}}, ): assert extract_region(input_params["auth"]) == expected_result + + @patch("ads.common.auth.default_signer") + @patch("os.path.exists") + def test_upload_to_os_with_invalid_src_uri( + self, mock_file_exists, mock_default_signer + ): + """Ensures upload_to_os fails when the given `src_uri` does not exist.""" + mock_default_signer.return_value = DEFAULT_SIGNER_CONF + mock_file_exists.return_value = False + with pytest.raises(FileNotFoundError): + upload_to_os(src_uri="fake_uri", dst_uri="fake_uri") + + @patch("ads.common.auth.default_signer") + @patch("os.path.exists") + @patch("ads.common.utils.is_path_exists") + def test_upload_to_os_with_invalid_dst_uri( + self, mock_is_path_exists, mock_file_exists, mock_default_signer + ): + """ + Ensures upload_to_os fails when the given `dst_uri` is invalid. + Ensures upload_to_os fails in case of destination file already exists and + `force_overwrite` flag is not set to True. + """ + mock_default_signer.return_value = DEFAULT_SIGNER_CONF + mock_file_exists.return_value = True + mock_is_path_exists = True + with pytest.raises(ValueError): + upload_to_os(src_uri="fake_uri", dst_uri="This is an invalid oci path.") + + with pytest.raises(FileExistsError): + upload_to_os( + src_uri="fake_uri", + dst_uri="oci://my-bucket@my-tenancy/prefix", + force_overwrite=False, + ) + + @patch("ads.common.oci_client.OCIClientFactory.object_storage") + @patch("ads.common.utils.is_path_exists") + @patch.object(object_storage.UploadManager, "upload_stream") + @patch.object(object_storage.UploadManager, "__init__", return_value=None) + def test_upload_to_os( + self, + mock_init, + mock_upload, + mock_is_path_exists, + mock_client, + ): + """Tests upload_to_os successfully.""" + + class MockResponse: + def __init__(self, status_code): + self.status = status_code + + mock_upload.return_value = MockResponse(200) + dst_namespace = "my-tenancy" + dst_bucket = "my-bucket" + dst_prefix = "prefix" + parallel_process_count = 3 + uri_src = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "./test_files/archive/1.txt" + ) + response = upload_to_os( + src_uri=uri_src, + dst_uri=f"oci://{dst_bucket}@{dst_namespace}/{dst_prefix}", + force_overwrite=True, + parallel_process_count=parallel_process_count, + ) + mock_init.assert_called_with( + object_storage_client=mock_client, + parallel_process_count=parallel_process_count, + allow_multipart_uploads=True, + allow_parallel_uploads=True, + ) + mock_upload.assert_called_with( + namespace_name=dst_namespace, + bucket_name=dst_bucket, + object_name=dst_prefix, + stream_ref=ANY, + progress_callback=ANY, + ) + assert response.status == 200 diff --git a/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py b/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py index b8ba0eec9..17b84fe3e 100644 --- a/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py +++ b/tests/unitary/default_setup/feature_types/test_feature_domain_schema.py @@ -344,7 +344,7 @@ def test_zipcode(self): def test_categorical_bool_ordinal(self, feature_type): self.boolean.ads.feature_type = [feature_type] domain = self.boolean.ads.feature_domain() - assert domain.constraints[0].expression == f"$x in [True, False]" + assert domain.constraints[0].expression == "$x in [True, False]" assert domain.constraints[0].evaluate(x=True) assert domain.constraints[0].evaluate(x=False) diff --git a/tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py b/tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py index 231aff316..1e6ed9706 100644 --- a/tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py +++ b/tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py @@ -137,12 +137,12 @@ def test_create_job_runs(self, patched_run, *args): [ { "display_name": "None-0", - "environment_variables": {"RANK": "0", "WORLD_SIZE": "2"}, + "environment_variables": {"NODE_RANK": "0", "WORLD_SIZE": "2"}, }, { "display_name": "None-1", "environment_variables": { - "RANK": "1", + "NODE_RANK": "1", "WORLD_SIZE": "2", "MAIN_JOB_RUN_OCID": test_ocid, }, diff --git a/tests/unitary/default_setup/model/test_artifact_uploader.py b/tests/unitary/default_setup/model/test_artifact_uploader.py index 48a41e7f5..bc9daeabf 100644 --- a/tests/unitary/default_setup/model/test_artifact_uploader.py +++ b/tests/unitary/default_setup/model/test_artifact_uploader.py @@ -13,6 +13,9 @@ import pytest from ads.model.artifact_uploader import LargeArtifactUploader, SmallArtifactUploader from ads.model.common.utils import zip_artifact +from ads.common.auth import default_signer +from ads.common.utils import DEFAULT_PARALLEL_PROCESS_COUNT +from oci import object_storage MODEL_OCID = "ocid1.datasciencemodel.oc1.xxx" @@ -60,7 +63,6 @@ def test__init__(self): # Ensures the LargeArtifactUploader can be successfully initialized with patch("os.path.exists", return_value=True): - with pytest.raises(ValueError, match="The `bucket_uri` must be provided."): lg_artifact_uploader = LargeArtifactUploader( dsc_model=self.mock_dsc_model, @@ -71,11 +73,11 @@ def test__init__(self): overwrite_existing_artifact=False, remove_existing_artifact=False, ) - + auth = default_signer() lg_artifact_uploader = LargeArtifactUploader( dsc_model=self.mock_dsc_model, artifact_path="existing_path", - auth=self.mock_auth, + auth=auth, region=self.mock_region, bucket_uri="test_bucket_uri", overwrite_existing_artifact=False, @@ -85,14 +87,17 @@ def test__init__(self): assert lg_artifact_uploader.artifact_path == "existing_path" assert lg_artifact_uploader.artifact_zip_path == None assert lg_artifact_uploader.progress == None - assert lg_artifact_uploader.auth == self.mock_auth + assert lg_artifact_uploader.auth == auth assert lg_artifact_uploader.region == self.mock_region assert lg_artifact_uploader.bucket_uri == "test_bucket_uri" assert lg_artifact_uploader.overwrite_existing_artifact == False assert lg_artifact_uploader.remove_existing_artifact == False + assert ( + lg_artifact_uploader._parallel_process_count + == DEFAULT_PARALLEL_PROCESS_COUNT + ) def test_prepare_artiact_tmp_zip(self): - # Tests case when a folder provided as artifacts location with patch("ads.model.common.utils.zip_artifact") as mock_zip_artifact: mock_zip_artifact.return_value = "test_artifact.zip" @@ -167,50 +172,48 @@ def test_upload_small_artifact(self): mock_remove_artiact_tmp_zip.assert_called() self.mock_dsc_model.create_model_artifact.assert_called() - def test_upload_large_artifact(self): - with tempfile.TemporaryDirectory() as tmp_artifact_dir: - test_bucket_file_name = os.path.join(tmp_artifact_dir, f"{MODEL_OCID}.zip") - # Case when artifact will be created and left in the TMP folder + @patch("ads.common.utils.is_path_exists") + @patch("ads.common.utils.upload_to_os") + def test_upload_large_artifact(self, mock_upload, mock_is_path_exists): + # Case when artifact already exists and overwrite_existing_artifact==True + dest_path = "oci://my-bucket@my-namespace/my-artifact-path" + test_bucket_file_name = os.path.join(dest_path, f"{MODEL_OCID}.zip") + mock_is_path_exists.return_value = True + auth = default_signer() + artifact_uploader = LargeArtifactUploader( + dsc_model=self.mock_dsc_model, + artifact_path=self.mock_artifact_zip_path, + bucket_uri=dest_path + "/", + auth=auth, + region=self.mock_region, + overwrite_existing_artifact=True, + remove_existing_artifact=False, + ) + artifact_uploader.upload() + mock_upload.assert_called_with( + src_uri=self.mock_artifact_zip_path, + dst_uri=test_bucket_file_name, + auth=auth, + parallel_process_count=DEFAULT_PARALLEL_PROCESS_COUNT, + force_overwrite=True, + progressbar_description="Copying model artifact to the Object Storage bucket.", + ) + self.mock_dsc_model.export_model_artifact.assert_called_with( + bucket_uri=test_bucket_file_name, region=self.mock_region + ) + + # Case when artifact already exists and overwrite_existing_artifact==False + with pytest.raises(FileExistsError): artifact_uploader = LargeArtifactUploader( dsc_model=self.mock_dsc_model, - artifact_path=self.mock_artifact_path, - bucket_uri=tmp_artifact_dir + "/", - auth=self.mock_auth, + artifact_path=self.mock_artifact_zip_path, + bucket_uri=dest_path + "/", + auth=default_signer(), region=self.mock_region, overwrite_existing_artifact=False, remove_existing_artifact=False, ) artifact_uploader.upload() - self.mock_dsc_model.export_model_artifact.assert_called_with( - bucket_uri=test_bucket_file_name, region=self.mock_region - ) - assert os.path.exists(test_bucket_file_name) - - # Case when artifact already exists and overwrite_existing_artifact==False - with pytest.raises(FileExistsError): - artifact_uploader = LargeArtifactUploader( - dsc_model=self.mock_dsc_model, - artifact_path=self.mock_artifact_path, - bucket_uri=tmp_artifact_dir + "/", - auth=self.mock_auth, - region=self.mock_region, - overwrite_existing_artifact=False, - remove_existing_artifact=False, - ) - artifact_uploader.upload() - - # Case when artifact already exists and overwrite_existing_artifact==True - artifact_uploader = LargeArtifactUploader( - dsc_model=self.mock_dsc_model, - artifact_path=self.mock_artifact_path, - bucket_uri=tmp_artifact_dir + "/", - auth=self.mock_auth, - region=self.mock_region, - overwrite_existing_artifact=True, - remove_existing_artifact=True, - ) - artifact_uploader.upload() - assert not os.path.exists(test_bucket_file_name) def test_zip_artifact_fail(self): with pytest.raises(ValueError, match="The `artifact_dir` must be provided."): diff --git a/tests/unitary/default_setup/model/test_datascience_model.py b/tests/unitary/default_setup/model/test_datascience_model.py index 5c0b4d673..cc357f5e6 100644 --- a/tests/unitary/default_setup/model/test_datascience_model.py +++ b/tests/unitary/default_setup/model/test_datascience_model.py @@ -156,7 +156,6 @@ class TestDataScienceModel: - DEFAULT_PROPERTIES_PAYLOAD = { "compartmentId": DSC_MODEL_PAYLOAD["compartmentId"], "projectId": DSC_MODEL_PAYLOAD["projectId"], @@ -368,6 +367,7 @@ def test_create_success( bucket_uri="test_bucket_uri", overwrite_existing_artifact=False, remove_existing_artifact=False, + parallel_process_count=3, ) mock_oci_dsc_model_create.assert_called() mock_create_model_provenance.assert_called_with( @@ -380,6 +380,7 @@ def test_create_success( region=None, auth=None, timeout=None, + parallel_process_count=3, ) mock_sync.assert_called() assert self.prepare_dict(result.to_dict()["spec"]) == self.prepare_dict( @@ -622,6 +623,7 @@ def test_upload_artifact(self): bucket_uri="test_bucket_uri", overwrite_existing_artifact=False, remove_existing_artifact=False, + parallel_process_count=utils.DEFAULT_PARALLEL_PROCESS_COUNT, ) mock_upload.assert_called() @@ -659,7 +661,6 @@ def test_download_artifact(self): LargeArtifactDownloader, "__init__", return_value=None ) as mock_init: with patch.object(LargeArtifactDownloader, "download") as mock_download: - # If artifact is large and bucket_uri not provided with pytest.raises(ModelArtifactSizeError): self.mock_dsc_model.download_artifact(target_dir="test_target_dir") diff --git a/tests/unitary/default_setup/model/test_model_artifact.py b/tests/unitary/default_setup/model/test_model_artifact.py index f170c4122..0d2e9b6f5 100644 --- a/tests/unitary/default_setup/model/test_model_artifact.py +++ b/tests/unitary/default_setup/model/test_model_artifact.py @@ -143,29 +143,11 @@ def test_prepare_with_schema(self, model, conda_file): os.path.join(conda_file.strpath, "score.py") ), "score.py does not exist" assert os.path.exists( - os.path.join(conda_file.strpath, "schema_input.json") - ), "schema_input.json does not exist" + os.path.join(conda_file.strpath, "input_schema.json") + ), "input_schema.json does not exist" assert os.path.exists( - os.path.join(conda_file.strpath, "schema_output.json") - ), "schema_output.json does not exist" - - def test_prepare_with_schema(self, model, conda_file): - model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - force_overwrite=True, - data_science_env=True, - ) - assert os.path.exists( - os.path.join(conda_file.strpath, "score.py") - ), "score.py does not exist" - assert os.path.exists( - os.path.join(conda_file, "input_schema.json") - ), "schema_input.json does not exist" - assert os.path.exists( - os.path.join(conda_file, "output_schema.json") - ), "schema_output.json does not exist" + os.path.join(conda_file.strpath, "output_schema.json") + ), "output_schema.json does not exist" def test_prepare_with_no_schema(self, model, conda_file): with pytest.raises( diff --git a/tests/unitary/default_setup/model/test_oci_datascience_model.py b/tests/unitary/default_setup/model/test_oci_datascience_model.py index 29ab8afd4..43148cc23 100644 --- a/tests/unitary/default_setup/model/test_oci_datascience_model.py +++ b/tests/unitary/default_setup/model/test_oci_datascience_model.py @@ -88,7 +88,6 @@ class TestOCIDataScienceModel: def setup_class(cls): - # Mock delete model response cls.mock_delete_model_response = Response( data=None, status=None, headers=None, request=None @@ -229,7 +228,9 @@ def test_delete_success(self, mock_client): mock_model_deployment.return_value = [ MagicMock(lifecycle_state="ACTIVE", identifier="md_id") ] - with patch("ads.model.deployment.ModelDeployment.from_id") as mock_from_id: + with patch( + "ads.model.deployment.ModelDeployment.from_id" + ) as mock_from_id: with patch.object(OCIDataScienceModel, "sync") as mock_sync: self.mock_model.delete(delete_associated_model_deployment=True) mock_from_id.assert_called_with("md_id") @@ -445,7 +446,7 @@ def test_export_model_artifact( ) mock_wait_for_work_request.assert_called_with( work_request_id="work_request_id", - num_steps=3, + num_steps=2, ) @patch.object(TqdmProgressBar, "update") diff --git a/tests/unitary/with_extras/hpo/test_hpo_search_space.py b/tests/unitary/with_extras/hpo/test_hpo_search_space.py index c2bc9c073..689812c4c 100644 --- a/tests/unitary/with_extras/hpo/test_hpo_search_space.py +++ b/tests/unitary/with_extras/hpo/test_hpo_search_space.py @@ -6,12 +6,10 @@ """Contains tests for ads.hpo.search_space """ -import unittest import lightgbm import pytest import sklearn import xgboost -import sys, mock from ads.hpo.stopping_criterion import * from ads.hpo.distributions import * diff --git a/tests/unitary/with_extras/jobs/test_pytorch_ddp.py b/tests/unitary/with_extras/jobs/test_pytorch_ddp.py index 68c591984..e8bf568e5 100644 --- a/tests/unitary/with_extras/jobs/test_pytorch_ddp.py +++ b/tests/unitary/with_extras/jobs/test_pytorch_ddp.py @@ -108,7 +108,7 @@ def test_install_deps(self, run_command): cmd_list, [ "pip install -r abc/requirements.txt", - "pip install abc==1.0", + "pip install 'abc==1.0'", ], ) @@ -172,7 +172,7 @@ def init_runner(self): driver.CONST_ENV_DEEPSPEED: "1", driver.OCI__WORKER_COUNT: "1", driver.CONST_ENV_LAUNCH_CMD: "accelerate launch train.py --data abc", - "RANK": "0", + "NODE_RANK": "0", }, ) @mock.patch("ads.jobs.templates.driver_pytorch.DeepSpeedRunner.run_deepspeed_host") diff --git a/tests/unitary/with_extras/model/test_generic_model.py b/tests/unitary/with_extras/model/test_generic_model.py index f0e1a1f63..7773b8a84 100644 --- a/tests/unitary/with_extras/model/test_generic_model.py +++ b/tests/unitary/with_extras/model/test_generic_model.py @@ -368,6 +368,7 @@ def test_save(self, mock_dsc_model_create, mock__random_display_name): bucket_uri=None, overwrite_existing_artifact=True, remove_existing_artifact=True, + parallel_process_count=utils.DEFAULT_PARALLEL_PROCESS_COUNT, ) def test_save_not_implemented_error(self): @@ -606,7 +607,10 @@ def test_deploy_success(self, mock_deploy): "ocpus": input_dict["deployment_ocpus"], "memory_in_gbs": input_dict["deployment_memory_in_gbs"], } - assert result.infrastructure.subnet_id == input_dict["deployment_instance_subnet_id"] + assert ( + result.infrastructure.subnet_id + == input_dict["deployment_instance_subnet_id"] + ) assert result.runtime.image == input_dict["deployment_image"] assert result.runtime.entrypoint == input_dict["entrypoint"] assert result.runtime.server_port == input_dict["server_port"] @@ -994,9 +998,7 @@ def test_from_model_deployment( compartment_id="test_compartment_id", ) - mock_from_id.assert_called_with( - test_model_deployment_id - ) + mock_from_id.assert_called_with(test_model_deployment_id) mock_from_model_catalog.assert_called_with( model_id=test_model_id, model_file_name="test.pkl", @@ -1049,9 +1051,7 @@ def test_from_model_deployment_fail( remove_existing_artifact=True, compartment_id="test_compartment_id", ) - mock_from_id.assert_called_with( - test_model_deployment_id - ) + mock_from_id.assert_called_with(test_model_deployment_id) @patch.object(ModelDeployment, "update") @patch.object(ModelDeployment, "from_id") @@ -1086,9 +1086,7 @@ def test_update_deployment_class_level( poll_interval=200, ) - mock_from_id.assert_called_with( - test_model_deployment_id - ) + mock_from_id.assert_called_with(test_model_deployment_id) mock_update.assert_called_with( properties=None, diff --git a/tests/unitary/with_extras/model/test_model_artifact.py b/tests/unitary/with_extras/model/test_model_artifact.py deleted file mode 100644 index f8592d2be..000000000 --- a/tests/unitary/with_extras/model/test_model_artifact.py +++ /dev/null @@ -1,230 +0,0 @@ -#!/usr/bin/env python - -# Copyright (c) 2022, 2023 Oracle and/or its affiliates. -# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ - -import os -import pickle -import sys - -import mock -import pytest -import yaml -from ads.common.model import ADSModel -from ads.common.model_artifact import MODEL_ARTIFACT_VERSION -from ads.common.model_export_util import prepare_generic_model -from sklearn import datasets -from sklearn.ensemble import RandomForestClassifier -from sklearn.model_selection import train_test_split - - -class TestModelArtifact: - """Contains test cases for ads/common/model_artifact.py.""" - - compartment_id = "9898989898" # os.environ['NB_SESSION_COMPARTMENT_OCID'] - project_id = "89898989898989" # os.environ["PROJECT_OCID"] - - clf = None - X_sample = None - y_sample = None - - iris = datasets.load_iris(as_frame=True) - X, y = iris["data"], iris["target"] - X, y = iris["data"], iris["target"] - X_train, X_test, y_train, y_test = train_test_split(X, y) - clf = RandomForestClassifier().fit(X_train, y_train) - X_sample = X_train.head(3) - y_sample = y_train.head(3) - - @pytest.fixture(autouse=True) - def conda_file(self, tmpdir_factory): - conda_file = tmpdir_factory.mktemp("conda") - manifest = { - "manifest": { - "pack_path": "pack_path: oci://service-conda-packs@id19sfcrra6z/service_pack/cpu/pyspark/1.0/pyspv10", - "python": "3.6", - "slug": "pyspv10", - "type": "data_science", - "version": "1.0", - "arch_type": "CPU", - "manifest_version": "1.0", - "name": "pyspark", - } - } - with open(os.path.join(conda_file.strpath, "test_manifest.yaml"), "w") as mfile: - yaml.dump(manifest, mfile) - - conda_prefix = os.environ["CONDA_PREFIX"] - os.environ["CONDA_PREFIX"] = conda_file.strpath - yield conda_file - os.environ["CONDA_PREFIX"] = conda_prefix - - @pytest.fixture(autouse=True, scope="module") - def model(self): - # build model - model = ADSModel.from_estimator(self.clf) - return model - - def test_prepare_artifact(self, tmpdir): - path = os.path.join(tmpdir, "model") - os.makedirs(path) - with open(os.path.join(path, "model.pkl"), "wb") as mfile: - pickle.dump(self.clf, mfile) - value = os.environ.pop("CONDA_PREFIX", None) - prepare_generic_model(path, force_overwrite=True, ignore_deployment_error=True) - expected_output = f""" -MODEL_ARTIFACT_VERSION: '{MODEL_ARTIFACT_VERSION}' -MODEL_DEPLOYMENT: - INFERENCE_CONDA_ENV: - INFERENCE_ENV_SLUG: - INFERENCE_ENV_TYPE: 'published' - INFERENCE_ENV_PATH: oci://@//.tar.gz - INFERENCE_PYTHON_VERSION: -""" - assert yaml.load(expected_output, Loader=yaml.FullLoader) == yaml.load( - open(os.path.join(path, "runtime.yaml")).read(), Loader=yaml.FullLoader - ) - if value: - os.environ["CONDA_PREFIX"] = value - - def test_prepare_artifact_conda_info(self, tmpdir): - path = os.path.join(tmpdir, "model") - os.makedirs(path) - with open(os.path.join(path, "model.pkl"), "wb") as mfile: - pickle.dump(self.clf, mfile) - value = os.environ.pop("CONDA_PREFIX", None) - inference_conda_env = "oci://mybucket@mynamespace/test/condapackv1" - inference_python_version = "3.7" - prepare_generic_model( - path, - force_overwrite=True, - ignore_deployment_error=True, - inference_conda_env=inference_conda_env, - inference_python_version=inference_python_version, - ) - expected_output = f""" -MODEL_ARTIFACT_VERSION: '{MODEL_ARTIFACT_VERSION}' -MODEL_DEPLOYMENT: - INFERENCE_CONDA_ENV: - INFERENCE_ENV_SLUG: '' - INFERENCE_ENV_TYPE: 'published' - INFERENCE_ENV_PATH: {inference_conda_env} - INFERENCE_PYTHON_VERSION: '{inference_python_version}' -""" - assert yaml.load(expected_output, Loader=yaml.FullLoader) == yaml.load( - open(os.path.join(path, "runtime.yaml")).read(), Loader=yaml.FullLoader - ) - if value: - os.environ["CONDA_PREFIX"] = value - - @pytest.mark.skip(reason="Test case seem to be invalid") - def test_prepare_with_schema_with_exception(self, model, conda_file): - with pytest.raises( - Exception, - match="The inference environment pyspv10 may have undergone changes over the course of development. You can choose to publish the current environment or set data_science_env to True in the prepare api", - ): - model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - force_overwrite=True, - ) - - def test_prepare_with_schema(self, model, conda_file): - model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - force_overwrite=True, - data_science_env=True, - ) - assert os.path.exists( - os.path.join(conda_file.strpath, "score.py") - ), "score.py does not exist" - assert os.path.exists( - os.path.join(conda_file.strpath, "schema_input.json") - ), "schema_input.json does not exist" - assert os.path.exists( - os.path.join(conda_file.strpath, "schema_output.json") - ), "schema_output.json does not exist" - - def test_prepare_with_schema(self, model, conda_file): - model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - force_overwrite=True, - data_science_env=True, - ) - assert os.path.exists( - os.path.join(conda_file.strpath, "score.py") - ), "score.py does not exist" - assert os.path.exists( - os.path.join(conda_file, "input_schema.json") - ), "schema_input.json does not exist" - assert os.path.exists( - os.path.join(conda_file, "output_schema.json") - ), "schema_output.json does not exist" - - def test_prepare_with_no_schema(self, model, conda_file): - with pytest.raises( - AssertionError, - match="You must provide a data sample to infer the input and output data types which are used when converting the the model to an equivalent onnx model. This can be done as an ADSData object with the parameter `data_sample`, or as X and y samples to X_sample and y_sample respectively.", - ): - model.prepare( - conda_file.strpath, force_overwrite=True, data_science_env=True - ) - - def test_script_in_artifact_dir(self, model, conda_file): - model_artifact = model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - force_overwrite=True, - data_science_env=True, - ) - self._test_predict(model_artifact, model) - - def test_check_featurenames(self, model, conda_file): - names = model.feature_names(self.X_sample) - import numpy as np - - assert np.array_equal(names.values, self.X_sample.columns.values) - - def _test_predict(self, model_artifact, model): - model_artifact.reload() - est_pred = model.predict(self.X_sample) - art_pred = model_artifact.predict(self.X_sample)["prediction"] - # TODO: this line block tests/unitary/test_text_dataset_dataloader.py - # fn_pred = model_artifact.verify({'input': self.X_sample.to_dict()})['prediction'] - assert est_pred is not None - assert all( - est_pred == art_pred - ), "the score.py prediction is not aligned with the estimators prediction" - # assert art_pred == fn_pred, "The func.py script is miss-handling the invoking of score.py (score.py is " \ - # "consistent with the est.predict output). " - - def test_prepare_without_force(self, model, conda_file): - with pytest.raises( - ValueError, match="Directory already exists, set force to overwrite" - ): - model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - data_science_env=True, - ) - - def test_fetch_runtime_schema_with_python_jsonschema_objects_uninstalled( - self, model, conda_file - ): - with mock.patch.dict(sys.modules, {"python_jsonschema_objects": None}): - with pytest.raises(ModuleNotFoundError): - model_artifact = model.prepare( - conda_file.strpath, - X_sample=self.X_sample, - y_sample=self.y_sample, - force_overwrite=True, - data_science_env=True, - ) - model_artifact._generate_runtime_yaml() diff --git a/tests/unitary/with_extras/model/test_model_info_extractor.py b/tests/unitary/with_extras/model/test_model_info_extractor.py index 4ed8afbe5..ff6268f60 100644 --- a/tests/unitary/with_extras/model/test_model_info_extractor.py +++ b/tests/unitary/with_extras/model/test_model_info_extractor.py @@ -164,7 +164,6 @@ def test_generic_lightgbm_model(self): reason="wait for proper testing pipeline for tensorflow related tests" ) def test_generic_keras_model(self): - import tensorflow mnist = tensorflow.keras.datasets.mnist @@ -239,7 +238,6 @@ class any: def test_huggingface_extractors( self, ): - fake_pipeline = FakePipeline("fake", Model()) metadata_taxonomy = ModelInfoExtractorFactory.extract_info(fake_pipeline) assert isinstance(metadata_taxonomy, dict)