Merge branch 'main' of https://github.com/oracle/accelerated-data-sci…

…ence into feature/forecasting
oracle · Sep 28, 2023 · ac932aa · ac932aa
2 parents 997d27f + fa6f315
commit ac932aa
Show file tree

Hide file tree

Showing 29 changed files with 444 additions and 142 deletions.
diff --git a/.gitleaks.toml b/.gitleaks.toml
@@ -13,6 +13,7 @@ useDefault = true
         '''example-password''',
         '''this-is-not-the-secret''',
         '''<redacted>''',
+        '''security_token''',
         # NVIDIA_GPGKEY_SUM from public documentation:
         # https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/base/Dockerfile
         '''d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87'''

diff --git a/README-development.md b/README-development.md
@@ -58,7 +58,7 @@ Open the destination folder where you want to clone ADS library, and install dep
     python3 -m pip install -e .
 ```
 
-To which packages were installed and their version numbers, run:
+To view which packages were installed and their version numbers, run:
 
 ```bash
     python3 -m pip freeze

diff --git a/SECURITY.md b/SECURITY.md
@@ -21,7 +21,7 @@ security features are welcome on GitHub Issues.
 
 Security updates will be released on a regular cadence. Many of our projects
 will typically release security fixes in conjunction with the
-[Oracle Critical Patch Update][3] program. Additional
+Oracle Critical Patch Update program. Additional
 information, including past advisories, is available on our [security alerts][4]
 page.
 

diff --git a/ads/dataset/sampled_dataset.py b/ads/dataset/sampled_dataset.py
@@ -47,13 +47,13 @@
     OptionalDependency,
 )
 
+NATURAL_EARTH_DATASET = "naturalearth_lowres"
 
 class PandasDataset(object):
     """
     This class provides APIs that can work on a sampled dataset.
     """
 
-    @runtime_dependency(module="geopandas", install_from=OptionalDependency.GEO)
     def __init__(
         self,
         sampled_df,
@@ -67,9 +67,7 @@ def __init__(
         self.correlation = None
         self.feature_dist_html_dict = {}
         self.feature_types = metadata if metadata is not None else {}
-        self.world = geopandas.read_file(
-            geopandas.datasets.get_path("naturalearth_lowres")
-        )
+        self.world = None
 
         self.numeric_columns = self.sampled_df.select_dtypes(
             utils.numeric_pandas_dtypes()
@@ -562,7 +560,7 @@ def plot_gis_scatter(self, lon="longitude", lat="latitude", ax=None):
                 ),
             )
             world = geopandas.read_file(
-                geopandas.datasets.get_path("naturalearth_lowres")
+                geopandas.datasets.get_path(NATURAL_EARTH_DATASET)
             )
             ax1 = world.plot(ax=ax, color="lightgrey", linewidth=0.5, edgecolor="white")
             gdf.plot(ax=ax1, color="blue", markersize=10)
@@ -706,6 +704,12 @@ def _visualize_feature_distribution(self, html_widget):
                 gdf = geopandas.GeoDataFrame(
                     df, geometry=geopandas.points_from_xy(df["lon"], df["lat"])
                 )
+
+                if not self.world:
+                    self.world = geopandas.read_file(
+                        geopandas.datasets.get_path(NATURAL_EARTH_DATASET)
+                    )
+
                 self.world.plot(
                     ax=ax, color="lightgrey", linewidth=0.5, edgecolor="white"
                 )

diff --git a/ads/model/artifact_uploader.py b/ads/model/artifact_uploader.py
@@ -10,6 +10,7 @@
 from typing import Dict, Optional
 
 from ads.common import utils
+from ads.common.object_storage_details import ObjectStorageDetails
 from ads.model.common import utils as model_utils
 from ads.model.service.oci_datascience_model import OCIDataScienceModel
 
@@ -29,7 +30,10 @@ def __init__(self, dsc_model: OCIDataScienceModel, artifact_path: str):
         artifact_path: str
             The model artifact location.
         """
-        if not os.path.exists(artifact_path):
+        if not (
+            ObjectStorageDetails.is_oci_path(artifact_path)
+            or os.path.exists(artifact_path)
+        ):
             raise ValueError(f"The `{artifact_path}` does not exist")
 
         self.dsc_model = dsc_model
@@ -45,7 +49,7 @@ def upload(self):
             ) as progress:
                 self.progress = progress
                 self.progress.update("Preparing model artifacts ZIP archive.")
-                self._prepare_artiact_tmp_zip()
+                self._prepare_artifact_tmp_zip()
                 self.progress.update("Uploading model artifacts.")
                 self._upload()
                 self.progress.update(
@@ -55,22 +59,19 @@ def upload(self):
         except Exception:
             raise
         finally:
-            self._remove_artiact_tmp_zip()
+            self._remove_artifact_tmp_zip()
 
-    def _prepare_artiact_tmp_zip(self) -> str:
+    def _prepare_artifact_tmp_zip(self) -> str:
         """Prepares model artifacts ZIP archive.
 
-        Parameters
-        ----------
-        progress: (TqdmProgressBar, optional). Defaults to `None`.
-            The progress indicator.
-
         Returns
         -------
         str
             Path to the model artifact ZIP archive.
         """
-        if os.path.isfile(self.artifact_path) and self.artifact_path.lower().endswith(
+        if ObjectStorageDetails.is_oci_path(self.artifact_path):
+            self.artifact_zip_path = self.artifact_path
+        elif os.path.isfile(self.artifact_path) and self.artifact_path.lower().endswith(
             ".zip"
         ):
             self.artifact_zip_path = self.artifact_path
@@ -80,7 +81,7 @@ def _prepare_artiact_tmp_zip(self) -> str:
             )
         return self.artifact_zip_path
 
-    def _remove_artiact_tmp_zip(self):
+    def _remove_artifact_tmp_zip(self):
         """Removes temporary created artifact zip archive."""
         if (
             self.artifact_zip_path
@@ -112,7 +113,10 @@ class LargeArtifactUploader(ArtifactUploader):
     Attributes
     ----------
     artifact_path: str
-        The model artifact location.
+        The model artifact location. Possible values are:
+            - object storage path to zip archive. Example: `oci://<bucket_name>@<namespace>/prefix/mymodel.zip`.
+            - local path to zip archive. Example: `./mymodel.zip`.
+            - local path to folder with artifacts. Example: `./mymodel`.
     artifact_zip_path: str
         The uri of the zip of model artifact.
     auth: dict
@@ -124,6 +128,8 @@ class LargeArtifactUploader(ArtifactUploader):
         The OCI Object Storage URI where model artifacts will be copied to.
         The `bucket_uri` is only necessary for uploading large artifacts which
         size is greater than 2GB. Example: `oci://<bucket_name>@<namespace>/prefix/`.
+        .. versionadded:: 2.8.10
+        If artifact_path is object storage path to a zip archive, bucket_uri will be ignored.
     dsc_model: OCIDataScienceModel
         The data scince model instance.
     overwrite_existing_artifact: bool
@@ -145,7 +151,7 @@ def __init__(
         self,
         dsc_model: OCIDataScienceModel,
         artifact_path: str,
-        bucket_uri: str,
+        bucket_uri: str = None,
         auth: Optional[Dict] = None,
         region: Optional[str] = None,
         overwrite_existing_artifact: Optional[bool] = True,
@@ -159,11 +165,16 @@ def __init__(
         dsc_model: OCIDataScienceModel
             The data scince model instance.
         artifact_path: str
-            The model artifact location.
-        bucket_uri: str
+            The model artifact location. Possible values are:
+                - object storage path to zip archive. Example: `oci://<bucket_name>@<namespace>/prefix/mymodel.zip`.
+                - local path to zip archive. Example: `./mymodel.zip`.
+                - local path to folder with artifacts. Example: `./mymodel`.
+        bucket_uri: (str, optional). Defaults to `None`.
             The OCI Object Storage URI where model artifacts will be copied to.
-            The `bucket_uri` is only necessary for uploading large artifacts which
+            The `bucket_uri` is only necessary for uploading large artifacts from local which
             size is greater than 2GB. Example: `oci://<bucket_name>@<namespace>/prefix/`.
+            .. versionadded:: 2.8.10
+            If `artifact_path` is object storage path to a zip archive, `bucket_uri` will be ignored.
         auth: (Dict, optional). Defaults to `None`.
             The default authetication is set using `ads.set_auth` API.
             If you need to override the default, use the `ads.common.auth.api_keys` or
@@ -179,11 +190,22 @@ def __init__(
         parallel_process_count: (int, optional).
             The number of worker processes to use in parallel for uploading individual parts of a multipart upload.
         """
+        self.auth = auth or dsc_model.auth
+        if ObjectStorageDetails.is_oci_path(artifact_path):
+            if not artifact_path.endswith(".zip"):
+                raise ValueError(
+                    f"The `artifact_path={artifact_path}` is invalid."
+                    "The remote path for model artifact should be a zip archive, "
+                    "e.g. `oci://<bucket_name>@<namespace>/prefix/mymodel.zip`."
+                )
+            if not utils.is_path_exists(uri=artifact_path, auth=self.auth):
+                raise ValueError(f"The `{artifact_path}` does not exist.")
+            bucket_uri = artifact_path
+
         if not bucket_uri:
             raise ValueError("The `bucket_uri` must be provided.")
 
         super().__init__(dsc_model=dsc_model, artifact_path=artifact_path)
-        self.auth = auth or dsc_model.auth
         self.region = region or utils.extract_region(self.auth)
         self.bucket_uri = bucket_uri
         self.overwrite_existing_artifact = overwrite_existing_artifact
@@ -192,38 +214,38 @@ def __init__(
 
     def _upload(self):
         """Uploads model artifacts to the model catalog."""
-        self.progress.update("Copying model artifact to the Object Storage bucket")
-
         bucket_uri = self.bucket_uri
-        bucket_uri_file_name = os.path.basename(bucket_uri)
-
-        if not bucket_uri_file_name:
-            bucket_uri = os.path.join(bucket_uri, f"{self.dsc_model.id}.zip")
-        elif not bucket_uri.lower().endswith(".zip"):
-            bucket_uri = f"{bucket_uri}.zip"
-
-        if not self.overwrite_existing_artifact and utils.is_path_exists(
-            uri=bucket_uri, auth=self.auth
-        ):
-            raise FileExistsError(
-                f"The bucket_uri=`{self.bucket_uri}` exists. Please use a new file name or "
-                "set `overwrite_existing_artifact` to `True` if you wish to overwrite."
-            )
+        self.progress.update("Copying model artifact to the Object Storage bucket")
+        if not bucket_uri == self.artifact_zip_path:
+            bucket_uri_file_name = os.path.basename(bucket_uri)
+
+            if not bucket_uri_file_name:
+                bucket_uri = os.path.join(bucket_uri, f"{self.dsc_model.id}.zip")
+            elif not bucket_uri.lower().endswith(".zip"):
+                bucket_uri = f"{bucket_uri}.zip"
+
+            if not self.overwrite_existing_artifact and utils.is_path_exists(
+                uri=bucket_uri, auth=self.auth
+            ):
+                raise FileExistsError(
+                    f"The bucket_uri=`{self.bucket_uri}` exists. Please use a new file name or "
+                    "set `overwrite_existing_artifact` to `True` if you wish to overwrite."
+                )
 
-        try:
-            utils.upload_to_os(
-                src_uri=self.artifact_zip_path,
-                dst_uri=bucket_uri,
-                auth=self.auth,
-                parallel_process_count=self._parallel_process_count,
-                force_overwrite=self.overwrite_existing_artifact,
-                progressbar_description="Copying model artifact to the Object Storage bucket.",
-            )
-        except Exception as ex:
-            raise RuntimeError(
-                f"Failed to upload model artifact to the given Object Storage path `{self.bucket_uri}`."
-                f"See Exception: {ex}"
-            )
+            try:
+                utils.upload_to_os(
+                    src_uri=self.artifact_zip_path,
+                    dst_uri=bucket_uri,
+                    auth=self.auth,
+                    parallel_process_count=self._parallel_process_count,
+                    force_overwrite=self.overwrite_existing_artifact,
+                    progressbar_description="Copying model artifact to the Object Storage bucket.",
+                )
+            except Exception as ex:
+                raise RuntimeError(
+                    f"Failed to upload model artifact to the given Object Storage path `{self.bucket_uri}`."
+                    f"See Exception: {ex}"
+                )
 
         self.progress.update("Exporting model artifact to the model catalog")
         self.dsc_model.export_model_artifact(bucket_uri=bucket_uri, region=self.region)

diff --git a/ads/model/datascience_model.py b/ads/model/datascience_model.py
@@ -11,6 +11,7 @@
 
 import pandas
 from ads.common import utils
+from ads.common.object_storage_details import ObjectStorageDetails
 from ads.config import COMPARTMENT_OCID, PROJECT_OCID
 from ads.feature_engineering.schema import Schema
 from ads.jobs.builders.base import Builder
@@ -548,6 +549,8 @@ def create(self, **kwargs) -> "DataScienceModel":
                 The OCI Object Storage URI where model artifacts will be copied to.
                 The `bucket_uri` is only necessary for uploading large artifacts which
                 size is greater than 2GB. Example: `oci://<bucket_name>@<namespace>/prefix/`.
+                .. versionadded:: 2.8.10
+                If `artifact` is provided as an object storage path to a zip archive, `bucket_uri` will be ignored.
             overwrite_existing_artifact: (bool, optional). Defaults to `True`.
                 Overwrite target bucket artifact if exists.
             remove_existing_artifact: (bool, optional). Defaults to `True`.
@@ -636,6 +639,8 @@ def upload_artifact(
             The OCI Object Storage URI where model artifacts will be copied to.
             The `bucket_uri` is only necessary for uploading large artifacts which
             size is greater than 2GB. Example: `oci://<bucket_name>@<namespace>/prefix/`.
+            .. versionadded:: 2.8.10
+            If `artifact` is provided as an object storage path to a zip archive, `bucket_uri` will be ignored.
         auth: (Dict, optional). Defaults to `None`.
             The default authentication is set using `ads.set_auth` API.
             If you need to override the default, use the `ads.common.auth.api_keys` or
@@ -668,6 +673,13 @@ def upload_artifact(
                 "timeout": timeout,
             }
 
+        if ObjectStorageDetails.is_oci_path(self.artifact):
+            if bucket_uri and bucket_uri != self.artifact:
+                logger.warn(
+                    "The `bucket_uri` will be ignored and the value of `self.artifact` will be used instead."
+                )
+            bucket_uri = self.artifact
+
         if bucket_uri or utils.folder_size(self.artifact) > _MAX_ARTIFACT_SIZE_IN_BYTES:
             if not bucket_uri:
                 raise ModelArtifactSizeError(