Skip to content

Commit

Permalink
Implement content type propagation (#226)
Browse files Browse the repository at this point in the history
* Implement content type propagation

* add to changelog

* random filenames

* fix for custom endpoint

* check class and live before updating kwargs

* update header level

* Move content type to base client; add docstring

* move other client settings to its own page

* update version and changelog

* Correct docstring error

* fix typo in history

Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com>

* set release date

Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com>

Co-authored-by: Jay Qi <2721979+jayqi@users.noreply.github.com>
  • Loading branch information
pjbull and jayqi authored May 19, 2022
1 parent 85268c8 commit da86719
Show file tree
Hide file tree
Showing 13 changed files with 258 additions and 43 deletions.
3 changes: 2 additions & 1 deletion HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
# cloudpathlib Changelog

## v0.7.2 (UNRELEASED)
## v0.8.0 (2022-05-19)

- Fixed pickling of `CloudPath` objects not working. ([Issue #223](https://github.com/drivendataorg/cloudpathlib/issues/223), [PR #224](https://github.com/drivendataorg/cloudpathlib/pull/224))
- Added functionality to [push the MIME (media) type to the content type property on cloud providers by default. ([Issue #222](https://github.com/drivendataorg/cloudpathlib/issues/222), [PR #226](https://github.com/drivendataorg/cloudpathlib/pull/226))

## v0.7.1 (2022-04-06)

Expand Down
25 changes: 21 additions & 4 deletions cloudpathlib/azure/azblobclient.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime
import mimetypes
import os
from pathlib import Path, PurePosixPath
from typing import Any, Dict, Iterable, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union


from ..client import Client, register_client_class
Expand All @@ -12,7 +13,7 @@

try:
from azure.core.exceptions import ResourceNotFoundError
from azure.storage.blob import BlobServiceClient, BlobProperties
from azure.storage.blob import BlobServiceClient, BlobProperties, ContentSettings
except ModuleNotFoundError:
implementation_registry["azure"].dependencies_loaded = False

Expand All @@ -32,6 +33,7 @@ def __init__(
connection_string: Optional[str] = None,
blob_service_client: Optional["BlobServiceClient"] = None,
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
):
"""Class constructor. Sets up a [`BlobServiceClient`](
https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python).
Expand Down Expand Up @@ -68,6 +70,8 @@ def __init__(
https://docs.microsoft.com/en-us/python/api/azure-storage-blob/azure.storage.blob.blobserviceclient?view=azure-python).
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
for downloaded files. If None, will use a temporary directory.
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
"""
if connection_string is None:
connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING", None)
Expand All @@ -86,14 +90,16 @@ def __init__(
"Credentials are required; see docs for options."
)

super().__init__(local_cache_dir=local_cache_dir)
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)

def _get_metadata(self, cloud_path: AzureBlobPath) -> Union["BlobProperties", Dict[str, Any]]:
blob = self.service_client.get_blob_client(
container=cloud_path.container, blob=cloud_path.blob
)
properties = blob.get_blob_properties()

properties["content_type"] = properties.content_settings.content_type

return properties

def _download_file(
Expand Down Expand Up @@ -220,7 +226,18 @@ def _upload_file(
container=cloud_path.container, blob=cloud_path.blob
)

blob.upload_blob(Path(local_path).read_bytes(), overwrite=True) # type: ignore
extra_args = {}
if self.content_type_method is not None:
content_type, content_encoding = self.content_type_method(str(local_path))

if content_type is not None:
extra_args["content_type"] = content_type
if content_encoding is not None:
extra_args["content_encoding"] = content_encoding

content_settings = ContentSettings(**extra_args)

blob.upload_blob(Path(local_path).read_bytes(), overwrite=True, content_settings=content_settings) # type: ignore

return cloud_path

Expand Down
8 changes: 7 additions & 1 deletion cloudpathlib/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import abc
import mimetypes
import os
from pathlib import Path
from tempfile import TemporaryDirectory
Expand All @@ -25,7 +26,11 @@ class Client(abc.ABC, Generic[BoundedCloudPath]):
_cloud_meta: CloudImplementation
_default_client = None

def __init__(self, local_cache_dir: Optional[Union[str, os.PathLike]] = None):
def __init__(
self,
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
):
self._cloud_meta.validate_completeness()
# setup caching and local versions of file and track if it is a tmp dir
self._cache_tmp_dir = None
Expand All @@ -34,6 +39,7 @@ def __init__(self, local_cache_dir: Optional[Union[str, os.PathLike]] = None):
local_cache_dir = self._cache_tmp_dir.name

self._local_cache_dir = Path(local_cache_dir)
self.content_type_method = content_type_method

def __del__(self) -> None:
# make sure temp is cleaned up if we created it
Expand Down
16 changes: 13 additions & 3 deletions cloudpathlib/gs/gsclient.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from datetime import datetime
import mimetypes
import os
from pathlib import Path, PurePosixPath
from typing import Any, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union
from typing import Any, Callable, Dict, Iterable, Optional, TYPE_CHECKING, Tuple, Union

from ..client import Client, register_client_class
from ..cloudpath import implementation_registry
Expand Down Expand Up @@ -34,6 +35,7 @@ def __init__(
project: Optional[str] = None,
storage_client: Optional["StorageClient"] = None,
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
):
"""Class constructor. Sets up a [`Storage
Client`](https://googleapis.dev/python/storage/latest/client.html).
Expand Down Expand Up @@ -65,6 +67,8 @@ def __init__(
https://googleapis.dev/python/storage/latest/client.html).
local_cache_dir (Optional[Union[str, os.PathLike]]): Path to directory to use as cache
for downloaded files. If None, will use a temporary directory.
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
"""
if application_credentials is None:
application_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
Expand All @@ -81,7 +85,7 @@ def __init__(
except DefaultCredentialsError:
self.client = StorageClient.create_anonymous_client()

super().__init__(local_cache_dir=local_cache_dir)
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)

def _get_metadata(self, cloud_path: GSPath) -> Optional[Dict[str, Any]]:
bucket = self.client.bucket(cloud_path.bucket)
Expand All @@ -94,6 +98,7 @@ def _get_metadata(self, cloud_path: GSPath) -> Optional[Dict[str, Any]]:
"etag": blob.etag,
"size": blob.size,
"updated": blob.updated,
"content_type": blob.content_type,
}

def _download_file(self, cloud_path: GSPath, local_path: Union[str, os.PathLike]) -> Path:
Expand Down Expand Up @@ -207,7 +212,12 @@ def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: GSPath)
bucket = self.client.bucket(cloud_path.bucket)
blob = bucket.blob(cloud_path.blob)

blob.upload_from_filename(str(local_path))
extra_args = {}
if self.content_type_method is not None:
content_type, _ = self.content_type_method(str(local_path))
extra_args["content_type"] = content_type

blob.upload_from_filename(str(local_path), **extra_args)
return cloud_path


Expand Down
17 changes: 15 additions & 2 deletions cloudpathlib/local/localclient.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import atexit
from hashlib import md5
import mimetypes
import os
from pathlib import Path, PurePosixPath
import shutil
from tempfile import TemporaryDirectory
from typing import Iterable, List, Optional, Tuple, Union
from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union

from ..client import Client
from .localpath import LocalPath
Expand All @@ -21,14 +22,15 @@ def __init__(
*args,
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
local_storage_dir: Optional[Union[str, os.PathLike]] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
**kwargs,
):
# setup caching and local versions of file. use default temp dir if not provided
if local_storage_dir is None:
local_storage_dir = self.get_default_storage_dir()
self._local_storage_dir = Path(local_storage_dir)

super().__init__(local_cache_dir=local_cache_dir)
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)

@classmethod
def get_default_storage_dir(cls) -> Path:
Expand Down Expand Up @@ -132,6 +134,17 @@ def _upload_file(
shutil.copy(local_path, dst)
return cloud_path

def _get_metadata(self, cloud_path: "LocalPath") -> Dict:
# content_type is the only metadata we test currently
if self.content_type_method is None:
content_type_method = lambda x: (None, None)
else:
content_type_method = self.content_type_method

return {
"content_type": content_type_method(str(self._cloud_path_to_local(cloud_path)))[0],
}


_temp_dirs_to_clean: List[TemporaryDirectory] = []

Expand Down
21 changes: 17 additions & 4 deletions cloudpathlib/s3/s3client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import mimetypes
import os
from pathlib import Path, PurePosixPath
from typing import Any, Dict, Iterable, Optional, Tuple, Union
from typing import Any, Callable, Dict, Iterable, Optional, Tuple, Union


from ..client import Client, register_client_class
Expand Down Expand Up @@ -35,6 +36,7 @@ def __init__(
local_cache_dir: Optional[Union[str, os.PathLike]] = None,
endpoint_url: Optional[str] = None,
boto3_transfer_config: Optional["TransferConfig"] = None,
content_type_method: Optional[Callable] = mimetypes.guess_type,
):
"""Class constructor. Sets up a boto3 [`Session`](
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html).
Expand Down Expand Up @@ -63,6 +65,8 @@ def __init__(
Parameterize it to access a customly deployed S3-compatible object store such as MinIO, Ceph or any other.
boto3_transfer_config (Optional[dict]): Instantiated TransferConfig for managing s3 transfers.
(https://boto3.amazonaws.com/v1/documentation/api/latest/reference/customizations/s3.html#boto3.s3.transfer.TransferConfig)
content_type_method (Optional[Callable]): Function to call to guess media type (mimetype) when
writing a file to the cloud. Defaults to `mimetypes.guess_type`. Must return a tuple (content type, content encoding).
"""
endpoint_url = endpoint_url or os.getenv("AWS_ENDPOINT_URL")
if boto3_session is not None:
Expand Down Expand Up @@ -93,7 +97,7 @@ def __init__(

self.boto3_transfer_config = boto3_transfer_config

super().__init__(local_cache_dir=local_cache_dir)
super().__init__(local_cache_dir=local_cache_dir, content_type_method=content_type_method)

def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]:
data = self.s3.ObjectSummary(cloud_path.bucket, cloud_path.key).get()
Expand All @@ -102,7 +106,7 @@ def _get_metadata(self, cloud_path: S3Path) -> Dict[str, Any]:
"last_modified": data["LastModified"],
"size": data["ContentLength"],
"etag": data["ETag"],
"mime": data["ContentType"],
"content_type": data["ContentType"],
"extra": data["Metadata"],
}

Expand Down Expand Up @@ -250,7 +254,16 @@ def _remove(self, cloud_path: S3Path) -> None:
def _upload_file(self, local_path: Union[str, os.PathLike], cloud_path: S3Path) -> S3Path:
obj = self.s3.Object(cloud_path.bucket, cloud_path.key)

obj.upload_file(str(local_path), Config=self.boto3_transfer_config)
extra_args = {}

if self.content_type_method is not None:
content_type, content_encoding = self.content_type_method(str(local_path))
if content_type is not None:
extra_args["ContentType"] = content_type
if content_encoding is not None:
extra_args["ContentEncoding"] = content_encoding

obj.upload_file(str(local_path), Config=self.boto3_transfer_config, ExtraArgs=extra_args)
return cloud_path


Expand Down
51 changes: 51 additions & 0 deletions docs/docs/other_client_settings.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Other `Client` settings

## Content type guessing (`content_type_method`)

All of the clients support passing a `content_type_method` when they are instantiated.
This is a method that is used to guess the [MIME (media) type](https://en.wikipedia.org/wiki/Media_type)
(often called the "content type") of the file and set that on the cloud provider.

By default, `content_type_method` use the Python built-in
[`guess_type`](https://docs.python.org/3/library/mimetypes.html#mimetypes.guess_type)
to set this content type. This guesses based on the file extension, and may not always get the correct type.
In these cases, you can set `content_type_method` to your own function that gets the proper type; for example, by
reading the file content or by looking it up in a dictionary of filename-to-media-type mappings that you maintain.

If you set a custom method, it should follow the signature of `guess_type` and return a tuple of the form:
`(content_type, content_encoding)`; for example, `("text/css", None)`.

If you set `content_type_method` to None, it will do whatever the default of the cloud provider's SDK does. This
varies from provider to provider.

Here is an example of using a custom `content_type_method`.

```python
import mimetypes
from pathlib import Path

from cloudpathlib import S3Client, CloudPath

def my_content_type(path):
# do lookup for content types I define; fallback to
# guess_type for anything else
return {
".potato": ("application/potato", None),
}.get(Path(path).suffix, mimetypes.guess_type(path))


# create a client with my custom content type
client = S3Client(content_type_method=my_content_type)

# To use this same method for every cloud path, set our client as the default.
# This is optional, and you could use client.CloudPath to create paths instead.
client.set_as_default_client()

# create a cloud path
cp1 = CloudPath("s3://cloudpathlib-test-bucket/i_am_a.potato")
cp1.write_text("hello")

# check content type with boto3
print(client.s3.Object(cp1.bucket, cp1.key).content_type)
#> application/potato
```
1 change: 1 addition & 0 deletions docs/mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ nav:
- Authentication: "authentication.md"
- Caching: "caching.ipynb"
- AnyPath: "anypath-polymorphism.md"
- Other Client settings: "other_client_settings.md"
- Testing code that uses cloudpathlib: "testing_mocked_cloudpathlib.ipynb"
- Integrations: "integrations.md"
- Changelog: "changelog.md"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,5 +60,5 @@ def load_requirements(path: Path):
"Source Code": "https://github.com/drivendataorg/cloudpathlib",
},
url="https://github.com/drivendataorg/cloudpathlib",
version="0.7.1",
version="0.8.0",
)
Loading

0 comments on commit da86719

Please sign in to comment.