diff --git a/binder/environment-dev.yml b/binder/environment-dev.yml index 202d825b..6783a808 100644 --- a/binder/environment-dev.yml +++ b/binder/environment-dev.yml @@ -10,6 +10,16 @@ dependencies: - ipyleaflet>=0.13 - h5netcdf>=0.11 - cartopy + + - mkdocs>=1.2 + - mkdocs-material>=7.1,<9.0 + - markdown-include>=0.6 + - mkdocstrings>=0.19.0 + - mkdocstrings-python + - mkdocs-jupyter>=0.19.0 + - pymdown-extensions>=9.2 + - pip - pip: - poetry + - markdown-callouts>=0.2.0 diff --git a/earthaccess/api.py b/earthaccess/api.py index 8e518912..a7d35fb0 100644 --- a/earthaccess/api.py +++ b/earthaccess/api.py @@ -13,8 +13,8 @@ from .utils import _validation as validate -def _normalize_location(location: Union[str, None]) -> Union[str, None]: - """Handle user-provided `daac` and `provider` values +def _normalize_location(location: Optional[str]) -> Optional[str]: + """Handle user-provided `daac` and `provider` values. These values must have a capital letter as the first character followed by capital letters, numbers, or an underscore. Here we @@ -31,32 +31,29 @@ def _normalize_location(location: Union[str, None]) -> Union[str, None]: def search_datasets( count: int = -1, **kwargs: Any ) -> List[earthaccess.results.DataCollection]: - """Search datasets using NASA's CMR + """Search datasets using NASA's CMR. [https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) Parameters: + count: Number of records to get, -1 = all + kwargs (Dict): + arguments to CMR: - count (Integer): Number of records to get, -1 = all - kwargs (Dict): arguments to CMR: - - * **keyword**: case insensitive and support wild cards ? and *, - + * **keyword**: case-insensitive and supports wildcards ? and * * **short_name**: e.g. ATL08 - * **doi**: DOI for a dataset - * **daac**: e.g. NSIDC or PODAAC - * **provider**: particular to each DAAC, e.g. POCLOUD, LPDAAC etc. + * **temporal**: a tuple representing temporal bounds in the form + `("yyyy-mm-dd", "yyyy-mm-dd")` + * **bounding_box**: a tuple representing spatial bounds in the form + `(lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat)` - * **temporal**: ("yyyy-mm-dd", "yyyy-mm-dd") - - * **bounding_box**: (lower_left_lon, lower_left_lat , - upper_right_lon, upper_right_lat) Returns: - an list of DataCollection results that can be used to get - information such as concept_id, doi, etc. about a dataset. + A list of DataCollection results that can be used to get information about a + dataset, e.g. concept_id, doi, etc. + Examples: ```python datasets = earthaccess.search_datasets( @@ -89,27 +86,24 @@ def search_data( [https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html](https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html) Parameters: + count: Number of records to get, -1 = all + kwargs (Dict): + arguments to CMR: - count (Integer): Number of records to get, -1 = all - kwargs (Dict): arguments to CMR: - - * **short_name**: dataset short name e.g. ATL08 - + * **short_name**: dataset short name, e.g. ATL08 * **version**: dataset version - * **doi**: DOI for a dataset - * **daac**: e.g. NSIDC or PODAAC - * **provider**: particular to each DAAC, e.g. POCLOUD, LPDAAC etc. + * **temporal**: a tuple representing temporal bounds in the form + `("yyyy-mm-dd", "yyyy-mm-dd")` + * **bounding_box**: a tuple representing spatial bounds in the form + `(lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat)` - * **temporal**: ("yyyy-mm-dd", "yyyy-mm-dd") - - * **bounding_box**: (lower_left_lon, lower_left_lat , - upper_right_lon, upper_right_lat) Returns: - Granules: a list of DataGranules that can be used to access - the granule files by using `download()` or `open()`. + a list of DataGranules that can be used to access the granule files by using + `download()` or `open()`. + Examples: ```python datasets = earthaccess.search_data( @@ -131,22 +125,20 @@ def search_data( def login(strategy: str = "all", persist: bool = False) -> Auth: - """Authenticate with Earthdata login (https://urs.earthdata.nasa.gov/) + """Authenticate with Earthdata login (https://urs.earthdata.nasa.gov/). Parameters: + strategy: + An authentication method. - strategy (String): authentication method. - - "all": (default) try all methods until one works + * **"all"**: (default) try all methods until one works + * **"interactive"**: enter username and password. + * **"netrc"**: retrieve username and password from ~/.netrc. + * **"environment"**: retrieve username and password from `$EARTHDATA_USERNAME` and `$EARTHDATA_PASSWORD`. + persist: will persist credentials in a .netrc file - "interactive": enter username and password. - - "netrc": retrieve username and password from ~/.netrc. - - "environment": retrieve username and password from $EARTHDATA_USERNAME and $EARTHDATA_PASSWORD. - persist (Boolean): will persist credentials in a .netrc file Returns: - an instance of Auth. + An instance of Auth. """ if strategy == "all": for strategy in ["environment", "netrc", "interactive"]: @@ -168,19 +160,20 @@ def login(strategy: str = "all", persist: bool = False) -> Auth: def download( granules: Union[DataGranule, List[DataGranule], str, List[str]], - local_path: Union[str, None], + local_path: Optional[str], provider: Optional[str] = None, threads: int = 8, ) -> List[str]: """Retrieves data granules from a remote storage system. - * If we run this in the cloud, we will be using S3 to move data to `local_path` - * If we run it outside AWS (us-west-2 region) and the dataset is cloud hostes we'll use HTTP links + * If we run this in the cloud, we will be using S3 to move data to `local_path`. + * If we run it outside AWS (us-west-2 region) and the dataset is cloud hosted, + we'll use HTTP links. Parameters: granules: a granule, list of granules, a granule link (HTTP), or a list of granule links (HTTP) local_path: local directory to store the remote data granules - provider: if we download a list of URLs we need to specify the provider. + provider: if we download a list of URLs, we need to specify the provider. threads: parallel number of threads to use to download the files, adjust as necessary, default = 8 Returns: @@ -208,8 +201,10 @@ def open( hosted on S3 or HTTPS by third party libraries like xarray. Parameters: - granules: a list of granule instances **or** list of URLs, e.g. s3://some-granule, - if a list of URLs is passed we need to specify the data provider e.g. POCLOUD, NSIDC_CPRD etc. + granules: a list of granule instances **or** list of URLs, e.g. `s3://some-granule`. + If a list of URLs is passed, we need to specify the data provider. + provider: e.g. POCLOUD, NSIDC_CPRD, etc. + Returns: a list of s3fs "file pointers" to s3 files. """ @@ -223,15 +218,16 @@ def get_s3_credentials( provider: Optional[str] = None, results: Optional[List[earthaccess.results.DataGranule]] = None, ) -> Dict[str, Any]: - """Returns temporary (1 hour) credentials for direct access to NASA S3 buckets, we can - use the daac name, the provider or a list of results from earthaccess.search_data() - if we use results earthaccess will use the metadata on the response to get the credentials, - this is useful for missions that do not use the same endpoint as their DAACs e.g. SWOT + """Returns temporary (1 hour) credentials for direct access to NASA S3 buckets. We can + use the daac name, the provider, or a list of results from earthaccess.search_data(). + If we use results, earthaccess will use the metadata on the response to get the credentials, + which is useful for missions that do not use the same endpoint as their DAACs, e.g. SWOT. Parameters: - daac (String): a DAAC short_name like NSIDC or PODAAC etc - provider (String: if we know the provider for the DAAC e.g. POCLOUD, LPCLOUD etc. - results (list[earthaccess.results.DataGranule]): List of results from search_data() + daac: a DAAC short_name like NSIDC or PODAAC, etc. + provider: if we know the provider for the DAAC, e.g. POCLOUD, LPCLOUD etc. + results: List of results from search_data() + Returns: a dictionary with S3 credentials for the DAAC or provider """ @@ -244,12 +240,10 @@ def get_s3_credentials( def collection_query() -> Type[CollectionQuery]: - """Returns a query builder instance for NASA collections (datasets) + """Returns a query builder instance for NASA collections (datasets). - Parameters: - cloud_hosted (Boolean): initializes the query builder for cloud hosted collections. Returns: - class earthaccess.DataCollections: a query builder instance for data collections. + a query builder instance for data collections. """ if earthaccess.__auth__.authenticated: query_builder = DataCollections(earthaccess.__auth__) @@ -261,11 +255,8 @@ class earthaccess.DataCollections: a query builder instance for data collections def granule_query() -> Type[GranuleQuery]: """Returns a query builder instance for data granules - Parameters: - cloud_hosted (Boolean): initializes the query builder for a particular DOI - if we have it. Returns: - class earthaccess.DataGranules: a query builder instance for data granules. + a query builder instance for data granules. """ if earthaccess.__auth__.authenticated: query_builder = DataGranules(earthaccess.__auth__) @@ -275,10 +266,10 @@ class earthaccess.DataGranules: a query builder instance for data granules. def get_fsspec_https_session() -> AbstractFileSystem: - """Returns a fsspec session that can be used to access datafiles across many different DAACs + """Returns a fsspec session that can be used to access datafiles across many different DAACs. Returns: - class AbstractFileSystem: an fsspec instance able to access data across DAACs + An fsspec instance able to access data across DAACs. Examples: ```python @@ -289,19 +280,18 @@ class AbstractFileSystem: an fsspec instance able to access data across DAACs with fs.open(DAAC_GRANULE) as f: f.read(10) ``` - """ session = earthaccess.__store__.get_fsspec_session() return session def get_requests_https_session() -> requests.Session: - """Returns a requests Session instance with an authorized bearer token - this is useful to make requests to restricted URLs like data granules or services that + """Returns a requests Session instance with an authorized bearer token. + This is useful for making requests to restricted URLs, such as data granules or services that require authentication with NASA EDL. Returns: - class requests.Session: an authenticated requests Session instance. + An authenticated requests Session instance. Examples: ```python @@ -323,15 +313,17 @@ def get_s3fs_session( provider: Optional[str] = None, results: Optional[earthaccess.results.DataGranule] = None, ) -> s3fs.S3FileSystem: - """Returns a fsspec s3fs file session for direct access when we are in us-west-2 + """Returns a fsspec s3fs file session for direct access when we are in us-west-2. Parameters: - daac (String): Any DAAC short name e.g. NSIDC, GES_DISC - provider (String): Each DAAC can have a cloud provider, if the DAAC is specified, there is no need to use provider - results (list[class earthaccess.results.DataGranule]): A list of results from search_data(), earthaccess will use the metadata form CMR to obtain the S3 Endpoint + daac: Any DAAC short name e.g. NSIDC, GES_DISC + provider: Each DAAC can have a cloud provider. + If the DAAC is specified, there is no need to use provider. + results: A list of results from search_data(). + `earthaccess` will use the metadata from CMR to obtain the S3 Endpoint. Returns: - class s3fs.S3FileSystem: an authenticated s3fs session valid for 1 hour + An authenticated s3fs session valid for 1 hour. """ daac = _normalize_location(daac) provider = _normalize_location(provider) @@ -345,11 +337,10 @@ class s3fs.S3FileSystem: an authenticated s3fs session valid for 1 hour def get_edl_token() -> str: - """Returns the current token used for EDL + """Returns the current token used for EDL. Returns: - str: EDL token - + EDL token """ token = earthaccess.__auth__.token return token diff --git a/earthaccess/auth.py b/earthaccess/auth.py index 9a3b22cb..004efa7c 100644 --- a/earthaccess/auth.py +++ b/earthaccess/auth.py @@ -49,9 +49,7 @@ def rebuild_auth(self, prepared_request: Any, response: Any) -> None: class Auth(object): - """ - Authentication class for operations that require Earthdata login (EDL) - """ + """Authentication class for operations that require Earthdata login (EDL).""" def __init__(self) -> None: # Maybe all these predefined URLs should be in a constants.py file @@ -63,20 +61,20 @@ def __init__(self) -> None: self.EDL_REVOKE_TOKEN = "https://urs.earthdata.nasa.gov/api/users/revoke_token" def login(self, strategy: str = "netrc", persist: bool = False) -> Any: - """Authenticate with Earthdata login + """Authenticate with Earthdata login. Parameters: + strategy: + The authentication method. - strategy (String): authentication method. - - "interactive": enter username and password. - - "netrc": (default) retrieve username and password from ~/.netrc. + * **"interactive"**: Enter a username and password. + * **"netrc"**: (default) Retrieve a username and password from ~/.netrc. + * **"environment"**: + Retrieve a username and password from $EARTHDATA_USERNAME and $EARTHDATA_PASSWORD. + persist: Will persist credentials in a `.netrc` file. - "environment": retrieve username and password from $EARTHDATA_USERNAME and $EARTHDATA_PASSWORD. - persist (Boolean): will persist credentials in a .netrc file Returns: - an instance of Auth. + An instance of Auth. """ if self.authenticated: logger.debug("We are already authenticated with NASA EDL") @@ -90,8 +88,8 @@ def login(self, strategy: str = "netrc", persist: bool = False) -> Any: return self def refresh_tokens(self) -> bool: - """Refresh CMR tokens - Tokens are used to do authenticated queries on CMR for restricted and early access datastes + """Refresh CMR tokens. + Tokens are used to do authenticated queries on CMR for restricted and early access datasets. This method renews the tokens to make sure we can query the collections allowed to our EDL user. """ if len(self.tokens) == 0: @@ -146,17 +144,16 @@ def get_s3_credentials( provider: Optional[str] = None, endpoint: Optional[str] = None, ) -> Dict[str, str]: - """Gets AWS S3 credentials for a given NASA cloud provider, the - easier way is to use the DAAC short name. provider is optional if we know it. + """Gets AWS S3 credentials for a given NASA cloud provider. + The easier way is to use the DAAC short name; provider is optional if we know it. Parameters: - provider: A valid cloud provider, each DAAC has a provider code for their cloud distributions - daac: the name of a NASA DAAC, i.e. NSIDC or PODAAC - endpoint: getting the credentials directly from the S3Credentials URL - - Rreturns: - A Python dictionary with the temporary AWS S3 credentials + daac: The name of a NASA DAAC, e.g. NSIDC or PODAAC. + provider: A valid cloud provider. Each DAAC has a provider code for their cloud distributions. + endpoint: Getting the credentials directly from the S3Credentials URL. + Returns: + A Python dictionary with the temporary AWS S3 credentials. """ if self.authenticated: session = SessionWithHeaderRedirection(self.username, self.password) @@ -199,14 +196,15 @@ def get_s3_credentials( print(f"Credentials for the cloud provider {daac} are not available") return {} else: - print("We need to auhtenticate with EDL first") + print("We need to authenticate with EDL first") return {} def get_session(self, bearer_token: bool = True) -> requests.Session: - """Returns a new request session instance + """Returns a new request session instance. Parameters: - bearer_token (Boolean): boolean, include bearer token + bearer_token: whether to include bearer token + Returns: class Session instance with Auth and bearer token headers """ @@ -228,13 +226,13 @@ def get_user_profile(self) -> Dict[str, Any]: else: return {} - def _interactive(self, presist_credentials: bool = False) -> bool: + def _interactive(self, persist_credentials: bool = False) -> bool: username = input("Enter your Earthdata Login username: ") password = getpass.getpass(prompt="Enter your Earthdata password: ") authenticated = self._get_credentials(username, password) if authenticated: logger.debug("Using user provided credentials for EDL") - if presist_credentials: + if persist_credentials: print("Persisting credentials to .netrc") self._persist_user_credentials(username, password) return authenticated diff --git a/earthaccess/daac.py b/earthaccess/daac.py index a15972c1..4f0e99f3 100644 --- a/earthaccess/daac.py +++ b/earthaccess/daac.py @@ -128,7 +128,7 @@ def find_provider( if len(daac["cloud-providers"]) > 0: return daac["cloud-providers"][0] else: - # We found the DAAC but it does not have cloud data + # We found the DAAC, but it does not have cloud data return daac["on-prem-providers"][0] else: # return on prem provider code diff --git a/earthaccess/kerchunk.py b/earthaccess/kerchunk.py index eb3f4cae..02533a0e 100644 --- a/earthaccess/kerchunk.py +++ b/earthaccess/kerchunk.py @@ -7,14 +7,14 @@ def _get_chunk_metadata( - granuale: earthaccess.results.DataGranule, + granule: earthaccess.results.DataGranule, fs: fsspec.AbstractFileSystem | s3fs.S3FileSystem, ) -> list[dict]: from kerchunk.hdf import SingleHdf5ToZarr metadata = [] access = "direct" if isinstance(fs, s3fs.S3FileSystem) else "indirect" - for url in granuale.data_links(access=access): + for url in granule.data_links(access=access): with fs.open(url) as inf: h5chunks = SingleHdf5ToZarr(inf, url) m = h5chunks.translate() @@ -23,7 +23,7 @@ def _get_chunk_metadata( def consolidate_metadata( - granuales: list[earthaccess.results.DataGranule], + granules: list[earthaccess.results.DataGranule], kerchunk_options: dict | None = None, access: str = "direct", outfile: str | None = None, @@ -39,13 +39,13 @@ def consolidate_metadata( ) from e if access == "direct": - fs = earthaccess.get_s3fs_session(provider=granuales[0]["meta"]["provider-id"]) + fs = earthaccess.get_s3fs_session(provider=granules[0]["meta"]["provider-id"]) else: fs = earthaccess.get_fsspec_https_session() - # Get metadata for each granuale + # Get metadata for each granule get_chunk_metadata = dask.delayed(_get_chunk_metadata) - chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granuales]) + chunks = dask.compute(*[get_chunk_metadata(g, fs) for g in granules]) chunks = sum(chunks, start=[]) # Get combined metadata object diff --git a/earthaccess/results.py b/earthaccess/results.py index a466e1c3..6d91ca7e 100644 --- a/earthaccess/results.py +++ b/earthaccess/results.py @@ -44,9 +44,7 @@ def _filter_fields_(self, fields: List[str]) -> Dict[str, Any]: return basic_dict def _filter_related_links(self, filter: str) -> List[str]: - """ - Filter RelatedUrls from the UMM fields on CMR - """ + """Filter RelatedUrls from the UMM fields on CMR.""" matched_links: List = [] if "RelatedUrls" in self["umm"]: for link in self["umm"]["RelatedUrls"]: @@ -56,9 +54,7 @@ def _filter_related_links(self, filter: str) -> List[str]: class DataCollection(CustomDict): - """ - Dictionary-like object to represent a data collection from CMR - """ + """Dictionary-like object to represent a data collection from CMR.""" _basic_meta_fields_ = [ "concept-id", @@ -78,10 +74,10 @@ class DataCollection(CustomDict): ] def summary(self) -> Dict[str, Any]: - """Summary containing short_name, concept-id, file-type, and cloud-info if the dataset is cloud hosted. + """Summary containing short_name, concept-id, file-type, and cloud-info (if cloud-hosted). Returns: - Returns a sumary of the collection metadata + A summary of the collection metadata. """ # we can print only the concept-id @@ -101,8 +97,9 @@ def get_umm(self, umm_field: str) -> Union[str, Dict[str, Any]]: """ Parameters: umm_field: Valid UMM item, i.e. `TemporalExtent` + Returns: - Returns the value of a given field inside the UMM (Unified Metadata Model) + The value of a given field inside the UMM (Unified Metadata Model). """ if umm_field in self["umm"]: return self["umm"][umm_field] @@ -111,14 +108,15 @@ def get_umm(self, umm_field: str) -> Union[str, Dict[str, Any]]: def concept_id(self) -> str: """ Returns: - Retrurns a collection's `concept_id`, this id is the most relevant search field on granule queries. + A collection's `concept_id`. + This id is the most relevant search field on granule queries. """ return self["meta"]["concept-id"] def data_type(self) -> str: """ Returns: - If available, it returns the collection data type, i.e. HDF5, CSV etc + The collection data type, i.e. HDF5, CSV etc., if available. """ if "ArchiveAndDistributionInformation" in self["umm"]: return str( @@ -131,7 +129,7 @@ def data_type(self) -> str: def version(self) -> str: """ Returns: - returns the collection's version. + The collection's version. """ if "Version" in self["umm"]: return self["umm"]["Version"] @@ -140,7 +138,7 @@ def version(self) -> str: def abstract(self) -> str: """ Returns: - Returns the abstract of a collection + The abstract of a collection """ if "Abstract" in self["umm"]: return self["umm"]["Abstract"] @@ -149,7 +147,7 @@ def abstract(self) -> str: def landing_page(self) -> str: """ Returns: - if available it returns the first landing page for the collection, can be many. + The first landing page for the collection (can be many), if available. """ links = self._filter_related_links("LANDING PAGE") if len(links) > 0: @@ -159,7 +157,7 @@ def landing_page(self) -> str: def get_data(self) -> List[str]: """ Returns: - Returns the GET DATA links, usually a link to a landing page, a DAAC portal or an FTP location. + The GET DATA links (usually a landing page link, a DAAC portal, or an FTP location). """ links = self._filter_related_links("GET DATA") return links @@ -167,7 +165,8 @@ def get_data(self) -> List[str]: def s3_bucket(self) -> Dict[str, Any]: """ Returns: - Returns the S3 bucket information if the collection has it (**cloud hosted collections only**) + The S3 bucket information if the collection has it. + (**cloud hosted collections only**) """ if "DirectDistributionInformation" in self["umm"]: return self["umm"]["DirectDistributionInformation"] @@ -180,9 +179,7 @@ def __repr__(self) -> str: class DataGranule(CustomDict): - """ - Dictionary-like object to represent a granule from CMR - """ + """Dictionary-like object to represent a granule from CMR.""" _basic_meta_fields_ = [ "concept-id", @@ -219,7 +216,7 @@ def __init__( def __repr__(self) -> str: """ Returns: - returns a basic representation of a data granule + A basic representation of a data granule. """ data_links = [link for link in self.data_links()] rep_str = f""" @@ -234,7 +231,7 @@ def __repr__(self) -> str: def _repr_html_(self) -> str: """ Returns: - Returns a rich representation for a data granule if we are in a Jupyter notebook. + A rich representation for a data granule if we are in a Jupyter notebook. """ granule_html_repr = _repr_granule_html(self) return granule_html_repr @@ -248,7 +245,7 @@ def get_s3_credentials_endpoint(self) -> Union[str, None]: def size(self) -> float: """ Returns: - Returns the total size for the granule in MB + The total size for the granule in MB. """ try: data_granule = self["umm"]["DataGranule"] @@ -287,20 +284,23 @@ def _derive_s3_link(self, links: List[str]) -> List[str]: def data_links( self, access: Optional[str] = None, in_region: bool = False ) -> List[str]: - """Returns the data links form a granule + """Returns the data links from a granule. Parameters: - access: direct or external, direct means in-region access for cloud hosted collections. - in_region: if we are running in us-west-2, meant for the store class, default is False + access: direct or external. + Direct means in-region access for cloud-hosted collections. + in_region: True if we are running in us-west-2. + It is meant for the store class. + Returns: - the data link for the requested access type + The data links for the requested access type. """ https_links = self._filter_related_links("GET DATA") s3_links = self._filter_related_links("GET DATA VIA DIRECT ACCESS") if in_region: # we are in us-west-2 if self.cloud_hosted and access in (None, "direct"): - # this is a cloud collection and we didn't specify the access type + # this is a cloud collection, and we didn't specify the access type # default to S3 links if len(s3_links) == 0 and len(https_links) > 0: # This is guessing the S3 links for some cloud collections that for @@ -310,14 +310,14 @@ def data_links( # we have the s3 links so we return those return s3_links else: - # Even though we are in us-west-2 the user wants the HTTPS links - # used in region they are S3 signed links from TEA - # https://github.com/asfadmin/thin-egress-app + # Even though we are in us-west-2, the user wants the HTTPS links used in-region. + # They are S3 signed links from TEA. + # return https_links else: - # we are not in region + # we are not in-region if access == "direct": - # maybe the user wants to collect S3 links ans use them later + # maybe the user wants to collect S3 links and use them later # from the cloud return s3_links else: @@ -327,7 +327,7 @@ def data_links( def dataviz_links(self) -> List[str]: """ Returns: - Returns the data visualization links, usually the browse images. + The data visualization links, usually the browse images. """ links = self._filter_related_links("GET RELATED VISUALIZATION") return links diff --git a/earthaccess/search.py b/earthaccess/search.py index 8392ca51..aa410985 100644 --- a/earthaccess/search.py +++ b/earthaccess/search.py @@ -15,7 +15,7 @@ class DataCollections(CollectionQuery): """ ???+ Info The DataCollection class queries against https://cmr.earthdata.nasa.gov/search/collections.umm_json, - the response has to be in umm_json in order to use the result classes. + the response has to be in umm_json to use the result classes. """ _fields = None @@ -37,13 +37,13 @@ def __init__(self, auth: Optional[Auth] = None, *args: Any, **kwargs: Any) -> No """Builds an instance of DataCollections to query CMR Parameters: - auth (Auth): An authenticated `Auth` instance, this is an optional parameter - for queries that need authentication e.g. restricted datasets + auth: An authenticated `Auth` instance. This is an optional parameter + for queries that need authentication, e.g. restricted datasets. """ super().__init__(*args, **kwargs) self.session = session() if auth is not None and auth.authenticated: - # To search we need the new bearer tokens from NASA Earthdata + # To search, we need the new bearer tokens from NASA Earthdata self.session = auth.get_session(bearer_token=True) self._debug = False @@ -54,16 +54,17 @@ def __init__(self, auth: Optional[Auth] = None, *args: Any, **kwargs: Any) -> No def hits(self) -> int: """Returns the number of hits the current query will return. This is done by making a lightweight query to CMR and inspecting the returned headers. - Restricted datasets will always return 0 results even if there are results. + Restricted datasets will always return zero results even if there are results. Returns: - number of results reported by CMR + The number of results reported by CMR. """ return super().hits() def concept_id(self, IDs: List[str]) -> Type[CollectionQuery]: - """Filter by concept ID (ex: C1299783579-LPDAAC_ECS or G1327299284-LPDAAC_ECS, S12345678-LPDAAC_ECS) + """Filter by concept ID. + For example: C1299783579-LPDAAC_ECS or G1327299284-LPDAAC_ECS, S12345678-LPDAAC_ECS Collections, granules, tools, services are uniquely identified with this ID. > @@ -73,24 +74,24 @@ def concept_id(self, IDs: List[str]) -> Type[CollectionQuery]: * If providing a service's concept ID here, it will uniquely identify those services. Parameters: - IDs (String, List): ID(s) to search by. Can be provided as a string or list of strings. + IDs: ID(s) to search by. Can be provided as a string or list of strings. """ super().concept_id(IDs) return self def keyword(self, text: str) -> Type[CollectionQuery]: - """Case insentive and wildcard (*) search through over two dozen fields in + """Case-insensitive and wildcard (*) search through over two dozen fields in a CMR collection record. This allows for searching against fields like summary and science keywords. Parameters: - text (String): text to search for + text: text to search for """ super().keyword(text) return self def doi(self, doi: str) -> Type[CollectionQuery]: - """Searh datasets by DOI + """Search datasets by DOI. ???+ Tip Not all datasets have an associated DOI, also DOI search works @@ -98,7 +99,7 @@ def doi(self, doi: str) -> Type[CollectionQuery]: We need to search by DOI, grab the concept_id and then get the data. Parameters: - doi (String): DOI of a datasets, e.g. 10.5067/AQR50-3Q7CS + doi: DOI of a datasets, e.g. 10.5067/AQR50-3Q7CS """ if not isinstance(doi, str): raise TypeError("doi must be of type str") @@ -137,13 +138,14 @@ def parameters(self, **kwargs: Any) -> Type[CollectionQuery]: return self def print_help(self, method: str = "fields") -> None: - """Prints the help information for a given method""" + """Prints the help information for a given method.""" print("Class components: \n") print([method for method in dir(self) if method.startswith("_") is False]) help(getattr(self, method)) def fields(self, fields: Optional[List[str]] = None) -> Type[CollectionQuery]: - """Masks the response by only showing the fields included in this list + """Masks the response by only showing the fields included in this list. + Parameters: fields (List): list of fields to show, these fields come from the UMM model e.g. Abstract, Title """ @@ -152,6 +154,7 @@ def fields(self, fields: Optional[List[str]] = None) -> Type[CollectionQuery]: def debug(self, debug: bool = True) -> Type[CollectionQuery]: """If True, prints the actual query to CMR, notice that the pagination happens in the headers. + Parameters: debug (Boolean): Print CMR query. """ @@ -166,7 +169,7 @@ def cloud_hosted(self, cloud_hosted: bool = True) -> Type[CollectionQuery]: Restricted collections will not be matched using this parameter Parameters: - cloud_hosted (Boolean): True to require granules only be online + cloud_hosted: True to require granules only be online """ if not isinstance(cloud_hosted, bool): raise TypeError("cloud_hosted must be of type bool") @@ -178,27 +181,31 @@ def cloud_hosted(self, cloud_hosted: bool = True) -> Type[CollectionQuery]: return self def provider(self, provider: str = "") -> Type[CollectionQuery]: - """Only match collections from a given provider, a NASA datacenter or DAAC can have 1 or more providers - i.e. PODAAC is a data center or DAAC, PODAAC is the default provider for on prem data, POCLOUD is - the PODAAC provider for their data in the cloud. + """Only match collections from a given provider. + + A NASA datacenter or DAAC can have one or more providers. + E.g., PODAAC is a data center or DAAC; PODAAC is the default provider for on-premises data, + POCLOUD is the PODAAC provider for their data in the cloud. Parameters: - provider (String): a provider code for any DAAC. e.g. POCLOUD, NSIDC_CPRD, etc. + provider: a provider code for any DAAC, e.g. POCLOUD, NSIDC_CPRD, etc. """ self.params["provider"] = provider return self def data_center(self, data_center_name: str = "") -> Type[CollectionQuery]: - """An alias name for `daac()` + """An alias name for `daac()`. + Parameters: - data_center_name (String): DAAC shortname, e.g. NSIDC, PODAAC, GESDISC + data_center_name: DAAC shortname, e.g. NSIDC, PODAAC, GESDISC """ return self.daac(data_center_name) def daac(self, daac_short_name: str = "") -> Type[CollectionQuery]: - """Only match collections for a given DAAC, by default the on-prem collections for the DAAC + """Only match collections for a given DAAC, by default the on-prem collections for the DAAC. + Parameters: - daac_short_name (String): a DAAC shortname, e.g. NSIDC, PODAAC, GESDISC + daac_short_name: a DAAC shortname, e.g. NSIDC, PODAAC, GESDISC """ if "cloud_hosted" in self.params: cloud_hosted = self.params["cloud_hosted"] @@ -218,7 +225,8 @@ def get(self, limit: int = 2000) -> list: they can be potentially millions of them. Parameters: - limit (Integer): The number of results to return + limit: The number of results to return + Returns: query results as a list of `DataCollection` instances. """ @@ -265,13 +273,13 @@ def temporal( self, date_from: str, date_to: str, exclude_boundary: bool = False ) -> Type[CollectionQuery]: """Filter by an open or closed date range. Dates can be provided as datetime objects - or ISO 8601 formatted strings. Multiple ranges can be provided by successive calls. + or ISO 8601 formatted strings. Multiple ranges can be provided by successive calls to this method before calling execute(). Parameters: - date_from (String): earliest date of temporal range - date_to (string): latest date of temporal range - exclude_boundary (Boolean): whether or not to exclude the date_from/to in the matched range + date_from: earliest date of temporal range + date_to: latest date of temporal range + exclude_boundary: whether to exclude the date_from/to in the matched range """ DEFAULT = dt.datetime(1979, 1, 1) if date_from is not None: @@ -291,8 +299,7 @@ def temporal( class DataGranules(GranuleQuery): - """ - A Granule oriented client for NASA CMR + """A Granule oriented client for NASA CMR. API: https://cmr.earthdata.nasa.gov/search/site/docs/search/api.html """ @@ -316,17 +323,17 @@ def __init__(self, auth: Any = None, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) self.session = session() if auth is not None and auth.authenticated: - # To search we need the new bearer tokens from NASA Earthdata + # To search, we need the new bearer tokens from NASA Earthdata self.session = auth.get_session(bearer_token=True) self._debug = False def hits(self) -> int: - """ - Returns the number of hits the current query will return. This is done by - making a lightweight query to CMR and inspecting the returned headers. + """Returns the number of hits the current query will return. + This is done by making a lightweight query to CMR and inspecting the returned headers. - :returns: number of results reported by CMR + Returns: + The number of results reported by CMR. """ url = self._build_url() @@ -353,6 +360,7 @@ def parameters(self, **kwargs: Any) -> Type[CollectionQuery]: temporal=("2015-01","2015-02"), point=(42.5, -101.25)) ``` + Returns: Query instance """ @@ -374,27 +382,31 @@ def parameters(self, **kwargs: Any) -> Type[CollectionQuery]: return self def provider(self, provider: str = "") -> Type[CollectionQuery]: - """Only match collections from a given provider, a NASA datacenter or DAAC can have 1 or more providers - i.e. PODAAC is a data center or DAAC, PODAAC is the default provider for on prem data, POCLOUD is + """Only match collections from a given provider. + A NASA datacenter or DAAC can have one or more providers. + For example, PODAAC is a data center or DAAC, + PODAAC is the default provider for on-prem data, and POCLOUD is the PODAAC provider for their data in the cloud. Parameters: - provider (String): a provider code for any DAAC. e.g. POCLOUD, NSIDC_CPRD, etc. + provider: a provider code for any DAAC, e.g. POCLOUD, NSIDC_CPRD, etc. """ self.params["provider"] = provider return self def data_center(self, data_center_name: str = "") -> Type[CollectionQuery]: - """An alias name for `daac()` + """An alias name for `daac()`. + Parameters: data_center_name (String): DAAC shortname, e.g. NSIDC, PODAAC, GESDISC """ return self.daac(data_center_name) def daac(self, daac_short_name: str = "") -> Type[CollectionQuery]: - """Only match collections for a given DAAC, by default the on-prem collections for the DAAC + """Only match collections for a given DAAC. Default to on-prem collections for the DAAC. + Parameters: - daac_short_name (String): a DAAC shortname, e.g. NSIDC, PODAAC, GESDISC + daac_short_name: a DAAC shortname, e.g. NSIDC, PODAAC, GESDISC """ if "cloud_hosted" in self.params: cloud_hosted = self.params["cloud_hosted"] @@ -416,15 +428,16 @@ def orbit_number(self, orbit1: int, orbit2: int) -> Type[GranuleQuery]: return self def cloud_hosted(self, cloud_hosted: bool = True) -> Type[CollectionQuery]: - """Only match granules that are hosted in the cloud. This is valid for public - collections and if we are using the short_name parameter. Concept-Id is unambiguous. + """Only match granules that are hosted in the cloud. + This is valid for public collections and when using the short_name parameter. + Concept-Id is unambiguous. ???+ Tip - Cloud hosted collections can be public or restricted. - Restricted collections will not be matched using this parameter + Cloud-hosted collections can be public or restricted. + Restricted collections will not be matched using this parameter. Parameters: - cloud_hosted (Boolean): True to require granules only be online + cloud_hosted: True to require granules only be online """ if not isinstance(cloud_hosted, bool): raise TypeError("cloud_hosted must be of type bool") @@ -442,11 +455,11 @@ def granule_name(self, granule_name: str) -> Type[CollectionQuery]: queries using the readable_granule_name metadata field. ???+ Tip - We can use wirldcards on a granule name to further refine our search - i.e. MODGRNLD.*.daily.* + We can use wildcards on a granule name to further refine our search, + e.g. `MODGRNLD.*.daily.*`. Parameters: - granule_name (String): granule name (accepts wildcards) + granule_name: granule name (accepts wildcards) """ if not isinstance(granule_name, str): raise TypeError("granule_name must be of type string") @@ -458,8 +471,9 @@ def granule_name(self, granule_name: str) -> Type[CollectionQuery]: def online_only(self, online_only: bool = True) -> Type[GranuleQuery]: """Only match granules that are listed online and not available for download. The opposite of this method is downloadable(). + Parameters: - online_only (Boolean): True to require granules only be online + online_only: True to require granules only be online """ super().online_only(online_only) return self @@ -477,7 +491,7 @@ def instrument(self, instrument: str = "") -> Type[GranuleQuery]: """Filter by the instrument associated with the granule. Parameters: - instrument (str): name of the instrument + instrument: name of the instrument """ super().instrument(instrument) return self @@ -497,8 +511,8 @@ def cloud_cover( """Filter by the percentage of cloud cover present in the granule. Parameters: - min_cover (int): minimum percentage of cloud cover - max_cover (int): maximum percentage of cloud cover + min_cover: minimum percentage of cloud cover + max_cover: maximum percentage of cloud cover """ super().cloud_cover(min_cover, max_cover) return self @@ -527,10 +541,13 @@ def _is_cloud_hosted(self, granule: Any) -> bool: return False def short_name(self, short_name: str = "") -> Type[GranuleQuery]: - """ - Filter by short name (aka product or collection name). - :param short_name: name of collection - :returns: Query instance + """Filter by short name (aka product or collection name). + + Parameters: + short_name: name of a collection + + Returns: + Query instance """ super().short_name(short_name) return self @@ -545,7 +562,8 @@ def get(self, limit: int = 2000) -> list: they can be potentially millions of them. Parameters: - limit (Integer): The number of results to return + limit: The number of results to return + Returns: query results as a list of `DataCollection` instances. """ @@ -606,8 +624,9 @@ def get(self, limit: int = 2000) -> list: def debug(self, debug: bool = True) -> Type[GranuleQuery]: """If True, prints the actual query to CMR, notice that the pagination happens in the headers. + Parameters: - debug (Boolean): Print CMR query. + debug: Print CMR query. """ self._debug = True return self @@ -623,9 +642,9 @@ def temporal( ranges can be provided by successive calls to this method before calling execute(). Parameters: - date_from (Date, String): earliest date of temporal range - date_to (Date, String): latest date of temporal range - exclude_boundary (Boolean): whether or not to exclude the date_from/to in the matched range + date_from: earliest date of temporal range + date_to: latest date of temporal range + exclude_boundary: whether to exclude the date_from/to in the matched range """ DEFAULT = dt.datetime(1979, 1, 1) if date_from is not None: @@ -668,7 +687,7 @@ def polygon(self, coordinates: List[Tuple[str, str]]) -> Type[GranuleQuery]: collection filtering parameter such as short_name or entry_title. Parameters: - coordinates (List): list of (lon, lat) tuples + coordinates: list of (lon, lat) tuples """ super().polygon(coordinates) return self @@ -699,7 +718,7 @@ def line(self, coordinates: List[Tuple[str, str]]) -> Type[GranuleQuery]: with a collection filtering parameter such as short_name or entry_title. Parameters: - coordinates (List): a list of (lon, lat) tuples + coordinates: a list of (lon, lat) tuples """ super().line(coordinates) return self @@ -715,14 +734,14 @@ def downloadable(self, downloadable: bool = True) -> Type[GranuleQuery]: return self def doi(self, doi: str) -> Type[GranuleQuery]: - """Searh data granules by DOI + """Search data granules by DOI ???+ Tip Not all datasets have an associated DOI, internally if a DOI is found earthaccess will grab the concept_id for the query to CMR. Parameters: - doi (String): DOI of a datasets, e.g. 10.5067/AQR50-3Q7CS + doi: DOI of a datasets, e.g. 10.5067/AQR50-3Q7CS """ collection = DataCollections().doi(doi).get() if len(collection) > 0: diff --git a/earthaccess/store.py b/earthaccess/store.py index 645721ad..4981fe50 100644 --- a/earthaccess/store.py +++ b/earthaccess/store.py @@ -79,7 +79,7 @@ def make_instance( def _get_url_granule_mapping( granules: List[DataGranule], access: str ) -> Mapping[str, DataGranule]: - """Construct a mapping between file urls and granules""" + """Construct a mapping between file urls and granules.""" url_mapping = {} for granule in granules: for url in granule.data_links(access=access): @@ -88,15 +88,13 @@ def _get_url_granule_mapping( class Store(object): - """ - Store class to access granules on-prem or in the cloud. - """ + """Store class to access granules on-prem or in the cloud.""" def __init__(self, auth: Any, pre_authorize: bool = False) -> None: - """Store is the class to access data + """Store is the class to access data. Parameters: - auth (Auth): Required, Auth instance to download and access data. + auth: Auth instance to download and access data. """ if auth.authenticated is True: self.auth = auth @@ -108,7 +106,7 @@ def __init__(self, auth: Any, pre_authorize: bool = False) -> None: self._requests_cookies: Dict[str, Any] = {} self.set_requests_session(oauth_profile) if pre_authorize: - # collect cookies from other daacs + # collect cookies from other DAACs for url in DAAC_TEST_URLS: self.set_requests_session(url) @@ -157,7 +155,7 @@ def _running_in_us_west_2(self) -> bool: return False if resp.status_code == 200 and b"us-west-2" == resp.content: - # On AWS in region us-west-2 + # On AWS, in region us-west-2 return True return False @@ -165,13 +163,14 @@ def set_requests_session( self, url: str, method: str = "get", bearer_token: bool = False ) -> None: """Sets up a `requests` session with bearer tokens that are used by CMR. - Mainly used to get the authentication cookies from different DAACs and URS - This HTTPS session can be used to download granules if we want to use a direct, lower level API + Mainly used to get the authentication cookies from different DAACs and URS. + This HTTPS session can be used to download granules if we want to use a direct, + lower level API. Parameters: - url (String): used to test the credentials and populate the class auth cookies - method (String): HTTP method to test. default: "GET" - bearer_token (Boolean): if true will be used for authenticated queries on CMR + url: used to test the credentials and populate the class auth cookies + method: HTTP method to test, default: "GET" + bearer_token: if true, will be used for authenticated queries on CMR Returns: fsspec HTTPFileSystem (aiohttp client session) @@ -202,13 +201,13 @@ def get_s3fs_session( provider: Optional[str] = None, endpoint: Optional[str] = None, ) -> s3fs.S3FileSystem: - """ - Returns a s3fs instance for a given cloud provider / DAAC + """Returns a s3fs instance for a given cloud provider / DAAC. Parameters: - daac: any of the DAACs e.g. NSIDC, PODAAC - provider: a data provider if we know them, e.g PODAAC -> POCLOUD + daac: any of the DAACs, e.g. NSIDC, PODAAC + provider: a data provider if we know them, e.g. PODAAC -> POCLOUD endpoint: pass the URL for the credentials directly + Returns: a s3fs file instance """ @@ -264,7 +263,8 @@ def get_s3fs_session( @lru_cache def get_fsspec_session(self) -> fsspec.AbstractFileSystem: """Returns a fsspec HTTPS session with bearer tokens that are used by CMR. - This HTTPS session can be used to download granules if we want to use a direct, lower level API + This HTTPS session can be used to download granules if we want to use a direct, + lower level API. Returns: fsspec HTTPFileSystem (aiohttp client session) @@ -272,7 +272,7 @@ def get_fsspec_session(self) -> fsspec.AbstractFileSystem: token = self.auth.token["access_token"] client_kwargs = { "headers": {"Authorization": f"Bearer {token}"}, - # This is important! if we trust the env end send a bearer token + # This is important! If we trust the env and send a bearer token, # auth will fail! "trust_env": False, } @@ -281,10 +281,11 @@ def get_fsspec_session(self) -> fsspec.AbstractFileSystem: def get_requests_session(self, bearer_token: bool = True) -> requests.Session: """Returns a requests HTTPS session with bearer tokens that are used by CMR. - This HTTPS session can be used to download granules if we want to use a direct, lower level API + This HTTPS session can be used to download granules if we want to use a direct, + lower level API. Parameters: - bearer_token (Boolean): if true will be used for authenticated queries on CMR + bearer_token: if true, will be used for authenticated queries on CMR Returns: requests Session @@ -300,9 +301,12 @@ def open( hosted on S3 or HTTPS by third party libraries like xarray. Parameters: - granules (List): a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule + granules: a list of granules(DataGranule) instances or list of URLs, + e.g. s3://some-granule + provider: an option + Returns: - a list of s3fs "file pointers" to s3 files. + A list of s3fs "file pointers" to s3 files. """ if len(granules): return self._open(granules, provider) @@ -318,9 +322,12 @@ def _open( hosted on S3 or HTTPS by third party libraries like xarray. Parameters: - granules (List): a list of granules(DataGranule) instances or list of URLs, e.g. s3://some-granule + granules: a list of granules(DataGranule) instances or list of URLs, + e.g. s3://some-granule + provider: an option + Returns: - a list of s3fs "file pointers" to s3 files. + A list of s3fs "file pointers" to s3 files. """ raise NotImplementedError("granules should be a list of DataGranule or URLs") @@ -344,7 +351,7 @@ def _open_granules( if granules[0].cloud_hosted: access = "direct" provider = granules[0]["meta"]["provider-id"] - # if the data has its own S3 credentials endpoint we'll use it + # if the data has its own S3 credentials endpoint, we will use it endpoint = self._own_s3_credentials(granules[0]["umm"]["RelatedUrls"]) if endpoint is not None: print(f"using endpoint: {endpoint}") @@ -442,17 +449,18 @@ def get( ) -> List[str]: """Retrieves data granules from a remote storage system. - * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda) + * If we run this in the cloud, + we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda). * If we run it outside the us-west-2 region and the data granules are part of a cloud-based - collection the method will not get any files. - * If we requests data granules from an on-prem collection the data will be effectively downloaded - to a local directory. + collection, the method will not get any files. + * If we request data granules from an on-prem collection, + the data will be effectively downloaded to a local directory. Parameters: - granules: a list of granules(DataGranule) instances or a list of granule links (HTTP) - local_path: local directory to store the remote data granules - access: direct or on_prem, if set it will use it for the access method. - threads: parallel number of threads to use to download the files, adjust as necessary, default = 8 + granules: A list of granules(DataGranule) instances or a list of granule links (HTTP). + local_path: Local directory to store the remote data granules. + threads: Parallel number of threads to use to download the files; + adjust as necessary, default = 8. Returns: List of downloaded files @@ -467,7 +475,7 @@ def get( files = self._get(granules, local_path, provider, threads) return files else: - raise ValueError("List of URLs or DataGranule isntances expected") + raise ValueError("List of URLs or DataGranule instances expected") @singledispatchmethod def _get( @@ -479,17 +487,18 @@ def _get( ) -> List[str]: """Retrieves data granules from a remote storage system. - * If we run this in the cloud we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda) + * If we run this in the cloud, + we are moving data from S3 to a cloud compute instance (EC2, AWS Lambda). * If we run it outside the us-west-2 region and the data granules are part of a cloud-based - collection the method will not get any files. - * If we requests data granules from an on-prem collection the data will be effectively downloaded - to a local directory. + collection, the method will not get any files. + * If we request data granules from an on-prem collection, + the data will be effectively downloaded to a local directory. Parameters: - granules: a list of granules(DataGranule) instances or a list of granule links (HTTP) - local_path: local directory to store the remote data granules - access: direct or on_prem, if set it will use it for the access method. - threads: parallel number of threads to use to download the files, adjust as necessary, default = 8 + granules: A list of granules (DataGranule) instances or a list of granule links (HTTP). + local_path: Local directory to store the remote data granules + threads: Parallel number of threads to use to download the files; + adjust as necessary, default = 8. Returns: None @@ -541,7 +550,7 @@ def _get_granules( cloud_hosted = granules[0].cloud_hosted access = "direct" if (cloud_hosted and self.in_region) else "external" data_links = list( - # we are not in region + # we are not in-region chain.from_iterable( granule.data_links(access=access, in_region=self.in_region) for granule in granules @@ -568,15 +577,19 @@ def _get_granules( downloaded_files.append(file_name) return downloaded_files else: - # if the data is cloud based bu we are not in AWS it will be downloaded as if it was on prem + # if the data are cloud-based, but we are not in AWS, + # it will be downloaded as if it was on prem return self._download_onprem_granules(data_links, local_path, threads) def _download_file(self, url: str, directory: str) -> str: - """ - download a single file from an on-prem location, a DAAC data center. - :param url: the granule url - :param directory: local directory - :returns: local filepath or an exception + """Download a single file from an on-prem location, a DAAC data center. + + Parameters: + url: the granule url + directory: local directory + + Returns: + A local filepath or an exception. """ # If the get data link is an Opendap location if "opendap" in url and url.endswith(".html"): @@ -595,7 +608,7 @@ def _download_file(self, url: str, directory: str) -> str: r.raise_for_status() with open(local_path, "wb") as f: # This is to cap memory usage for large files at 1MB per write to disk per thread - # https://docs.python-requests.org/en/master/user/quickstart/#raw-response-content + # https://docs.python-requests.org/en/latest/user/quickstart/#raw-response-content shutil.copyfileobj(r.raw, f, length=1024 * 1024) except Exception: print(f"Error while downloading the file {local_filename}") @@ -608,12 +621,16 @@ def _download_file(self, url: str, directory: str) -> str: def _download_onprem_granules( self, urls: List[str], directory: str, threads: int = 8 ) -> List[Any]: - """ - downloads a list of URLS into the data directory. - :param urls: list of granule URLs from an on-prem collection - :param directory: local directory to store the files - :param threads: parallel number of threads to use to download the files, adjust as necessary, default = 8 - :returns: None + """Downloads a list of URLS into the data directory. + + Parameters: + urls: list of granule URLs from an on-prem collection + directory: local directory to store the downloaded files + threads: parallel number of threads to use to download the files; + adjust as necessary, default = 8 + + Returns: + A list of local filepaths to which the files were downloaded. """ if urls is None: raise ValueError("The granules didn't provide a valid GET DATA link") diff --git a/scripts/docs-live.sh b/scripts/docs-live.sh index 286f2a7c..6ed6a4be 100755 --- a/scripts/docs-live.sh +++ b/scripts/docs-live.sh @@ -1,7 +1,4 @@ - #!/usr/bin/env bash - -set -e -set -x +set -ex mkdocs serve --dev-addr 0.0.0.0:8008 --dirtyreload diff --git a/tests/integration/test_kerchunk.py b/tests/integration/test_kerchunk.py index 39c95e99..58f93077 100644 --- a/tests/integration/test_kerchunk.py +++ b/tests/integration/test_kerchunk.py @@ -20,21 +20,21 @@ @pytest.fixture(scope="module") -def granuales(): - granuales = earthaccess.search_data( +def granules(): + granules = earthaccess.search_data( count=2, short_name="SEA_SURFACE_HEIGHT_ALT_GRIDS_L4_2SATS_5DAY_6THDEG_V_JPL2205", cloud_hosted=True, ) - return granuales + return granules @pytest.mark.parametrize("protocol", ["", "file://"]) -def test_consolidate_metadata_outfile(tmp_path, granuales, protocol): +def test_consolidate_metadata_outfile(tmp_path, granules, protocol): outfile = f"{protocol}{tmp_path / 'metadata.json'}" assert not os.path.exists(outfile) result = earthaccess.consolidate_metadata( - granuales, + granules, outfile=outfile, access="indirect", kerchunk_options={"concat_dims": "Time"}, @@ -43,9 +43,9 @@ def test_consolidate_metadata_outfile(tmp_path, granuales, protocol): assert result == outfile -def test_consolidate_metadata_memory(tmp_path, granuales): +def test_consolidate_metadata_memory(tmp_path, granules): result = earthaccess.consolidate_metadata( - granuales, + granules, access="indirect", kerchunk_options={"concat_dims": "Time"}, ) @@ -54,10 +54,10 @@ def test_consolidate_metadata_memory(tmp_path, granuales): @pytest.mark.parametrize("output", ["file", "memory"]) -def test_consolidate_metadata(tmp_path, granuales, output): +def test_consolidate_metadata(tmp_path, granules, output): xr = pytest.importorskip("xarray") # Open directly with `earthaccess.open` - expected = xr.open_mfdataset(earthaccess.open(granuales)) + expected = xr.open_mfdataset(earthaccess.open(granules)) # Open with kerchunk consolidated metadata file if output == "file": @@ -65,7 +65,7 @@ def test_consolidate_metadata(tmp_path, granuales, output): else: kwargs = {} metadata = earthaccess.consolidate_metadata( - granuales, access="indirect", kerchunk_options={"concat_dims": "Time"}, **kwargs + granules, access="indirect", kerchunk_options={"concat_dims": "Time"}, **kwargs ) fs = earthaccess.get_fsspec_https_session()