diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2df3e4352..019849197 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,8 +25,10 @@ jobs: - "-p stac -F wkb" - "-p stac-api" - "-p stac -p stac-api -F geo" + - "-p stac-arrow" - "-p stac-async" - "-p stac-cli --no-default-features" + - "-p stac-cli --no-default-features -F parquet" - "-p stac-server --no-default-features" - "-p stac-server --no-default-features -F axum" - "-p stac-server --no-default-features -F memory-item-search" @@ -168,3 +170,18 @@ jobs: run: pip install -r scripts/requirements.txt - name: Validate run: scripts/validate-stac-server --pgstac + validate-stac-geoparquet: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Rust cache + uses: Swatinem/rust-cache@v2 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "pip" + - name: Install stac-api-validator + run: pip install -r scripts/requirements.txt + - name: Validate + run: scripts/validate-stac-geoparquet \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index ec37e9c39..cf6baafae 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "stac", "pgstac", "stac-api", + "stac-arrow", "stac-async", "stac-cli", "stac-server", @@ -12,6 +13,7 @@ members = [ default-members = [ "stac", "stac-api", + "stac-arrow", "stac-async", "stac-cli", "stac-server", diff --git a/README.md b/README.md index 7a7567e3c..2f4dbe996 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ This monorepo contains several crates: | [stac](./stac/README.md) | Core data structures and synchronous I/O | [![docs.rs](https://img.shields.io/docsrs/stac?style=flat-square)](https://docs.rs/stac/latest/stac/)
[![Crates.io](https://img.shields.io/crates/v/stac?style=flat-square)](https://crates.io/crates/stac) | | [pgstac](./pgstac/README.md) | Bindings for [pgstac](https://github.com/stac-utils/pgstac) | [![docs.rs](https://img.shields.io/docsrs/pgstac?style=flat-square)](https://docs.rs/pgstac/latest/pgstac/)
[![Crates.io](https://img.shields.io/crates/v/pgstac?style=flat-square)](https://crates.io/crates/pgstac) | | [stac-api](./stac-api/README.md) | Data structures for the [STAC API](https://github.com/radiantearth/stac-api-spec) specification | [![docs.rs](https://img.shields.io/docsrs/stac-api?style=flat-square)](https://docs.rs/stac-api/latest/stac_api/)
[![Crates.io](https://img.shields.io/crates/v/stac-api?style=flat-square)](https://crates.io/crates/stac-api) | +| [stac-arrow](./stac-arrow/README.md) | Read STAC data stored in [arrow](https://arrow.apache.org/) | [![docs.rs](https://img.shields.io/docsrs/stac-arrow?style=flat-square)](https://docs.rs/stac-arrow/latest/stac_arrow/)
[![Crates.io](https://img.shields.io/crates/v/stac-arrow?style=flat-square)](https://crates.io/crates/stac-arrow) | | [stac-async](./stac-async/README.md) | Asynchronous I/O with [tokio](https://tokio.rs/) | [![docs.rs](https://img.shields.io/docsrs/stac-async?style=flat-square)](https://docs.rs/stac-async/latest/stac_async/)
[![Crates.io](https://img.shields.io/crates/v/stac-async?style=flat-square)](https://crates.io/crates/stac-async) | | [stac-cli](./stac-cli/README.md)| Command line interface | [![docs.rs](https://img.shields.io/docsrs/stac-cli?style=flat-square)](https://docs.rs/stac-cli/latest/stac_cli/)
[![Crates.io](https://img.shields.io/crates/v/stac-cli?style=flat-square)](https://crates.io/crates/stac-cli) | | [stac-server](./stac-server/README.md)| STAC API server with multiple backends | [![docs.rs](https://img.shields.io/docsrs/stac-server?style=flat-square)](https://docs.rs/stac-server/latest/stac_server/)
[![Crates.io](https://img.shields.io/crates/v/stac-server?style=flat-square)](https://crates.io/crates/stac-server) | diff --git a/scripts/requirements.in b/scripts/requirements.in index 2b7069c2d..b2a4a2686 100644 --- a/scripts/requirements.in +++ b/scripts/requirements.in @@ -1 +1,2 @@ stac-api-validator +stac-geoparquet diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 52a389d18..f7740f15f 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -2,24 +2,40 @@ # uv pip compile scripts/requirements.in attrs==23.2.0 # via + # fiona # jsonschema # referencing -certifi==2024.2.2 +certifi==2024.6.2 # via + # fiona + # pyproj # requests # stac-api-validator charset-normalizer==3.3.2 # via requests +ciso8601==2.3.1 + # via stac-geoparquet click==8.1.7 # via + # click-plugins + # cligj + # fiona # stac-api-validator # stac-check # stac-validator +click-plugins==1.1.1 + # via fiona +cligj==0.7.2 + # via fiona deepdiff==6.7.1 # via stac-api-validator +fiona==1.9.6 + # via geopandas +geopandas==0.14.4 + # via stac-geoparquet idna==3.7 # via requests -jsonschema==4.21.1 +jsonschema==4.22.0 # via # pystac # stac-api-validator @@ -30,52 +46,81 @@ jsonschema-specifications==2023.12.1 more-itertools==8.14.0 # via stac-api-validator numpy==1.26.4 - # via shapely + # via + # geopandas + # pandas + # pyarrow + # shapely ordered-set==4.1.0 # via deepdiff -orjson==3.10.1 +orjson==3.10.3 # via pystac -pystac[orjson,validation]==1.10.0 +packaging==24.0 + # via + # geopandas + # stac-geoparquet +pandas==2.2.2 + # via + # geopandas + # stac-geoparquet +pyarrow==16.1.0 + # via stac-geoparquet +pyproj==3.6.1 + # via geopandas +pystac==1.10.1 # via # pystac-client # stac-api-validator + # stac-geoparquet pystac-client==0.7.7 # via stac-api-validator python-dateutil==2.9.0.post0 # via + # pandas # pystac # pystac-client python-dotenv==1.0.1 # via stac-check +pytz==2024.1 + # via pandas pyyaml==6.0.1 # via # stac-api-validator # stac-check -referencing==0.35.0 +referencing==0.35.1 # via # jsonschema # jsonschema-specifications -requests==2.32.0 +requests==2.32.3 # via # pystac-client # stac-api-validator # stac-check # stac-validator -rpds-py==0.18.0 +rpds-py==0.18.1 # via # jsonschema # referencing shapely==2.0.4 - # via stac-api-validator + # via + # geopandas + # stac-api-validator + # stac-geoparquet six==1.16.0 - # via python-dateutil + # via + # fiona + # python-dateutil stac-api-validator==0.6.2 - # via -r requirements.in + # via -r scripts/requirements.in stac-check==1.3.3 # via stac-api-validator +stac-geoparquet==0.5.1 + # via -r scripts/requirements.in stac-validator==3.3.2 # via # stac-api-validator # stac-check +tzdata==2024.1 + # via pandas urllib3==2.2.1 # via requests diff --git a/scripts/validate-stac-geoparquet b/scripts/validate-stac-geoparquet new file mode 100755 index 000000000..56308edd6 --- /dev/null +++ b/scripts/validate-stac-geoparquet @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 + +import subprocess +from pathlib import Path +from tempfile import TemporaryDirectory + +import pyarrow.parquet +import stac_geoparquet.arrow + +with TemporaryDirectory() as temporary_directory: + path = Path(temporary_directory) / "naip.parquet" + subprocess.run( + [ + "cargo", + "run", + "-p", + "stac-cli", + "--no-default-features", + "-F", + "parquet", + "--", + "convert", + "stac-arrow/data/naip.json", + path, + ], + check=True, + ) + table = pyarrow.parquet.read_table(path) + items = list(stac_geoparquet.arrow.stac_table_to_items(table)) + assert len(items) == 5 diff --git a/stac-arrow/CHANGELOG.md b/stac-arrow/CHANGELOG.md new file mode 100644 index 000000000..245034002 --- /dev/null +++ b/stac-arrow/CHANGELOG.md @@ -0,0 +1,7 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## Unreleased diff --git a/stac-arrow/Cargo.toml b/stac-arrow/Cargo.toml new file mode 100644 index 000000000..7b056b06e --- /dev/null +++ b/stac-arrow/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "stac-arrow" +version = "0.1.0" +authors = ["Pete Gadomski "] +edition = "2021" +description = "Read STAC data stored in Apache Arrow" +homepage = "https://github.com/stac-utils/stac-rs" +repository = "https://github.com/stac-utils/stac-rs" +keywords = ["geospatial", "stac", "metadata", "geo", "arrow"] +categories = ["science", "data-structures"] + +[dependencies] +arrow = { version = "51", default-features = false, features = ["chrono-tz"] } +arrow-json = "51" # TODO port `record_batches_to_json_rows` so we can go to the latest version +geo = "0.28" +geoarrow = "0.2.0" +geojson = "0.24" +geozero = { version = "0.13", features = ["with-wkb"] } +serde_json = "1" +stac = { version = "0.7", features = ["wkb"], path = "../stac" } +thiserror = "1" + +[dev-dependencies] +criterion = { version = "0.5", features = ["html_reports"] } +parquet = "51" +stac-validate = { version = "0.1", path = "../stac-validate" } + +[[bench]] +name = "read" +harness = false diff --git a/stac-arrow/README.md b/stac-arrow/README.md new file mode 100644 index 000000000..29a2cae6b --- /dev/null +++ b/stac-arrow/README.md @@ -0,0 +1,42 @@ +# stac-arrow + +[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/stac-utils/stac-rs/ci.yml?branch=main&style=for-the-badge)](https://github.com/stac-utils/stac-rs/actions/workflows/ci.yml) +[![docs.rs](https://img.shields.io/docsrs/stac-arrow?style=for-the-badge)](https://docs.rs/stac-arrow/latest/stac_arrow/) +[![Crates.io](https://img.shields.io/crates/v/stac-arrow?style=for-the-badge)](https://crates.io/crates/stac-arrow) +![Crates.io](https://img.shields.io/crates/l/stac-arrow?style=for-the-badge) +[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg?style=for-the-badge)](./CODE_OF_CONDUCT) + +Read [STAC](https://stacspec.org/) data stored in [arrow](https://arrow.apache.org/). + +## Usage + +To use the library in your project: + +```toml +[dependencies] +stac-arrow = "0.1" +``` + +## Examples + +```rust +use std::fs::File; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + +let file = File::open("data/naip.parquet").unwrap(); +let reader = ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .build() + .unwrap(); +let mut items = Vec::new(); +for result in reader { + items.extend(stac_arrow::record_batch_to_items(result.unwrap()).unwrap()); +} +assert_eq!(items.len(), 5); +``` + +Please see the [documentation](https://docs.rs/stac-arrow) for more usage examples. + +## Other info + +This crate is part of the [stac-rs](https://github.com/stac-utils/stac-rs) monorepo, see its README for contributing and license information. diff --git a/stac-arrow/benches/read.rs b/stac-arrow/benches/read.rs new file mode 100644 index 000000000..e4eae7117 --- /dev/null +++ b/stac-arrow/benches/read.rs @@ -0,0 +1,41 @@ +//! How fast is `RecordBatch` -> `Vec>` when going through +//! full serialization vs the deprecated `record_batches_to_json_rows`? + +use arrow::array::RecordBatch; +use arrow_json::ArrayWriter; +use criterion::{criterion_group, criterion_main, Criterion}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use serde_json::{Map, Value}; +use std::fs::File; + +#[allow(deprecated)] +fn record_batches_to_json_rows(record_batch: &RecordBatch) { + let _ = arrow_json::writer::record_batches_to_json_rows(&[record_batch]).unwrap(); +} + +fn writer(record_batch: &RecordBatch) { + let mut writer = ArrayWriter::new(Vec::new()); + writer.write(record_batch).unwrap(); + writer.finish().unwrap(); + let _: Vec> = + serde_json::from_reader(writer.into_inner().as_slice()).unwrap(); +} + +fn criterion_benchmark(c: &mut Criterion) { + let mut group = c.benchmark_group("read"); + let file = File::open("data/naip.parquet").unwrap(); + let mut reader = ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .build() + .unwrap(); + let mut record_batch = reader.next().unwrap().unwrap(); + let index = record_batch.schema().index_of("geometry").unwrap(); + record_batch.remove_column(index); + group.bench_function("record_batches_to_json_rows", |b| { + b.iter(|| record_batches_to_json_rows(&record_batch)) + }); + group.bench_function("writer", |b| b.iter(|| writer(&record_batch))); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/stac-arrow/data/naip.json b/stac-arrow/data/naip.json new file mode 100644 index 000000000..c735ae935 --- /dev/null +++ b/stac-arrow/data/naip.json @@ -0,0 +1,775 @@ +{ + "features": [ + { + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json" + ], + "id": "ok_m_3409901_nw_14_1_20100425", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -99.933454, + 34.934815 + ], + [ + -99.93423, + 35.00323 + ], + [ + -100.004084, + 35.002673 + ], + [ + -100.00325, + 34.934259 + ], + [ + -99.933454, + 34.934815 + ] + ] + ] + }, + "bbox": [ + -100.004084, + 34.934259, + -99.933454, + 35.00323 + ], + "properties": { + "datetime": "2010-04-25T00:00:00Z", + "stac_version": "1.0.0", + "gsd": 1.0, + "naip:year": "2010", + "proj:bbox": [ + 408377.0, + 3866212.0, + 414752.0, + 3873800.0 + ], + "proj:epsg": 26914, + "naip:state": "ok", + "proj:shape": [ + 7588, + 6375 + ], + "proj:transform": [ + 1.0, + 0.0, + 408377.0, + 0.0, + -1.0, + 3873800.0, + 0.0, + 0.0, + 1.0 + ] + }, + "links": [ + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "collection", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "parent", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + "rel": "root", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip/items/ok_m_3409901_nw_14_1_20100425", + "rel": "self", + "type": "application/geo+json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ok_m_3409901_nw_14_1_20100425", + "rel": "preview", + "type": "text/html", + "title": "Map of item" + } + ], + "assets": { + "rendered_preview": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ok_m_3409901_nw_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "Rendered preview", + "type": "image/png", + "roles": [ + "overview" + ], + "rel": "preview" + }, + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409901_nw_14_1_20100425.200.jpg", + "title": "Thumbnail", + "type": "image/jpeg", + "roles": [ + "thumbnail" + ] + }, + "tilejson": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ok_m_3409901_nw_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "TileJSON with default rendering", + "type": "application/json", + "roles": [ + "tiles" + ] + }, + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409901_nw_14_1_20100425.tif", + "title": "RGBIR COG tile", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": [ + "data" + ], + "eo:bands": [ + { + "common_name": "red", + "name": "Red" + }, + { + "common_name": "green", + "name": "Green" + }, + { + "common_name": "blue", + "name": "Blue" + }, + { + "common_name": "nir", + "description": "near-infrared", + "name": "NIR" + } + ] + } + }, + "collection": "naip" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json" + ], + "id": "ok_m_3409901_se_14_1_20100425", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -99.871005, + 34.872311 + ], + [ + -99.871728, + 34.940702 + ], + [ + -99.94153, + 34.940181 + ], + [ + -99.94075, + 34.871792 + ], + [ + -99.871005, + 34.872311 + ] + ] + ] + }, + "bbox": [ + -99.94153, + 34.871792, + -99.871005, + 34.940702 + ], + "properties": { + "datetime": "2010-04-25T00:00:00Z", + "stac_version": "1.0.0", + "gsd": 1.0, + "naip:year": "2010", + "proj:bbox": [ + 414020.0, + 3859229.0, + 420395.0, + 3866814.0 + ], + "proj:epsg": 26914, + "naip:state": "ok", + "proj:shape": [ + 7585, + 6375 + ], + "proj:transform": [ + 1.0, + 0.0, + 414020.0, + 0.0, + -1.0, + 3866814.0, + 0.0, + 0.0, + 1.0 + ] + }, + "links": [ + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "collection", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "parent", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + "rel": "root", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip/items/ok_m_3409901_se_14_1_20100425", + "rel": "self", + "type": "application/geo+json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ok_m_3409901_se_14_1_20100425", + "rel": "preview", + "type": "text/html", + "title": "Map of item" + } + ], + "assets": { + "rendered_preview": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ok_m_3409901_se_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "Rendered preview", + "type": "image/png", + "roles": [ + "overview" + ], + "rel": "preview" + }, + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409901_se_14_1_20100425.tif", + "title": "RGBIR COG tile", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": [ + "data" + ], + "eo:bands": [ + { + "common_name": "red", + "name": "Red" + }, + { + "common_name": "green", + "name": "Green" + }, + { + "common_name": "blue", + "name": "Blue" + }, + { + "common_name": "nir", + "description": "near-infrared", + "name": "NIR" + } + ] + }, + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409901_se_14_1_20100425.200.jpg", + "title": "Thumbnail", + "type": "image/jpeg", + "roles": [ + "thumbnail" + ] + }, + "tilejson": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ok_m_3409901_se_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "TileJSON with default rendering", + "type": "application/json", + "roles": [ + "tiles" + ] + } + }, + "collection": "naip" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json" + ], + "id": "ok_m_3409901_sw_14_1_20100425", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -99.933458, + 34.872317 + ], + [ + -99.934233, + 34.940733 + ], + [ + -100.004078, + 34.940176 + ], + [ + -100.003245, + 34.871761 + ], + [ + -99.933458, + 34.872317 + ] + ] + ] + }, + "bbox": [ + -100.004078, + 34.871761, + -99.933458, + 34.940733 + ], + "properties": { + "datetime": "2010-04-25T00:00:00Z", + "stac_version": "1.0.0", + "gsd": 1.0, + "naip:year": "2010", + "proj:bbox": [ + 408308.0, + 3859281.0, + 414687.0, + 3866869.0 + ], + "proj:epsg": 26914, + "naip:state": "ok", + "proj:shape": [ + 7588, + 6379 + ], + "proj:transform": [ + 1.0, + 0.0, + 408308.0, + 0.0, + -1.0, + 3866869.0, + 0.0, + 0.0, + 1.0 + ] + }, + "links": [ + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "collection", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "parent", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + "rel": "root", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip/items/ok_m_3409901_sw_14_1_20100425", + "rel": "self", + "type": "application/geo+json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ok_m_3409901_sw_14_1_20100425", + "rel": "preview", + "type": "text/html", + "title": "Map of item" + } + ], + "assets": { + "rendered_preview": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ok_m_3409901_sw_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "Rendered preview", + "type": "image/png", + "roles": [ + "overview" + ], + "rel": "preview" + }, + "tilejson": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ok_m_3409901_sw_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "TileJSON with default rendering", + "type": "application/json", + "roles": [ + "tiles" + ] + }, + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409901_sw_14_1_20100425.200.jpg", + "title": "Thumbnail", + "type": "image/jpeg", + "roles": [ + "thumbnail" + ] + }, + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409901_sw_14_1_20100425.tif", + "title": "RGBIR COG tile", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": [ + "data" + ], + "eo:bands": [ + { + "common_name": "red", + "name": "Red" + }, + { + "common_name": "green", + "name": "Green" + }, + { + "common_name": "blue", + "name": "Blue" + }, + { + "common_name": "nir", + "description": "near-infrared", + "name": "NIR" + } + ] + } + }, + "collection": "naip" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json" + ], + "id": "ok_m_3409902_ne_14_1_20100425", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -99.746106, + 34.934802 + ], + [ + -99.746725, + 35.003131 + ], + [ + -99.816441, + 35.002682 + ], + [ + -99.815763, + 34.934355 + ], + [ + -99.746106, + 34.934802 + ] + ] + ] + }, + "bbox": [ + -99.816441, + 34.934355, + -99.746106, + 35.003131 + ], + "properties": { + "datetime": "2010-04-25T00:00:00Z", + "stac_version": "1.0.0", + "gsd": 1.0, + "naip:year": "2010", + "proj:bbox": [ + 425500.0, + 3866067.0, + 431862.0, + 3873645.0 + ], + "proj:epsg": 26914, + "naip:state": "ok", + "proj:shape": [ + 7578, + 6362 + ], + "proj:transform": [ + 1.0, + 0.0, + 425500.0, + 0.0, + -1.0, + 3873645.0, + 0.0, + 0.0, + 1.0 + ] + }, + "links": [ + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "collection", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "parent", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + "rel": "root", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip/items/ok_m_3409902_ne_14_1_20100425", + "rel": "self", + "type": "application/geo+json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ok_m_3409902_ne_14_1_20100425", + "rel": "preview", + "type": "text/html", + "title": "Map of item" + } + ], + "assets": { + "rendered_preview": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ok_m_3409902_ne_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "Rendered preview", + "type": "image/png", + "roles": [ + "overview" + ], + "rel": "preview" + }, + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409902_ne_14_1_20100425.tif", + "title": "RGBIR COG tile", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": [ + "data" + ], + "eo:bands": [ + { + "common_name": "red", + "name": "Red" + }, + { + "common_name": "green", + "name": "Green" + }, + { + "common_name": "blue", + "name": "Blue" + }, + { + "common_name": "nir", + "description": "near-infrared", + "name": "NIR" + } + ] + }, + "tilejson": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ok_m_3409902_ne_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "TileJSON with default rendering", + "type": "application/json", + "roles": [ + "tiles" + ] + }, + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409902_ne_14_1_20100425.200.jpg", + "title": "Thumbnail", + "type": "image/jpeg", + "roles": [ + "thumbnail" + ] + } + }, + "collection": "naip" + }, + { + "type": "Feature", + "stac_version": "1.0.0", + "stac_extensions": [ + "https://stac-extensions.github.io/eo/v1.0.0/schema.json", + "https://stac-extensions.github.io/projection/v1.0.0/schema.json" + ], + "id": "ok_m_3409902_nw_14_1_20100425", + "geometry": { + "type": "Polygon", + "coordinates": [ + [ + [ + -99.808552, + 34.934809 + ], + [ + -99.809224, + 35.003163 + ], + [ + -99.878982, + 35.002678 + ], + [ + -99.878252, + 34.934325 + ], + [ + -99.808552, + 34.934809 + ] + ] + ] + }, + "bbox": [ + -99.878982, + 34.934325, + -99.808552, + 35.003163 + ], + "properties": { + "datetime": "2010-04-25T00:00:00Z", + "stac_version": "1.0.0", + "gsd": 1.0, + "naip:year": "2010", + "proj:bbox": [ + 419793.0, + 3866112.0, + 426159.0, + 3873693.0 + ], + "proj:epsg": 26914, + "naip:state": "ok", + "proj:shape": [ + 7581, + 6366 + ], + "proj:transform": [ + 1.0, + 0.0, + 419793.0, + 0.0, + -1.0, + 3873693.0, + 0.0, + 0.0, + 1.0 + ] + }, + "links": [ + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "collection", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip", + "rel": "parent", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/", + "rel": "root", + "type": "application/json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/stac/v1/collections/naip/items/ok_m_3409902_nw_14_1_20100425", + "rel": "self", + "type": "application/geo+json" + }, + { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/map?collection=naip&item=ok_m_3409902_nw_14_1_20100425", + "rel": "preview", + "type": "text/html", + "title": "Map of item" + } + ], + "assets": { + "thumbnail": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409902_nw_14_1_20100425.200.jpg", + "title": "Thumbnail", + "type": "image/jpeg", + "roles": [ + "thumbnail" + ] + }, + "image": { + "href": "https://naipeuwest.blob.core.windows.net/naip/v002/ok/2010/ok_100cm_2010/34099/m_3409902_nw_14_1_20100425.tif", + "title": "RGBIR COG tile", + "type": "image/tiff; application=geotiff; profile=cloud-optimized", + "roles": [ + "data" + ], + "eo:bands": [ + { + "common_name": "red", + "name": "Red" + }, + { + "common_name": "green", + "name": "Green" + }, + { + "common_name": "blue", + "name": "Blue" + }, + { + "common_name": "nir", + "description": "near-infrared", + "name": "NIR" + } + ] + }, + "tilejson": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/tilejson.json?collection=naip&item=ok_m_3409902_nw_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "TileJSON with default rendering", + "type": "application/json", + "roles": [ + "tiles" + ] + }, + "rendered_preview": { + "href": "https://planetarycomputer.microsoft.com/api/data/v1/item/preview.png?collection=naip&item=ok_m_3409902_nw_14_1_20100425&assets=image&asset_bidx=image%7C1%2C2%2C3", + "title": "Rendered preview", + "type": "image/png", + "roles": [ + "overview" + ], + "rel": "preview" + } + }, + "collection": "naip" + } + ], + "type": "FeatureCollection" +} \ No newline at end of file diff --git a/stac-arrow/data/naip.parquet b/stac-arrow/data/naip.parquet new file mode 100644 index 000000000..e54d47a2e Binary files /dev/null and b/stac-arrow/data/naip.parquet differ diff --git a/stac-arrow/src/error.rs b/stac-arrow/src/error.rs new file mode 100644 index 000000000..4df6d4236 --- /dev/null +++ b/stac-arrow/src/error.rs @@ -0,0 +1,60 @@ +use arrow::datatypes::Schema; +use thiserror::Error; + +/// Crate-specific error enum +#[derive(Debug, Error)] +pub enum Error { + /// [arrow::error::ArrowError] + #[error(transparent)] + Arrow(#[from] arrow::error::ArrowError), + + /// Two record batches have different schemas. + #[error("different schemas")] + DifferentSchemas(Schema, Schema), + + /// [geoarrow::error::GeoArrowError] + #[error(transparent)] + GeoArrow(#[from] geoarrow::error::GeoArrowError), + + /// [geojson::Error] + #[error(transparent)] + Geojson(#[from] geojson::Error), + + /// [geozero::error::GeozeroError] + #[error(transparent)] + Geozero(#[from] geozero::error::GeozeroError), + + /// [std::io::Error] + #[error(transparent)] + Io(#[from] std::io::Error), + + /// Invalid bbox. + /// + /// TODO this should probably be in the stac crate. + #[error("invalid bbox")] + InvalidBbox(Vec), + + /// An invalid datetime string. + #[error("invalid datetime: {0}")] + InvalidDatetime(String), + + /// A required field is missing. + #[error("missing required field: {0}")] + MissingField(&'static str), + + /// The geometry column is not binary. + #[error("non-binary geometry column")] + NonBinaryGeometryColumn, + + /// No items to serialize. + #[error("no items")] + NoItems, + + /// [serde_json::Error] + #[error(transparent)] + SerdeJson(#[from] serde_json::Error), + + /// [stac::Error] + #[error(transparent)] + Stac(#[from] stac::Error), +} diff --git a/stac-arrow/src/lib.rs b/stac-arrow/src/lib.rs new file mode 100644 index 000000000..dd4492aae --- /dev/null +++ b/stac-arrow/src/lib.rs @@ -0,0 +1,312 @@ +//! Read and write STAC to/from [geoarrow](https://github.com/geoarrow/geoarrow). +//! +//! The arrow data formatted per the [stac-geoparquet +//! spec](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/stac-geoparquet-spec.md). + +#![deny( + elided_lifetimes_in_paths, + explicit_outlives_requirements, + keyword_idents, + macro_use_extern_crate, + meta_variable_misuse, + missing_abi, + missing_debug_implementations, + missing_docs, + non_ascii_idents, + noop_method_call, + pointer_structural_match, + rust_2021_incompatible_closure_captures, + rust_2021_incompatible_or_patterns, + rust_2021_prefixes_incompatible_syntax, + rust_2021_prelude_collisions, + single_use_lifetimes, + trivial_casts, + trivial_numeric_casts, + unreachable_pub, + unsafe_code, + unsafe_op_in_unsafe_fn, + unused_crate_dependencies, + unused_extern_crates, + unused_import_braces, + unused_lifetimes, + unused_qualifications, + unused_results, + warnings +)] + +mod error; + +use arrow::{ + array::{AsArray, Float64Builder, RecordBatch, StructBuilder, TimestampMicrosecondBuilder}, + compute::kernels::cast_utils::Parser, + datatypes::{ArrowPrimitiveType, DataType, Field, SchemaBuilder, TimestampMicrosecondType}, +}; +use arrow_json::ReaderBuilder; +pub use error::Error; +use geo::Geometry; +use geoarrow::{array::MixedGeometryBuilder, table::GeoTable, GeometryArrayTrait}; +use geozero::wkb::{FromWkb, WkbDialect}; +use serde_json::{Map, Value}; +use stac::{item::GeoparquetItem, Item}; +use std::{collections::HashMap, io::Cursor, sync::Arc}; + +/// Crate-specific result type. +pub type Result = std::result::Result; + +const DATETIME_ATTRIBUTES: [&str; 8] = [ + "datetime", + "start_datetime", + "end_datetime", + "created", + "updated", + "expires", + "published", + "unpublished", +]; + +/// Converts a [RecordBatch] into a vector of [Items](Item). +/// +/// # Examples +/// +/// ``` +/// use std::fs::File; +/// use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +/// +/// let file = File::open("data/naip.parquet").unwrap(); +/// let reader = ParquetRecordBatchReaderBuilder::try_new(file) +/// .unwrap() +/// .build() +/// .unwrap(); +/// let mut items = Vec::new(); +/// for result in reader { +/// items.extend(stac_arrow::record_batch_to_items(result.unwrap()).unwrap()); +/// } +/// assert_eq!(items.len(), 5); +/// ``` +#[allow(deprecated)] // We find that `record_batches_to_json_rows` is faster than serializing-then-deserializing with `Writer` +pub fn record_batch_to_items(mut record_batch: RecordBatch) -> Result> { + let index = record_batch.schema().index_of("geometry")?; + let geometry = record_batch.remove_column(index); + // TODO allow for i64 offsets + let geometry = geometry + .as_binary_opt::() + .ok_or_else(|| Error::NonBinaryGeometryColumn)?; + let items: Vec> = + arrow_json::writer::record_batches_to_json_rows(&[&record_batch])?; + items + .into_iter() + .enumerate() + .map(|(i, item)| { + let geoparquet_item: GeoparquetItem = serde_json::from_value(Value::Object(item))?; + // TODO handle null geometries + let mut item: Item = geoparquet_item.try_into()?; + item.geometry = Some( + (&Geometry::from_wkb(&mut Cursor::new(geometry.value(i)), WkbDialect::Wkb)?).into(), + ); + Ok(item) + }) + .collect() +} + +/// Converts items to a record batch. +/// +/// # Examples +/// +/// ``` +/// use stac::ItemCollection; +/// +/// let item_collection: ItemCollection = stac::read_json("data/naip.json").unwrap(); +/// let record_batch = stac_arrow::items_to_record_batch(item_collection.items).unwrap(); +/// ``` +pub fn items_to_record_batch(items: Vec) -> Result { + if items.is_empty() { + return Err(Error::NoItems); + } + let mut values = Vec::with_capacity(items.len()); + let mut geometry_builder = MixedGeometryBuilder::::new(); + let mut datetime_builders = HashMap::new(); + let mut datetime_keys = Vec::new(); + // TODO support 3D bboxes + let bbox_fields = vec![ + Field::new("xmin", DataType::Float64, false), + Field::new("ymin", DataType::Float64, false), + Field::new("xmax", DataType::Float64, false), + Field::new("ymax", DataType::Float64, false), + ]; + let mut bbox_builder = StructBuilder::from_fields(bbox_fields.clone(), items.len()); + for mut item in items { + // TODO allow configuring dropping of invalid attributes. + let geometry: Option = item.geometry.take().map(|g| g.try_into()).transpose()?; + geometry_builder.push_geometry(geometry.as_ref())?; + let geoparquet_item = item.into_geoparquet_item(true)?; + if geoparquet_item.bbox.len() != 4 { + return Err(Error::InvalidBbox(geoparquet_item.bbox)); + } else { + for i in 0..4 { + bbox_builder + .field_builder::(i) + .unwrap() + .append_value(geoparquet_item.bbox[i]); + } + bbox_builder.append(true); + } + + let mut value = serde_json::to_value(geoparquet_item)?; + let _ = value + .as_object_mut() + .expect("geoparquet item should be a map") + .remove("geometry"); + let _ = value + .as_object_mut() + .expect("geoparquet item should be a map") + .remove("bbox"); + for key in DATETIME_ATTRIBUTES { + let entry = datetime_builders + .entry(key) + .or_insert_with(TimestampMicrosecondBuilder::new); + if let Some(s) = value.as_object_mut().unwrap().remove(key) { + if !datetime_keys.contains(&key) { + datetime_keys.push(key); + } + entry.append_value( + s.as_str() + .and_then(TimestampMicrosecondType::parse) + .ok_or_else(|| Error::InvalidDatetime(s.to_string()))?, + ); + } else { + entry.append_null(); + } + } + values.push(value); + } + let geometry = geometry_builder.finish(); + // TODO allow configuration of how many items to iterate + let schema = arrow_json::reader::infer_json_schema_from_iterator(values.iter().map(Ok))?; + let mut decoder = ReaderBuilder::new(Arc::new(schema.clone())).build_decoder()?; + decoder.serialize(&values)?; + let record_batch = decoder.flush().map(|record_batch| record_batch.unwrap())?; + let mut builder = SchemaBuilder::from(schema.fields); + builder.push(geometry.extension_field()); + builder.push(Field::new( + "bbox", + DataType::Struct(bbox_fields.into()), + false, + )); + for key in &datetime_keys { + builder.push(Field::new(*key, TimestampMicrosecondType::DATA_TYPE, true)); + } + let schema = builder.finish(); + let mut columns = record_batch.columns().to_vec(); + columns.push(geometry.to_array_ref()); + columns.push(Arc::new(bbox_builder.finish())); + for key in datetime_keys { + columns.push(Arc::new( + datetime_builders + .get_mut(key) + .expect("should be a builder for every key") + .finish(), + )); + } + RecordBatch::try_new(Arc::new(schema), columns).map_err(Error::from) +} + +/// Converts items to a [GeoTable]. +/// +/// # Examples +/// +/// ``` +/// use stac::ItemCollection; +/// +/// let item_collection: ItemCollection = stac::read_json("data/naip.json").unwrap(); +/// let geo_table = stac_arrow::items_to_geo_table(item_collection.items, 100).unwrap(); +/// ``` +pub fn items_to_geo_table(items: Vec, batch_size: usize) -> Result { + let mut record_batches: Vec = Vec::new(); + for chunk in items.chunks(batch_size) { + let record_batch = items_to_record_batch(chunk.to_vec())?; + if let Some(first) = record_batches.first() { + if first.schema() != record_batch.schema() { + return Err(Error::DifferentSchemas( + (*first.schema()).clone(), + (*record_batch.schema()).clone(), + )); + } + } + record_batches.push(record_batch); + } + if record_batches.is_empty() { + return Err(Error::NoItems); + } + let (geometry_column_index, _) = record_batches[0] + .schema() + .column_with_name("geometry") + .expect("should have a geometry field"); + GeoTable::try_new( + record_batches[0].schema(), + record_batches, + geometry_column_index, + ) + .map_err(Error::from) +} + +#[cfg(test)] +mod tests { + use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + use stac::ItemCollection; + use stac_validate::Validate; + use std::fs::File; + + #[test] + fn record_batch_to_items() { + let file = File::open("data/naip.parquet").unwrap(); + let mut reader = ParquetRecordBatchReaderBuilder::try_new(file) + .unwrap() + .build() + .unwrap(); + let items = reader + .next() + .map(|result| super::record_batch_to_items(result.unwrap()).unwrap()) + .unwrap(); + assert_eq!(items.len(), 5); + for item in items { + assert_eq!(item.extensions.len(), 2); + assert!(item.geometry.is_some()); + assert!(item.bbox.is_some()); + assert!(!item.links.is_empty()); + assert!(!item.assets.is_empty()); + assert!(item.collection.is_some()); + item.validate().unwrap(); + } + } + + #[test] + fn items_to_record_batch() { + let items: ItemCollection = stac::read_json("data/naip.json").unwrap(); + let record_batch = super::items_to_record_batch(items.items).unwrap(); + assert_eq!(record_batch.num_rows(), 5); + } + + #[test] + fn items_to_geo_table() { + let items: ItemCollection = stac::read_json("data/naip.json").unwrap(); + let geo_table = super::items_to_geo_table(items.items, 5).unwrap(); + assert_eq!(geo_table.len(), 5); + } +} + +// From https://github.com/rust-lang/cargo/issues/383#issuecomment-720873790, +// may they be forever blessed. +#[cfg(doctest)] +mod readme { + macro_rules! external_doc_test { + ($x:expr) => { + #[doc = $x] + extern "C" {} + }; + } + + external_doc_test!(include_str!("../README.md")); +} + +#[cfg(test)] +use criterion as _; diff --git a/stac-cli/Cargo.toml b/stac-cli/Cargo.toml index 59a97aeee..4db685dcd 100644 --- a/stac-cli/Cargo.toml +++ b/stac-cli/Cargo.toml @@ -11,16 +11,21 @@ keywords = ["geospatial", "stac", "metadata", "geo", "raster"] categories = ["science", "data-structures"] [features] -default = ["gdal", "pgstac"] +default = ["parquet", "gdal", "pgstac"] +parquet = ["stac-arrow", "dep:parquet", "dep:arrow", "dep:geoarrow"] gdal = ["stac/gdal"] pgstac = ["stac-server/pgstac"] [dependencies] +arrow = { version = "51", optional = true } axum = "0.7" clap = { version = "4", features = ["derive"] } +geoarrow = { version = "0.2", features = ["parquet"], optional = true } +parquet = { version = "51", optional = true } serde = "1" serde_json = "1" stac = { version = "0.7", path = "../stac" } +stac-arrow = { version = "0.1", path = "../stac-arrow", optional = true } stac-api = { version = "0.4", path = "../stac-api" } stac-async = { version = "0.5", path = "../stac-async" } stac-server = { version = "0.1", path = "../stac-server", features = [ diff --git a/stac-cli/src/args.rs b/stac-cli/src/args.rs index 2b85679ba..1eebdf99f 100644 --- a/stac-cli/src/args.rs +++ b/stac-cli/src/args.rs @@ -1,13 +1,15 @@ -use crate::{Error, Result, Subcommand}; +use crate::{Error, Format, Result, Subcommand}; use clap::Parser; -use serde::{de::DeserializeOwned, Serialize}; +#[cfg(feature = "parquet")] +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use serde::Serialize; use serde_json::json; use stac::{item::Builder, Asset, Value}; use stac_api::{GetItems, GetSearch, Item, ItemCollection}; use stac_async::ApiClient; use stac_server::{Api, Backend, MemoryBackend}; use stac_validate::Validate; -use std::path::Path; +use std::{fs::File, io::Write, path::Path}; use tokio::net::TcpListener; use tokio_stream::StreamExt; use url::Url; @@ -30,6 +32,15 @@ impl Args { pub async fn execute(self) -> i32 { use Subcommand::*; let result = match &self.subcommand { + Convert { + from, + to, + in_format, + out_format, + } => { + self.convert(from.as_deref(), to.as_deref(), *in_format, *out_format) + .await + } Item { id_or_href, id, @@ -94,6 +105,17 @@ impl Args { } } + async fn convert( + &self, + from: Option<&str>, + to: Option<&str>, + in_format: Option, + out_format: Option, + ) -> Result<()> { + self.write_href(self.read_href(from, in_format).await?, to, out_format) + .await + } + #[allow(clippy::too_many_arguments)] fn item( &self, @@ -267,12 +289,18 @@ impl Args { } async fn sort(&self, href: Option<&str>) -> Result<()> { - let value: Value = self.read_href(href).await?; + // TODO allow specifying formats + let value: Value = self.read_href(href, None).await?; self.println(value) } async fn validate(&self, href: Option<&str>) -> Result<()> { - let value: serde_json::Value = self.read_href(href).await?; + // TODO allow specifying formats + let value: serde_json::Value = if let Some(href) = href { + stac_async::read_json(href).await? + } else { + serde_json::from_reader(std::io::stdin())? + }; let mut errors: Vec = Vec::new(); let mut update_errors = |result: std::result::Result<(), stac_validate::Error>| match result { @@ -323,11 +351,82 @@ impl Args { } } - async fn read_href(&self, href: Option<&str>) -> Result { - if let Some(href) = href { - stac_async::read_json(href).await.map_err(Error::from) - } else { - serde_json::from_reader(std::io::stdin()).map_err(Error::from) + async fn read_href(&self, href: Option<&str>, format: Option) -> Result { + let format = format.unwrap_or_else(|| href.and_then(Format::from_href).unwrap_or_default()); + match format { + Format::Json => { + if let Some(href) = href { + stac_async::read_json(href).await.map_err(Error::from) + } else { + serde_json::from_reader(std::io::stdin()).map_err(Error::from) + } + } + #[cfg(feature = "parquet")] + Format::GeoParquet => { + let reader = if let Some(href) = href { + let file = File::open(href)?; + ParquetRecordBatchReaderBuilder::try_new(file)?.build()? + } else { + // FIXME + unimplemented!() + }; + let mut items = Vec::new(); + for result in reader { + items.extend(stac_arrow::record_batch_to_items(result?)?); + } + Ok(Value::ItemCollection(items.into())) + } + } + } + + async fn write_href( + &self, + value: Value, + href: Option<&str>, + format: Option, + ) -> Result<()> { + let format = format.unwrap_or_else(|| href.and_then(Format::from_href).unwrap_or_default()); + match format { + Format::Json => { + if let Some(href) = href { + let output = if self.compact { + serde_json::to_string(&value)? + } else { + serde_json::to_string_pretty(&value)? + }; + let mut file = File::create(href)?; + file.write_all(output.as_bytes())?; + } else { + self.println(value)?; + } + Ok(()) + } + #[cfg(feature = "parquet")] + Format::GeoParquet => { + let items = match value { + Value::ItemCollection(item_collection) => item_collection.items, + Value::Item(item) => vec![item], + _ => { + return Err(Error::Custom(format!( + "cannot write STAC GeoParquet of type: {}", + value.type_name() + ))) + } + }; + // TODO allow customizing batch size + let mut geo_table = stac_arrow::items_to_geo_table(items, 1000)?; + if let Some(href) = href { + let file = File::create(href)?; + geoarrow::io::parquet::write_geoparquet(&mut geo_table, file, None)?; + } else { + geoarrow::io::parquet::write_geoparquet( + &mut geo_table, + std::io::stdout(), + None, + )?; + } + Ok(()) + } } } diff --git a/stac-cli/src/error.rs b/stac-cli/src/error.rs index e851f84b5..63e07156d 100644 --- a/stac-cli/src/error.rs +++ b/stac-cli/src/error.rs @@ -4,6 +4,11 @@ use thiserror::Error; #[derive(Error, Debug)] #[non_exhaustive] pub enum Error { + /// [arrow::error::ArrowError] + #[cfg(feature = "parquet")] + #[error(transparent)] + Arrow(#[from] arrow::error::ArrowError), + /// Custom error. #[error("{0}")] Custom(String), @@ -12,6 +17,20 @@ pub enum Error { #[error(transparent)] Io(#[from] std::io::Error), + /// Invalid format. + #[error("invalid format: {0}")] + InvalidFormat(String), + + /// [geoarrow::error::GeoArrowError] + #[cfg(feature = "parquet")] + #[error(transparent)] + GeoArrow(#[from] geoarrow::error::GeoArrowError), + + /// [parquet::errors::ParquetError] + #[cfg(feature = "parquet")] + #[error(transparent)] + Parquet(#[from] parquet::errors::ParquetError), + /// [serde_json::Error] #[error(transparent)] SerdeJson(#[from] serde_json::Error), @@ -24,6 +43,11 @@ pub enum Error { #[error(transparent)] Stac(#[from] stac::Error), + /// [stac_arrow::Error] + #[cfg(feature = "parquet")] + #[error(transparent)] + StacArrow(#[from] stac_arrow::Error), + /// [stac_async::Error] #[error(transparent)] StacAsync(#[from] stac_async::Error), diff --git a/stac-cli/src/format.rs b/stac-cli/src/format.rs new file mode 100644 index 000000000..73e43ec84 --- /dev/null +++ b/stac-cli/src/format.rs @@ -0,0 +1,52 @@ +use crate::Error; +use std::{path::Path, str::FromStr}; + +/// Formats that can be used for STAC data. +#[derive(Clone, Copy, Debug, Default, PartialEq)] +pub enum Format { + /// JSON format, the default. + #[default] + Json, + + /// [Geoparquet](https://github.com/stac-utils/stac-geoparquet) + #[cfg(feature = "parquet")] + GeoParquet, +} + +impl Format { + /// Returns the correct format for this href's extension. + /// + /// # Examples + /// + /// ``` + /// use stac_cli::Format; + /// + /// assert_eq!(Format::Json, Format::from_href("item.json").unwrap()); + /// #[cfg(feature = "parquet")] + /// assert_eq!(Format::GeoParquet, Format::from_href("items.parquet").unwrap()); + /// ``` + pub fn from_href(href: &str) -> Option { + Path::new(href) + .extension() + .and_then(|e| e.to_str()) + .and_then(|e| match e { + "json" => Some(Format::Json), + #[cfg(feature = "parquet")] + "parquet" | "geoparquet" => Some(Format::GeoParquet), + _ => None, + }) + } +} + +impl FromStr for Format { + type Err = Error; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "json" => Ok(Format::Json), + #[cfg(feature = "parquet")] + "geoparquet" | "parquet" => Ok(Format::GeoParquet), + _ => Err(Error::InvalidFormat(s.to_string())), + } + } +} diff --git a/stac-cli/src/lib.rs b/stac-cli/src/lib.rs index 37108f597..fbd41edc9 100644 --- a/stac-cli/src/lib.rs +++ b/stac-cli/src/lib.rs @@ -34,9 +34,10 @@ mod args; mod error; +mod format; mod subcommand; -pub use {args::Args, error::Error, subcommand::Subcommand}; +pub use {args::Args, error::Error, format::Format, subcommand::Subcommand}; /// Crate-specific result type. pub type Result = std::result::Result; diff --git a/stac-cli/src/subcommand.rs b/stac-cli/src/subcommand.rs index fcf2a9de8..ff61c4b7a 100644 --- a/stac-cli/src/subcommand.rs +++ b/stac-cli/src/subcommand.rs @@ -1,7 +1,36 @@ +use crate::Format; + /// A CLI subcommand. #[derive(Debug, clap::Subcommand)] #[allow(clippy::large_enum_variant)] pub enum Subcommand { + /// Converts STAC data from one format to another. + Convert { + /// The href of the STAC object. + /// + /// If this is not provided or is `-`, will read from standard input. + from: Option, + + /// The output href. + /// + /// If this is not provided or is `-`, will be printed to standard output. + to: Option, + + /// The input format. + /// + /// If not provided, will be inferred from the extension of the input + /// href or, if that is not provided, will default to JSON. + #[arg(short, long)] + in_format: Option, + + /// The output format. + /// + /// If not provided, will be inferred from the extension of the output + /// href or, if that is not provided, will default to JSON. + #[arg(short, long)] + out_format: Option, + }, + /// Creates a STAC Item. Item { /// The item id or asset href. diff --git a/stac/src/item.rs b/stac/src/item.rs index 0bcbe5029..8c3515c8c 100644 --- a/stac/src/item.rs +++ b/stac/src/item.rs @@ -118,8 +118,10 @@ pub struct GeoparquetItem { pub id: String, /// For GeoParquet 1.0 this must be well-known Binary - #[serde(default)] - pub geometry: Vec, + /// + /// We allow this to be null because we sometimes need to remove the + /// geometry for extra processing. + pub geometry: Option>, /// Can be a 4 or 6 value struct, depending on dimension of the data. /// @@ -628,8 +630,7 @@ impl Item { tm: false, }) }) - .transpose()? - .ok_or_else(|| Error::MissingGeometry)?, + .transpose()?, bbox: self.bbox.ok_or_else(|| Error::MissingBbox)?, links: self.links, assets: self.assets, @@ -648,13 +649,17 @@ impl TryFrom for Item { use geozero::wkb::{FromWkb, WkbDialect}; use std::io::Cursor; - let geometry = Geometry::::from_wkb(&mut Cursor::new(item.geometry), WkbDialect::Wkb)?; + let geometry = if let Some(geometry) = item.geometry { + Some((&Geometry::::from_wkb(&mut Cursor::new(geometry), WkbDialect::Wkb)?).into()) + } else { + None + }; Ok(Item { r#type: item.r#type.unwrap_or_else(|| ITEM_TYPE.to_string()), version: STAC_VERSION.to_string(), extensions: item.extensions, id: item.id, - geometry: Some((&geometry).into()), + geometry, bbox: Some(item.bbox), links: item.links, assets: item.assets, @@ -1024,14 +1029,16 @@ mod tests { r#type: None, extensions: Vec::new(), id: "an-id".to_string(), - geometry: Geometry::Point((-105., 41.).into()) - .to_wkb(CoordDimensions { - z: false, - m: false, - t: false, - tm: false, - }) - .unwrap(), + geometry: Some( + Geometry::Point((-105., 41.).into()) + .to_wkb(CoordDimensions { + z: false, + m: false, + t: false, + tm: false, + }) + .unwrap(), + ), bbox: vec![-105., 41., -105., 41.], links: Vec::new(), assets: Default::default(), @@ -1049,6 +1056,6 @@ mod tests { let mut value = serde_json::to_value(item).unwrap(); let _ = value.as_object_mut().unwrap().remove("geometry").unwrap(); let geoparquet_item: GeoparquetItem = serde_json::from_value(value).unwrap(); - assert_eq!(geoparquet_item.geometry, Vec::::new()); + assert_eq!(geoparquet_item.geometry, None); } }