Skip to content

Commit

Permalink
feat: add stac-arrow
Browse files Browse the repository at this point in the history
  • Loading branch information
gadomski committed Jun 7, 2024
1 parent c0a3e97 commit 703d0aa
Show file tree
Hide file tree
Showing 24 changed files with 1,763 additions and 38 deletions.
17 changes: 17 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@ jobs:
- "-p stac -F wkb"
- "-p stac-api"
- "-p stac -p stac-api -F geo"
- "-p stac-arrow"
- "-p stac-async"
- "-p stac-cli --no-default-features"
- "-p stac-cli --no-default-features -F parquet"
- "-p stac-server --no-default-features"
- "-p stac-server --no-default-features -F axum"
- "-p stac-server --no-default-features -F memory-item-search"
Expand Down Expand Up @@ -174,3 +176,18 @@ jobs:
run: pip install -r scripts/requirements.txt
- name: Validate
run: scripts/validate-stac-server --pgstac
validate-stac-geoparquet:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Rust cache
uses: Swatinem/rust-cache@v2
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: "pip"
- name: Install stac-api-validator
run: pip install -r scripts/requirements.txt
- name: Validate
run: scripts/validate-stac-geoparquet
2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ members = [
"stac",
"pgstac",
"stac-api",
"stac-arrow",
"stac-async",
"stac-cli",
"stac-server",
Expand All @@ -12,6 +13,7 @@ members = [
default-members = [
"stac",
"stac-api",
"stac-arrow",
"stac-async",
"stac-cli",
"stac-server",
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ This monorepo contains several crates:
| [stac](./stac/README.md) | Core data structures and synchronous I/O | [![docs.rs](https://img.shields.io/docsrs/stac?style=flat-square)](https://docs.rs/stac/latest/stac/) <br> [![Crates.io](https://img.shields.io/crates/v/stac?style=flat-square)](https://crates.io/crates/stac) |
| [pgstac](./pgstac/README.md) | Bindings for [pgstac](https://github.com/stac-utils/pgstac) | [![docs.rs](https://img.shields.io/docsrs/pgstac?style=flat-square)](https://docs.rs/pgstac/latest/pgstac/) <br> [![Crates.io](https://img.shields.io/crates/v/pgstac?style=flat-square)](https://crates.io/crates/pgstac) |
| [stac-api](./stac-api/README.md) | Data structures for the [STAC API](https://github.com/radiantearth/stac-api-spec) specification | [![docs.rs](https://img.shields.io/docsrs/stac-api?style=flat-square)](https://docs.rs/stac-api/latest/stac_api/) <br> [![Crates.io](https://img.shields.io/crates/v/stac-api?style=flat-square)](https://crates.io/crates/stac-api) |
| [stac-arrow](./stac-arrow/README.md) | Read and write STAC data stored in [arrow](https://arrow.apache.org/) | [![docs.rs](https://img.shields.io/docsrs/stac-arrow?style=flat-square)](https://docs.rs/stac-arrow/latest/stac_arrow/) <br> [![Crates.io](https://img.shields.io/crates/v/stac-arrow?style=flat-square)](https://crates.io/crates/stac-arrow) |
| [stac-async](./stac-async/README.md) | Asynchronous I/O with [tokio](https://tokio.rs/) | [![docs.rs](https://img.shields.io/docsrs/stac-async?style=flat-square)](https://docs.rs/stac-async/latest/stac_async/) <br> [![Crates.io](https://img.shields.io/crates/v/stac-async?style=flat-square)](https://crates.io/crates/stac-async) |
| [stac-cli](./stac-cli/README.md)| Command line interface | [![docs.rs](https://img.shields.io/docsrs/stac-cli?style=flat-square)](https://docs.rs/stac-cli/latest/stac_cli/) <br> [![Crates.io](https://img.shields.io/crates/v/stac-cli?style=flat-square)](https://crates.io/crates/stac-cli) |
| [stac-server](./stac-server/README.md)| STAC API server with multiple backends | [![docs.rs](https://img.shields.io/docsrs/stac-server?style=flat-square)](https://docs.rs/stac-server/latest/stac_server/) <br> [![Crates.io](https://img.shields.io/crates/v/stac-server?style=flat-square)](https://crates.io/crates/stac-server) |
Expand Down
1 change: 1 addition & 0 deletions scripts/requirements.in
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
stac-api-validator
stac-geoparquet
67 changes: 56 additions & 11 deletions scripts/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,40 @@
# uv pip compile scripts/requirements.in
attrs==23.2.0
# via
# fiona
# jsonschema
# referencing
certifi==2024.2.2
certifi==2024.6.2
# via
# fiona
# pyproj
# requests
# stac-api-validator
charset-normalizer==3.3.2
# via requests
ciso8601==2.3.1
# via stac-geoparquet
click==8.1.7
# via
# click-plugins
# cligj
# fiona
# stac-api-validator
# stac-check
# stac-validator
click-plugins==1.1.1
# via fiona
cligj==0.7.2
# via fiona
deepdiff==6.7.1
# via stac-api-validator
fiona==1.9.6
# via geopandas
geopandas==0.14.4
# via stac-geoparquet
idna==3.7
# via requests
jsonschema==4.21.1
jsonschema==4.22.0
# via
# pystac
# stac-api-validator
Expand All @@ -30,52 +46,81 @@ jsonschema-specifications==2023.12.1
more-itertools==8.14.0
# via stac-api-validator
numpy==1.26.4
# via shapely
# via
# geopandas
# pandas
# pyarrow
# shapely
ordered-set==4.1.0
# via deepdiff
orjson==3.10.1
orjson==3.10.3
# via pystac
pystac[orjson,validation]==1.10.0
packaging==24.0
# via
# geopandas
# stac-geoparquet
pandas==2.2.2
# via
# geopandas
# stac-geoparquet
pyarrow==16.1.0
# via stac-geoparquet
pyproj==3.6.1
# via geopandas
pystac==1.10.1
# via
# pystac-client
# stac-api-validator
# stac-geoparquet
pystac-client==0.7.7
# via stac-api-validator
python-dateutil==2.9.0.post0
# via
# pandas
# pystac
# pystac-client
python-dotenv==1.0.1
# via stac-check
pytz==2024.1
# via pandas
pyyaml==6.0.1
# via
# stac-api-validator
# stac-check
referencing==0.35.0
referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
requests==2.32.0
requests==2.32.3
# via
# pystac-client
# stac-api-validator
# stac-check
# stac-validator
rpds-py==0.18.0
rpds-py==0.18.1
# via
# jsonschema
# referencing
shapely==2.0.4
# via stac-api-validator
# via
# geopandas
# stac-api-validator
# stac-geoparquet
six==1.16.0
# via python-dateutil
# via
# fiona
# python-dateutil
stac-api-validator==0.6.2
# via -r requirements.in
# via -r scripts/requirements.in
stac-check==1.3.3
# via stac-api-validator
stac-geoparquet==0.5.1
# via -r scripts/requirements.in
stac-validator==3.3.2
# via
# stac-api-validator
# stac-check
tzdata==2024.1
# via pandas
urllib3==2.2.1
# via requests
30 changes: 30 additions & 0 deletions scripts/validate-stac-geoparquet
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3

import subprocess
from pathlib import Path
from tempfile import TemporaryDirectory

import pyarrow.parquet
import stac_geoparquet.arrow

with TemporaryDirectory() as temporary_directory:
path = Path(temporary_directory) / "naip.parquet"
subprocess.run(
[
"cargo",
"run",
"-p",
"stac-cli",
"--no-default-features",
"-F",
"parquet",
"--",
"convert",
"stac-arrow/data/naip.json",
path,
],
check=True,
)
table = pyarrow.parquet.read_table(path)
items = list(stac_geoparquet.arrow.stac_table_to_items(table))
assert len(items) == 5
7 changes: 7 additions & 0 deletions stac-arrow/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased
31 changes: 31 additions & 0 deletions stac-arrow/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
[package]
name = "stac-arrow"
version = "0.1.0"
authors = ["Pete Gadomski <pete.gadomski@gmail.com>"]
edition = "2021"
description = "Read and write STAC data stored in geoarrow and geoparquet formats"
homepage = "https://github.com/stac-utils/stac-rs"
repository = "https://github.com/stac-utils/stac-rs"
keywords = ["geospatial", "stac", "metadata", "geo", "arrow", "parquet"]
categories = ["science", "data-structures"]

[dependencies]
arrow = { version = "51", default-features = false, features = ["chrono-tz"] }
arrow-json = "51" # TODO port `record_batches_to_json_rows` so we can go to the latest version
geo = "0.28"
geoarrow = "0.2"
geojson = "0.24"
geozero = { version = "0.13", features = ["with-wkb"] }
serde_json = "1"
stac = { version = "0.7", features = ["wkb"], path = "../stac" }
thiserror = "1"

[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
geoarrow = { version = "0.2", features = ["parquet"] }
parquet = "51"
stac-validate = { version = "0.1", path = "../stac-validate" }

[[bench]]
name = "read"
harness = false
57 changes: 57 additions & 0 deletions stac-arrow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# stac-arrow

[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/stac-utils/stac-rs/ci.yml?branch=main&style=for-the-badge)](https://github.com/stac-utils/stac-rs/actions/workflows/ci.yml)
[![docs.rs](https://img.shields.io/docsrs/stac-arrow?style=for-the-badge)](https://docs.rs/stac-arrow/latest/stac_arrow/)
[![Crates.io](https://img.shields.io/crates/v/stac-arrow?style=for-the-badge)](https://crates.io/crates/stac-arrow)
![Crates.io](https://img.shields.io/crates/l/stac-arrow?style=for-the-badge)
[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg?style=for-the-badge)](./CODE_OF_CONDUCT)

Read and write [STAC](https://stacspec.org/) data stored in [arrow](https://arrow.apache.org/).
Data are formatted per the [stac-geoparquet spec](https://github.com/stac-utils/stac-geoparquet/blob/main/spec/stac-geoparquet-spec.md).

## Usage

To use the library in your project:

```toml
[dependencies]
stac-arrow = "0.1"
```

## Examples

Reading from a [geoparquet](https://geoparquet.org/) file:

```rust
use std::fs::File;
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;

let file = File::open("data/naip.parquet").unwrap();
let reader = ParquetRecordBatchReaderBuilder::try_new(file)
.unwrap()
.build()
.unwrap();
let mut items = Vec::new();
for result in reader {
items.extend(stac_arrow::record_batch_to_items(result.unwrap()).unwrap());
}
assert_eq!(items.len(), 5);
```

Writing:

```rust
use stac::ItemCollection;
use std::io::Cursor;

let item_collection: ItemCollection = stac::read_json("data/naip.json").unwrap();
let mut geo_table = stac_arrow::items_to_geo_table(item_collection.items).unwrap();
let mut cursor = Cursor::new(Vec::new());
geoarrow::io::parquet::write_geoparquet(&mut geo_table, &mut cursor, None).unwrap();
```

Please see the [documentation](https://docs.rs/stac-arrow) for more usage examples.

## Other info

This crate is part of the [stac-rs](https://github.com/stac-utils/stac-rs) monorepo, see its README for contributing and license information.
41 changes: 41 additions & 0 deletions stac-arrow/benches/read.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
//! How fast is `RecordBatch` -> `Vec<Map<String, Value>>` when going through
//! full serialization vs the deprecated `record_batches_to_json_rows`?
use arrow::array::RecordBatch;
use arrow_json::ArrayWriter;
use criterion::{criterion_group, criterion_main, Criterion};
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
use serde_json::{Map, Value};
use std::fs::File;

#[allow(deprecated)]
fn record_batches_to_json_rows(record_batch: &RecordBatch) {
let _ = arrow_json::writer::record_batches_to_json_rows(&[record_batch]).unwrap();
}

fn writer(record_batch: &RecordBatch) {
let mut writer = ArrayWriter::new(Vec::new());
writer.write(record_batch).unwrap();
writer.finish().unwrap();
let _: Vec<Map<String, Value>> =
serde_json::from_reader(writer.into_inner().as_slice()).unwrap();
}

fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("read");
let file = File::open("data/naip.parquet").unwrap();
let mut reader = ParquetRecordBatchReaderBuilder::try_new(file)
.unwrap()
.build()
.unwrap();
let mut record_batch = reader.next().unwrap().unwrap();
let index = record_batch.schema().index_of("geometry").unwrap();
record_batch.remove_column(index);
group.bench_function("record_batches_to_json_rows", |b| {
b.iter(|| record_batches_to_json_rows(&record_batch))
});
group.bench_function("writer", |b| b.iter(|| writer(&record_batch)));
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
Loading

0 comments on commit 703d0aa

Please sign in to comment.