diff --git a/LICENSE b/LICENSE index 8472429..c88b3c1 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2024, zoj613 +Copyright (c) 2024, Zolisa Bleki Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index d2c458a..28f358d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,112 @@ # zarr-ml -An Ocaml implementation of the Zarr version 3 specification. +An implementation of the Zarr version 3 specification. + + +## Example +```ocaml +open Zarr +open Zarr.Codecs +open Zarr.Storage +open Zarr.Metadata +open Zarr.Extensions +module Ndarray = Owl.Dense.Ndarray.Generic + +let store = + Result.get_ok @@ + FilesystemStore.open_or_create ~file_perm:0o777 "testdata.zarr";; + +let group_node = + Result.get_ok @@ Node.of_path "/some/group";; + +FilesystemStore.create_group store group_node;; + +let array_node = + Result.get_ok @@ Node.(group_node / "name");; + +let shard_config = { + chunk_shape = [|5; 5; 10|]; + codecs = Chain.create [] (Bytes Little) [Gzip L5]; + index_codecs = Chain.create [] (Bytes Big) [Crc32c]; + index_location = Start +};; +let codec_chain = + Chain.create [Transpose [|0; 1; 2|]] (ShardingIndexed shard_config) [];; + +FilesystemStore.create_array + ~codecs:codec_chain + ~shape:[|100; 100; 50|] + ~chunks:[|15; 15; 20|] + (FillValue.Float Float.neg_infinity) + Datatype.Float32 + array_node + store;; + +FilesystemStore.find_all_nodes store |> List.map Node.to_path;; +(* - : string list = ["/"; "/some"; "/some/group/name"; "/some/group"] *) + +let slice = Owl_types.[|R [0; 20]; I 10; R []|];; +let x = + Result.get_ok @@ + FilesystemStore.get_array array_node slice Bigarray.Float32 store;; +(* + C0 C1 C2 C3 C4 C45 C46 C47 C48 C49 + R[0,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF + R[1,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF + R[2,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF + R[3,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF + R[4,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF + ... ... ... ... ... ... ... ... ... ... ... +R[16,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF +R[17,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF +R[18,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF +R[19,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF +R[20,0] -INF -INF -INF -INF -INF ... -INF -INF -INF -INF -INF *) + +(* Do some computation on the array slice *) +let x' = Ndarray.map (fun _ -> Owl_stats_dist.uniform_rvs 0. 100.) x;; +FilesystemStore.set_array array_node slice x' store;; + +FilesystemStore.get_array + array_node + Owl_types.[|R [0; 73]; L [10; 16]; R[0; 5]|] + Bigarray.Float32 + store;; +(* + C0 C1 C2 C3 C4 C5 + R[0,0] 68.0272 44.914 85.2431 39.0772 26.582 16.577 + R[0,1] -INF -INF -INF -INF -INF -INF + R[1,0] 88.418 77.0368 43.4968 45.1263 8.95641 76.9155 + R[1,1] -INF -INF -INF -INF -INF -INF + R[2,0] 98.4036 77.8744 67.6689 56.8803 37.0718 97.042 + ... ... ... ... ... ... +R[71,1] -INF -INF -INF -INF -INF -INF +R[72,0] -INF -INF -INF -INF -INF -INF +R[72,1] -INF -INF -INF -INF -INF -INF +R[73,0] -INF -INF -INF -INF -INF -INF +R[73,1] -INF -INF -INF -INF -INF -INF *) + +FilesystemStore.reshape store array_node [|25; 32; 10|];; +FilesystemStore.get_array + array_node + Owl_types.[|R []; I 10; R[0; 5]|] + Bigarray.Float32 + store;; +(* + C0 C1 C2 C3 C4 C5 + R[0,0] 68.0272 44.914 85.2431 39.0772 26.582 16.577 + R[1,0] 88.418 77.0368 43.4968 45.1263 8.95641 76.9155 + R[2,0] 98.4036 77.8744 67.6689 56.8803 37.0718 97.042 + R[3,0] 22.8653 20.1767 88.9549 22.1052 9.86822 10.8826 + R[4,0] 55.6043 93.8599 60.3723 40.543 46.8199 97.282 + ... ... ... ... ... ... +R[20,0] 61.2473 78.8035 52.3056 59.5631 78.2462 52.4205 +R[21,0] -INF -INF -INF -INF -INF -INF +R[22,0] -INF -INF -INF -INF -INF -INF +R[23,0] -INF -INF -INF -INF -INF -INF +R[24,0] -INF -INF -INF -INF -INF -INF *) + +FilesystemStore.array_metadata array_node store +|> Result.get_ok +|> ArrayMetadata.shape;; +(* - : int array = [|25; 32; 10|] *) +``` diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..57c1c23 --- /dev/null +++ b/dune-project @@ -0,0 +1,34 @@ +(lang dune 3.15) + +(name zarr) + +(generate_opam_files true) + +(source + (github zoj613/zarr-ml)) + +(authors "Author Name") + +(maintainers "Maintainer Name") + +(license BSD-3-Clause) + +(documentation https://url/to/documentation) + +(package + (name zarr) + (synopsis "A short synopsis") + (description "A longer description") + (depends + dune + (ocaml (>= 4.14.2)) + yojson + ppx_derviving_yojson + ezgzip + checkseum + stdint + owl) + (tags + (topics "to describe" your project))) + +; See the complete stanza docs at https://dune.readthedocs.io/en/stable/dune-files.html#dune-project diff --git a/lib/codecs/array_to_array.ml b/lib/codecs/array_to_array.ml new file mode 100644 index 0000000..7b301b2 --- /dev/null +++ b/lib/codecs/array_to_array.ml @@ -0,0 +1,66 @@ +module Ndarray = Owl.Dense.Ndarray.Generic + +type dimension_order = int array + +type array_to_array = + | Transpose of dimension_order + +type error = + [ `Invalid_transpose_order of dimension_order * string ] + +(* https://zarr-specs.readthedocs.io/en/latest/v3/codecs/transpose/v1.0.html *) +module TransposeCodec = struct + type config = {order : int array} [@@deriving yojson] + type transpose = config Util.ext_point [@@deriving yojson] + + let compute_encoded_size input_size = input_size + + let encode o x = + try Ok (Ndarray.transpose ~axis:o x) with + | Failure s -> Error (`Invalid_transpose_order (o, s)) + + let decode o x = + let inv_order = Array.(make (length o) 0) in + Array.iteri (fun i x -> inv_order.(x) <- i) o; + try Ok (Ndarray.transpose ~axis:inv_order x) with + | Failure s -> Error (`Invalid_transpose_order (o, s)) + + let to_yojson order = + transpose_to_yojson + {name = "transpose"; configuration = {order}} + + let parse_order o = + let o' = Array.copy o in + Array.fast_sort Int.compare o'; + if o' <> Array.init (Array.length o') Fun.id then + Error (`Invalid_transpose_order (o, "")) + else + Result.ok @@ Transpose o + + let of_yojson x = + let open Util.Result_syntax in + transpose_of_yojson x >>= fun trans -> + parse_order trans.configuration.order + >>? fun (`Invalid_transpose_order _) -> "Invalid transpose order" +end + +module ArrayToArray = struct + let compute_encoded_size input_size = function + | Transpose _ -> TransposeCodec.compute_encoded_size input_size + + let encode t x = + match t with + | Transpose order -> TransposeCodec.encode order x + + let decode t x = + match t with + | Transpose order -> TransposeCodec.decode order x + + let to_yojson = function + | Transpose order -> TransposeCodec.to_yojson order + + let of_yojson x = + match Util.get_name x with + | "transpose" -> TransposeCodec.of_yojson x + | s -> Error ("array->array codec not supported: " ^ s) +end diff --git a/lib/codecs/array_to_array.mli b/lib/codecs/array_to_array.mli new file mode 100644 index 0000000..4d55b45 --- /dev/null +++ b/lib/codecs/array_to_array.mli @@ -0,0 +1,23 @@ +module Ndarray = Owl.Dense.Ndarray.Generic + +type dimension_order = int array + +type array_to_array = + | Transpose of dimension_order + +type error = + [ `Invalid_transpose_order of dimension_order * string ] + +module ArrayToArray : sig + val compute_encoded_size : int -> array_to_array -> int + val encode + : array_to_array -> + ('a, 'b) Ndarray.t -> + (('a, 'b) Ndarray.t, [> error]) result + val decode + : array_to_array -> + ('a, 'b) Ndarray.t -> + (('a, 'b) Ndarray.t, [> error]) result + val of_yojson : Yojson.Safe.t -> (array_to_array, string) result + val to_yojson : array_to_array -> Yojson.Safe.t +end diff --git a/lib/codecs/array_to_bytes.ml b/lib/codecs/array_to_bytes.ml new file mode 100644 index 0000000..7562038 --- /dev/null +++ b/lib/codecs/array_to_bytes.ml @@ -0,0 +1,468 @@ +open Array_to_array +open Bytes_to_bytes +open Util.Result_syntax + +module Ndarray = Owl.Dense.Ndarray.Generic + +type endianness = Little | Big + +type loc = Start | End + +type array_to_bytes = + | Bytes of endianness + | ShardingIndexed of shard_config + +and shard_config = + {chunk_shape : int array + ;codecs : chain + ;index_codecs : chain + ;index_location : loc} + +and chain = { + a2a: array_to_array list; + a2b: array_to_bytes; + b2b: bytes_to_bytes list; +} + +type error = + [ `Bytes_encode_error of string + | `Bytes_decode_error of string + | Array_to_array.error + | Bytes_to_bytes.error ] + +(* https://zarr-specs.readthedocs.io/en/latest/v3/codecs/bytes/v1.0.html *) +module BytesCodec = struct + type config = {endian : string} [@@deriving yojson] + type bytes = config Util.ext_point [@@deriving yojson] + type t = endianness + + let compute_encoded_size (input_size : int) = input_size + + let endian_module = function + | Little -> (module Ebuffer.Little : Ebuffer.S) + | Big -> (module Ebuffer.Big : Ebuffer.S) + + let encode + : type a b. (a, b) Ndarray.t -> t -> (string, [> error]) result + = fun x t -> + let open Bigarray in + let open (val endian_module t) in + let buf = Buffer.create @@ Ndarray.size_in_bytes x in + match Ndarray.kind x with + | Char -> Ndarray.iter (add_char buf) x; Ok (contents buf) + | Int8_signed -> Ndarray.iter (add_int8 buf) x; Ok (contents buf) + | Int8_unsigned -> Ndarray.iter (add_uint8 buf) x; Ok (contents buf) + | Int16_signed -> Ndarray.iter (add_int16 buf) x; Ok (contents buf) + | Int16_unsigned -> Ndarray.iter (add_uint16 buf) x; Ok (contents buf) + | Int32 -> Ndarray.iter (add_int32 buf) x; Ok (contents buf) + | Int64 -> Ndarray.iter (add_int64 buf) x; Ok (contents buf) + | Float32 -> Ndarray.iter (add_float32 buf) x; Ok (contents buf) + | Float64 -> Ndarray.iter (add_float64 buf) x; Ok (contents buf) + | Complex32 -> Ndarray.iter (add_complex32 buf) x; Ok (contents buf) + | Complex64 -> Ndarray.iter (add_complex64 buf) x; Ok (contents buf) + | _ -> Error (`Bytes_encode_error "unsupported data kind.") + + let mk_array kind shape init_fn = + let x = Array.init (Util.prod shape) init_fn in + Ok (Ndarray.of_array kind x shape) + + let decode + : type a b. + string -> + (a, b) Util.array_repr -> + t -> + ((a, b) Ndarray.t, [> error]) result + = fun buf decoded t -> + let open Bigarray in + let open (val endian_module t) in + let k, shp = decoded.kind, decoded.shape in + match k, kind_size_in_bytes k with + | Char, _ -> mk_array k shp @@ get_char buf + | Int8_signed, _ -> mk_array k shp @@ get_int8 buf + | Int8_unsigned, _ -> mk_array k shp @@ get_int8 buf + | Int16_signed, s -> mk_array k shp @@ fun i -> get_int16 buf (i*s) + | Int16_unsigned, s -> mk_array k shp @@ fun i -> get_uint16 buf (i*s) + | Int32, s -> mk_array k shp @@ fun i -> get_int32 buf (i*s) + | Int64, s -> mk_array k shp @@ fun i -> get_int64 buf (i*s) + | Float32, s -> mk_array k shp @@ fun i -> get_float32 buf (i*s) + | Float64, s -> mk_array k shp @@ fun i -> get_float64 buf (i*s) + | Complex32, s -> mk_array k shp @@ fun i -> get_complex32 buf (i*s) + | Complex64, s -> mk_array k shp @@ fun i -> get_complex64 buf (i*s) + | _ -> Error (`Bytes_decode_error "unsupported data kind.") + + let to_yojson = function + | Little -> + bytes_to_yojson {name = "bytes"; configuration = {endian = "little"}} + | Big -> + bytes_to_yojson {name = "bytes"; configuration = {endian = "big"}} + + let of_yojson x = + let open Util.Result_syntax in + bytes_of_yojson x >>= fun b -> + match b.configuration.endian with + | "little" -> Ok Little + | "big" -> Ok Big + | _ -> Error "unsupported endianness." +end + +module rec ArrayToBytes : sig + val compute_encoded_size : int -> array_to_bytes -> int + val default : array_to_bytes + val encode + : ('a, 'b) Ndarray.t -> + array_to_bytes -> + (string, [> error]) result + val decode + : string -> + ('a, 'b) Util.array_repr -> + array_to_bytes -> + (('a, 'b) Ndarray.t, [> error]) result + val of_yojson : Yojson.Safe.t -> (array_to_bytes, string) result + val to_yojson : array_to_bytes -> Yojson.Safe.t +end = struct + + let default = Bytes Little + + let compute_encoded_size input_size = function + | Bytes _ -> + BytesCodec.compute_encoded_size input_size + | ShardingIndexed s -> + ShardingIndexedCodec.compute_encoded_size input_size s + + let encode + : type a b. + (a, b) Ndarray.t -> + array_to_bytes -> + (string, [> error]) result + = fun x -> function + | Bytes endian -> BytesCodec.encode x endian + | ShardingIndexed c -> ShardingIndexedCodec.encode x c + + let decode + : type a b. + string -> + (a, b) Util.array_repr -> + array_to_bytes -> + ((a, b) Ndarray.t, [> error]) result + = fun b repr -> function + | Bytes endian -> BytesCodec.decode b repr endian + | ShardingIndexed c -> ShardingIndexedCodec.decode b repr c + + let to_yojson = function + | Bytes endian -> BytesCodec.to_yojson endian + | ShardingIndexed c -> ShardingIndexedCodec.to_yojson c + + let of_yojson x = + match Util.get_name x with + | "bytes" -> + BytesCodec.of_yojson x >>| fun e -> Bytes e + | "sharding_indexed" -> + ShardingIndexedCodec.of_yojson x >>| fun c -> ShardingIndexed c + | _ -> Error ("array->bytes codec not supported: ") +end + +and ShardingIndexedCodec : sig + type t = shard_config + val compute_encoded_size : int -> t -> int + val encode + : ('a, 'b) Ndarray.t -> + t -> + (string, [> error]) result + val decode + : string -> + ('a, 'b) Util.array_repr -> + t -> + (('a, 'b) Ndarray.t, [> error]) result + val of_yojson : Yojson.Safe.t -> (t, string) result + val to_yojson : t -> Yojson.Safe.t +end = struct + + type t = shard_config + + let compute_encoded_size input_size t = + List.fold_left BytesToBytes.compute_encoded_size + (ArrayToBytes.compute_encoded_size + (List.fold_left + ArrayToArray.compute_encoded_size + input_size t.index_codecs.a2a) + t.index_codecs.a2b) + t.index_codecs.b2b + + let rec encode_chain + : type a b. + chain -> + (a, b) Ndarray.t -> + (string, [> error]) result + = fun t x -> + List.fold_left + (fun acc c -> acc >>= ArrayToArray.encode c) (Ok x) t.a2a + >>= fun y -> + List.fold_left + (fun acc c -> acc >>= BytesToBytes.encode c) + (ArrayToBytes.encode y t.a2b) t.b2b + + and encode + : type a b. + (a, b) Ndarray.t -> + shard_config -> + (string, [> error]) result + = fun x t -> + let open Util in + let open Util.Result_syntax in + let shard_shape = Ndarray.shape x in + let cps = Array.map2 (/) shard_shape t.chunk_shape in + let idx_shp = Array.append cps [|2|] in + let shard_idx = + Ndarray.create Bigarray.Int64 idx_shp Int64.max_int in + let sg = + Extensions.RegularGrid.create t.chunk_shape in + let slice = + Array.make + (Ndarray.num_dims x) (Owl_types.R []) in + let coords = Indexing.coords_of_slice slice shard_shape in + let tbl = Arraytbl.create @@ Array.length coords in + Ndarray.iteri (fun i y -> + let k, c = Extensions.RegularGrid.index_coord_pair sg coords.(i) in + Arraytbl.add tbl k (c, y)) x; + let fill_value = + Arraytbl.to_seq_values tbl + |> Seq.uncons + |> Option.get + |> fst + |> snd + in + let repr = + {kind = Ndarray.kind x + ;fill_value + ;shape = t.chunk_shape} in + let cindices = ArraySet.of_seq @@ Arraytbl.to_seq_keys tbl in + let buf = Buffer.create @@ Ndarray.size_in_bytes x in + let offset = ref 0L in + let coord = idx_shp in + ArraySet.fold (fun idx acc -> + acc >>= fun () -> + (* find_all returns bindings in reverse order. To restore the + * C-ordering of elements we must call List.rev. *) + let vals = + Arraytbl.find_all tbl idx + |> List.rev + |> List.split + |> snd + |> Array.of_list + in + let x' = Ndarray.of_array repr.kind vals t.chunk_shape in + encode_chain t.codecs x' >>| fun b -> + Buffer.add_string buf b; + let len = Array.length idx in + Array.blit idx 0 coord 0 len; + coord.(len) <- 0; + Ndarray.set shard_idx coord !offset; + coord.(len) <- 1; + let nbytes = Int64.of_int @@ String.length b in + Ndarray.set shard_idx coord nbytes; + offset := Int64.add !offset nbytes) cindices (Ok ()) + >>= fun () -> + encode_chain t.index_codecs shard_idx >>| fun b' -> + match t.index_location with + | Start -> + let buf' = Buffer.create @@ String.length b' in + Buffer.add_string buf' b'; + Buffer.add_buffer buf' buf; + Buffer.contents buf' + | End -> + Buffer.add_string buf b'; + Buffer.contents buf + + let rec decode_chain + : type a b. + chain -> + string -> + (a, b) Util.array_repr -> + ((a, b) Ndarray.t, [> error]) result + = fun t x repr -> + List.fold_right + (fun c acc -> acc >>= BytesToBytes.decode c) t.b2b (Ok x) + >>= fun y -> + List.fold_right + (fun c acc -> acc >>= ArrayToArray.decode c) + t.a2a (ArrayToBytes.decode y repr t.a2b) + + and decode_index + : string -> + int array -> + shard_config -> + ((int64, Bigarray.int64_elt) Ndarray.t * string, [> error]) result + = fun b shard_shape t -> + let open Util in + let cps = Array.map2 (/) shard_shape t.chunk_shape in + let l = index_size t cps in + let o = String.length b - l in + let b', rest = + match t.index_location with + | End -> String.sub b o l, String.sub b 0 o + | Start -> String.sub b 0 l, String.sub b l o + in + let repr = + {fill_value = Int64.max_int + ;kind = Bigarray.Int64 + ;shape = Array.append cps [|2|]} + in + decode_chain t.index_codecs b' repr >>| fun decoded -> + (decoded, rest) + + and index_size t cps = + compute_encoded_size (16 * Util.prod cps) t + + and decode + : type a b. + string -> + (a, b) Util.array_repr -> + t -> + ((a, b) Ndarray.t, [> error]) result + = fun b repr t -> + let open Util in + let open Extensions in + let open Util.Result_syntax in + decode_index b repr.shape t >>= fun (shard_idx, b') -> + if Ndarray.for_all (Int64.equal Int64.max_int) shard_idx then + Ok (Ndarray.create repr.kind repr.shape repr.fill_value) + else + let sg = RegularGrid.create t.chunk_shape in + let slice = + Array.make + (Array.length repr.shape) + (Owl_types.R []) in + (* pair (i, c) is a pair of shard chunk index (i) and shard coordinate c *) + let pair = + Array.map + (RegularGrid.index_coord_pair sg) + (Indexing.coords_of_slice slice repr.shape) in + let tbl = Arraytbl.create @@ Array.length pair in + let inner = + {kind = repr.kind + ;shape = t.chunk_shape + ;fill_value = repr.fill_value} + in + Array.fold_right (fun (idx, coord) acc -> + acc >>= fun l -> + match Arraytbl.find_opt tbl idx with + | Some arr -> + Ok (Ndarray.get arr coord :: l) + | None -> + match Ndarray.(slice_left shard_idx idx) with + | pair when Ndarray.for_all (Int64.equal Int64.max_int) pair -> + let x = Ndarray.create inner.kind inner.shape inner.fill_value in + Arraytbl.add tbl idx x; + Ok (Ndarray.get x coord :: l) + | pair -> + let p = Bigarray.array1_of_genarray pair in + let c = String.sub b' (Int64.to_int p.{0}) (Int64.to_int p.{1}) in + decode_chain t.codecs c inner >>= fun x -> + Arraytbl.add tbl idx x; + Ok (Ndarray.get x coord :: l)) pair (Ok []) + >>| fun res -> + Ndarray.of_array + inner.kind (Array.of_list res) repr.shape + + let rec chain_to_yojson chain = + [%to_yojson: Yojson.Safe.t list] @@ + List.map ArrayToArray.to_yojson chain.a2a @ + (ArrayToBytes.to_yojson chain.a2b) :: + List.map BytesToBytes.to_yojson chain.b2b + + and to_yojson t = + let codecs = chain_to_yojson t.codecs + in + let index_codecs = chain_to_yojson t.index_codecs + in + let index_location = + match t.index_location with + | End -> `String "end" + | Start -> `String "start" + in + let chunk_shape = + `List ( + Array.to_list @@ + Array.map (fun x -> `Int x) t.chunk_shape) + in + `Assoc + [("name", `String "sharding_indexed"); + ("configuration", + `Assoc + [("chunk_shape", chunk_shape); + ("index_location", index_location); + ("index_codecs", index_codecs); + ("codecs", codecs)])] + + let rec chain_of_yojson + : Yojson.Safe.t list -> (chain, string) result + = fun codecs -> + let filter_partition f encoded = + List.fold_right (fun c (l, r) -> + match f c with + | Ok v -> v :: l, r + | Error _ -> l, c :: r) encoded ([], []) + in + if List.length codecs = 0 then + Error "No codec chain specified for sharding_indexed." + else + let a2b, rest = filter_partition ArrayToBytes.of_yojson codecs in + if List.length a2b <> 1 then + Error "Must be exactly one array->bytes codec." + else + let a2a, rest = filter_partition ArrayToArray.of_yojson rest in + let b2b, rest = filter_partition BytesToBytes.of_yojson rest in + if List.length rest <> 0 then + Error ("Unsupported codec: " ^ (Util.get_name @@ List.hd rest)) + else + Ok {a2a; a2b = List.hd a2b; b2b} + + and of_yojson x = + let assoc = + Yojson.Safe.Util.(member "configuration" x |> to_assoc) + in + match + Yojson.Safe.Util.filter_map + (fun (n, v) -> if n = "chunk_shape" then Some v else None) assoc + with + | [] -> Error ("sharding_indexed must contain a chunk_shape field") + | l -> + List.fold_right (fun a acc -> + acc >>= fun k -> + match a with + | `Int x -> Ok (x :: k) + | _ -> Error "chunk_shape must only contain integers.") + (Yojson.Safe.Util.to_list @@ List.hd l) (Ok []) + >>= fun l'-> + let chunk_shape = Array.of_list l' + in + match + Yojson.Safe.Util.filter_map + (fun (n, v) -> if n = "index_location" then Some v else None) assoc + with + | [] -> Error "sharding_indexed must have a index_location field" + | l -> + (match List.hd l with + | `String "end" -> Ok End + | `String "start" -> Ok Start + | _ -> Error "index_location must only be 'end' or 'start'") + >>= fun index_location -> + match + Yojson.Safe.Util.filter_map + (fun (n, v) -> if n = "codecs" then Some v else None) assoc + with + | [] -> Error "sharding_indexed must have a codecs field" + | l -> + chain_of_yojson (Yojson.Safe.Util.to_list @@ List.hd l) + >>= fun codecs -> + match + Yojson.Safe.Util.filter_map + (fun (n, v) -> if n = "index_codecs" then Some v else None) + assoc + with + | [] -> Error "sharding_indexed must have a index_codecs field" + | l -> + chain_of_yojson (Yojson.Safe.Util.to_list @@ List.hd l) + >>| fun index_codecs -> + {index_codecs; index_location; codecs; chunk_shape} +end diff --git a/lib/codecs/array_to_bytes.mli b/lib/codecs/array_to_bytes.mli new file mode 100644 index 0000000..a3a22b1 --- /dev/null +++ b/lib/codecs/array_to_bytes.mli @@ -0,0 +1,43 @@ +module Ndarray = Owl.Dense.Ndarray.Generic + +type endianness = Little | Big + +type loc = Start | End + +type array_to_bytes = + | Bytes of endianness + | ShardingIndexed of shard_config + +and shard_config = + {chunk_shape : int array + ;codecs : chain + ;index_codecs : chain + ;index_location : loc} + +and chain = { + a2a: Array_to_array.array_to_array list; + a2b: array_to_bytes; + b2b: Bytes_to_bytes.bytes_to_bytes list; +} + +type error = + [ `Bytes_encode_error of string + | `Bytes_decode_error of string + | Array_to_array.error + | Bytes_to_bytes.error ] + +module ArrayToBytes : sig + val compute_encoded_size : int -> array_to_bytes -> int + val default : array_to_bytes + val encode + : ('a, 'b) Ndarray.t -> + array_to_bytes -> + (string, [> error]) result + val decode + : string -> + ('a, 'b) Util.array_repr -> + array_to_bytes -> + (('a, 'b) Ndarray.t, [> error]) result + val of_yojson : Yojson.Safe.t -> (array_to_bytes, string) result + val to_yojson : array_to_bytes -> Yojson.Safe.t +end diff --git a/lib/codecs/bytes_to_bytes.ml b/lib/codecs/bytes_to_bytes.ml new file mode 100644 index 0000000..4bea12a --- /dev/null +++ b/lib/codecs/bytes_to_bytes.ml @@ -0,0 +1,94 @@ +module Ndarray = Owl.Dense.Ndarray.Generic + +type compression_level = + | L0 | L1 | L2 | L3 | L4 | L5 | L6 | L7 | L8 | L9 + +type bytes_to_bytes = + | Crc32c + | Gzip of compression_level + +type error = + [ `Gzip of Ezgzip.error ] + +(* https://zarr-specs.readthedocs.io/en/latest/v3/codecs/gzip/v1.0.html *) +module GzipCodec = struct + type config = {level : int} [@@deriving yojson] + type gzip = config Util.ext_point [@@deriving yojson] + + let compute_encoded_size _ = + failwith "Cannot compute encoded size of Gzip codec." + + let to_int = function + | L0 -> 0 | L1 -> 1 | L2 -> 2 | L3 -> 3 | L4 -> 4 + | L5 -> 5 | L6 -> 6 | L7 -> 7 | L8 -> 8 | L9 -> 9 + + let of_int = function + | 0 -> Ok L0 | 1 -> Ok L1 | 2 -> Ok L2 | 3 -> Ok L3 + | 4 -> Ok L4 | 5 -> Ok L5 | 6 -> Ok L6 | 7 -> Ok L7 + | 8 -> Ok L8 | 9 -> Ok L9 | i -> + Error ("Invalid Gzip compression level: " ^ (string_of_int i)) + + let encode l x = + Result.ok @@ Ezgzip.compress ~level:(to_int l) x + + let decode x = Ezgzip.decompress x + + let to_yojson l = + gzip_to_yojson + {name = "gzip"; configuration = {level = to_int l}} + + let of_yojson x = + let open Util.Result_syntax in + gzip_of_yojson x >>= fun gzip -> + of_int gzip.configuration.level >>| fun level -> + Gzip level +end + +(* https://zarr-specs.readthedocs.io/en/latest/v3/codecs/crc32c/v1.0.html *) +module Crc32cCodec = struct + type config = {name : string} [@@deriving yojson] + + let compute_encoded_size input_size = input_size + 4 + + let encode x = + let size = String.length x in + let buf = Buffer.create size in + Buffer.add_string buf x; + Buffer.add_int32_le buf @@ + Checkseum.Crc32c.(default |> digest_string x 0 size |> to_int32); + Result.ok @@ Buffer.contents buf + + let decode x = + Ok String.(length x - 4 |> sub x 0) + + let to_yojson = + config_to_yojson {name = "crc32c"} + + let of_yojson x = + let open Util.Result_syntax in + config_of_yojson x >>= fun _ -> Ok Crc32c +end + +module BytesToBytes = struct + let compute_encoded_size input_size = function + | Gzip _ -> GzipCodec.compute_encoded_size input_size + | Crc32c -> Crc32cCodec.compute_encoded_size input_size + + let encode t x = match t with + | Gzip l -> GzipCodec.encode l x + | Crc32c -> Crc32cCodec.encode x + + let decode t x = match t with + | Gzip _ -> GzipCodec.decode x + | Crc32c -> Crc32cCodec.decode x + + let to_yojson = function + | Gzip l -> GzipCodec.to_yojson l + | Crc32c -> Crc32cCodec.to_yojson + + let of_yojson x = + match Util.get_name x with + | "gzip" -> GzipCodec.of_yojson x + | "crc32c" -> Crc32cCodec.of_yojson x + | s -> Error ("bytes->bytes codec not supported: " ^ s) +end diff --git a/lib/codecs/bytes_to_bytes.mli b/lib/codecs/bytes_to_bytes.mli new file mode 100644 index 0000000..fa5a0f5 --- /dev/null +++ b/lib/codecs/bytes_to_bytes.mli @@ -0,0 +1,19 @@ +module Ndarray = Owl.Dense.Ndarray.Generic + +type compression_level = + | L0 | L1 | L2 | L3 | L4 | L5 | L6 | L7 | L8 | L9 + +type bytes_to_bytes = + | Crc32c + | Gzip of compression_level + +type error = + [ `Gzip of Ezgzip.error ] + +module BytesToBytes : sig + val compute_encoded_size : int -> bytes_to_bytes -> int + val encode : bytes_to_bytes -> string -> (string, [> error]) result + val decode : bytes_to_bytes -> string -> (string, [> error]) result + val of_yojson : Yojson.Safe.t -> (bytes_to_bytes, string) result + val to_yojson : bytes_to_bytes -> Yojson.Safe.t +end diff --git a/lib/codecs/codecs.ml b/lib/codecs/codecs.ml new file mode 100644 index 0000000..b9d66a6 --- /dev/null +++ b/lib/codecs/codecs.ml @@ -0,0 +1,72 @@ +include Bytes_to_bytes +include Array_to_array +include Array_to_bytes +open Util.Result_syntax + +module Ndarray = Owl.Dense.Ndarray.Generic + +module Chain = struct + type t = chain + + let create a2a a2b b2b = {a2a; a2b; b2b} + (*let open Util.Result_syntax in + match a2b with + | Bytes _ -> {a2a; a2b; b2b} + | ShardingIndexed c -> + ArrayToBytes.parse_config c >>= fun () -> + {a2a; a2b; b2b} *) + + let default = + {a2a = []; a2b = ArrayToBytes.default; b2b = []} + + let compute_encoded_size input_size t = + List.fold_left BytesToBytes.compute_encoded_size + (ArrayToBytes.compute_encoded_size + (List.fold_left ArrayToArray.compute_encoded_size + input_size t.a2a) t.a2b) t.b2b + + let encode t x = + List.fold_left + (fun acc c -> acc >>= ArrayToArray.encode c) (Ok x) t.a2a + >>= fun y -> + List.fold_left + (fun acc c -> acc >>= BytesToBytes.encode c) + (ArrayToBytes.encode y t.a2b) t.b2b + + let decode t repr x = + List.fold_right + (fun c acc -> acc >>= BytesToBytes.decode c) t.b2b (Ok x) + >>= fun y -> + List.fold_right + (fun c acc -> acc >>= ArrayToArray.decode c) + t.a2a (ArrayToBytes.decode y repr t.a2b) + + let to_yojson t = + [%to_yojson: Yojson.Safe.t list] @@ + List.map ArrayToArray.to_yojson t.a2a @ + (ArrayToBytes.to_yojson t.a2b) :: + List.map BytesToBytes.to_yojson t.b2b + + let of_yojson x = + let filter_partition f encoded = + List.fold_right (fun c (l, r) -> + match f c with + | Ok v -> v :: l, r + | Error _ -> l, c :: r) encoded ([], []) + in + let codecs = Yojson.Safe.Util.to_list x + in + if List.length codecs = 0 then + Error "No codec specified." + else + let a2b, rest = filter_partition ArrayToBytes.of_yojson codecs in + if List.length a2b <> 1 then + Error "Must be exactly one array->bytes codec." + else + let a2a, rest = filter_partition ArrayToArray.of_yojson rest in + let b2b, rest = filter_partition BytesToBytes.of_yojson rest in + if List.length rest <> 0 then + Error ("Unsupported codec: " ^ (Util.get_name @@ List.hd rest)) + else + Ok {a2a; a2b = List.hd a2b; b2b} +end diff --git a/lib/codecs/codecs.mli b/lib/codecs/codecs.mli new file mode 100644 index 0000000..3e1fada --- /dev/null +++ b/lib/codecs/codecs.mli @@ -0,0 +1,61 @@ +module Ndarray = Owl.Dense.Ndarray.Generic + +type dimension_order = int array + +type array_to_array = + | Transpose of dimension_order + +type compression_level = + | L0 | L1 | L2 | L3 | L4 | L5 | L6 | L7 | L8 | L9 + +type bytes_to_bytes = + | Crc32c + | Gzip of compression_level + +type endianness = Little | Big + +type loc = Start | End + +type array_to_bytes = + | Bytes of endianness + | ShardingIndexed of shard_config + +and shard_config = + {chunk_shape : int array + ;codecs : chain + ;index_codecs : chain + ;index_location : loc} + +and chain = { + a2a: array_to_array list; + a2b: array_to_bytes; + b2b: bytes_to_bytes list; +} + +type error = Array_to_bytes.error + +module Chain : sig + type t = chain + + val create + : array_to_array list -> + array_to_bytes -> + bytes_to_bytes list -> t + + val default : t + + val compute_encoded_size : int -> t -> int + + val encode + : t -> ('a, 'b) Ndarray.t -> (string, [> error]) result + + val decode + : t -> + ('a, 'b) Util.array_repr -> + string -> + (('a, 'b) Ndarray.t, [> error]) result + + val of_yojson : Yojson.Safe.t -> (t, string) result + + val to_yojson : t -> Yojson.Safe.t +end diff --git a/lib/codecs/ebuffer.ml b/lib/codecs/ebuffer.ml new file mode 100644 index 0000000..edcf184 --- /dev/null +++ b/lib/codecs/ebuffer.ml @@ -0,0 +1,96 @@ +module type S = sig + val contents : Buffer.t -> string + val add_char : Buffer.t -> char -> unit + val add_int8 : Buffer.t -> int -> unit + val add_uint8 : Buffer.t -> int -> unit + val add_int16 : Buffer.t -> int -> unit + val add_uint16 : Buffer.t -> int -> unit + val add_int32 : Buffer.t -> int32 -> unit + val add_int64 : Buffer.t -> int64 -> unit + val add_float32 : Buffer.t -> float -> unit + val add_float64 : Buffer.t -> float -> unit + val add_complex32 : Buffer.t -> Complex.t -> unit + val add_complex64 : Buffer.t -> Complex.t -> unit + + val get_char : string -> int -> char + val get_int8 : string -> int -> int + val get_uint8 : string -> int -> int + val get_int16 : string -> int -> int + val get_uint16 : string -> int -> int + val get_int32 : string -> int -> int32 + val get_int64 : string -> int -> int64 + val get_float32 : string -> int -> float + val get_float64 : string -> int -> float + val get_complex32 : string -> int -> Complex.t + val get_complex64 : string -> int -> Complex.t +end + +module Little = struct + let contents = Buffer.contents + let add_int8 = Buffer.add_int8 + let add_char buf v = Char.code v |> add_int8 buf + let add_uint8 = Buffer.add_uint8 + let add_int16 = Buffer.add_int16_le + let add_uint16 = Buffer.add_uint16_le + let add_int32 = Buffer.add_int32_le + let add_int64 = Buffer.add_int64_le + let add_float32 buf v = Int32.bits_of_float v |> add_int32 buf + let add_float64 buf v = Int64.bits_of_float v |> add_int64 buf + let add_complex32 buf Complex.{re; im} = + Int32.bits_of_float re |> add_int32 buf; + Int32.bits_of_float im |> add_int32 buf + let add_complex64 buf Complex.{re; im} = + Int64.bits_of_float re |> add_int64 buf; + Int64.bits_of_float im |> add_int64 buf + + let get_int8 = String.get_int8 + let get_char buf i = get_int8 buf i |> Char.chr + let get_uint8 = String.get_uint8 + let get_int16 = String.get_int16_le + let get_uint16 = String.get_uint16_le + let get_int32 = String.get_int32_le + let get_int64 = String.get_int64_le + let get_float32 buf i = get_int32 buf i |> Int32.float_of_bits + let get_float64 buf i = get_int64 buf i |> Int64.float_of_bits + let get_complex32 buf i = + let re, im = get_float32 buf i, get_float32 buf (i + 4) in + Complex.{re; im} + let get_complex64 buf i = + let re, im = get_float64 buf i, get_float64 buf (i + 8) in + Complex.{re; im} +end + +module Big = struct + let contents = Buffer.contents + let add_int8 = Buffer.add_int8 + let add_char buf v = Char.code v |> add_int8 buf + let add_uint8 = Buffer.add_uint8 + let add_int16 = Buffer.add_int16_be + let add_uint16 = Buffer.add_uint16_be + let add_int32 = Buffer.add_int32_be + let add_int64 = Buffer.add_int64_be + let add_float32 buf v = Int32.bits_of_float v |> Buffer.add_int32_be buf + let add_float64 buf v = Int64.bits_of_float v |> Buffer.add_int64_be buf + let add_complex32 buf Complex.{re; im} = + Int32.bits_of_float re |> Buffer.add_int32_be buf; + Int32.bits_of_float im |> Buffer.add_int32_be buf + let add_complex64 buf Complex.{re; im} = + Int64.bits_of_float re |> Buffer.add_int64_be buf; + Int64.bits_of_float im |> Buffer.add_int64_be buf + + let get_int8 = String.get_int8 + let get_char buf i = get_int8 buf i |> Char.chr + let get_uint8 = String.get_uint8 + let get_int16 = String.get_int16_be + let get_uint16 = String.get_uint16_be + let get_int32 = String.get_int32_be + let get_int64 = String.get_int64_be + let get_float32 buf i = get_int32 buf i |> Int32.float_of_bits + let get_float64 buf i = get_int64 buf i |> Int64.float_of_bits + let get_complex32 buf i = + let re, im = get_float32 buf i, get_float32 buf (i + 4) in + Complex.{re; im} + let get_complex64 buf i = + let re, im = get_float64 buf i, get_float64 buf (i + 8) in + Complex.{re; im} +end diff --git a/lib/codecs/ebuffer.mli b/lib/codecs/ebuffer.mli new file mode 100644 index 0000000..0cbbd7b --- /dev/null +++ b/lib/codecs/ebuffer.mli @@ -0,0 +1,30 @@ +module type S = sig + val contents : Buffer.t -> string + val add_char : Buffer.t -> char -> unit + val add_int8 : Buffer.t -> int -> unit + val add_uint8 : Buffer.t -> int -> unit + val add_int16 : Buffer.t -> int -> unit + val add_uint16 : Buffer.t -> int -> unit + val add_int32 : Buffer.t -> int32 -> unit + val add_int64 : Buffer.t -> int64 -> unit + val add_float32 : Buffer.t -> float -> unit + val add_float64 : Buffer.t -> float -> unit + val add_complex32 : Buffer.t -> Complex.t -> unit + val add_complex64 : Buffer.t -> Complex.t -> unit + + val get_char : string -> int -> char + val get_int8 : string -> int -> int + val get_uint8 : string -> int -> int + val get_int16 : string -> int -> int + val get_uint16 : string -> int -> int + val get_int32 : string -> int -> int32 + val get_int64 : string -> int -> int64 + val get_float32 : string -> int -> float + val get_float64 : string -> int -> float + val get_complex32 : string -> int -> Complex.t + val get_complex64 : string -> int -> Complex.t +end + +module Little : sig include S end + +module Big : sig include S end diff --git a/lib/dune b/lib/dune new file mode 100644 index 0000000..6cbd40a --- /dev/null +++ b/lib/dune @@ -0,0 +1,15 @@ +(library + (name zarr) + (public_name zarr) + (ocamlopt_flags (:standard -O3)) + (libraries + yojson + ppx_deriving_yojson.runtime + ezgzip + owl + stdint + checkseum) + (preprocess + (pps ppx_deriving_yojson))) + +(include_subdirs unqualified) diff --git a/lib/extensions.ml b/lib/extensions.ml new file mode 100644 index 0000000..957dffc --- /dev/null +++ b/lib/extensions.ml @@ -0,0 +1,141 @@ +open Util.Result_syntax + +module RegularGrid = struct + type config = + {chunk_shape : int array} [@@deriving yojson] + type chunk_grid = + config Util.ext_point [@@deriving yojson] + type t = int array + + let chunk_shape t = t + + let create chunk_shape = chunk_shape + + let ceildiv x y = + Float.(to_int @@ ceil (of_int x /. of_int y)) + + let floordiv x y = + Float.(to_int @@ floor (of_int x /. of_int y)) + + let grid_shape t array_shape = + Array.map2 ceildiv array_shape t + + let index_coord_pair t coord = + Array.map2 + (fun x y -> floordiv x y, Int.rem x y) coord t + |> Array.split + + (* returns all chunk indices in this regular grid *) + let indices t array_shape = + grid_shape t array_shape + |> Array.to_list + |> List.map (fun x -> List.init x Fun.id) + |> Util.Indexing.cartesian_prod + |> List.map Array.of_list + + let to_yojson t = + chunk_grid_to_yojson + {name = "regular"; configuration = {chunk_shape = t}} + + let of_yojson x = + chunk_grid_of_yojson x >>= fun y -> + if y.name <> "regular" then + Error ("chunk grid name should be 'regular' not: " ^ y.name) + else + Ok y.configuration.chunk_shape +end + +type separator = Dot | Slash + +module ChunkKeyEncoding = struct + type encoding = Default | V2 + type config = {separator : string} [@@deriving yojson] + type key_encoding = config Util.ext_point [@@deriving yojson] + type t = {encoding : encoding; sep : string} + + let create = function + | Dot -> {encoding = Default; sep = "."} + | Slash -> {encoding = Default; sep = "/"} + + (* map a chunk coordinate index to a key. E.g, (2,3,1) maps to c/2/3/1 *) + let encode t index = + let f i acc = + string_of_int i :: acc + in + match t.encoding with + | Default -> + String.concat t.sep @@ + "c" :: Array.fold_right f index [] + | V2 -> + if Array.length index = 0 + then + "0" + else + String.concat t.sep @@ + Array.fold_right f index [] + + let to_yojson t = + match t.encoding with + | Default -> + key_encoding_to_yojson + {name = "default"; configuration = {separator = t.sep}} + | V2 -> + key_encoding_to_yojson + {name = "v2"; configuration = {separator = t.sep}} + + let of_yojson x = + key_encoding_of_yojson x >>= fun y -> + match y with + | {name = "default"; configuration = {separator = "/"}} -> + Ok {encoding = Default; sep = "/"} + | {name = "default"; configuration = {separator = "."}} -> + Ok {encoding = Default; sep = "."} + | {name = "v2"; configuration = {separator = "."}} -> + Ok {encoding = V2; sep = "."} + | {name = "v2"; configuration = {separator = "/"}} -> + Ok {encoding = V2; sep = "/"} + | {name = e; configuration = {separator = s}} -> + Error ("Unsupported chunk key configuration: " ^ e ^ ", " ^ s) +end + +module Datatype = struct + type t = + | Char + | Int8 + | Uint8 + | Int16 + | Uint16 + | Int32 + | Int64 + | Float32 + | Float64 + | Complex32 + | Complex64 + + let to_yojson = function + | Char -> `String "char" + | Int8 -> `String "int8" + | Uint8 -> `String "uint8" + | Int16 -> `String "int16" + | Uint16 -> `String "uint16" + | Int32 -> `String "int32" + | Int64 -> `String "int64" + | Float32 -> `String "float32" + | Float64 -> `String "float64" + | Complex32 -> `String "complex32" + | Complex64 -> `String "complex64" + + let of_yojson = function + | `String "char" -> Ok Char + | `String "int8" -> Ok Int8 + | `String "uint8" -> Ok Uint8 + | `String "int16" -> Ok Int16 + | `String "uint16" -> Ok Uint16 + | `String "int32" -> Ok Int32 + | `String "int64" -> Ok Int64 + | `String "float32" -> Ok Float32 + | `String "float64" -> Ok Float64 + | `String "complex32" -> Ok Complex32 + | `String "complex64" -> Ok Complex64 + | _ -> Error ("Unsupported metadata data_type") +end diff --git a/lib/extensions.mli b/lib/extensions.mli new file mode 100644 index 0000000..b86ecc6 --- /dev/null +++ b/lib/extensions.mli @@ -0,0 +1,44 @@ +module RegularGrid : sig + type t + val create : int array -> t + val chunk_shape : t -> int array + val grid_shape : t -> int array -> int array + val indices : t -> int array -> int array list + val index_coord_pair : t -> int array -> int array * int array + val of_yojson : Yojson.Safe.t -> (t, string) result + val to_yojson : t -> Yojson.Safe.t +end + +type separator = Dot | Slash +(** A type representing the separator in an array chunk's key encoding. + For example, [Dot] is "/", and is used to encode the chunk index + [(0, 3, 5)] as [0/3/5]. *) + +module ChunkKeyEncoding : sig + type t + val create : separator -> t + val encode : t -> int array -> string + val of_yojson : Yojson.Safe.t -> (t, string) result + val to_yojson : t -> Yojson.Safe.t +end + +module Datatype : sig + (** Data types as defined in the Zarr V3 specification *) + + type t = + | Char + | Int8 + | Uint8 + | Int16 + | Uint16 + | Int32 + | Int64 + | Float32 + | Float64 + | Complex32 + | Complex64 + (** A type for the supported data types of a Zarr array. *) + + val of_yojson : Yojson.Safe.t -> (t, string) result + val to_yojson : t -> Yojson.Safe.t +end diff --git a/lib/metadata.ml b/lib/metadata.ml new file mode 100644 index 0000000..ce03f91 --- /dev/null +++ b/lib/metadata.ml @@ -0,0 +1,228 @@ +type error = + [ `Json_decode_error of string ] + +module FillValue = struct + type t = + | Char of char + | Bool of bool + | Int of int + | Float of float + | FloatBits of float + | IntComplex of Complex.t + | FloatComplex of Complex.t + | FFComplex of Complex.t + | FBComplex of Complex.t + | BFComplex of Complex.t + | BBComplex of Complex.t + + let rec of_yojson x = + let open Util.Result_syntax in + match x with + | `Bool b -> Ok (Bool b) + | `Int i -> Ok (Int i) + | `Float f -> Ok (Float f) + | `String "Infinity" -> + Ok (Float Float.infinity) + | `String "-Infinity" -> + Ok (Float Float.neg_infinity) + | `String "NaN" -> + Ok (Float Float.nan) + | `String s when String.length s = 1 -> + Ok (Char (String.get s 0)) + | `String s when String.starts_with ~prefix:"0x" s -> + let b = Int64.of_string s in + Ok (FloatBits (Int64.float_of_bits b)) + | `List [`Int x; `Int y] -> + let re = Float.of_int x + and im = Float.of_int y in + Ok (IntComplex Complex.{re; im}) + | `List [`Float re; `Float im] -> + Ok (FloatComplex Complex.{re; im}) + | `List [`String _ as a; `String _ as b] -> + of_yojson a >>= fun x -> + of_yojson b >>= fun y -> + (match x, y with + | Float re, Float im -> + Ok (FFComplex Complex.{re; im}) + | Float re, FloatBits im -> + Ok (FBComplex Complex.{re; im}) + | FloatBits re, Float im -> + Ok (BFComplex Complex.{re; im}) + | FloatBits re, FloatBits im -> + Ok (BBComplex Complex.{re; im}) + | _ -> Error "Unsupported fill value") + | _ -> Error "Unsupported fill value." + + let rec to_yojson = function + | Bool b -> `Bool b + | Int i -> `Int i + | Char c -> + `String (String.of_seq @@ List.to_seq [c]) + | Float f when f = Float.infinity -> + `String "Infinity" + | Float f when f = Float.neg_infinity -> + `String "-Infinity" + | Float f when f = Float.nan -> + `String "NaN" + | Float f -> `Float f + | FloatBits f -> + `String (Stdint.Int64.to_string_hex @@ Int64.bits_of_float f) + | IntComplex Complex.{re; im} -> + `List [`Int (Float.to_int re); `Int (Float.to_int im)] + | FloatComplex Complex.{re; im} -> + `List [`Float re; `Float im] + | FFComplex Complex.{re; im} -> + `List [to_yojson (Float re); to_yojson (Float im)] + | FBComplex Complex.{re; im} -> + `List [to_yojson (Float re); to_yojson (FloatBits im)] + | BFComplex Complex.{re; im} -> + `List [to_yojson (FloatBits re); to_yojson (Float im)] + | BBComplex Complex.{re; im} -> + `List [to_yojson (FloatBits re); to_yojson (FloatBits im)] +end + +module ArrayMetadata = struct + type t = + {zarr_format : int + ;shape : int array + ;node_type : string + ;data_type : Extensions.Datatype.t + ;codecs : Codecs.Chain.t + ;fill_value : FillValue.t + ;chunk_grid : Extensions.RegularGrid.t + ;chunk_key_encoding : Extensions.ChunkKeyEncoding.t + ;attributes : Yojson.Safe.t option [@yojson.option] + ;dimension_names : string option list option [@yojson.option] + ;storage_transformers : Yojson.Safe.t Util.ext_point list option [@yojson.option]} + [@@deriving yojson] + + let create + ?(sep=Extensions.Slash) + ?(codecs=Codecs.Chain.default) + ~shape + fill_value + data_type + chunks + = + {shape + ;codecs + ;fill_value + ;data_type + ;chunk_grid = Extensions.RegularGrid.create chunks + ;chunk_key_encoding = Extensions.ChunkKeyEncoding.create sep + ;zarr_format = 3 + ;node_type = "array" + ;storage_transformers = None + ;dimension_names = None + ;attributes = None} + + let shape t = t.shape + + let codecs t = t.codecs + + let dtype t = t.data_type + + let fill_value t = t.fill_value + + let ndim t = Array.length @@ shape t + + let dimension_names t = t.dimension_names + + let attributes t : Yojson.Safe.t option = t.attributes + + let chunk_shape t = + Extensions.RegularGrid.chunk_shape t.chunk_grid + + let grid_shape t shape = + Extensions.RegularGrid.grid_shape t.chunk_grid shape + + let index_coord_pair t coord = + Extensions.RegularGrid.index_coord_pair t.chunk_grid coord + + let chunk_key t index = + Extensions.ChunkKeyEncoding.encode t.chunk_key_encoding index + + let chunk_indices t shape = + Extensions.RegularGrid.indices t.chunk_grid shape + + let encode t = + Yojson.Safe.to_string @@ to_yojson t + + let decode b = + let open Util.Result_syntax in + of_yojson @@ Yojson.Safe.from_string b >>? fun s -> + `Json_decode_error s + + let update_attributes attrs t = + {t with attributes = Some attrs} + + let update_shape t shape = {t with shape} + + let is_valid_kind + : type a b. t -> (a, b) Bigarray.kind -> bool + = fun t kind -> + match kind, t.data_type with + | Bigarray.Char, Extensions.Datatype.Char + | Bigarray.Int8_signed, Extensions.Datatype.Int8 + | Bigarray.Int8_unsigned, Extensions.Datatype.Uint8 + | Bigarray.Int16_signed, Extensions.Datatype.Int16 + | Bigarray.Int16_unsigned, Extensions.Datatype.Uint16 + | Bigarray.Int32, Extensions.Datatype.Int32 + | Bigarray.Int64, Extensions.Datatype.Int64 + | Bigarray.Float32, Extensions.Datatype.Float32 + | Bigarray.Float64, Extensions.Datatype.Float64 + | Bigarray.Complex32, Extensions.Datatype.Complex32 + | Bigarray.Complex64, Extensions.Datatype.Complex64 -> true + | _ -> false + + let fillvalue_of_kind + : type a b. t -> (a, b) Bigarray.kind -> a + = fun t kind -> + match kind, t.fill_value with + | Bigarray.Char, FillValue.Char c -> c + | Bigarray.Int8_signed, FillValue.Int i -> i + | Bigarray.Int8_unsigned, FillValue.Int i -> i + | Bigarray.Int16_signed, FillValue.Int i -> i + | Bigarray.Int16_unsigned, FillValue.Int i -> i + | Bigarray.Int32, FillValue.Int i -> Int32.of_int i + | Bigarray.Int64, FillValue.Int i -> Int64.of_int i + | Bigarray.Float32, FillValue.Float f -> f + | Bigarray.Float32, FillValue.FloatBits f -> f + | Bigarray.Float64, FillValue.Float f -> f + | Bigarray.Float64, FillValue.FloatBits f -> f + | Bigarray.Complex32, FillValue.IntComplex c -> c + | Bigarray.Complex32, FillValue.FloatComplex c -> c + | Bigarray.Complex32, FillValue.FFComplex c -> c + | Bigarray.Complex32, FillValue.FBComplex c -> c + | Bigarray.Complex32, FillValue.BFComplex c -> c + | Bigarray.Complex32, FillValue.BBComplex c -> c + | Bigarray.Complex64, FillValue.IntComplex c -> c + | Bigarray.Complex64, FillValue.FloatComplex c -> c + | Bigarray.Complex64, FillValue.FFComplex c -> c + | Bigarray.Complex64, FillValue.FBComplex c -> c + | Bigarray.Complex64, FillValue.BFComplex c -> c + | Bigarray.Complex64, FillValue.BBComplex c -> c + | _ -> failwith "kind is not compatible with node's fill value." +end + +module GroupMetadata = struct + type t = + {zarr_format : int + ;node_type : string + ;attributes : Yojson.Safe.t option [@yojson.option]} + [@@deriving yojson] + + let default = + {zarr_format = 3; node_type = "group"; attributes = None} + + let decode s = + let open Util.Result_syntax in + of_yojson @@ Yojson.Safe.from_string s >>? fun s -> + `Json_decode_error s + + let encode t = + Yojson.Safe.to_string @@ to_yojson t + + let update_attributes attrs t = + {t with attributes = Some attrs} +end diff --git a/lib/metadata.mli b/lib/metadata.mli new file mode 100644 index 0000000..83fd718 --- /dev/null +++ b/lib/metadata.mli @@ -0,0 +1,151 @@ +(** This module provides functionality for manipulating a Zarr node's + metadata JSON document. + + The Zarr V3 specification defines two types of metadata documents: + array and group metadata. Both types are stored under the key + [zarr.json] within the prefix of a group or array.*) + +type error = + [ `Json_decode_error of string ] +(** A type for JSON decoding errors. *) + +module FillValue : sig + type t = + | Char of char (** A single character string. *) + | Bool of bool (** Must be a JSON boolean. *) + | Int of int (** Value must be a JSON number with no fractional or exponent part that is within the representable range of the corresponding integer data type. *) + | Float of float (** Value representing a JSON float. *) + | FloatBits of float (** A JSON string specifying a byte representation of the float a hexstring. *) + | IntComplex of Complex.t (** A JSON 2-element array of integers representing a complex number. *) + | FloatComplex of Complex.t (** A JSON 2-element array of floats representing a complex number. *) + | FFComplex of Complex.t + | FBComplex of Complex.t + | BFComplex of Complex.t + | BBComplex of Complex.t + (** Provides an element value to use for uninitialised portions of + a Zarr array. The permitted values depend on the data type. *) +end + +module ArrayMetadata : sig + (** A module which contains functionality to work with a parsed JSON + Zarr array metadata document. *) + + type t + (** A type representing a parsed array metadata document. *) + + val create : + ?sep:Extensions.separator -> + ?codecs:Codecs.Chain.t -> + shape:int array -> + FillValue.t -> + Extensions.Datatype.t -> + int array -> + t + (** [create ~shape fv dtype cshp] Creates a new array metadata document + with shape [shape], fill value [fv], data type [dtype] and chunk shape + [cshp]. *) + + val encode : t -> string + (** [encode t] returns a byte string representing a JSON Zarr array metadata. *) + + val decode : string -> (t, [> error]) result + (** [decode s] decodes a bytes string [s] into a {!Metadata.ArrayMetadata.t} + type, and returns an {!Metadata.error} error if the decoding process fails. *) + + val shape : t -> int array + (** [shape t] returns the shape of the zarr array represented by metadata type [t]. *) + + val fill_value : t -> FillValue.t + (** [fill_value t] returns the fill value of the zarra array represented by [t]. *) + + val ndim : t -> int + (** [ndim t] returns the number of dimension in a Zarr array. *) + + val chunk_shape : t -> int array + (** [chunk_shape t] returns the shape a chunk in this zarr array. *) + + val dtype : t -> Extensions.Datatype.t + (** [dtype t] returns the data type as specified in the array metadata. *) + + val is_valid_kind : t -> ('a, 'b) Bigarray.kind -> bool + (** [is_valid_kind t kind] checks if [kind] is a valid {!Bigarray.kind} that + matches the data type of the zarr array represented by this metadata type. *) + + val fillvalue_of_kind : t -> ('a, 'b) Bigarray.kind -> 'a + (** [fillvalue_of_kind t kind] returns the fill value of uninitialized + chunks in this zarr array given [kind]. + + @raises [Failure] if the kind is not compatible with this array's fill value. *) + + val attributes : t -> Yojson.Safe.t option + (** [attributes t] Returns a Yojson type containing user attributes assigned + to the zarr array represented by [t]. *) + + val dimension_names : t -> string option list option + (** [dimension_name t] returns a list of dimension names, if any are + defined in the array's JSON metadata document. *) + + val codecs : t -> Codecs.Chain.t + (** [codecs t] Returns a type representing the chain of codecs applied + when decoding/encoding a Zarr array chunk. *) + + val grid_shape : t -> int array -> int array + (** [grip_shape t] returns the shape of the Zarr array's regular chunk grid, + as defined in the Zarr V3 specification. *) + + val index_coord_pair : t -> int array -> int array * int array + (** [index_coord_pair t coord] maps a coordinate of this Zarr array to + a pair of chunk index and coordinate {i within} that chunk. *) + + val chunk_indices : t -> int array -> int array list + (** [chunk_indices t shp] returns a list of all chunk indices that would + be contained in a zarr array of shape [shp] given the regular grid + defined in array metadata [t]. *) + + val chunk_key : t -> int array -> string + (** [chunk_key t idx] returns a key encoding of a the chunk index [idx]. *) + + val update_attributes : Yojson.Safe.t -> t -> t + (** [update_attributes json t] returns a new metadata type with an updated + attribute field containing contents in [json] *) + + val update_shape : t -> int array -> t + (** [update_shape t new_shp] returns a new metadata type containing + shape [new_shp]. *) + + val of_yojson : Yojson.Safe.t -> (t, string) result + (** [of_yojson json] converts a {!Yojson.Safe.t} object into a {!Metadata.ArrayMetadata.t} + and returns an error message upon failure. *) + + val to_yojson : t -> Yojson.Safe.t + (** [to_yojson t] serializes an array metadata type into a {!Yojson.Safe.t} object. *) +end + +module GroupMetadata : sig + (** A module which contains functionality to work with a parsed JSON + Zarr group metadata document. *) + + type t + (** A type representing a parsed group metadata document. *) + + val default : t + (** Return a group metadata type with default values for all fields. *) + + val encode : t -> string + (** [encode t] returns a byte string representing a JSON Zarr group metadata. *) + + val decode : string -> (t, [> error]) result + (** [decode s] decodes a bytes string [s] into a {!Metadata.GroupMetadata.t} + type, and returns an {!Metadata.error} error if the decoding process fails. *) + + val update_attributes : Yojson.Safe.t -> t -> t + (** [update_attributes json t] returns a new metadata type with an updated + attribute field containing contents in [json]. *) + + val of_yojson : Yojson.Safe.t -> (t, string) result + (** [of_yojson json] converts a {!Yojson.Safe.t} object into a {!Metadata.GroupMetadata.t} + and returns an error message upon failure. *) + + val to_yojson : t -> Yojson.Safe.t + (** [to_yojson t] serializes a group metadata type into a {!Yojson.Safe.t} object. *) +end diff --git a/lib/node.ml b/lib/node.ml new file mode 100644 index 0000000..3340b39 --- /dev/null +++ b/lib/node.ml @@ -0,0 +1,89 @@ +type t = + | Root + | Cons of t * name +and name = string + +type error = + [ `Node_invariant_error of string ] + +(* Check if the path's name satisfies path invariants *) +let rep_ok name = + (String.empty <> name) && + not (String.contains name '/') && + not (String.for_all (Char.equal '.') name) && + not (String.starts_with ~prefix:"__" name) + +let root = Root + +let unsafe_create p n = Cons (p, n) + +let create parent name = + if rep_ok name then + Ok (unsafe_create parent name) + else + Error (`Node_invariant_error name) + +let ( / ) = create + +let of_path = function + | "/" -> Ok Root + | str -> + if not String.(starts_with ~prefix:"/" str) then + Result.error @@ + `Node_invariant_error "path should start with a /" + else if String.ends_with ~suffix:"/" str then + Result.error @@ + `Node_invariant_error "path should not end with a /" + else + Result.ok @@ + List.fold_left + unsafe_create + Root (List.tl @@ String.split_on_char '/' str) + +let name = function + | Root -> "" + | Cons (_, n) -> n + +let parent = function + | Root -> None + | Cons (parent, _) -> Some parent + +let rec fold f acc = function + | Root -> f acc Root + | Cons (parent, _) as p -> + fold f (f acc p) parent + +let rec ( = ) x y = + match x, y with + | Root, Root -> true + | Root, Cons _ | Cons _, Root -> false + | Cons (p, n), Cons (q, m) -> ( = ) p q && String.equal n m + +let to_path = function + | Root -> "/" + | p -> + fold (fun acc -> function + | Root -> acc + | Cons (_, n) -> "/" :: n :: acc) [] p + |> String.concat "" + +let ancestors p = + fold (fun acc -> function + | Root -> acc + | Cons (parent, _) -> parent :: acc) [] p + +let to_key p = + let str = to_path p in + String.(length str - 1 |> sub str 1) + +let to_prefix = function + | Root -> "" + | p -> to_key p ^ "/" + +let to_metakey p = + to_prefix p ^ "zarr.json" + +let is_parent x y = + match x, y with + | Root, _ -> false + | Cons (parent, _), v -> parent = v diff --git a/lib/node.mli b/lib/node.mli new file mode 100644 index 0000000..13e69d8 --- /dev/null +++ b/lib/node.mli @@ -0,0 +1,70 @@ +(** This module provides functionality for manipulating Zarr nodes. + + A Zarr V3 node is associated with either a group or an array. + All nodes in a hierarchy have a name and a path. The root node does not + have a name and is the empty string "". Except for the root node, each + node in a hierarchy must have a name, which is a string of unicode code + points. The following constraints apply to node names: + - must not be the empty string (""). + - must not include the character "/". + - must not be a string composed only of period characters, e.g. "." or "..". + - must not start with the reserved prefix "__".*) + +type t +(** The type of a node. *) + +type error = + [ `Node_invariant_error of string ] +(** The error type for operations on the {!Node.t} type. It is returned by + functions that create a {!Node.t} type when one or more of a Node's + invariants are not satisfied as defined in the Zarr V3 specification.*) + +val root : t +(** Returns the root node *) + +val create : t -> string -> (t, [> error]) result +(** [create p n] returns a node with parent [p] and name [n] + or an error of type {!error} if this operation fails. *) + +val ( / ) : t -> string -> (t, [> error]) result +(** The infix operator alias of {!Node.create} *) + +val of_path : string -> (t, [> error]) result +(** [of_path s] returns a node from string [s] or an error of + type {!error} upon failure. *) + +val to_path : t -> string +(** [to_path n] returns node [n] as a string path. *) + +val name : t -> string +(** [name n] returns the name of node [n]. The root node does not have a + name and thus the empty string [""] is returned if [n] is a root node. *) + +val parent : t -> t option +(** [parent n] returns [Some p] where [p] is the parent node of [n] + of [None] if node [n] is the root node. *) + +val ( = ) : t -> t -> bool +(** [x = y] returns [true] if nodes [x] and [y] are equal, + and [false] otherwise. *) + +val ancestors : t -> t list +(** [ancestors n] returns ancestor nodes of [n] including the root node. + The root node has no ancestors, thus this returns the empty list + is called on a root node. *) + +val to_key : t -> string +(** [to_key n] converts a node's path to a key, as defined in the Zarr V3 + specification. *) + +val to_prefix : t -> string +(** [to_prefix n] converts a node's path to a prefix key, as defined + in the Zarr V3 specification. *) + +val to_metakey : t -> string +(** [to_prefix n] returns the metadata key associated with node [n], + as defined in the Zarr V3 specification. *) + +val is_parent : t -> t -> bool +(** [is_parent n m] Tests if node [n] is a the immediate parent of + node [m]. Returns [true] when the test passes and [false] otherwise. *) diff --git a/lib/storage/base.ml b/lib/storage/base.ml new file mode 100644 index 0000000..07cc6b3 --- /dev/null +++ b/lib/storage/base.ml @@ -0,0 +1,93 @@ +open Interface + +(* general implementation agnostic STORE interface functions *) + +module StrSet = Set.Make (String) + +let erase_values ~erase_fn t keys = + StrSet.iter (erase_fn t) @@ StrSet.of_list keys + +let erase_prefix ~list_fn ~erase_fn t pre = + List.iter (fun k -> + if String.starts_with ~prefix:pre k + then begin + erase_fn t k + end) @@ list_fn t + +let list_prefix ~list_fn t pre = + List.filter + (String.starts_with ~prefix:pre) + (list_fn t) + +let list_dir ~list_fn t pre = + let paths = + List.map + (fun k -> + Result.get_ok @@ + Node.of_path @@ + String.cat "/" k) + (list_prefix ~list_fn t pre) + in + let is_prefix_child k = + match Node.parent k with + | Some par -> + String.equal pre @@ Node.to_prefix par + | None -> false in + let keys, rest = + List.partition_map (fun k -> + match is_prefix_child k with + | true -> Either.left @@ Node.to_key k + | false -> Either.right k) + paths + in + let prefixes = + List.fold_left (fun acc k -> + match + List.find_opt + is_prefix_child + (Node.ancestors k) + with + | None -> acc + | Some v -> + let w = Node.to_prefix v in + if List.mem w acc then acc + else w :: acc) + [] rest + in + keys, prefixes + +let rec get_partial_values ~get_fn t kr_pairs = + match kr_pairs with + | [] -> [None] + | (k, r) :: xs -> + match get_fn t k with + | Error _ -> + None :: (get_partial_values ~get_fn t xs) + | Ok v -> + try + let sub = match r with + | ByteRange (rs, None) -> + String.sub v rs @@ String.length v + | ByteRange (rs, Some rl) -> + String.sub v rs rl in + Some sub :: (get_partial_values ~get_fn t xs) + with + | Invalid_argument _ -> + None :: (get_partial_values ~get_fn t xs) + +let rec set_partial_values ~set_fn ~get_fn t = function + | [] -> Ok () + | (k, rs, v) :: xs -> + match get_fn t k with + | Error _ -> + set_fn t k v; + set_partial_values ~set_fn ~get_fn t xs + | Ok ov -> + try + let ov' = Bytes.of_string ov in + String.(length v |> blit v 0 ov' rs); + set_fn t k @@ Bytes.to_string ov'; + set_partial_values ~set_fn ~get_fn t xs + with + | Invalid_argument s -> + Error (`Invalid_byte_range s) diff --git a/lib/storage/filesystem.ml b/lib/storage/filesystem.ml new file mode 100644 index 0000000..0eca39a --- /dev/null +++ b/lib/storage/filesystem.ml @@ -0,0 +1,115 @@ +module Impl = struct + type t = + {dirname : string; file_perm : Unix.file_perm} + + let fspath_to_key t p = + let ld = String.length t.dirname in + String.sub p ld @@ String.length p - ld + + let key_to_fspath t key = t.dirname ^ key + + (* Obtained from: + https://discuss.ocaml.org/t/how-to-create-a-new-file-while-automatically-creating-any-intermediate-directories/14837/5?u=zoj613 *) + let rec create_parent_dir fn perm = + let parent_dir = Filename.dirname fn in + if not (Sys.file_exists parent_dir) then begin + create_parent_dir parent_dir perm; + Sys.mkdir parent_dir perm + end + + let get t key = + let fpath = key_to_fspath t key in + try + In_channel.with_open_gen + In_channel.[Open_rdonly] + t.file_perm + fpath + (fun ic -> Ok (In_channel.input_all ic)) + with + | Sys_error _ | End_of_file -> + Error (`Store_read_error fpath) + + let set t key value = + let filename = key_to_fspath t key in + create_parent_dir filename t.file_perm; + Out_channel.with_open_gen + Out_channel.[Open_wronly; Open_trunc; Open_creat] + t.file_perm + filename + (fun oc -> Out_channel.output_string oc value) + + let list t = + let module StrSet = Base.StrSet in + let rec aux acc path = + try + match Sys.readdir path with + | [||] -> acc + | xs -> + Array.fold_left (fun set x -> + match path ^ x with + | p when Sys.is_directory p -> + aux set @@ p ^ "/" + | p -> + StrSet.add (fspath_to_key t p) set) acc xs + with + | Sys_error _ -> acc + in + match + StrSet.to_list @@ + aux StrSet.empty @@ + key_to_fspath t "" + with + | [] -> [] + | xs -> "" :: xs + + let is_member t key = + Sys.file_exists @@ key_to_fspath t key + + let erase t key = + try Sys.remove @@ key_to_fspath t key with + | Sys_error _ -> () + + let get_partial_values t kr_pairs = + Base.get_partial_values ~get_fn:get t kr_pairs + + let set_partial_values t krv_triplet = + Base.set_partial_values ~set_fn:set ~get_fn:get t krv_triplet + + let erase_values t keys = + Base.erase_values ~erase_fn:erase t keys + + let erase_prefix t pre = + Base.erase_prefix ~list_fn:list ~erase_fn:erase t pre + + let list_prefix pre t = + Base.list_prefix ~list_fn:list t pre + + let list_dir t pre = + Base.list_dir ~list_fn:list t pre +end + +let create ?(file_perm=0o640) path = + Impl.create_parent_dir path file_perm; + Sys.mkdir path file_perm; + let dirname = + if String.ends_with ~suffix:"/" path then + path + else + path ^ "/" in + Impl.{dirname; file_perm} + +let open_store ?(file_perm=0o640) path = + if Sys.is_directory path then + let dirname = + if String.ends_with ~suffix:"/" path then + path + else + path ^ "/" in + Ok Impl.{dirname; file_perm} + else + Result.error @@ + `Store_read_error (path ^ " is not a Filesystem store.") + +let open_or_create ?(file_perm=0o640) path = + try open_store ~file_perm path with + | Sys_error _ -> Ok (create ~file_perm path) diff --git a/lib/storage/interface.ml b/lib/storage/interface.ml new file mode 100644 index 0000000..7557002 --- /dev/null +++ b/lib/storage/interface.ml @@ -0,0 +1,311 @@ +open Metadata +open Util.Result_syntax + +type key = string +type range = ByteRange of int * int option + +type error = + [ `Store_read_error of string + | `Invalid_slice of string + | `Invalid_kind of string + | `Reshape_error of string + | `Invalid_byte_range of string + | Codecs.error + | Metadata.error ] + +module type STORE = sig + type t + val get : t -> key -> (string, [> error]) result + val get_partial_values : t -> (key * range) list -> string option list + val set : t -> key -> string -> unit + val set_partial_values : t -> (key * int * string) list -> (unit, [> error]) result + val erase : t -> key -> unit + val erase_values : t -> key list -> unit + val erase_prefix : t -> key -> unit + val list : t -> key list + val list_prefix : key -> t -> key list + val list_dir : t -> key -> key list * string list + val is_member : t -> key -> bool +end + +module Ndarray = Owl.Dense.Ndarray.Generic + +module type S = sig + type t + + val create_group + : ?metadata:GroupMetadata.t -> t -> Node.t -> unit + + val create_array + : ?sep:Extensions.separator -> + ?codecs:Codecs.Chain.t -> + shape:int array -> + chunks:int array -> + Metadata.FillValue.t -> + Extensions.Datatype.t -> + Node.t -> + t -> unit + + val array_metadata + : Node.t -> t -> (ArrayMetadata.t, [> error]) result + + val group_metadata + : Node.t -> t -> (GroupMetadata.t, [> error]) result + + val find_child_nodes + : t -> Node.t -> (Node.t list * Node.t list, string) result + + val find_all_nodes : t -> Node.t list + + val erase_node : t -> Node.t -> unit + + val is_member : t -> Node.t -> bool + + val set_array + : Node.t -> + Owl_types.index array -> + ('a, 'b) Ndarray.t -> + t -> + (unit, [> error]) result + + val get_array + : Node.t -> + Owl_types.index array -> + ('a, 'b) Bigarray.kind -> + t -> + (('a, 'b) Ndarray.t, [> error]) result + + val reshape + : t -> Node.t -> int array -> (unit, [> error]) result +end + +module Make (M : STORE) : S with type t = M.t = struct + module ArraySet = Util.ArraySet + module Arraytbl = Util.Arraytbl + module AM = ArrayMetadata + module GM = GroupMetadata + include M + + let rec create_group ?metadata t node = + match metadata, Node.to_metakey node with + | Some m, k -> set t k @@ GM.encode m; + | None, k -> set t k @@ GM.(default |> encode); + make_implicit_groups_explicit t node + + and make_implicit_groups_explicit t node = + List.iter (fun n -> + match get t @@ Node.to_metakey n with + | Ok _ -> () + | Error _ -> create_group t n) @@ Node.ancestors node + + let create_array + ?(sep=Extensions.Slash) ?(codecs=Codecs.Chain.default) ~shape ~chunks fillvalue dtype node t = + set t (Node.to_metakey node) @@ + AM.encode @@ AM.create ~sep ~codecs ~shape fillvalue dtype chunks; + make_implicit_groups_explicit t node + + (* All nodes are explicit upon creation so just check the node's metadata key.*) + let is_member t node = + M.is_member t @@ Node.to_metakey node + + (* Assumes without checking that [metakey] is a valid node metadata key.*) + let unsafe_node_type t metakey = + let open Yojson.Safe in + get t metakey |> Result.get_ok |> from_string + |> Util.member "node_type" |> Util.to_string + + let get_metadata node t = + match is_member t node, Node.to_metakey node with + | true, k when unsafe_node_type t k = "array" -> + get t k >>= fun bytes -> + AM.decode bytes >>= fun meta -> + Ok (Either.left meta) + | true, k -> + get t k >>= fun bytes -> + GM.decode bytes >>= fun meta -> + Ok (Either.right meta) + | false, _ -> + Error (`Store_read_error (Node.to_path node ^ " is not a store member.")) + + let group_metadata node t = + match get_metadata node t with + | Ok x -> Ok (Either.find_right x |> Option.get) + | Error _ as err -> err + + let array_metadata node t = + match get_metadata node t with + | Ok x -> Ok (Either.find_left x |> Option.get) + | Error _ as err -> err + + let find_child_nodes t node = + match is_member t node, Node.to_metakey node with + | true, k when unsafe_node_type t k = "group" -> + Result.ok @@ + List.fold_left (fun (lacc, racc) pre -> + match + Node.of_path @@ + "/" ^ String.(length pre - 1 |> sub pre 0) + with + | Ok x -> + if unsafe_node_type t (pre ^ "zarr.json") = "array" then + x :: lacc, racc + else + lacc, x :: racc + | Error _ -> lacc, racc) + ([], []) (snd @@ list_dir t @@ Node.to_prefix node) + | true, _ -> + Error (Node.to_path node ^ " is not a group node.") + | false, _ -> + Error (Node.to_path node ^ " is not a node in this heirarchy.") + + let find_all_nodes t = + let rec aux acc p = + match find_child_nodes t p with + | Error _ -> acc + | Ok ([], []) -> p :: acc + | Ok (arrays, groups) -> + arrays @ p :: List.concat_map (aux acc) groups + in aux [] Node.root + + let erase_node t node = + erase_prefix t @@ Node.to_prefix node + + let set_array + : type a b. + Node.t -> + Owl_types.index array -> + (a, b) Ndarray.t -> + t -> + (unit, [> error]) result + = fun node slice x t -> + let open Util in + get t @@ Node.to_metakey node >>= fun bytes -> + AM.decode bytes >>= fun meta -> + (if Ndarray.shape x = Indexing.slice_shape slice @@ AM.shape meta then + Ok () + else + Error (`Invalid_slice "slice and input array shapes are unequal.")) + >>= fun () -> + (if AM.is_valid_kind meta @@ Ndarray.kind x then + Ok () + else + Result.error @@ + `Invalid_kind ( + "input array's kind is not compatible with node's data type.")) + >>= fun () -> + let coords = Indexing.coords_of_slice slice @@ AM.shape meta in + let tbl = Arraytbl.create @@ Array.length coords + in + Ndarray.iteri (fun i y -> + let k, c = AM.index_coord_pair meta coords.(i) in + Arraytbl.add tbl k (c, y)) x; + let repr = + {kind = Ndarray.kind x + ;shape = AM.chunk_shape meta + ;fill_value = AM.fillvalue_of_kind meta @@ Ndarray.kind x} + in + let codecs = AM.codecs meta in + let prefix = Node.to_prefix node in + let cindices = ArraySet.of_seq @@ Arraytbl.to_seq_keys tbl in + ArraySet.fold (fun idx acc -> + acc >>= fun () -> + let chunkkey = prefix ^ AM.chunk_key meta idx in + (match get t chunkkey with + | Ok b -> + Codecs.Chain.decode codecs repr b + | Error _ -> + Ok (Ndarray.create repr.kind repr.shape repr.fill_value)) + >>= fun arr -> + (* find_all returns bindings in reverse order. To restore the + * C-ordering of elements we must call List.rev. *) + let coords, vals = + List.split @@ + List.rev @@ + Arraytbl.find_all tbl idx in + let slice' = Indexing.slice_of_coords coords in + let shape' = Indexing.slice_shape slice' repr.shape in + let x' = Ndarray.of_array repr.kind (Array.of_list vals) shape' in + (* Ndarray.set_fancy* unfortunately doesn't work for array kinds + other than Float32, Float64, Complex32 and Complex64. + See: https://github.com/owlbarn/owl/issues/671 *) + Ndarray.set_fancy_ext slice' arr x'; (* possible to rewrite this function? *) + Codecs.Chain.encode codecs arr >>| fun encoded -> + set t chunkkey encoded) cindices (Ok ()) + + let get_array + : type a b. + Node.t -> + Owl_types.index array -> + (a, b) Bigarray.kind -> + t -> + ((a, b) Ndarray.t, [> error]) result + = fun node slice kind t -> + let open Util in + get t @@ Node.to_metakey node >>= fun bytes -> + AM.decode bytes >>= fun meta -> + (if AM.is_valid_kind meta kind then + Ok () + else + Result.error @@ + `Invalid_kind ("input kind is not compatible with node's data type.")) + >>= fun () -> + (try + Ok (Indexing.slice_shape slice @@ AM.shape meta) + with + | Assert_failure _ -> + Result.error @@ + `Store_read_error "slice shape is not compatible with node's shape.") + >>= fun sshape -> + let pair = + Array.map + (AM.index_coord_pair meta) + (Indexing.coords_of_slice slice @@ AM.shape meta) in + let tbl = Arraytbl.create @@ Array.length pair in + let prefix = Node.to_prefix node in + let chain = AM.codecs meta in + let repr = + {kind + ;shape = AM.chunk_shape meta + ;fill_value = AM.fillvalue_of_kind meta kind} + in + Array.fold_right (fun (idx, coord) acc -> + acc >>= fun l -> + match Arraytbl.find_opt tbl idx with + | Some arr -> + Ok (Ndarray.get arr coord :: l) + | None -> + (match get t @@ prefix ^ AM.chunk_key meta idx with + | Ok b -> + Codecs.Chain.decode chain repr b + | Error _ -> + Ok (Ndarray.create repr.kind repr.shape repr.fill_value)) + >>= fun arr -> + Arraytbl.add tbl idx arr; + Ok (Ndarray.get arr coord :: l)) pair (Ok []) + >>| fun res -> + Ndarray.of_array kind (Array.of_list res) sshape + + let reshape t node shape = + let mkey = Node.to_metakey node in + (if "array" = unsafe_node_type t mkey then + Ok () + else + Error (`Reshape_error (Node.to_path node ^ " is not an array node."))) + >>= fun () -> + get t mkey >>= fun bytes -> + AM.decode bytes >>= fun meta -> + (if Array.length shape = Array.length @@ AM.shape meta then + Ok () + else + Error (`Reshape_error "new shape must have same number of dimensions.")) + >>= fun () -> + let pre = Node.to_prefix node in + let s = + ArraySet.of_list @@ AM.chunk_indices meta @@ AM.shape meta in + let s' = + ArraySet.of_list @@ AM.chunk_indices meta shape in + ArraySet.iter + (fun v -> erase t @@ pre ^ AM.chunk_key meta v) + ArraySet.(diff s s'); + Ok (set t mkey @@ AM.encode @@ AM.update_shape meta shape) +end diff --git a/lib/storage/memory.ml b/lib/storage/memory.ml new file mode 100644 index 0000000..7c7a2f6 --- /dev/null +++ b/lib/storage/memory.ml @@ -0,0 +1,45 @@ +module StrMap = Hashtbl.Make (String) + +module Impl = struct + type t = string StrMap.t + + let get t key = + Option.to_result + ~none:(`Store_read_error key) @@ + StrMap.find_opt t key + + let set t key value = + StrMap.replace t key value + + let list t = + StrMap.to_seq_keys t |> List.of_seq + + let is_member = StrMap.mem + + let erase = StrMap.remove + + let erase_prefix t pre = + StrMap.filter_map_inplace + (fun k v -> + if String.starts_with ~prefix:pre k then + None + else + Some v) t + + let get_partial_values t kr_pairs = + Base.get_partial_values ~get_fn:get t kr_pairs + + let set_partial_values t krv_triplet = + Base.set_partial_values ~set_fn:set ~get_fn:get t krv_triplet + + let erase_values t keys = + Base.erase_values ~erase_fn:erase t keys + + let list_prefix pre t = + Base.list_prefix ~list_fn:list t pre + + let list_dir t pre = + Base.list_dir ~list_fn:list t pre +end + +let create () = StrMap.create 16 diff --git a/lib/storage/storage.ml b/lib/storage/storage.ml new file mode 100644 index 0000000..1910687 --- /dev/null +++ b/lib/storage/storage.ml @@ -0,0 +1,17 @@ +type error = Interface.error + +module type S = Interface.S + +module MemoryStore = struct + module MS = Interface.Make (Memory.Impl) + let create = Memory.create + include MS +end + +module FilesystemStore = struct + module FS = Interface.Make (Filesystem.Impl) + let create = Filesystem.create + let open_store = Filesystem.open_store + let open_or_create = Filesystem.open_or_create + include FS +end diff --git a/lib/storage/storage.mli b/lib/storage/storage.mli new file mode 100644 index 0000000..2edfbca --- /dev/null +++ b/lib/storage/storage.mli @@ -0,0 +1,18 @@ +type error = Interface.error + +module type S = Interface.S + +module MemoryStore : sig + include S + val create : unit -> t +end + +module FilesystemStore : sig + include S + val create + : ?file_perm:Unix.file_perm -> string -> t + val open_store + : ?file_perm:Unix.file_perm -> string -> (t, [> error]) result + val open_or_create + : ?file_perm:Unix.file_perm -> string -> (t, [> error]) result +end diff --git a/lib/util.ml b/lib/util.ml new file mode 100644 index 0000000..f7232b8 --- /dev/null +++ b/lib/util.ml @@ -0,0 +1,107 @@ +type 'a ext_point = + {name : string + ;configuration : 'a} +[@@deriving yojson] + +type ('a, 'b) array_repr = + {kind : ('a, 'b) Bigarray.kind + ;shape : int array + ;fill_value : 'a} + +module HashableArray = struct + include Array + type t = int array + let hash = Hashtbl.hash + let equal x y = Array.for_all2 Int.equal x y +end + +module ComparableArray = struct + include Array + type t = int array + let compare = Stdlib.compare +end + +module ArraySet = Set.Make (ComparableArray) + +module Arraytbl = Hashtbl.Make (HashableArray) + +module Result_syntax = struct + let ( let* ) = Result.bind + let ( let+ ) = Result.map + + let ( >>= ) = Result.bind + + let ( >>| ) x f = (* infix map *) + match x with + | Ok v -> Ok (f v) + | Error _ as e -> e + + let ( >>? ) x f = (* map_error *) + match x with + | Ok _ as k -> k + | Error e -> Error (f e) + + let ( and+ ) x y = (* product *) + match x, y with + | Ok a, Ok b -> Ok (a, b) + | Error e, Ok _ | Ok _, Error e | Error e, Error _ -> Error e +end + +module Indexing = struct + let rec cartesian_prod = function + | [] -> [[]] + | x :: xs -> + List.concat_map (fun i -> + List.map (List.cons i) (cartesian_prod xs)) x + + let range ?(step=1) start stop = + List.of_seq @@ if step > 0 then + Seq.unfold (function + | x when x > stop -> None + | x -> Some (x, x + step)) start + else + Seq.unfold (function + | x when x < start -> None + | x -> Some (x, x + step)) stop + + (* get indices from a reformated slice *) + let indices_of_slice = function + | Owl_types.R_ [|start; stop; step|] -> range ~step start stop + | Owl_types.L_ l -> Array.to_list l + (* this is added for exhaustiveness but is never reached since + a reformatted slice replaces a I_ index with an R_ index.*) + | _ -> failwith "Invalid slice index." + + let reformat_slice slice shape = + Owl_slicing.check_slice_definition + (Owl_slicing.sdarray_to_sdarray slice) shape + + let coords_of_slice slice shape = + (Array.map indices_of_slice @@ + reformat_slice slice shape) + |> Array.to_list + |> cartesian_prod + |> List.map Array.of_list + |> Array.of_list + + let slice_of_coords = function + | [] -> [||] + | xs -> + let ndims = Array.length @@ List.hd xs in + let indices = Array.make ndims [] in + Array.map (fun x -> Owl_types.L x) @@ + List.fold_right (fun x acc -> + Array.iteri (fun i y -> + if List.mem y acc.(i) then () + else acc.(i) <- y :: acc.(i)) x; acc) xs indices + + let slice_shape slice array_shape = + Owl_slicing.calc_slice_shape @@ + reformat_slice slice array_shape +end + +let get_name j = + Yojson.Safe.Util.(member "name" j |> to_string) + +let prod x = + Array.fold_left Int.mul 1 x diff --git a/lib/util.mli b/lib/util.mli new file mode 100644 index 0000000..2231731 --- /dev/null +++ b/lib/util.mli @@ -0,0 +1,68 @@ +type 'a ext_point = + {name : string ; configuration : 'a} +[@@deriving yojson] +(** The type representing a JSON extension point metadata configuration. *) + +type ('a, 'b) array_repr = + {kind : ('a, 'b) Bigarray.kind + ;shape : int array + ;fill_value : 'a} +(** The type summarizing the decoded/encoded representation of a Zarr array + or chunk. *) + +module Arraytbl : sig include Hashtbl.S with type key = int array end +(** A hashtable with integer array keys. *) + +module ArraySet : sig include Set.S with type elt = int array end +(** A hash set of integer array elements. *) + +module Result_syntax : sig + (** Result monad operator syntax. *) + + val ( let* ) + : ('a, 'e) result -> ('a -> ('b, 'e) result ) -> ('b, 'e) result + val ( >>= ) + : ('a, 'e) result -> ('a -> ('b, 'e) result ) -> ('b, 'e) result + val ( let+ ) + : ('a -> 'b) -> ('a, 'e) result -> ('b, 'e) result + val ( >>| ) + : ('a, 'e) result -> ('a -> 'b) -> ('b, 'e) result + val ( >>? ) + : ('a, 'e) result -> ('e -> 'f) -> ('a, 'f) result + val ( and+ ) + : ('a, 'e) result -> ('b, 'e) result -> (('a * 'b), 'e) result +end + +module Indexing : sig + (** A module housing functions for creating and manipulating indices and + slices for working with Zarr arrays. *) + + val slice_of_coords + : int array list -> Owl_types.index array + (** [slice_of_coords c] takes a list of array coordinates and returns + a slice corresponding to the coordinates. *) + + val coords_of_slice + : Owl_types.index array -> int array -> int array array + (** [coords_of_slice s shp] returns an array of coordinates given + a slice [s] and array shape [shp]. *) + + val cartesian_prod + : 'a list list -> 'a list list + (** [cartesian_prod ll] returns a cartesian product of the elements of + list [ll]. It is mainly used to generate a C-order of chunk indices + in a regular Zarr array grid. *) + + val slice_shape + : Owl_types.index array -> int array -> int array + (** [slice_shape s shp] returns the shape of slice [s] within an array + of shape [shp]. *) +end + +val get_name : Yojson.Safe.t -> string +(** [get_name c] returns the name value of a JSON metadata extension point + configuration of the form [{"name": value, "configuration": ...}], + as defined in the Zarr V3 specification. *) + +val prod : int array -> int +(** [prod x] returns the product of the elements of [x]. *) diff --git a/zarr.opam b/zarr.opam new file mode 100644 index 0000000..fb7c710 --- /dev/null +++ b/zarr.opam @@ -0,0 +1,37 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: "A short synopsis" +description: "A longer description" +maintainer: ["Maintainer Name"] +authors: ["Author Name"] +license: "BSD-3-Clause" +tags: ["topics" "to describe" "your" "project"] +homepage: "https://github.com/zoj613/zarr-ml" +doc: "https://url/to/documentation" +bug-reports: "https://github.com/zoj613/zarr-ml/issues" +depends: [ + "dune" {>= "3.15"} + "ocaml" {>= "4.14.2"} + "yojson" + "ppx_derviving_yojson" + "ezgzip" + "checkseum" + "stdint" + "owl" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/zoj613/zarr-ml.git"