From c85da00d24bdac3fe5ec16444f2443a3ad4f2725 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Fri, 17 Feb 2023 09:41:16 +0100 Subject: [PATCH] Add functionality for gzipped .jsonl reading and writing (#84) * Add functionality for gzipped .jsonl reading and writing. * Remove srsly import. * Skip cloudpickle test test_builtin_classmethod(). * Fix docstring comment for test_read_jsonl_gzip(). * Update srsly/tests/cloudpickle/cloudpickle_test.py * Format. --------- Co-authored-by: Adriane Boyd --- README.md | 75 ++++++++++++++------- srsly/_json_api.py | 43 +++++++++++- srsly/tests/cloudpickle/cloudpickle_test.py | 6 +- srsly/tests/test_json_api.py | 60 ++++++++++++++++- 4 files changed, 155 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index a171d8f..70cc285 100644 --- a/README.md +++ b/README.md @@ -136,11 +136,11 @@ data = {"foo": "bar", "baz": 123} srsly.write_json("/path/to/file.json", data) ``` -| Argument | Type | Description | -| ---------- | ------------ | ------------------------------------------------------ | -| `path` | str / `Path` | The file path or `"-"` to write to stdout. | -| `data` | - | The JSON-serializable data to output. | -| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. | +| Argument | Type | Description | +| -------- | ------------ | ------------------------------------------------------ | +| `path` | str / `Path` | The file path or `"-"` to write to stdout. | +| `data` | - | The JSON-serializable data to output. | +| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. | #### function `srsly.read_json` @@ -152,7 +152,7 @@ data = srsly.read_json("/path/to/file.json") | Argument | Type | Description | | ----------- | ------------ | ------------------------------------------ | -| `path` | str / `Path` | The file path or `"-"` to read from stdin. | +| `path` | str / `Path` | The file path or `"-"` to read from stdin. | | **RETURNS** | dict / list | The loaded JSON content. | #### function `srsly.write_gzip_json` @@ -164,11 +164,27 @@ data = {"foo": "bar", "baz": 123} srsly.write_gzip_json("/path/to/file.json.gz", data) ``` -| Argument | Type | Description | -| ---------- | ------------ | ------------------------------------------------------ | -| `path` | str / `Path` | The file path. | -| `data` | - | The JSON-serializable data to output. | -| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. | +| Argument | Type | Description | +| -------- | ------------ | ------------------------------------------------------ | +| `path` | str / `Path` | The file path. | +| `data` | - | The JSON-serializable data to output. | +| `indent` | int | Number of spaces used to indent JSON. Defaults to `2`. | + +#### function `srsly.write_gzip_jsonl` + +Create a gzipped JSONL file and dump contents. + +```python +data = [{"foo": "bar"}, {"baz": 123}] +srsly.write_gzip_json("/path/to/file.jsonl.gz", data) +``` + +| Argument | Type | Description | +| ----------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | str / `Path` | The file path. | +| `lines` | - | The JSON-serializable contents of each line. | +| `append` | bool | Whether or not to append to the location. Appending to .gz files is generally not recommended, as it doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly compressed. | +| `append_new_line` | bool | Whether or not to write a new line before appending to the file. | #### function `srsly.read_gzip_json` @@ -180,9 +196,22 @@ data = srsly.read_gzip_json("/path/to/file.json.gz") | Argument | Type | Description | | ----------- | ------------ | ------------------------ | -| `path` | str / `Path` | The file path. | +| `path` | str / `Path` | The file path. | | **RETURNS** | dict / list | The loaded JSON content. | +#### function `srsly.read_gzip_jsonl` + +Load gzipped JSONL from a file. + +```python +data = srsly.read_gzip_jsonl("/path/to/file.jsonl.gz") +``` + +| Argument | Type | Description | +| ----------- | ------------ | ------------------------- | +| `path` | str / `Path` | The file path. | +| **RETURNS** | dict / list | The loaded JSONL content. | + #### function `srsly.write_jsonl` Create a JSONL file (newline-delimited JSON) and dump contents line by line, or @@ -195,7 +224,7 @@ srsly.write_jsonl("/path/to/file.jsonl", data) | Argument | Type | Description | | ----------------- | ------------ | ---------------------------------------------------------------------------------------------------------------------- | -| `path` | str / `Path` | The file path or `"-"` to write to stdout. | +| `path` | str / `Path` | The file path or `"-"` to write to stdout. | | `lines` | iterable | The JSON-serializable lines. | | `append` | bool | Append to an existing file. Will open it in `"a"` mode and insert a newline before writing lines. Defaults to `False`. | | `append_new_line` | bool | Defines whether a new line should first be written when appending to an existing file. Defaults to `True`. | @@ -211,7 +240,7 @@ data = srsly.read_jsonl("/path/to/file.jsonl") | Argument | Type | Description | | ---------- | ---------- | -------------------------------------------------------------------- | -| `path` | str / Path | The file path or `"-"` to read from stdin. | +| `path` | str / Path | The file path or `"-"` to read from stdin. | | `skip` | bool | Skip broken lines and don't raise `ValueError`. Defaults to `False`. | | **YIELDS** | - | The loaded JSON contents of each line. | @@ -272,10 +301,10 @@ data = {"foo": "bar", "baz": 123} srsly.write_msgpack("/path/to/file.msg", data) ``` -| Argument | Type | Description | -| ---------- | ------------ | ---------------------- | -| `path` | str / `Path` | The file path. | -| `data` | - | The data to serialize. | +| Argument | Type | Description | +| -------- | ------------ | ---------------------- | +| `path` | str / `Path` | The file path. | +| `data` | - | The data to serialize. | #### function `srsly.read_msgpack` @@ -287,7 +316,7 @@ data = srsly.read_msgpack("/path/to/file.msg") | Argument | Type | Description | | ----------- | ------------ | --------------------------------------------------------------------------------------- | -| `path` | str / `Path` | The file path. | +| `path` | str / `Path` | The file path. | | `use_list` | bool | Don't use tuples instead of lists. Can make deserialization slower. Defaults to `True`. | | **RETURNS** | - | The loaded and deserialized content. | @@ -343,7 +372,7 @@ yaml_string = srsly.yaml_dumps(data) | ----------------- | ---- | ------------------------------------------ | | `data` | - | The JSON-serializable data to output. | | `indent_mapping` | int | Mapping indentation. Defaults to `2`. | -| `indent_sequence` | int | Sequence indentation. Defaults to `4`. | +| `indent_sequence` | int | Sequence indentation. Defaults to `4`. | | `indent_offset` | int | Indentation offset. Defaults to `2`. | | `sort_keys` | bool | Sort dictionary keys. Defaults to `False`. | | **RETURNS** | str | The serialized string. | @@ -373,10 +402,10 @@ srsly.write_yaml("/path/to/file.yml", data) | Argument | Type | Description | | ----------------- | ------------ | ------------------------------------------ | -| `path` | str / `Path` | The file path or `"-"` to write to stdout. | +| `path` | str / `Path` | The file path or `"-"` to write to stdout. | | `data` | - | The JSON-serializable data to output. | | `indent_mapping` | int | Mapping indentation. Defaults to `2`. | -| `indent_sequence` | int | Sequence indentation. Defaults to `4`. | +| `indent_sequence` | int | Sequence indentation. Defaults to `4`. | | `indent_offset` | int | Indentation offset. Defaults to `2`. | | `sort_keys` | bool | Sort dictionary keys. Defaults to `False`. | @@ -390,7 +419,7 @@ data = srsly.read_yaml("/path/to/file.yml") | Argument | Type | Description | | ----------- | ------------ | ------------------------------------------ | -| `path` | str / `Path` | The file path or `"-"` to read from stdin. | +| `path` | str / `Path` | The file path or `"-"` to read from stdin. | | **RETURNS** | dict / list | The loaded YAML content. | #### function `srsly.is_yaml_serializable` diff --git a/srsly/_json_api.py b/srsly/_json_api.py index 900e42b..24d25fd 100644 --- a/srsly/_json_api.py +++ b/srsly/_json_api.py @@ -1,4 +1,4 @@ -from typing import Union, Iterable, Sequence, Any, Optional +from typing import Union, Iterable, Sequence, Any, Optional, Iterator import sys import json as _builtin_json import gzip @@ -56,14 +56,27 @@ def read_json(path: FilePath) -> JSONOutput: def read_gzip_json(path: FilePath) -> JSONOutput: """Load JSON from a gzipped file. - location (FilePath): The file path. - RETURNS (JSONOutput): The loaded JSON content. + location (FilePath): The file path. + RETURNS (JSONOutput): The loaded JSON content. """ file_path = force_string(path) with gzip.open(file_path, "r") as f: return ujson.load(f) +def read_gzip_jsonl(path: FilePath, skip: bool = False) -> Iterator[JSONOutput]: + """Read a gzipped .jsonl file and yield contents line by line. + Blank lines will always be skipped. + + path (FilePath): The file path. + skip (bool): Skip broken lines and don't raise ValueError. + YIELDS (JSONOutput): The unpacked, deserialized Python objects. + """ + with gzip.open(force_path(path), "r") as f: + for line in _yield_json_lines(f, skip=skip): + yield line + + def write_json(path: FilePath, data: JSONInput, indent: int = 2) -> None: """Create a .json file and dump contents or write to standard output. @@ -94,6 +107,30 @@ def write_gzip_json(path: FilePath, data: JSONInput, indent: int = 2) -> None: f.write(json_data.encode("utf-8")) +def write_gzip_jsonl( + path: FilePath, + lines: Iterable[JSONInput], + append: bool = False, + append_new_line: bool = True, +) -> None: + """Create a .jsonl.gz file and dump contents. + + location (FilePath): The file path. + lines (Sequence[JSONInput]): The JSON-serializable contents of each line. + append (bool): Whether or not to append to the location. Appending to .gz files is generally not recommended, as it + doesn't allow the algorithm to take advantage of all data when compressing - files may hence be poorly + compressed. + append_new_line (bool): Whether or not to write a new line before appending + to the file. + """ + mode = "a" if append else "w" + file_path = force_path(path, require_exists=False) + with gzip.open(file_path, mode=mode) as f: + if append and append_new_line: + f.write("\n".encode("utf-8")) + f.writelines([(json_dumps(line) + "\n").encode("utf-8") for line in lines]) + + def read_jsonl(path: FilePath, skip: bool = False) -> Iterable[JSONOutput]: """Read a .jsonl file or standard input and yield contents line by line. Blank lines will always be skipped. diff --git a/srsly/tests/cloudpickle/cloudpickle_test.py b/srsly/tests/cloudpickle/cloudpickle_test.py index 1d33369..b293c53 100644 --- a/srsly/tests/cloudpickle/cloudpickle_test.py +++ b/srsly/tests/cloudpickle/cloudpickle_test.py @@ -872,8 +872,10 @@ def test_builtin_classicmethod(self): @pytest.mark.skipif( (platform.machine() == "aarch64" and sys.version_info[:2] >= (3, 10)) or platform.python_implementation() == "PyPy" - or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8)), - reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8") + or (sys.version_info[:2] == (3, 10) and sys.version_info >= (3, 10, 8)) + # Skipping tests on 3.11 due to https://github.com/cloudpipe/cloudpickle/pull/486. + or sys.version_info[:2] == (3, 11), + reason="Fails on aarch64 + python 3.10+ in cibuildwheel, currently unable to replicate failure elsewhere; fails sometimes for pypy on conda-forge; fails for python 3.10.8+ and 3.11") def test_builtin_classmethod(self): obj = 1.5 # float object diff --git a/srsly/tests/test_json_api.py b/srsly/tests/test_json_api.py index dc23952..89ce400 100644 --- a/srsly/tests/test_json_api.py +++ b/srsly/tests/test_json_api.py @@ -4,7 +4,14 @@ import gzip import numpy -from .._json_api import read_json, write_json, read_jsonl, write_jsonl +from .._json_api import ( + read_json, + write_json, + read_jsonl, + write_jsonl, + read_gzip_jsonl, + write_gzip_jsonl, +) from .._json_api import write_gzip_json, json_dumps, is_json_serializable from .._json_api import json_loads from ..util import force_string @@ -204,3 +211,54 @@ def test_unsupported_type_error(): f = numpy.float32() with pytest.raises(TypeError): s = json_dumps(f) + + +def test_write_jsonl_gzip(): + """Tests writing data to a gzipped .jsonl file.""" + data = [{"hello": "world"}, {"test": 123}] + expected = ['{"hello":"world"}\n', '{"test":123}\n'] + + with make_tempdir() as temp_dir: + file_path = temp_dir / "tmp.json" + write_gzip_jsonl(file_path, data) + with gzip.open(file_path, "r") as f: + assert [line.decode("utf8") for line in f.readlines()] == expected + + +def test_write_jsonl_gzip_append(): + """Tests appending data to a gzipped .jsonl file.""" + data = [{"hello": "world"}, {"test": 123}] + expected = [ + '{"hello":"world"}\n', + '{"test":123}\n', + "\n", + '{"hello":"world"}\n', + '{"test":123}\n', + ] + with make_tempdir() as temp_dir: + file_path = temp_dir / "tmp.json" + write_gzip_jsonl(file_path, data) + write_gzip_jsonl(file_path, data, append=True) + with gzip.open(file_path, "r") as f: + assert [line.decode("utf8") for line in f.readlines()] == expected + + +def test_read_jsonl_gzip(): + """Tests reading data from a gzipped .jsonl file.""" + file_contents = [{"hello": "world"}, {"test": 123}] + with make_tempdir() as temp_dir: + file_path = temp_dir / "tmp.json" + with gzip.open(file_path, "w") as f: + f.writelines( + [(json_dumps(line) + "\n").encode("utf-8") for line in file_contents] + ) + assert file_path.exists() + data = read_gzip_jsonl(file_path) + # Make sure this returns a generator, not just a list + assert not hasattr(data, "__len__") + data = list(data) + assert len(data) == 2 + assert len(data[0]) == 1 + assert len(data[1]) == 1 + assert data[0]["hello"] == "world" + assert data[1]["test"] == 123