Skip to content

Commit

Permalink
feat: python native JSON format parser (#53)
Browse files Browse the repository at this point in the history
Add `substrait.json.load_json` and `substrait.json.parse_json` functions
able to load
the JSON representation of a Substrait plan to a `substrait.proto.Plan`
object.

It also adds the `substrait-cpp` repository as a git submodule to reuse
the test files.
This is reasonable because we might end up using the cpp library in the
future
to create bindings to other features too, so it's helpful to already
have it as a submodule.

---------

Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>
  • Loading branch information
amol- and mbrobbel authored Apr 17, 2024
1 parent 528cad8 commit 783e68a
Show file tree
Hide file tree
Showing 10 changed files with 189 additions and 4 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ jobs:
python -m pip install ".[test]"
- name: Run tests
run: |
python -m pytest
python -m pytest tests
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[submodule "third_party/substrait"]
path = third_party/substrait
url = https://github.com/substrait-io/substrait
[submodule "third_party/substrait-cpp"]
path = third_party/substrait-cpp
url = https://github.com/substrait-io/substrait-cpp
4 changes: 2 additions & 2 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ git submodule update --init --recursive
```


# Upgrade the substrait submodule
# Upgrade the substrait protocol definition

## a) Use the upgrade script

Run the upgrade script to upgrade the submodule and regenerate the protobuf stubs.

```
./upgrade.sh <version>
./update_proto.sh <version>
```

## b) Manual upgrade
Expand Down
80 changes: 79 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,84 @@ relations {
}
```

## Load a Substrait Plan from JSON
A substrait plan can be loaded from its JSON representation
using the ``substrait.json.load_json`` and ``substrait.json.parse_json``
functions:

```
>>> import substrait.json
>>> jsontext = """{
... "relations":[
... {
... "root":{
... "input":{
... "read":{
... "baseSchema":{
... "names":[
... "first_name",
... "surname"
... ],
... "struct":{
... "types":[
... {
... "string":{
... "nullability":"NULLABILITY_REQUIRED"
... }
... },
... {
... "string":{
... "nullability":"NULLABILITY_REQUIRED"
... }
... }
... ]
... }
... },
... "namedTable":{
... "names":[
... "people"
... ]
... }
... }
... },
... "names":[
... "first_name"
... ]
... }
... }
... ]
... }"""
>>> substrait.json.parse_json(jsontext)
relations {
root {
input {
read {
base_schema {
names: "first_name"
names: "surname"
struct {
types {
string {
nullability: NULLABILITY_REQUIRED
}
}
types {
string {
nullability: NULLABILITY_REQUIRED
}
}
}
}
named_table {
names: "people"
}
}
}
names: "first_name"
}
}
```

## Produce a Substrait Plan with Ibis
Let's use an existing Substrait producer, [Ibis](https://ibis-project.org),
to provide an example using Python Substrait as the consumer.
Expand Down Expand Up @@ -280,4 +358,4 @@ version {
minor_number: 24
producer: "ibis-substrait"
}
```
```
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ test = ["pytest >= 7.0.0"]

[tool.pytest.ini_options]
pythonpath = "src"
testpaths = "tests"

[build-system]
requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0"]
Expand Down
25 changes: 25 additions & 0 deletions src/substrait/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from google.protobuf import json_format

from substrait.proto import Plan


def load_json(filename):
"""Load a Substrait Plan from a json file"""
with open(filename, encoding="utf-8") as f:
return parse_json(f.read())


def parse_json(text):
"""Generate a Substrait Plan from its JSON definition"""
return json_format.Parse(text=text, message=Plan())


def write_json(plan, filename):
"""Write a Substrait Plan to a json file"""
with open(filename, "w+") as f:
f.write(dump_json(plan))


def dump_json(plan):
"""Dump a Substrait Plan to a string in JSON format"""
return json_format.MessageToJson(plan)
72 changes: 72 additions & 0 deletions tests/test_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import pathlib
import tempfile
import json

from substrait.proto import Plan
from substrait.json import load_json, parse_json, dump_json, write_json

import pytest


JSON_FIXTURES = (
pathlib.Path(os.path.dirname(__file__))
/ ".."
/ "third_party"
/ "substrait-cpp"
/ "src"
/ "substrait"
/ "textplan"
/ "data"
)
JSON_TEST_FILE = sorted(JSON_FIXTURES.glob("*.json"))
JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE]


@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
def test_json_load(jsonfile):
with open(jsonfile) as f:
jsondata = _strip_json_comments(f)
parsed_plan = parse_json(jsondata)

# Save to a temporary file so we can test load_json
# on content stripped of comments.
with tempfile.TemporaryDirectory() as tmpdir:
# We use a TemporaryDirectory as on Windows NamedTemporaryFile
# doesn't allow for easy reopening of the file.
with open(pathlib.Path(tmpdir) / "jsonfile.json", "w+") as stripped_file:
stripped_file.write(jsondata)
loaded_plan = load_json(stripped_file.name)

# The Plan constructor itself will throw an exception
# in case there is anything wrong in parsing the JSON
# so we can take for granted that if the plan was created
# it is a valid plan in terms of protobuf definition.
assert type(loaded_plan) is Plan

# Ensure that when loading from file or from string
# the outcome is the same
assert parsed_plan == loaded_plan


@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
def test_json_roundtrip(jsonfile):
with open(jsonfile) as f:
jsondata = _strip_json_comments(f)

parsed_plan = parse_json(jsondata)
assert parse_json(dump_json(parsed_plan)) == parsed_plan

# Test with write/load
with tempfile.TemporaryDirectory() as tmpdir:
filename = pathlib.Path(tmpdir) / "jsonfile.json"
write_json(parsed_plan, filename)
assert load_json(filename) == parsed_plan


def _strip_json_comments(jsonfile):
# The JSON files in the cpp testsuite are prefixed with
# a comment containing the SQL that matches the json plan.
# As Python JSON parser doesn't support comments,
# we have to strip them to make the content readable
return "\n".join(l for l in jsonfile.readlines() if l[0] != "#")
1 change: 1 addition & 0 deletions third_party/substrait-cpp
Submodule substrait-cpp added at cc8d08
5 changes: 5 additions & 0 deletions update_cpp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

echo "Updating substrait-cpp submodule..."
git submodule update --remote third_party/substrait-cpp

File renamed without changes.

0 comments on commit 783e68a

Please sign in to comment.