Skip to content

Commit 783e68a

Browse files
amol-mbrobbel
andauthored
feat: python native JSON format parser (#53)
Add `substrait.json.load_json` and `substrait.json.parse_json` functions able to load the JSON representation of a Substrait plan to a `substrait.proto.Plan` object. It also adds the `substrait-cpp` repository as a git submodule to reuse the test files. This is reasonable because we might end up using the cpp library in the future to create bindings to other features too, so it's helpful to already have it as a submodule. --------- Co-authored-by: Matthijs Brobbel <m1brobbel@gmail.com>
1 parent 528cad8 commit 783e68a

10 files changed

Lines changed: 189 additions & 4 deletions

File tree

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ jobs:
3232
python -m pip install ".[test]"
3333
- name: Run tests
3434
run: |
35-
python -m pytest
35+
python -m pytest tests

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[submodule "third_party/substrait"]
22
path = third_party/substrait
33
url = https://github.com/substrait-io/substrait
4+
[submodule "third_party/substrait-cpp"]
5+
path = third_party/substrait-cpp
6+
url = https://github.com/substrait-io/substrait-cpp

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ git submodule update --init --recursive
2121
```
2222

2323

24-
# Upgrade the substrait submodule
24+
# Upgrade the substrait protocol definition
2525

2626
## a) Use the upgrade script
2727

2828
Run the upgrade script to upgrade the submodule and regenerate the protobuf stubs.
2929

3030
```
31-
./upgrade.sh <version>
31+
./update_proto.sh <version>
3232
```
3333

3434
## b) Manual upgrade

README.md

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,84 @@ relations {
135135
}
136136
```
137137

138+
## Load a Substrait Plan from JSON
139+
A substrait plan can be loaded from its JSON representation
140+
using the ``substrait.json.load_json`` and ``substrait.json.parse_json``
141+
functions:
142+
143+
```
144+
>>> import substrait.json
145+
>>> jsontext = """{
146+
... "relations":[
147+
... {
148+
... "root":{
149+
... "input":{
150+
... "read":{
151+
... "baseSchema":{
152+
... "names":[
153+
... "first_name",
154+
... "surname"
155+
... ],
156+
... "struct":{
157+
... "types":[
158+
... {
159+
... "string":{
160+
... "nullability":"NULLABILITY_REQUIRED"
161+
... }
162+
... },
163+
... {
164+
... "string":{
165+
... "nullability":"NULLABILITY_REQUIRED"
166+
... }
167+
... }
168+
... ]
169+
... }
170+
... },
171+
... "namedTable":{
172+
... "names":[
173+
... "people"
174+
... ]
175+
... }
176+
... }
177+
... },
178+
... "names":[
179+
... "first_name"
180+
... ]
181+
... }
182+
... }
183+
... ]
184+
... }"""
185+
>>> substrait.json.parse_json(jsontext)
186+
relations {
187+
root {
188+
input {
189+
read {
190+
base_schema {
191+
names: "first_name"
192+
names: "surname"
193+
struct {
194+
types {
195+
string {
196+
nullability: NULLABILITY_REQUIRED
197+
}
198+
}
199+
types {
200+
string {
201+
nullability: NULLABILITY_REQUIRED
202+
}
203+
}
204+
}
205+
}
206+
named_table {
207+
names: "people"
208+
}
209+
}
210+
}
211+
names: "first_name"
212+
}
213+
}
214+
```
215+
138216
## Produce a Substrait Plan with Ibis
139217
Let's use an existing Substrait producer, [Ibis](https://ibis-project.org),
140218
to provide an example using Python Substrait as the consumer.
@@ -280,4 +358,4 @@ version {
280358
minor_number: 24
281359
producer: "ibis-substrait"
282360
}
283-
```
361+
```

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ test = ["pytest >= 7.0.0"]
1717

1818
[tool.pytest.ini_options]
1919
pythonpath = "src"
20+
testpaths = "tests"
2021

2122
[build-system]
2223
requires = ["setuptools>=61.0.0", "setuptools_scm[toml]>=6.2.0"]

src/substrait/json.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from google.protobuf import json_format
2+
3+
from substrait.proto import Plan
4+
5+
6+
def load_json(filename):
7+
"""Load a Substrait Plan from a json file"""
8+
with open(filename, encoding="utf-8") as f:
9+
return parse_json(f.read())
10+
11+
12+
def parse_json(text):
13+
"""Generate a Substrait Plan from its JSON definition"""
14+
return json_format.Parse(text=text, message=Plan())
15+
16+
17+
def write_json(plan, filename):
18+
"""Write a Substrait Plan to a json file"""
19+
with open(filename, "w+") as f:
20+
f.write(dump_json(plan))
21+
22+
23+
def dump_json(plan):
24+
"""Dump a Substrait Plan to a string in JSON format"""
25+
return json_format.MessageToJson(plan)

tests/test_json.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
import os
2+
import pathlib
3+
import tempfile
4+
import json
5+
6+
from substrait.proto import Plan
7+
from substrait.json import load_json, parse_json, dump_json, write_json
8+
9+
import pytest
10+
11+
12+
JSON_FIXTURES = (
13+
pathlib.Path(os.path.dirname(__file__))
14+
/ ".."
15+
/ "third_party"
16+
/ "substrait-cpp"
17+
/ "src"
18+
/ "substrait"
19+
/ "textplan"
20+
/ "data"
21+
)
22+
JSON_TEST_FILE = sorted(JSON_FIXTURES.glob("*.json"))
23+
JSON_TEST_FILENAMES = [path.name for path in JSON_TEST_FILE]
24+
25+
26+
@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
27+
def test_json_load(jsonfile):
28+
with open(jsonfile) as f:
29+
jsondata = _strip_json_comments(f)
30+
parsed_plan = parse_json(jsondata)
31+
32+
# Save to a temporary file so we can test load_json
33+
# on content stripped of comments.
34+
with tempfile.TemporaryDirectory() as tmpdir:
35+
# We use a TemporaryDirectory as on Windows NamedTemporaryFile
36+
# doesn't allow for easy reopening of the file.
37+
with open(pathlib.Path(tmpdir) / "jsonfile.json", "w+") as stripped_file:
38+
stripped_file.write(jsondata)
39+
loaded_plan = load_json(stripped_file.name)
40+
41+
# The Plan constructor itself will throw an exception
42+
# in case there is anything wrong in parsing the JSON
43+
# so we can take for granted that if the plan was created
44+
# it is a valid plan in terms of protobuf definition.
45+
assert type(loaded_plan) is Plan
46+
47+
# Ensure that when loading from file or from string
48+
# the outcome is the same
49+
assert parsed_plan == loaded_plan
50+
51+
52+
@pytest.mark.parametrize("jsonfile", JSON_TEST_FILE, ids=JSON_TEST_FILENAMES)
53+
def test_json_roundtrip(jsonfile):
54+
with open(jsonfile) as f:
55+
jsondata = _strip_json_comments(f)
56+
57+
parsed_plan = parse_json(jsondata)
58+
assert parse_json(dump_json(parsed_plan)) == parsed_plan
59+
60+
# Test with write/load
61+
with tempfile.TemporaryDirectory() as tmpdir:
62+
filename = pathlib.Path(tmpdir) / "jsonfile.json"
63+
write_json(parsed_plan, filename)
64+
assert load_json(filename) == parsed_plan
65+
66+
67+
def _strip_json_comments(jsonfile):
68+
# The JSON files in the cpp testsuite are prefixed with
69+
# a comment containing the SQL that matches the json plan.
70+
# As Python JSON parser doesn't support comments,
71+
# we have to strip them to make the content readable
72+
return "\n".join(l for l in jsonfile.readlines() if l[0] != "#")

third_party/substrait-cpp

Submodule substrait-cpp added at cc8d08a

update_cpp.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/bash
2+
3+
echo "Updating substrait-cpp submodule..."
4+
git submodule update --remote third_party/substrait-cpp
5+
File renamed without changes.

0 commit comments

Comments
 (0)