Skip to content

Commit bde7764

Browse files
authored
Merge pull request #74 from daggaz/feature/72
#72 simplify and document support for reading streams with mixed data
2 parents dfb4b70 + e662019 commit bde7764

5 files changed

Lines changed: 180 additions & 10 deletions

File tree

README.md

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ Simple streaming JSON parser and encoder.
1111
When [reading](#reading) JSON data, `json-stream` can decode JSON data in
1212
a streaming manner, providing a pythonic dict/list-like interface, or a
1313
[visitor-based interface](#visitor). It can stream from files, [URLs](#urls)
14-
or [iterators](#iterators). It can process [multiple JSON documents](#multiple) in a single stream.
14+
or [iterators](#iterators). It can process [multiple JSON documents](#multiple)
15+
in a single stream, and can read JSON [mixed with other non-JSON data](#reading-mixed-data).
1516

1617
When [writing](#writing) JSON data, `json-stream` can stream JSON objects
1718
as you generate them.
@@ -495,13 +496,81 @@ significant parsing speedup compared to pure python implementation.
495496
`json-stream` will fallback to its pure python tokenizer implementation
496497
if `json-stream-rs-tokenizer` is not available.
497498

499+
#### <a id="reading-mixed-data"></a> Reading mixed data
500+
501+
When using the Rust tokenizer, you can also use `json-stream` to parse mixed
502+
data, for example a file with a JSON followed by binary data.
503+
504+
To do this, you should pass `correct_cursor=True` to `load()`. The ensures the
505+
rust tokenizer keeps track of the exact stream position it has read up to. This
506+
comes with a **significant performance cost** for un-seekable streams.
507+
508+
After reading the JSON data, call `read_all()` on the top-level object returned
509+
by `load()` to ensure you have read up to the end of the JSON data, and then call
510+
`.tokenizer.park_cursor()` to "park" the underlying file cursor at the correct
511+
position.
512+
513+
```python
514+
import json_stream
515+
516+
with open('test.bin', 'rb') as f:
517+
# read JSON header
518+
header = json_stream.load(f, correct_cursor=True)
519+
# ... process JSON header ...
520+
header.read_all()
521+
522+
# ensure the tokenizer has "parked" the file
523+
# cursor at the end of the JSON data
524+
header.tokenizer.park_cursor()
525+
526+
# now we can read binary data from the same file
527+
binary_start = f.tell()
528+
data = f.read()
529+
530+
#### <a id="mixed-scenarios"></a> Other mixed data scenarios
531+
532+
`json-stream` can also handle streams that start with binary data, or have binary
533+
data between multiple JSON documents.
534+
535+
##### Binary then JSON
536+
537+
You can simply read the binary data from the file before calling `load()`.
538+
539+
```python
540+
with open('test.bin', 'rb') as f:
541+
binary_data = f.read(1024)
542+
data = json_stream.load(f)
543+
# ... process JSON ...
544+
```
545+
546+
##### JSON then binary then JSON
547+
548+
You must use `correct_cursor=True` for any JSON document that is followed by
549+
binary data.
550+
551+
```python
552+
with open('test.bin', 'rb') as f:
553+
# 1. Read first JSON
554+
data1 = json_stream.load(f, correct_cursor=True)
555+
# ... process data1 ...
556+
data1.read_all()
557+
data1.tokenizer.park_cursor()
558+
559+
# 2. Read binary data
560+
binary_data = f.read(1024)
561+
562+
# 3. Read second JSON
563+
data2 = json_stream.load(f)
564+
# ... process data2 ...
565+
```
566+
498567
### Custom tokenizer
499568

500569
You can supply an alternative JSON tokenizer implementation. Simply pass
501570
a tokenizer to the `load()` or `visit()` methods.
502571

503572
```python
504-
json_stream.load(f, tokenizer=some_tokenizer)
573+
json_stream.load(f, tokenizer=some_tokenizer, **tokenizer_kwargs)
505574
```
506575

507576
The requests methods also accept a customer tokenizer parameter.

src/json_stream/base.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ def __init__(self, token_stream):
3636
self._stream = token_stream
3737
self._child: Optional[StreamingJSONBase] = None
3838

39+
@property
40+
def tokenizer(self):
41+
return self._stream
42+
3943
def _clear_child(self):
4044
if self._child is not None:
4145
self._child.read_all()

src/json_stream/loader.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@
33
from json_stream.select_tokenizer import default_tokenizer
44

55

6-
def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer):
7-
return next(load_many(fp_or_iterable, persistent, tokenizer))
6+
def load(fp_or_iterable, persistent=False, tokenizer=default_tokenizer, **tokenizer_kwargs):
7+
return next(load_many(fp_or_iterable, persistent, tokenizer, **tokenizer_kwargs))
88

99

10-
def load_many(fp_or_iterable, persistent=False, tokenizer=default_tokenizer):
10+
def load_many(fp_or_iterable, persistent=False, tokenizer=default_tokenizer, **tokenizer_kwargs):
1111
fp = ensure_file(fp_or_iterable)
12-
token_stream = tokenizer(fp)
12+
token_stream = tokenizer(fp, **tokenizer_kwargs)
1313
for token_type, token in token_stream:
1414
if token_type == TokenType.OPERATOR:
1515
data = StreamingJSONBase.factory(token, token_stream, persistent)
Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import io
2+
import json
3+
from unittest import TestCase, skipUnless
4+
import json_stream
5+
6+
try:
7+
import json_stream_rs_tokenizer
8+
HAS_RS_TOKENIZER = hasattr(json_stream_rs_tokenizer, 'RustTokenizer')
9+
except ImportError:
10+
HAS_RS_TOKENIZER = False
11+
12+
class TestBinaryResumption(TestCase):
13+
@skipUnless(HAS_RS_TOKENIZER, 'Rust tokenizer not available')
14+
def test_json_then_binary(self):
15+
json_header = json.dumps({"header": "info"})
16+
binary_data = b'\x00\x01\x02\x03'
17+
test_data = json_header.encode('utf-8') + binary_data
18+
test_file = io.BytesIO(test_data)
19+
20+
# Load with correct_cursor=True
21+
header = json_stream.load(test_file, correct_cursor=True)
22+
23+
# Consume all data from header
24+
header.read_all()
25+
26+
# Signal that we are done with JSON and want to resume binary read
27+
header.tokenizer.park_cursor()
28+
29+
# Verify file cursor position
30+
self.assertEqual(test_file.tell(), len(json_header))
31+
32+
# Verify binary data
33+
remaining = test_file.read()
34+
self.assertEqual(remaining, binary_data)
35+
36+
@skipUnless(HAS_RS_TOKENIZER, 'Rust tokenizer not available')
37+
def test_binary_then_json(self):
38+
binary_data = b'binary_start'
39+
json_data = b'{"a": 1}'
40+
test_data = binary_data + json_data
41+
test_file = io.BytesIO(test_data)
42+
43+
# Read binary
44+
read_binary = test_file.read(len(binary_data))
45+
self.assertEqual(read_binary, binary_data)
46+
47+
# Load JSON
48+
data = json_stream.load(test_file)
49+
self.assertEqual(dict(data.items()), {"a": 1})
50+
51+
@skipUnless(HAS_RS_TOKENIZER, 'Rust tokenizer not available')
52+
def test_json_then_binary_then_json(self):
53+
json_1 = b'{"first": true}'
54+
binary_middle = b'middle_binary'
55+
json_2 = b'{"second": false}'
56+
test_data = json_1 + binary_middle + json_2
57+
test_file = io.BytesIO(test_data)
58+
59+
# Load first JSON
60+
data1 = json_stream.load(test_file, correct_cursor=True)
61+
self.assertEqual(dict(data1.items()), {"first": True})
62+
data1.read_all()
63+
data1.tokenizer.park_cursor()
64+
self.assertEqual(test_file.tell(), len(json_1))
65+
66+
# Read middle binary
67+
read_middle = test_file.read(len(binary_middle))
68+
self.assertEqual(read_middle, binary_middle)
69+
70+
# Load second JSON
71+
data2 = json_stream.load(test_file)
72+
self.assertEqual(dict(data2.items()), {"second": False})
73+
74+
@skipUnless(HAS_RS_TOKENIZER, 'Rust tokenizer not available')
75+
def test_load_many_then_binary(self):
76+
json_1 = '{"a": 1}'
77+
json_2 = '{"b": 2}'
78+
binary_data = b'binary'
79+
test_data = json_1.encode('utf-8') + json_2.encode('utf-8') + binary_data
80+
81+
test_file = io.BytesIO(test_data)
82+
83+
loader = json_stream.load_many(test_file, correct_cursor=True)
84+
85+
# Read first JSON
86+
doc1 = next(loader)
87+
doc1.read_all()
88+
89+
# Read second JSON
90+
doc2 = next(loader)
91+
doc2.read_all()
92+
93+
# Now park cursor
94+
doc2.tokenizer.park_cursor()
95+
96+
self.assertEqual(test_file.tell(), len(json_1) + len(json_2))
97+
self.assertEqual(test_file.read(), binary_data)

src/json_stream/visitor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@ def _visit(obj, visitor, path):
2020
visitor(obj, path)
2121

2222

23-
def visit_many(fp_or_iterator, visitor, tokenizer=default_tokenizer):
23+
def visit_many(fp_or_iterator, visitor, tokenizer=default_tokenizer, **tokenizer_kwargs):
2424
fp = ensure_file(fp_or_iterator)
25-
token_stream = tokenizer(fp)
25+
token_stream = tokenizer(fp, **tokenizer_kwargs)
2626
for token_type, token in token_stream:
2727
if token_type == TokenType.OPERATOR:
2828
obj = StreamingJSONBase.factory(token, token_stream, persistent=False)
@@ -33,5 +33,5 @@ def visit_many(fp_or_iterator, visitor, tokenizer=default_tokenizer):
3333
yield
3434

3535

36-
def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer):
37-
next(visit_many(fp_or_iterator, visitor, tokenizer))
36+
def visit(fp_or_iterator, visitor, tokenizer=default_tokenizer, **tokenizer_kwargs):
37+
next(visit_many(fp_or_iterator, visitor, tokenizer, **tokenizer_kwargs))

0 commit comments

Comments
 (0)