Skip to content

Commit 188a631

Browse files
fix(wire): tighten encoder/decoder against protocol drift and corruption
- ISSUE-59: FilesResponse.encode_body validates per-file content is 8-byte aligned. Upstream C (gateway.c::dumpFile) asserts ``len % 8 == 0`` because SQLite pages are always word-aligned and per-file entries are written back-to-back without padding. Python- produced mock frames with non-aligned content would decode under our permissive decoder but fail under a real C peer. Raise EncodeError at construction rather than silently producing malformed bytes. - ISSUE-60: encode_value for BOOLEAN rejects arbitrary ints. Previous ``1 if value else 0`` coercion silently mapped ``5`` or ``-1`` to True, making the round-trip lossy (encode(5, BOOLEAN) → wire 1 → decode → True). Now only ``bool`` or exact 0/1 ints are accepted. - ISSUE-61: RowsResponse.__post_init__ defensively deep-copies row_types and rows (outer + inner lists). Prior fix copied only column_names and column_types. User-supplied row_types / rows remained aliased: mutating the caller's list would silently mutate the message's internals. - ISSUE-63: decode_row_header validates the full 8-byte marker, not just the first byte. Upstream C uses the full uint64 sentinel (0xff..ff for DONE, 0xee..ee for PART); Go's permissive "first byte wins" is tolerant but lets torn/corrupt frames silently truncate results. We reject non-uniform markers. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 07002ec commit 188a631

6 files changed

Lines changed: 155 additions & 55 deletions

File tree

src/dqlitewire/messages/responses.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,8 @@ class RowsResponse(Message):
246246
has_more: bool = False
247247

248248
def __post_init__(self) -> None:
249-
# Defensive copies (issue 042). Two sources of aliasing
250-
# motivate this:
249+
# Defensive copies (issue 042, ISSUE-61). Two sources of
250+
# aliasing motivate this:
251251
#
252252
# 1. ``decode_body`` stores ``column_types = types`` where
253253
# ``types`` is also stored as ``all_row_types[0]``, so
@@ -257,11 +257,14 @@ def __post_init__(self) -> None:
257257
# 2. User code constructing ``RowsResponse`` directly with a
258258
# list they intend to keep mutating elsewhere.
259259
#
260-
# Copying here catches both sites uniformly and survives
261-
# future construction sites. The cost is two list allocations
262-
# per response — negligible compared to the row payload.
260+
# Copy all list-valued fields uniformly. ``row_types`` is a
261+
# list-of-lists so it needs both outer and inner copies; the
262+
# same for ``rows``. Cost is O(n) on the row dimension —
263+
# dominated by the row payload itself, so negligible.
263264
self.column_names = list(self.column_names)
264265
self.column_types = list(self.column_types)
266+
self.row_types = [list(t) for t in self.row_types]
267+
self.rows = [list(r) for r in self.rows]
265268

266269
def _get_row_types(self, row_idx: int, row: list[Any]) -> list[ValueType]:
267270
"""Get types for a row: from row_types, column_types, or inferred.
@@ -467,12 +470,21 @@ class FilesResponse(Message):
467470
def encode_body(self) -> bytes:
468471
result = encode_uint64(len(self.files))
469472
for name, content in self.files.items():
473+
# The upstream C server (gateway.c::dumpFile) asserts
474+
# ``len % 8 == 0`` for every file's content, because per-file
475+
# entries are written back-to-back with no explicit padding
476+
# and SQLite pages are always 8-byte aligned multiples of
477+
# 512. Validate here so a Python-encoded mock-server frame
478+
# cannot diverge from what a real C peer produces (ISSUE-59).
479+
if len(content) % 8 != 0:
480+
raise EncodeError(
481+
f"FilesResponse content for {name!r} must be 8-byte aligned "
482+
f"(got {len(content)} bytes); dqlite file entries carry no "
483+
"per-file padding"
484+
)
470485
result += encode_text(name)
471486
result += encode_uint64(len(content))
472487
result += content
473-
# No padding after content — matches Go's byte-by-byte read.
474-
# The C server only produces word-aligned content (SQLite pages
475-
# are multiples of 512), so padding is never needed in practice.
476488
return result
477489

478490
@classmethod

src/dqlitewire/tuples.py

Lines changed: 20 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
# Valid ValueType codes as integers, for fast membership testing in hot paths.
1818
_VALID_TYPE_CODES = frozenset(int(v) for v in ValueType)
1919

20+
# Full 8-byte sentinels matching DQLITE_RESPONSE_ROWS_DONE/PART. Used to
21+
# reject torn/corrupt row markers instead of accepting any 8 bytes that
22+
# happen to start with 0xff/0xee (ISSUE-63).
23+
_ROW_DONE_MARKER = bytes([ROW_DONE_BYTE]) * 8
24+
_ROW_PART_MARKER = bytes([ROW_PART_BYTE]) * 8
25+
2026
# Defense-in-depth cap on parameter count, matching the pattern used by
2127
# _MAX_COLUMN_COUNT, _MAX_FILE_COUNT, and _MAX_NODE_COUNT in responses.py.
2228
# SQLite's default limit is 999 (compile-time max 32766).
@@ -221,16 +227,22 @@ def decode_row_header(
221227
before validating header size, matching the Go reference implementation.
222228
Returns (types_or_marker, bytes_consumed).
223229
"""
224-
# Check for markers first — markers are always exactly one 8-byte word,
225-
# regardless of column count. Must check before header size validation
226-
# because for large column counts the header would be >8 bytes.
227-
# Go checks the first byte (0xFF -> DONE, 0xEE -> PART); we match that
228-
# behavior so non-uniform markers are also detected.
230+
# Check for markers first — markers are always exactly one 8-byte word
231+
# of a repeated sentinel byte, regardless of column count. Must check
232+
# before header size validation because for large column counts the
233+
# header would be >8 bytes.
234+
#
235+
# Upstream C uses the full uint64 sentinel (DQLITE_RESPONSE_ROWS_DONE
236+
# = 0xff..ff, _PART = 0xee..ee). Go's reference client checks only the
237+
# first byte; we validate all 8 bytes so torn/corrupt markers like
238+
# ``0xff 0x00..`` are rejected as malformed rather than silently
239+
# truncating the result stream (ISSUE-63). This is strictly tighter
240+
# than the Go behavior.
229241
if len(data) >= 8:
230-
first_byte = data[0]
231-
if first_byte == ROW_DONE_BYTE:
242+
marker = bytes(data[:8]) if isinstance(data, memoryview) else data[:8]
243+
if marker == _ROW_DONE_MARKER:
232244
return RowMarker.DONE, 8
233-
if first_byte == ROW_PART_BYTE:
245+
if marker == _ROW_PART_MARKER:
234246
return RowMarker.PART, 8
235247

236248
# Calculate bytes needed: 2 types per byte, rounded up

src/dqlitewire/types.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,19 @@ def encode_value(value: Any, value_type: ValueType | None = None) -> tuple[bytes
242242
)
243243

244244
if value_type == ValueType.BOOLEAN:
245-
if not isinstance(value, (bool, int)):
246-
raise EncodeError(f"Expected bool or int for BOOLEAN, got {type(value).__name__}")
247-
return encode_uint64(1 if value else 0), value_type
245+
# Accept bool directly; allow the exact ints 0 and 1 as a
246+
# pragmatic escape for callers working with raw column values.
247+
# Reject arbitrary ints — the previous ``1 if value else 0``
248+
# coercion silently mapped values like ``5`` or ``-1`` to True,
249+
# which round-trips as the bool True and loses the caller's
250+
# original value (ISSUE-60).
251+
if isinstance(value, bool):
252+
return encode_uint64(1 if value else 0), value_type
253+
if isinstance(value, int) and value in (0, 1):
254+
return encode_uint64(value), value_type
255+
raise EncodeError(
256+
f"BOOLEAN requires bool (or exactly 0/1), got {type(value).__name__}={value!r}"
257+
)
248258
elif value_type in (ValueType.INTEGER, ValueType.UNIXTIME):
249259
# Note: UNIXTIME is a server-to-client-only type (the C server's
250260
# tuple_decoder has no inbound case for DQLITE_UNIXTIME). Explicit

tests/test_messages_responses.py

Lines changed: 55 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,35 @@ class TestRowsResponseAliasing:
184184
The fix copies both lists on construction via ``__post_init__``.
185185
"""
186186

187+
def test_constructor_copies_row_types_and_rows(self) -> None:
188+
"""Caller-supplied lists must be independent from the message
189+
after construction (ISSUE-61). Applies uniformly to column_names,
190+
column_types, row_types (outer + inner), and rows (outer + inner).
191+
"""
192+
supplied_row_types = [[ValueType.INTEGER, ValueType.TEXT]]
193+
supplied_rows = [[1, "alice"]]
194+
msg = RowsResponse(
195+
column_names=["id", "name"],
196+
column_types=[ValueType.INTEGER, ValueType.TEXT],
197+
row_types=supplied_row_types,
198+
rows=supplied_rows,
199+
has_more=False,
200+
)
201+
202+
# Outer-list copies: mutating the supplied list does not affect
203+
# the message.
204+
supplied_row_types.append([ValueType.NULL])
205+
supplied_rows.append([99, "mallory"])
206+
assert len(msg.row_types) == 1
207+
assert len(msg.rows) == 1
208+
209+
# Inner-list copies: mutating the supplied inner list does not
210+
# affect the message's inner copies.
211+
supplied_row_types[0][0] = ValueType.NULL
212+
supplied_rows[0][0] = 999
213+
assert msg.row_types[0][0] == ValueType.INTEGER
214+
assert msg.rows[0][0] == 1
215+
187216
def test_column_types_is_not_aliased_to_row_types_first(self) -> None:
188217
"""After decoding, ``column_types`` must be a distinct list
189218
object from ``row_types[0]`` so mutation of one does not
@@ -897,7 +926,7 @@ def test_wire_format_starts_with_count(self) -> None:
897926
"""Body must start with uint64 file count per Go wire protocol."""
898927
from dqlitewire.types import decode_uint64
899928

900-
msg = FilesResponse(files={"test.db": b"data"})
929+
msg = FilesResponse(files={"test.db": b"datadata"}) # 8 bytes (ISSUE-59)
901930
body = msg.encode_body()
902931
count = decode_uint64(body[:8])
903932
assert count == 1
@@ -917,17 +946,18 @@ def test_wire_format_has_size_field(self) -> None:
917946
assert size == len(content)
918947

919948
def test_roundtrip(self) -> None:
920-
msg = FilesResponse(files={"db.sqlite": b"database content", "wal": b"wal data"})
949+
# 16 and 8 bytes — word-aligned per upstream C's dumpFile assert.
950+
msg = FilesResponse(files={"db.sqlite": b"databasecontent!", "wal": b"wal data"})
921951
encoded = msg.encode()
922952
decoded = FilesResponse.decode_body(encoded[HEADER_SIZE:])
923-
assert decoded.files["db.sqlite"] == b"database content"
953+
assert decoded.files["db.sqlite"] == b"databasecontent!"
924954
assert decoded.files["wal"] == b"wal data"
925955

926956
def test_roundtrip_single_file(self) -> None:
927-
msg = FilesResponse(files={"main.db": b"\x00\x01\x02\x03"})
957+
msg = FilesResponse(files={"main.db": b"\x00\x01\x02\x03\x04\x05\x06\x07"})
928958
encoded = msg.encode()
929959
decoded = FilesResponse.decode_body(encoded[HEADER_SIZE:])
930-
assert decoded.files["main.db"] == b"\x00\x01\x02\x03"
960+
assert decoded.files["main.db"] == b"\x00\x01\x02\x03\x04\x05\x06\x07"
931961

932962
def test_roundtrip_aligned_content(self) -> None:
933963
"""Real dqlite content is always word-aligned (SQLite pages are multiples of 512)."""
@@ -943,23 +973,22 @@ def test_roundtrip_aligned_content(self) -> None:
943973
assert decoded.files["main.db"] == page
944974
assert decoded.files["wal.db"] == page + page
945975

946-
def test_roundtrip_non_aligned_content(self) -> None:
947-
"""Non-aligned content roundtrips correctly without padding.
976+
def test_encode_rejects_non_aligned_content(self) -> None:
977+
"""Content whose length is not a multiple of 8 is rejected at encode.
948978
949-
The C server asserts content is always word-aligned. Neither Go nor
950-
Python adds padding after file content. This test verifies that
951-
non-aligned content still works for encode/decode symmetry.
979+
ISSUE-59: the upstream C server (gateway.c::dumpFile) asserts
980+
``len % 8 == 0``. We enforce the same invariant on the encoder
981+
side so mock-server frames cannot diverge from real C output.
952982
"""
953-
msg = FilesResponse(
954-
files={
955-
"file1.db": b"\x01\x02\x03", # 3 bytes, needs 5 padding
956-
"file2.db": b"\x04\x05\x06\x07\x08\x09\x0a", # 7 bytes, needs 1 padding
957-
}
958-
)
959-
encoded = msg.encode()
960-
decoded = FilesResponse.decode_body(encoded[HEADER_SIZE:])
961-
assert decoded.files["file1.db"] == b"\x01\x02\x03"
962-
assert decoded.files["file2.db"] == b"\x04\x05\x06\x07\x08\x09\x0a"
983+
from dqlitewire.exceptions import EncodeError
984+
985+
msg = FilesResponse(files={"file1.db": b"\x01\x02\x03"}) # 3 bytes
986+
with pytest.raises(EncodeError, match="8-byte aligned"):
987+
msg.encode_body()
988+
989+
msg = FilesResponse(files={"file2.db": b"\x04\x05\x06\x07\x08\x09\x0a"}) # 7
990+
with pytest.raises(EncodeError, match="8-byte aligned"):
991+
msg.encode_body()
963992

964993
def test_roundtrip_empty_content(self) -> None:
965994
"""116: zero-length file content must round-trip correctly."""
@@ -969,12 +998,15 @@ def test_roundtrip_empty_content(self) -> None:
969998
assert decoded.files == {"empty.db": b""}
970999

9711000
def test_roundtrip_mixed_empty_and_nonempty(self) -> None:
972-
"""116: empty and non-empty files in the same response."""
1001+
"""116: empty and non-empty files in the same response.
1002+
1003+
All non-empty content must be 8-byte aligned (ISSUE-59).
1004+
"""
9731005
msg = FilesResponse(
9741006
files={
975-
"main.db": b"data",
1007+
"main.db": b"datadata",
9761008
"empty.db": b"",
977-
"wal.db": b"more data",
1009+
"wal.db": b"moredata",
9781010
}
9791011
)
9801012
encoded = msg.encode()

tests/test_tuples.py

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -357,18 +357,27 @@ def test_full_uint64_marker_comparison(self) -> None:
357357
assert decode_row_header(b"\xff" * 8, 1) == (RowMarker.DONE, 8)
358358
assert decode_row_header(b"\xee" * 8, 1) == (RowMarker.PART, 8)
359359

360-
def test_first_byte_marker_detection(self) -> None:
361-
"""Marker detection uses first byte, matching Go's byte-by-byte check.
362-
363-
Go checks the first byte (0xFF -> DONE, 0xEE -> PART). A non-uniform
364-
marker where only the first byte matches must still be detected.
360+
def test_non_uniform_marker_rejected(self) -> None:
361+
"""Non-uniform markers are rejected as corrupt (ISSUE-63).
362+
363+
Upstream C uses the full uint64 sentinel (DQLITE_RESPONSE_ROWS_DONE
364+
= 0xff..ff, _PART = 0xee..ee). Go's reference client accepts any
365+
8 bytes starting with 0xff/0xee as a marker; we validate all 8
366+
bytes so torn/corrupt frames are rejected rather than silently
367+
truncating results.
368+
369+
ValueType max is 11 (0xb) — a real row-header byte can never be
370+
0xff or 0xee — so the "strictly C-aligned" check and the
371+
"ValueType rejection on nibble 0xf/0xe" arrive at the same
372+
outcome from different angles.
365373
"""
366-
# Non-uniform marker: first byte 0xFF, rest different
367-
data = b"\xff\x00\x00\x00\x00\x00\x00\x00"
368-
assert decode_row_header(data, 1) == (RowMarker.DONE, 8)
374+
# First byte 0xff, remaining zero → falls through to ValueType
375+
# nibble decode, which rejects 0xf.
376+
with pytest.raises(DecodeError, match="Invalid value type code"):
377+
decode_row_header(b"\xff\x00\x00\x00\x00\x00\x00\x00", 1)
369378

370-
data = b"\xee\x00\x00\x00\x00\x00\x00\x00"
371-
assert decode_row_header(data, 1) == (RowMarker.PART, 8)
379+
with pytest.raises(DecodeError, match="Invalid value type code"):
380+
decode_row_header(b"\xee\x00\x00\x00\x00\x00\x00\x00", 1)
372381

373382
def test_marker_sentinel_bytes_match_full_constants(self) -> None:
374383
"""ROW_DONE_BYTE/ROW_PART_BYTE must match the first byte of the full marker words.

tests/test_types.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -354,13 +354,38 @@ def test_boolean_explicit_rejects_non_bool_non_int(self) -> None:
354354
"""encode_value with explicit BOOLEAN should reject non-bool/int types."""
355355
import pytest
356356

357-
with pytest.raises(EncodeError, match="[Bb]ool"):
357+
with pytest.raises(EncodeError, match="BOOLEAN"):
358358
encode_value("hello", ValueType.BOOLEAN)
359-
with pytest.raises(EncodeError, match="[Bb]ool"):
359+
with pytest.raises(EncodeError, match="BOOLEAN"):
360360
encode_value([1, 2], ValueType.BOOLEAN)
361-
with pytest.raises(EncodeError, match="[Bb]ool"):
361+
with pytest.raises(EncodeError, match="BOOLEAN"):
362362
encode_value({"key": "val"}, ValueType.BOOLEAN)
363363

364+
def test_boolean_rejects_arbitrary_int(self) -> None:
365+
"""BOOLEAN requires exact bool or 0/1 — arbitrary ints are rejected.
366+
367+
Previously any truthy int was silently coerced to True via
368+
``1 if value else 0``. That made the round-trip lossy: encode(5,
369+
BOOLEAN) → wire 1 → decode → True. Callers that mean "store the
370+
integer 5" should use INTEGER; callers that mean "bool" should
371+
pass bool (ISSUE-60).
372+
"""
373+
import pytest
374+
375+
with pytest.raises(EncodeError, match="BOOLEAN"):
376+
encode_value(5, ValueType.BOOLEAN)
377+
with pytest.raises(EncodeError, match="BOOLEAN"):
378+
encode_value(-1, ValueType.BOOLEAN)
379+
with pytest.raises(EncodeError, match="BOOLEAN"):
380+
encode_value(2, ValueType.BOOLEAN)
381+
382+
# 0 and 1 remain acceptable as a pragmatic escape for callers
383+
# working with raw column values.
384+
encoded_zero, _ = encode_value(0, ValueType.BOOLEAN)
385+
encoded_one, _ = encode_value(1, ValueType.BOOLEAN)
386+
assert decode_uint64(encoded_zero) == 0
387+
assert decode_uint64(encoded_one) == 1
388+
364389
def test_decode_integer(self) -> None:
365390
value, consumed = decode_value(encode_int64(42), ValueType.INTEGER)
366391
assert value == 42

0 commit comments

Comments
 (0)