Skip to content

Commit 177778e

Browse files
Reject bool under explicit INTEGER and report UTF-8 byte offset on NUL
Two small encoder hygiene changes: - encode_value now rejects bool when the caller passes an explicit ValueType.INTEGER (or UNIXTIME), matching the FLOAT branch which already rejects bool. Previously INTEGER silently coerced True/False to 1/0, producing the round-trip surprise "True bound as INTEGER decodes as 1 (int), not True (bool)". The default- inference path still picks BOOLEAN for bools, so callers that want integer semantics must cast via int(x) explicitly — consistent with "explicit over implicit". - encode_text's embedded-NUL error now reports the UTF-8 byte offset rather than the Python code-point index. For pure-ASCII strings the two agree; for strings containing multi-byte codepoints (e.g. "café\\x00" — 4 codepoints, 5 UTF-8 bytes, NUL at byte 5 / codepoint 4) the byte offset matches wire captures and hex-editor output that an operator would use to debug. The "embedded null byte" match string is preserved so existing tests pass unchanged.
1 parent 51f7850 commit 177778e

2 files changed

Lines changed: 53 additions & 16 deletions

File tree

src/dqlitewire/types.py

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,15 +131,20 @@ def encode_text(value: str) -> bytes:
131131
"""Encode text as null-terminated UTF-8, padded to 8-byte boundary."""
132132
if not isinstance(value, str):
133133
raise EncodeError(f"encode_text expected str, got {type(value).__name__}")
134-
if "\x00" in value:
135-
raise EncodeError(
136-
f"Text value contains embedded null byte at position {value.index(chr(0))}; "
137-
"null-terminated encoding would lose data"
138-
)
139134
try:
140-
encoded = value.encode("utf-8") + b"\x00"
135+
utf8 = value.encode("utf-8")
141136
except UnicodeEncodeError as e:
142137
raise EncodeError(f"Text contains invalid UTF-8: {e}") from e
138+
nul_byte_offset = utf8.find(b"\x00")
139+
if nul_byte_offset != -1:
140+
# Report the byte offset of the embedded NUL rather than the
141+
# Python-string character index — the encoder produces bytes so
142+
# operators debugging a wire capture expect byte offsets.
143+
raise EncodeError(
144+
f"Text value contains embedded null byte at byte offset "
145+
f"{nul_byte_offset}; null-terminated encoding would lose data"
146+
)
147+
encoded = utf8 + b"\x00"
143148
padding = pad_to_word(len(encoded))
144149
return encoded + (b"\x00" * padding)
145150

@@ -308,8 +313,18 @@ def encode_value(value: Any, value_type: ValueType | None = None) -> tuple[bytes
308313
# the outgoing-params path, uses inference and never picks
309314
# UNIXTIME, so the server-rejection case cannot arise via the
310315
# documented client API.
316+
#
317+
# Reject bool under explicit non-BOOLEAN types for symmetry with
318+
# the FLOAT branch. The default-inference path (no explicit
319+
# value_type) still picks BOOLEAN for bools, so a caller who
320+
# wants a bool encoded as an integer must coerce explicitly via
321+
# ``int(x)``. This prevents the silent "True in an INTEGER
322+
# column decodes as 1 (int), not True (bool)" surprise.
311323
if isinstance(value, bool):
312-
value = 1 if value else 0
324+
raise EncodeError(
325+
f"Cannot encode bool as {value_type.name}; cast with int(x) "
326+
"explicitly if integer semantics are intended."
327+
)
313328
if not isinstance(value, int):
314329
raise EncodeError(f"Expected int for {value_type.name}, got {type(value).__name__}")
315330
return encode_int64(value), value_type

tests/test_types.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,16 @@ def test_embedded_null_raises_encode_error(self) -> None:
264264
with pytest.raises(EncodeError, match="embedded null byte"):
265265
encode_text("\x00")
266266

267+
def test_embedded_null_error_reports_byte_offset(self) -> None:
268+
"""The embedded-NUL error must report a UTF-8 byte offset, not a
269+
Python code-point index. A multi-byte character preceding the
270+
NUL surfaces different offsets for bytes vs codepoints.
271+
"""
272+
# "café" is 5 UTF-8 bytes (c, a, f, 0xC3, 0xA9) but 4 codepoints.
273+
# The NUL is at codepoint index 4 and byte offset 5.
274+
with pytest.raises(EncodeError, match="byte offset 5"):
275+
encode_text("café\x00")
276+
267277
@pytest.mark.parametrize(
268278
"bad",
269279
[None, 42, b"x", 3.14, ["x"], bytearray(b"x"), memoryview(b"x")],
@@ -747,16 +757,28 @@ def test_encode_value_none_without_type_ok(self) -> None:
747757
assert vtype == ValueType.NULL
748758
assert encoded == b"\x00" * 8
749759

750-
def test_encode_value_bool_as_explicit_integer(self) -> None:
751-
"""Bool with explicit ValueType.INTEGER should coerce to int."""
752-
encoded, vtype = encode_value(True, ValueType.INTEGER)
753-
assert vtype == ValueType.INTEGER
754-
assert decode_int64(encoded) == 1
755-
756-
def test_encode_value_false_as_explicit_integer(self) -> None:
757-
encoded, vtype = encode_value(False, ValueType.INTEGER)
760+
def test_encode_value_bool_with_explicit_integer_is_rejected(self) -> None:
761+
"""Bool with explicit ValueType.INTEGER is rejected for symmetry
762+
with FLOAT. The default-inference path picks BOOLEAN for bools;
763+
callers that genuinely want an integer encoding must cast via
764+
``int(x)`` explicitly. Prevents the silent "True stored as INTEGER
765+
decodes as 1 (int) not True (bool)" round-trip surprise.
766+
"""
767+
with pytest.raises(EncodeError, match="Cannot encode bool as INTEGER"):
768+
encode_value(True, ValueType.INTEGER)
769+
with pytest.raises(EncodeError, match="Cannot encode bool as INTEGER"):
770+
encode_value(False, ValueType.INTEGER)
771+
772+
def test_encode_value_bool_as_explicit_unixtime_is_rejected(self) -> None:
773+
"""UNIXTIME shares the code path with INTEGER; same bool rejection."""
774+
with pytest.raises(EncodeError, match="Cannot encode bool as UNIXTIME"):
775+
encode_value(True, ValueType.UNIXTIME)
776+
777+
def test_encode_value_int_as_explicit_integer_still_works(self) -> None:
778+
"""Regression: the bool-specific reject must not affect plain int."""
779+
encoded, vtype = encode_value(42, ValueType.INTEGER)
758780
assert vtype == ValueType.INTEGER
759-
assert decode_int64(encoded) == 0
781+
assert decode_int64(encoded) == 42
760782

761783

762784
class TestEncodeValueUnsupportedTypes:

0 commit comments

Comments
 (0)