Reject bool under explicit INTEGER and report UTF-8 byte offset on NUL

antoineleclair · antoineleclair · commit 177778e7272c · 2026-04-20T21:15:29.000-04:00
Two small encoder hygiene changes:

- encode_value now rejects bool when the caller passes an explicit
  ValueType.INTEGER (or UNIXTIME), matching the FLOAT branch which
  already rejects bool. Previously INTEGER silently coerced
  True/False to 1/0, producing the round-trip surprise "True bound
  as INTEGER decodes as 1 (int), not True (bool)". The default-
  inference path still picks BOOLEAN for bools, so callers that
  want integer semantics must cast via int(x) explicitly —
  consistent with "explicit over implicit".

- encode_text's embedded-NUL error now reports the UTF-8 byte
  offset rather than the Python code-point index. For pure-ASCII
  strings the two agree; for strings containing multi-byte
  codepoints (e.g. "café\\x00" — 4 codepoints, 5 UTF-8 bytes, NUL
  at byte 5 / codepoint 4) the byte offset matches wire captures
  and hex-editor output that an operator would use to debug. The
  "embedded null byte" match string is preserved so existing tests
  pass unchanged.
diff --git a/src/dqlitewire/types.py b/src/dqlitewire/types.py
@@ -131,15 +131,20 @@ def encode_text(value: str) -> bytes:
     """Encode text as null-terminated UTF-8, padded to 8-byte boundary."""
     if not isinstance(value, str):
         raise EncodeError(f"encode_text expected str, got {type(value).__name__}")
-    if "\x00" in value:
-        raise EncodeError(
-            f"Text value contains embedded null byte at position {value.index(chr(0))}; "
-            "null-terminated encoding would lose data"
-        )
     try:
-        encoded = value.encode("utf-8") + b"\x00"
+        utf8 = value.encode("utf-8")
     except UnicodeEncodeError as e:
         raise EncodeError(f"Text contains invalid UTF-8: {e}") from e
+    nul_byte_offset = utf8.find(b"\x00")
+    if nul_byte_offset != -1:
+        # Report the byte offset of the embedded NUL rather than the
+        # Python-string character index — the encoder produces bytes so
+        # operators debugging a wire capture expect byte offsets.
+        raise EncodeError(
+            f"Text value contains embedded null byte at byte offset "
+            f"{nul_byte_offset}; null-terminated encoding would lose data"
+        )
+    encoded = utf8 + b"\x00"
     padding = pad_to_word(len(encoded))
     return encoded + (b"\x00" * padding)
 
@@ -308,8 +313,18 @@ def encode_value(value: Any, value_type: ValueType | None = None) -> tuple[bytes
         # the outgoing-params path, uses inference and never picks
         # UNIXTIME, so the server-rejection case cannot arise via the
         # documented client API.
+        #
+        # Reject bool under explicit non-BOOLEAN types for symmetry with
+        # the FLOAT branch. The default-inference path (no explicit
+        # value_type) still picks BOOLEAN for bools, so a caller who
+        # wants a bool encoded as an integer must coerce explicitly via
+        # ``int(x)``. This prevents the silent "True in an INTEGER
+        # column decodes as 1 (int), not True (bool)" surprise.
         if isinstance(value, bool):
-            value = 1 if value else 0
+            raise EncodeError(
+                f"Cannot encode bool as {value_type.name}; cast with int(x) "
+                "explicitly if integer semantics are intended."
+            )
         if not isinstance(value, int):
             raise EncodeError(f"Expected int for {value_type.name}, got {type(value).__name__}")
         return encode_int64(value), value_type
diff --git a/tests/test_types.py b/tests/test_types.py
@@ -264,6 +264,16 @@ def test_embedded_null_raises_encode_error(self) -> None:
         with pytest.raises(EncodeError, match="embedded null byte"):
             encode_text("\x00")
 
+    def test_embedded_null_error_reports_byte_offset(self) -> None:
+        """The embedded-NUL error must report a UTF-8 byte offset, not a
+        Python code-point index. A multi-byte character preceding the
+        NUL surfaces different offsets for bytes vs codepoints.
+        """
+        # "café" is 5 UTF-8 bytes (c, a, f, 0xC3, 0xA9) but 4 codepoints.
+        # The NUL is at codepoint index 4 and byte offset 5.
+        with pytest.raises(EncodeError, match="byte offset 5"):
+            encode_text("café\x00")
+
     @pytest.mark.parametrize(
         "bad",
         [None, 42, b"x", 3.14, ["x"], bytearray(b"x"), memoryview(b"x")],
@@ -747,16 +757,28 @@ def test_encode_value_none_without_type_ok(self) -> None:
         assert vtype == ValueType.NULL
         assert encoded == b"\x00" * 8
 
-    def test_encode_value_bool_as_explicit_integer(self) -> None:
-        """Bool with explicit ValueType.INTEGER should coerce to int."""
-        encoded, vtype = encode_value(True, ValueType.INTEGER)
-        assert vtype == ValueType.INTEGER
-        assert decode_int64(encoded) == 1
-
-    def test_encode_value_false_as_explicit_integer(self) -> None:
-        encoded, vtype = encode_value(False, ValueType.INTEGER)
+    def test_encode_value_bool_with_explicit_integer_is_rejected(self) -> None:
+        """Bool with explicit ValueType.INTEGER is rejected for symmetry
+        with FLOAT. The default-inference path picks BOOLEAN for bools;
+        callers that genuinely want an integer encoding must cast via
+        ``int(x)`` explicitly. Prevents the silent "True stored as INTEGER
+        decodes as 1 (int) not True (bool)" round-trip surprise.
+        """
+        with pytest.raises(EncodeError, match="Cannot encode bool as INTEGER"):
+            encode_value(True, ValueType.INTEGER)
+        with pytest.raises(EncodeError, match="Cannot encode bool as INTEGER"):
+            encode_value(False, ValueType.INTEGER)
+
+    def test_encode_value_bool_as_explicit_unixtime_is_rejected(self) -> None:
+        """UNIXTIME shares the code path with INTEGER; same bool rejection."""
+        with pytest.raises(EncodeError, match="Cannot encode bool as UNIXTIME"):
+            encode_value(True, ValueType.UNIXTIME)
+
+    def test_encode_value_int_as_explicit_integer_still_works(self) -> None:
+        """Regression: the bool-specific reject must not affect plain int."""
+        encoded, vtype = encode_value(42, ValueType.INTEGER)
         assert vtype == ValueType.INTEGER
-        assert decode_int64(encoded) == 0
+        assert decode_int64(encoded) == 42
 
 
 class TestEncodeValueUnsupportedTypes: