|
| 1 | +"""Error-path tests for the memoryview chunked branch of ``decode_text``. |
| 2 | +
|
| 3 | +The one-shot branch (payloads <= 64 KiB) has null-terminator and |
| 4 | +invalid-UTF-8 tests already. The chunked branch is only exercised for |
| 5 | +the happy path. Without dedicated error-path tests, a regression in |
| 6 | +the accumulator — wrong ``scanned`` update, dropped partial multi-byte |
| 7 | +UTF-8 split across chunk boundaries — would silently corrupt large |
| 8 | +CLOB / JSON-text columns. |
| 9 | +""" |
| 10 | + |
| 11 | +import pytest |
| 12 | + |
| 13 | +from dqlitewire.exceptions import DecodeError |
| 14 | +from dqlitewire.types import decode_text |
| 15 | + |
| 16 | + |
| 17 | +def _align(n: int) -> int: |
| 18 | + """Pad up to a multiple of 8 bytes (word alignment for text fields).""" |
| 19 | + rem = n % 8 |
| 20 | + return 0 if rem == 0 else 8 - rem |
| 21 | + |
| 22 | + |
| 23 | +class TestDecodeTextChunkedErrors: |
| 24 | + def test_chunked_missing_null_terminator_raises(self) -> None: |
| 25 | + # 70 KiB > 64 KiB threshold, forcing the chunked branch. |
| 26 | + payload = b"a" * (70 * 1024) |
| 27 | + # Pad to a word boundary so the decoder accepts the surrounding |
| 28 | + # size math once it reaches the scan-exhausted state. |
| 29 | + buf = memoryview(payload + b"\x00" * _align(len(payload))) |
| 30 | + with pytest.raises(DecodeError, match="not null-terminated"): |
| 31 | + decode_text(buf) |
| 32 | + |
| 33 | + def test_chunked_invalid_utf8_before_null_raises(self) -> None: |
| 34 | + # 70 KiB of ASCII + a truncated multi-byte UTF-8 sequence, then null. |
| 35 | + prefix = b"a" * (70 * 1024) |
| 36 | + payload = prefix + b"\xc3" + b"\x00" |
| 37 | + buf = memoryview(payload + b"\x00" * _align(len(payload))) |
| 38 | + with pytest.raises(DecodeError, match="Invalid UTF-8"): |
| 39 | + decode_text(buf) |
| 40 | + |
| 41 | + def test_chunked_multibyte_codepoint_across_boundary(self) -> None: |
| 42 | + """A 3-byte UTF-8 codepoint straddling the internal |
| 43 | + ``_TEXT_SCAN_CHUNK`` boundary must decode correctly. Verifies |
| 44 | + the accumulator joins chunks before decoding, rather than |
| 45 | + decoding each chunk independently. |
| 46 | + """ |
| 47 | + # Pad with 'a' until one byte before the chunk boundary, then |
| 48 | + # place a 3-byte codepoint so it straddles the boundary. The |
| 49 | + # payload total must exceed _TEXT_ONE_SHOT_MAX to force the |
| 50 | + # chunked branch. |
| 51 | + from dqlitewire.types import _TEXT_ONE_SHOT_MAX, _TEXT_SCAN_CHUNK |
| 52 | + |
| 53 | + filler_len = _TEXT_SCAN_CHUNK - 1 |
| 54 | + codepoint = "€".encode() # 3 bytes: e2 82 ac |
| 55 | + assert len(codepoint) == 3 |
| 56 | + # Ensure we stay above the one-shot threshold. |
| 57 | + tail_padding = b"z" * (_TEXT_ONE_SHOT_MAX + 1 - filler_len - len(codepoint)) |
| 58 | + payload = b"a" * filler_len + codepoint + tail_padding + b"\x00" |
| 59 | + buf = memoryview(payload + b"\x00" * _align(len(payload))) |
| 60 | + |
| 61 | + text, _consumed = decode_text(buf) |
| 62 | + assert text == "a" * filler_len + "€" + "z" * len(tail_padding) |
0 commit comments