@@ -119,6 +119,13 @@ def encode_text(value: str) -> bytes:
119119 return encoded + (b"\x00 " * padding )
120120
121121
122+ # Threshold below which we materialize a memoryview to bytes in one
123+ # shot (one allocation + one ``bytes.find``) instead of the chunked
124+ # scan. Row text payloads are almost always well under 64 KiB, so the
125+ # one-shot path dominates the common case (ISSUE-65). Above the
126+ # threshold we fall back to chunked scanning to bound peak memory for
127+ # pathologically long texts.
128+ _TEXT_ONE_SHOT_MAX = 65_536
122129_TEXT_SCAN_CHUNK = 4096
123130
124131
@@ -132,37 +139,46 @@ def decode_text(data: bytes | memoryview) -> tuple[str, int]:
132139 ServersResponse) wrap the body in a ``memoryview`` so
133140 per-iteration slices are O(1) rather than O(remaining) — see
134141 issue 228. ``bytes`` inputs use zero-copy ``.index(b"\\ x00")``.
135- ``memoryview`` inputs walk the buffer in fixed-size chunks so the
136- per-chunk ``bytes(...) `` copy is bounded; arbitrarily long text
137- values (e.g. multi-KiB SQL strings or TEXT column values) are
138- supported because the scan simply visits more chunks. Per-call
139- cost scales with the actual text length, not with the remaining
140- body .
142+
143+ ``memoryview `` inputs use a single ``bytes(mv).find(b" \\ x00")``
144+ when the remaining buffer is small (< 64 KiB). This is one
145+ allocation and one C-level scan, matching the hot-path cost of the
146+ ``bytes`` branch. For larger buffers we fall back to a chunked
147+ scan so peak memory stays bounded (ISSUE-65) .
141148 """
142149 if isinstance (data , memoryview ):
143- # Memoryview has no ``.index(bytes)``. Scan in fixed chunks and
144- # accumulate so we can decode the full text without re-copying
145- # after the NUL is found.
146- chunks : list [bytes ] = []
147- scanned = 0
148- null_pos = - 1
149150 data_len = len (data )
150- while scanned < data_len :
151- chunk_end = min (scanned + _TEXT_SCAN_CHUNK , data_len )
152- chunk = bytes (data [scanned :chunk_end ])
153- local = chunk .find (b"\x00 " )
154- if local >= 0 :
155- chunks .append (chunk [:local ])
156- null_pos = scanned + local
157- break
158- chunks .append (chunk )
159- scanned = chunk_end
160- if null_pos < 0 :
161- raise DecodeError ("Text not null-terminated" )
162- try :
163- text = b"" .join (chunks ).decode ("utf-8" )
164- except UnicodeDecodeError as e :
165- raise DecodeError (f"Invalid UTF-8 in text field: { e } " ) from e
151+ if data_len <= _TEXT_ONE_SHOT_MAX :
152+ # One-shot path: single materialization + C-level find.
153+ materialized = bytes (data )
154+ null_pos = materialized .find (b"\x00 " )
155+ if null_pos < 0 :
156+ raise DecodeError ("Text not null-terminated" )
157+ try :
158+ text = materialized [:null_pos ].decode ("utf-8" )
159+ except UnicodeDecodeError as e :
160+ raise DecodeError (f"Invalid UTF-8 in text field: { e } " ) from e
161+ else :
162+ # Chunked fallback for pathologically long text payloads.
163+ chunks : list [bytes ] = []
164+ scanned = 0
165+ null_pos = - 1
166+ while scanned < data_len :
167+ chunk_end = min (scanned + _TEXT_SCAN_CHUNK , data_len )
168+ chunk = bytes (data [scanned :chunk_end ])
169+ local = chunk .find (b"\x00 " )
170+ if local >= 0 :
171+ chunks .append (chunk [:local ])
172+ null_pos = scanned + local
173+ break
174+ chunks .append (chunk )
175+ scanned = chunk_end
176+ if null_pos < 0 :
177+ raise DecodeError ("Text not null-terminated" )
178+ try :
179+ text = b"" .join (chunks ).decode ("utf-8" )
180+ except UnicodeDecodeError as e :
181+ raise DecodeError (f"Invalid UTF-8 in text field: { e } " ) from e
166182 else :
167183 try :
168184 null_pos = data .index (b"\x00 " )
0 commit comments