Skip to content

Commit 8f81ed7

Browse files
committed
fix(storage): correct off-by-one in BlobReader chunked range fetch
1 parent 53f72fb commit 8f81ed7

2 files changed

Lines changed: 55 additions & 19 deletions

File tree

packages/google-cloud-storage/google/cloud/storage/fileio.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,9 @@ def read(self, size=-1):
134134
fetch_start = self._pos
135135
if size > 0:
136136
# Fetch the larger of self._chunk_size or the remaining_size.
137-
fetch_end = fetch_start + max(remaining_size, self._chunk_size)
137+
# `end` is inclusive in `download_as_bytes` (HTTP Range semantics),
138+
# so subtract 1 to fetch exactly chunk_size / remaining_size bytes.
139+
fetch_end = fetch_start + max(remaining_size, self._chunk_size) - 1
138140
else:
139141
fetch_end = None
140142

packages/google-cloud-storage/tests/unit/test_fileio.py

Lines changed: 52 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ def test_read(self):
7171
blob = mock.Mock()
7272

7373
def read_from_fake_data(start=0, end=None, **_):
74-
return TEST_BINARY_DATA[start:end]
74+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
75+
stop = end + 1 if end is not None else None
76+
return TEST_BINARY_DATA[start:stop]
7577

7678
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
7779
download_kwargs = {"if_metageneration_match": 1}
@@ -80,7 +82,7 @@ def read_from_fake_data(start=0, end=None, **_):
8082
# Read and trigger the first download of chunk_size.
8183
self.assertEqual(reader.read(1), TEST_BINARY_DATA[0:1])
8284
blob.download_as_bytes.assert_called_once_with(
83-
start=0, end=8, checksum=None, retry=DEFAULT_RETRY, **download_kwargs
85+
start=0, end=7, checksum=None, retry=DEFAULT_RETRY, **download_kwargs
8486
)
8587

8688
# Read from buffered data only.
@@ -92,15 +94,15 @@ def read_from_fake_data(start=0, end=None, **_):
9294
self.assertEqual(reader._pos, 12)
9395
self.assertEqual(blob.download_as_bytes.call_count, 2)
9496
blob.download_as_bytes.assert_called_with(
95-
start=8, end=16, checksum=None, retry=DEFAULT_RETRY, **download_kwargs
97+
start=8, end=15, checksum=None, retry=DEFAULT_RETRY, **download_kwargs
9698
)
9799

98100
# Read a larger amount, requiring a download larger than chunk_size.
99101
self.assertEqual(reader.read(16), TEST_BINARY_DATA[12:28])
100102
self.assertEqual(reader._pos, 28)
101103
self.assertEqual(blob.download_as_bytes.call_count, 3)
102104
blob.download_as_bytes.assert_called_with(
103-
start=16, end=28, checksum=None, retry=DEFAULT_RETRY, **download_kwargs
105+
start=16, end=27, checksum=None, retry=DEFAULT_RETRY, **download_kwargs
104106
)
105107

106108
# Read all remaining data.
@@ -112,11 +114,27 @@ def read_from_fake_data(start=0, end=None, **_):
112114

113115
reader.close()
114116

117+
def test_read_uses_inclusive_end_range(self):
118+
# Regression: download_as_bytes documents `end` as inclusive
119+
# (HTTP Range semantics), so a chunk_size=N read must request
120+
# end=N-1, not end=N. Otherwise we over-fetch by one byte per
121+
# request.
122+
blob = mock.Mock()
123+
blob.download_as_bytes = mock.Mock(return_value=b"")
124+
reader = self._make_blob_reader(blob, chunk_size=64)
125+
reader.read(1)
126+
_, kwargs = blob.download_as_bytes.call_args
127+
self.assertEqual(kwargs["start"], 0)
128+
self.assertEqual(kwargs["end"], 63)
129+
reader.close()
130+
115131
def test_read_with_raw_download(self):
116132
blob = mock.Mock()
117133

118134
def read_from_fake_data(start=0, end=None, **_):
119-
return TEST_BINARY_DATA[start:end]
135+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
136+
stop = end + 1 if end is not None else None
137+
return TEST_BINARY_DATA[start:stop]
120138

121139
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
122140
download_kwargs = {"raw_download": True}
@@ -125,7 +143,7 @@ def read_from_fake_data(start=0, end=None, **_):
125143
# Read and trigger the first download of chunk_size.
126144
self.assertEqual(reader.read(1), TEST_BINARY_DATA[0:1])
127145
blob.download_as_bytes.assert_called_once_with(
128-
start=0, end=8, checksum=None, retry=DEFAULT_RETRY, raw_download=True
146+
start=0, end=7, checksum=None, retry=DEFAULT_RETRY, raw_download=True
129147
)
130148

131149
reader.close()
@@ -134,7 +152,9 @@ def test_retry_passed_through(self):
134152
blob = mock.Mock()
135153

136154
def read_from_fake_data(start=0, end=None, **_):
137-
return TEST_BINARY_DATA[start:end]
155+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
156+
stop = end + 1 if end is not None else None
157+
return TEST_BINARY_DATA[start:stop]
138158

139159
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
140160
download_kwargs = {"if_metageneration_match": 1}
@@ -145,7 +165,7 @@ def read_from_fake_data(start=0, end=None, **_):
145165
# Read and trigger the first download of chunk_size.
146166
self.assertEqual(reader.read(1), TEST_BINARY_DATA[0:1])
147167
blob.download_as_bytes.assert_called_once_with(
148-
start=0, end=8, checksum=None, retry=None, **download_kwargs
168+
start=0, end=7, checksum=None, retry=None, **download_kwargs
149169
)
150170

151171
reader.close()
@@ -163,22 +183,24 @@ def test_readline(self):
163183
blob = mock.Mock()
164184

165185
def read_from_fake_data(start=0, end=None, **_):
166-
return TEST_BINARY_DATA[start:end]
186+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
187+
stop = end + 1 if end is not None else None
188+
return TEST_BINARY_DATA[start:stop]
167189

168190
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
169191
reader = self._make_blob_reader(blob, chunk_size=10)
170192

171193
# Read a line. With chunk_size=10, expect three chunks downloaded.
172194
self.assertEqual(reader.readline(), TEST_BINARY_DATA[:27])
173195
blob.download_as_bytes.assert_called_with(
174-
start=20, end=30, checksum=None, retry=DEFAULT_RETRY
196+
start=20, end=29, checksum=None, retry=DEFAULT_RETRY
175197
)
176198
self.assertEqual(blob.download_as_bytes.call_count, 3)
177199

178200
# Read another line.
179201
self.assertEqual(reader.readline(), TEST_BINARY_DATA[27:])
180202
blob.download_as_bytes.assert_called_with(
181-
start=50, end=60, checksum=None, retry=DEFAULT_RETRY
203+
start=50, end=59, checksum=None, retry=DEFAULT_RETRY
182204
)
183205
self.assertEqual(blob.download_as_bytes.call_count, 6)
184206

@@ -189,7 +211,7 @@ def read_from_fake_data(start=0, end=None, **_):
189211
self.assertEqual(b"".join(reader.readlines()), TEST_BINARY_DATA)
190212
blob.download_as_bytes.assert_called_with(
191213
start=len(TEST_BINARY_DATA),
192-
end=len(TEST_BINARY_DATA) + 10,
214+
end=len(TEST_BINARY_DATA) + 9,
193215
checksum=None,
194216
retry=DEFAULT_RETRY,
195217
)
@@ -201,7 +223,9 @@ def test_seek(self):
201223
blob = mock.Mock()
202224

203225
def read_from_fake_data(start=0, end=None, **_):
204-
return TEST_BINARY_DATA[start:end]
226+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
227+
stop = end + 1 if end is not None else None
228+
return TEST_BINARY_DATA[start:stop]
205229

206230
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
207231
blob.size = None
@@ -254,7 +278,9 @@ def test_advanced_seek(self):
254278
blob = mock.Mock()
255279

256280
def read_from_fake_data(start=0, end=None, **_):
257-
return TEST_BINARY_DATA[start:end] * 1024
281+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
282+
stop = end + 1 if end is not None else None
283+
return TEST_BINARY_DATA[start:stop] * 1024
258284

259285
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
260286
blob.size = None
@@ -758,7 +784,9 @@ def test_read(self):
758784
blob = mock.Mock()
759785

760786
def read_from_fake_data(start=0, end=None, **_):
761-
return TEST_TEXT_DATA.encode("utf-8")[start:end]
787+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
788+
stop = end + 1 if end is not None else None
789+
return TEST_TEXT_DATA.encode("utf-8")[start:stop]
762790

763791
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
764792
blob.chunk_size = None
@@ -789,7 +817,9 @@ def test_multibyte_read(self):
789817
blob = mock.Mock()
790818

791819
def read_from_fake_data(start=0, end=None, **_):
792-
return TEST_MULTIBYTE_TEXT_DATA.encode("utf-8")[start:end]
820+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
821+
stop = end + 1 if end is not None else None
822+
return TEST_MULTIBYTE_TEXT_DATA.encode("utf-8")[start:stop]
793823

794824
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
795825
blob.chunk_size = None
@@ -820,7 +850,9 @@ def test_seek(self):
820850
blob = mock.Mock()
821851

822852
def read_from_fake_data(start=0, end=None, **_):
823-
return TEST_TEXT_DATA.encode("utf-8")[start:end]
853+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
854+
stop = end + 1 if end is not None else None
855+
return TEST_TEXT_DATA.encode("utf-8")[start:stop]
824856

825857
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
826858
blob.size = None
@@ -853,7 +885,9 @@ def test_multibyte_seek(self):
853885
blob = mock.Mock()
854886

855887
def read_from_fake_data(start=0, end=None, **_):
856-
return TEST_MULTIBYTE_TEXT_DATA.encode("utf-8")[start:end]
888+
# `end` is inclusive in download_as_bytes (HTTP Range semantics).
889+
stop = end + 1 if end is not None else None
890+
return TEST_MULTIBYTE_TEXT_DATA.encode("utf-8")[start:stop]
857891

858892
blob.download_as_bytes = mock.Mock(side_effect=read_from_fake_data)
859893
blob.size = None

0 commit comments

Comments
 (0)