Skip to content

Commit c6461a4

Browse files
authored
perf(storage): use google_crc32c.value for checksums (googleapis#16719)
Updates _ReadResumptionStrategy to use google_crc32c.value(data) instead of manually converting Checksum(data).digest() to an integer. Updated unit tests to mock google_crc32c.value accordingly. Test code ``` import timeit import os import google_crc32c from google_crc32c import Checksum def method1(data): return int.from_bytes(Checksum(data).digest(), "big") def method2(data): return google_crc32c.value(data) def benchmark(): # Testing larger sizes and more iterations # 1KB, 1MB, 10MB, 100MB data_sizes = [1024, 1024 * 1024, 10 * 1024 * 1024, 100 * 1024 * 1024] for size in data_sizes: data = os.urandom(size) print(f"\nData size: {size / (1024*1024):.2f} MB" if size >= 1024*1024 else f"\nData size: {size / 1024:.2f} KB") # Correctness check res1 = method1(data) res2 = method2(data) assert res1 == res2, f"Failed for size {size}: {res1} != {res2}" print("Assertion passed: results are equal.") # Increase iterations for more stable results, with a minimum of 1000 if size <= 1024: number = 100000 elif size <= 1024 * 1024: number = 10000 else: number = 1000 t1 = timeit.timeit(lambda: method1(data), number=number) t2 = timeit.timeit(lambda: method2(data), number=number) avg1 = t1 / number avg2 = t2 / number print(f"Method 1 (Checksum(data).digest()): {t1:.6f} s total ({avg1:.8f} s/call)") print(f"Method 2 (google_crc32c.value(data)): {t2:.6f} s total ({avg2:.8f} s/call)") print(f"Improvement: {(t1 - t2) / t1 * 100:.2f}%") if __name__ == "__main__": print(f"google_crc32c implementation: {google_crc32c.implementation}") benchmark() ``` Output ``` google_crc32c implementation: c Data size: 1.00 KB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 0.046088 s total (0.00000046 s/call) Method 2 (google_crc32c.value(data)): 0.016062 s total (0.00000016 s/call) Improvement: 65.15% Data size: 1.00 MB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 0.464044 s total (0.00004640 s/call) Method 2 (google_crc32c.value(data)): 0.439121 s total (0.00004391 s/call) Improvement: 5.37% Data size: 10.00 MB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 0.450953 s total (0.00045095 s/call) Method 2 (google_crc32c.value(data)): 0.445793 s total (0.00044579 s/call) Improvement: 1.14% Data size: 100.00 MB Assertion passed: results are equal. Method 1 (Checksum(data).digest()): 6.287893 s total (0.00628789 s/call) Method 2 (google_crc32c.value(data)): 6.095833 s total (0.00609583 s/call) Improvement: 3.05% ```
1 parent 7c8412a commit c6461a4

2 files changed

Lines changed: 12 additions & 20 deletions

File tree

packages/google-cloud-storage/google/cloud/storage/asyncio/retry/reads_resumption_strategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import logging
1616
from typing import IO, Any, Dict, List
1717

18-
from google_crc32c import Checksum
18+
import google_crc32c
1919

2020
from google.cloud import _storage_v2 as storage_v2
2121
from google.cloud.storage.asyncio.retry._helpers import (
@@ -127,7 +127,7 @@ def update_state_from_response(
127127

128128
if checksummed_data.HasField("crc32c"):
129129
server_checksum = checksummed_data.crc32c
130-
client_checksum = int.from_bytes(Checksum(data).digest(), "big")
130+
client_checksum = google_crc32c.value(data)
131131
if server_checksum != client_checksum:
132132
raise DataCorruption(
133133
response,

packages/google-cloud-storage/tests/unit/asyncio/test_async_multi_range_downloader.py

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -324,10 +324,10 @@ def test_init_raises_if_crc32c_c_extension_is_missing(self, mock_google_crc32c):
324324
)
325325

326326
@pytest.mark.asyncio
327-
@mock.patch("google.cloud.storage.asyncio.retry.reads_resumption_strategy.Checksum")
328-
async def test_download_ranges_raises_on_checksum_mismatch(
329-
self, mock_checksum_class
330-
):
327+
@mock.patch(
328+
"google.cloud.storage.asyncio.retry.reads_resumption_strategy.google_crc32c.value"
329+
)
330+
async def test_download_ranges_raises_on_checksum_mismatch(self, mock_crc32c_value):
331331
from google.cloud.storage.asyncio._stream_multiplexer import _StreamMultiplexer
332332
from google.cloud.storage.asyncio.async_multi_range_downloader import (
333333
AsyncMultiRangeDownloader,
@@ -340,8 +340,7 @@ async def test_download_ranges_raises_on_checksum_mismatch(
340340

341341
test_data = b"some-data"
342342
server_checksum = 12345
343-
mock_checksum_instance = mock_checksum_class.return_value
344-
mock_checksum_instance.digest.return_value = (54321).to_bytes(4, "big")
343+
mock_crc32c_value.return_value = 54321
345344

346345
mock_response = _storage_v2.BidiReadObjectResponse(
347346
object_data_ranges=[
@@ -372,7 +371,7 @@ async def test_download_ranges_raises_on_checksum_mismatch(
372371
await mrd.download_ranges([(0, len(test_data), BytesIO())])
373372

374373
assert "Checksum mismatch" in str(exc_info.value)
375-
mock_checksum_class.assert_called_once_with(test_data)
374+
mock_crc32c_value.assert_called_once_with(test_data)
376375

377376
@mock.patch(
378377
"google.cloud.storage.asyncio.async_multi_range_downloader.AsyncMultiRangeDownloader.open",
@@ -575,18 +574,11 @@ async def staged_recv():
575574
# Act
576575
buffer = BytesIO()
577576

578-
# Patch Checksum where it is likely used (reads_resumption_strategy or similar),
579-
# but actually if we use google_crc32c directly, we should patch that or provide valid CRC.
580-
# Since we can't reliably predict where Checksum is imported/used without more digging,
581-
# let's provide a valid CRC for b"data".
582-
# Checksum(b"data").digest() -> needs to match crc32c=123.
583-
# But we can't force b"data" to have crc=123.
584-
# So we MUST patch Checksum.
585-
# It is used in google.cloud.storage.asyncio.retry.reads_resumption_strategy
577+
# Patch google_crc32c.value where it is used in reads_resumption_strategy
586578
with mock.patch(
587-
"google.cloud.storage.asyncio.retry.reads_resumption_strategy.Checksum"
588-
) as mock_chk:
589-
mock_chk.return_value.digest.return_value = (123).to_bytes(4, "big")
579+
"google.cloud.storage.asyncio.retry.reads_resumption_strategy.google_crc32c.value"
580+
) as mock_crc_value:
581+
mock_crc_value.return_value = 123
590582
await mock_mrd.download_ranges([(0, 4, buffer)])
591583

592584
# Assert

0 commit comments

Comments
 (0)