Skip to content

Commit ec1476f

Browse files
committed
PB-1024: add large asset download guide for assets > 50 GB
Add a new page explaining how to download STAC assets larger than 50 GB, which exceed CloudFront's object size limit and return HTTP 400 on regular GET/HEAD requests.
1 parent dabc5eb commit ec1476f

2 files changed

Lines changed: 287 additions & 0 deletions

File tree

.vitepress/config.mts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ function downloadDataItems(): DefaultTheme.SidebarItem[] {
248248
link: '/download-data/stac-api/supported-media',
249249
},
250250
{ text: 'Caching', link: '/download-data/stac-api/caching' },
251+
{ text: 'Large Assets (> 50 GB)', link: '/download-data/stac-api/large-assets' },
251252
{ text: 'Item Expiration', link: '/download-data/stac-api/item-expiration' },
252253
{ text: 'Migrate v0.9 to v1', link: '/download-data/stac-api/migrate09-10' },
253254
{
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
# Downloading Large Assets (> 50 GB)
2+
3+
Assets larger than **50 GB** cannot be downloaded with a regular HTTP `GET` or `HEAD` request through CloudFront, which enforces a 50 GB object size limit and returns `HTTP 400` for objects above that threshold.
4+
5+
The workaround is to use **HTTP range requests**, which bypass the CloudFront limit by fetching the file in sequential chunks directly from the S3 origin.
6+
7+
## How It Works
8+
9+
A `GET` request with the header `Range: bytes=0-0` is sent first to probe the asset.
10+
The S3 origin responds with `HTTP 206 Partial Content` and includes two useful headers:
11+
12+
| Header | Value |
13+
| ------------------- | ----------------------------------------------------------------- |
14+
| `Content-Range` | `bytes 0-0/<total_size>` — the total size of the object |
15+
| `x-amz-meta-sha256` | SHA-256 hex digest of the full object (when set by the publisher) |
16+
17+
The file is then downloaded chunk by chunk using `Range: bytes=<start>-<end>`, and the final file is verified against the expected size and checksum.
18+
19+
::: tip
20+
`HEAD` requests are **also blocked** by CloudFront for objects > 50 GB. Always use `GET` with a `Range` header to probe asset metadata.
21+
:::
22+
23+
## Download Script
24+
25+
The script below requires **Python 3.6+ and no third-party packages** (stdlib only). It works on Linux, macOS, and Windows.
26+
27+
**Usage:**
28+
29+
```bash
30+
# Basic usage (auto-detects size and checksum via range probe)
31+
python3 range_download.py "https://data.geo.admin.ch/.../file.tar" /path/to/file.tar
32+
33+
# Custom chunk size 2GB (in MB, default is 20 GB)
34+
python3 range_download.py "https://..." /path/to/file.tar --chunk-size 2048
35+
36+
# Skip the probe if you already know the total size
37+
python3 range_download.py "https://..." /path/to/file.tar --total-size 102265118720
38+
```
39+
40+
**Script (`range_download.py`):**
41+
42+
```python
43+
#!/usr/bin/env python3
44+
"""Download large assets via HTTP range requests, bypassing CDN size limits.
45+
46+
Usage:
47+
python3 range_download.py <URL> <OUTPUT_FILE> [--chunk-size MB] [--total-size BYTES]
48+
49+
Examples:
50+
python3 range_download.py "https://data.geo.admin.ch/.../file.tar" /path/to/file.tar
51+
python3 range_download.py "https://..." /path/to/file.tar --chunk-size 2048
52+
python3 range_download.py "https://..." /path/to/file.tar --total-size 102265118720
53+
"""
54+
55+
import argparse
56+
import hashlib
57+
import os
58+
import sys
59+
import time
60+
import urllib.error
61+
import urllib.request
62+
63+
DEFAULT_CHUNK_MB = 20 * 1024 # 20 GB
64+
READ_BUFFER = 16 * 1024 * 1024 # 16 MB read buffer for streaming to disk
65+
MAX_ATTEMPTS = 3
66+
67+
68+
def format_size(n):
69+
for unit in ('B', 'KB', 'MB', 'GB', 'TB'):
70+
if n < 1024:
71+
return f'{n:.1f} {unit}'
72+
n /= 1024
73+
return f'{n:.1f} PB'
74+
75+
76+
def format_eta(secs):
77+
secs = int(secs)
78+
if secs >= 3600:
79+
return f'{secs // 3600}h {(secs % 3600) // 60}m'
80+
if secs >= 60:
81+
return f'{secs // 60}m {secs % 60}s'
82+
return f'{secs}s'
83+
84+
85+
def probe_asset(url):
86+
"""Probe asset size and checksum via a Range: bytes=0-0 GET request.
87+
88+
Uses GET (not HEAD) because CloudFront blocks HEAD for objects > 50 GB.
89+
The S3 origin returns HTTP 206 with:
90+
Content-Range: bytes 0-0/<total>
91+
x-amz-meta-sha256: <hex>
92+
Returns (total_size, sha256_hex_or_None).
93+
"""
94+
print('Probing asset via range request...')
95+
req = urllib.request.Request(url, headers={'Range': 'bytes=0-0'})
96+
try:
97+
with urllib.request.urlopen(req, timeout=30) as resp:
98+
content_range = resp.headers.get('Content-Range', '')
99+
sha256 = resp.headers.get('x-amz-meta-sha256', '')
100+
except urllib.error.HTTPError as exc:
101+
if exc.code == 206:
102+
content_range = exc.headers.get('Content-Range', '')
103+
sha256 = exc.headers.get('x-amz-meta-sha256', '')
104+
else:
105+
raise
106+
107+
total_size = None
108+
if '/' in content_range:
109+
try:
110+
total_size = int(content_range.split('/')[-1].strip())
111+
except ValueError:
112+
pass
113+
114+
if total_size is None:
115+
print(f'ERROR: Could not detect total size from Content-Range header.', file=sys.stderr)
116+
print(f' Content-Range received: "{content_range}"', file=sys.stderr)
117+
print(f' Use --total-size to provide it manually.', file=sys.stderr)
118+
sys.exit(1)
119+
120+
print(f'Total size: {format_size(total_size)} ({total_size} bytes)')
121+
if sha256:
122+
print(f'SHA256: {sha256}')
123+
else:
124+
print('SHA256: (not available in headers)')
125+
126+
return total_size, sha256 or None
127+
128+
129+
def download_chunk(url, file_handle, offset, end, label):
130+
chunk_bytes = end - offset + 1
131+
print(f' Chunk {label}: bytes {offset}-{end} ({format_size(chunk_bytes)}) ... ',
132+
end='', flush=True)
133+
134+
for attempt in range(1, MAX_ATTEMPTS + 1):
135+
req = urllib.request.Request(url, headers={'Range': f'bytes={offset}-{end}'})
136+
try:
137+
with urllib.request.urlopen(req, timeout=300) as resp:
138+
file_handle.seek(offset)
139+
while True:
140+
data = resp.read(READ_BUFFER)
141+
if not data:
142+
break
143+
file_handle.write(data)
144+
file_handle.flush()
145+
return
146+
except Exception as exc: # pylint: disable=broad-except
147+
if attempt < MAX_ATTEMPTS:
148+
delay = 2 ** attempt
149+
print(f'\n Attempt {attempt} failed ({exc}). Retrying in {delay}s...',
150+
end=' ', flush=True)
151+
file_handle.seek(offset)
152+
file_handle.truncate()
153+
time.sleep(delay)
154+
else:
155+
print(f'\n All {MAX_ATTEMPTS} attempts failed: {exc}', file=sys.stderr)
156+
sys.exit(1)
157+
158+
159+
def download_loop(url, output, total_size, chunk_size):
160+
total_chunks = -(-total_size // chunk_size) # ceiling division
161+
162+
print(f'Output: {output}')
163+
print(f'Chunk size: {format_size(chunk_size)}')
164+
print(f'Total chunks: {total_chunks}')
165+
print()
166+
167+
session_start = time.monotonic()
168+
downloaded = 0
169+
170+
with open(output, 'wb') as fh:
171+
for chunk_num in range(1, total_chunks + 1):
172+
offset = (chunk_num - 1) * chunk_size
173+
end = min(offset + chunk_size - 1, total_size - 1)
174+
chunk_bytes = end - offset + 1
175+
176+
chunk_start = time.monotonic()
177+
download_chunk(url, fh, offset, end, f'{chunk_num}/{total_chunks}')
178+
chunk_elapsed = max(time.monotonic() - chunk_start, 0.001)
179+
180+
speed = chunk_bytes / chunk_elapsed
181+
downloaded += chunk_bytes
182+
183+
session_elapsed = max(time.monotonic() - session_start, 0.001)
184+
overall_speed = downloaded / session_elapsed
185+
remaining_bytes = total_size - downloaded
186+
eta = remaining_bytes / overall_speed if overall_speed > 0 else 0
187+
progress = downloaded * 100 // total_size
188+
189+
print(
190+
f'{format_size(speed)}/s | '
191+
f'{format_size(downloaded)}/{format_size(total_size)} ({progress}%) | '
192+
f'ETA {format_eta(eta)}'
193+
)
194+
195+
print()
196+
total_elapsed = max(time.monotonic() - session_start, 0.001)
197+
avg_speed = downloaded / total_elapsed
198+
print(f'Download complete in {format_eta(total_elapsed)} ({format_size(avg_speed)}/s average)')
199+
200+
201+
def verify_download(output, expected):
202+
final_size = os.path.getsize(output)
203+
print(f'Final size: {format_size(final_size)} ({final_size} bytes)')
204+
if final_size != expected:
205+
print(f'ERROR: final size {final_size} != expected {expected}.', file=sys.stderr)
206+
sys.exit(1)
207+
print('Size OK.')
208+
209+
210+
def verify_checksum(output, expected):
211+
if not expected:
212+
print('Checksum: skipped (not available).')
213+
return
214+
215+
print('Computing SHA256 (this may take a moment for large files)...')
216+
sha256 = hashlib.sha256()
217+
with open(output, 'rb') as fh:
218+
while True:
219+
data = fh.read(READ_BUFFER)
220+
if not data:
221+
break
222+
sha256.update(data)
223+
actual = sha256.hexdigest()
224+
225+
if actual == expected:
226+
print(f'SHA256 OK: {actual}')
227+
else:
228+
print('ERROR: SHA256 mismatch!', file=sys.stderr)
229+
print(f' expected: {expected}', file=sys.stderr)
230+
print(f' actual: {actual}', file=sys.stderr)
231+
sys.exit(1)
232+
233+
234+
def get_args():
235+
parser = argparse.ArgumentParser(
236+
description='Download large assets via HTTP range requests, '
237+
'bypassing CDN size limits (e.g. CloudFront 50 GB cap).'
238+
)
239+
parser.add_argument('url', help='URL of the asset to download')
240+
parser.add_argument('output', help='Local output file path')
241+
parser.add_argument(
242+
'--chunk-size',
243+
type=int,
244+
default=DEFAULT_CHUNK_MB,
245+
metavar='MB',
246+
dest='chunk_size_mb',
247+
help=f'Chunk size in MB (default: {DEFAULT_CHUNK_MB} = 20 GB)'
248+
)
249+
parser.add_argument(
250+
'--total-size',
251+
type=int,
252+
default=None,
253+
metavar='BYTES',
254+
dest='total_size',
255+
help='Known total size in bytes (skips range probe)'
256+
)
257+
return parser.parse_args()
258+
259+
260+
def main():
261+
args = get_args()
262+
263+
if os.path.exists(args.output):
264+
print(f"ERROR: Output file '{args.output}' already exists. Delete it first.",
265+
file=sys.stderr)
266+
sys.exit(1)
267+
268+
chunk_size = args.chunk_size_mb * 1024 * 1024
269+
270+
if args.total_size is not None:
271+
total_size = args.total_size
272+
sha256 = None
273+
print(f'Total size (provided): {format_size(total_size)} ({total_size} bytes)')
274+
else:
275+
total_size, sha256 = probe_asset(args.url)
276+
277+
print()
278+
download_loop(args.url, args.output, total_size, chunk_size)
279+
print()
280+
verify_download(args.output, total_size)
281+
verify_checksum(args.output, sha256)
282+
283+
284+
if __name__ == '__main__':
285+
main()
286+
```

0 commit comments

Comments
 (0)