Skip to content

Commit 733b41a

Browse files
authored
Merge pull request #127 from geoadmin/feat_PB-1024_add_range_download_sample
PB-1024: add large asset download guide for assets > 50 GB
2 parents dabc5eb + c09fbba commit 733b41a

File tree

2 files changed

+305
-0
lines changed

2 files changed

+305
-0
lines changed

.vitepress/config.mts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ function downloadDataItems(): DefaultTheme.SidebarItem[] {
248248
link: '/download-data/stac-api/supported-media',
249249
},
250250
{ text: 'Caching', link: '/download-data/stac-api/caching' },
251+
{ text: 'Large Assets (> 50 GB)', link: '/download-data/stac-api/large-assets' },
251252
{ text: 'Item Expiration', link: '/download-data/stac-api/item-expiration' },
252253
{ text: 'Migrate v0.9 to v1', link: '/download-data/stac-api/migrate09-10' },
253254
{
Lines changed: 304 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,304 @@
1+
# Downloading Large Assets (> 50 GB)
2+
3+
Assets larger than **50 GB** cannot be downloaded with a regular HTTP `GET` or `HEAD` request through CloudFront, which enforces a 50 GB object size limit and returns `HTTP 400` for objects above that threshold.
4+
5+
The workaround is to use **HTTP range requests**, which bypass the CloudFront limit by fetching the file in sequential chunks directly from the S3 origin.
6+
7+
## How It Works
8+
9+
A `GET` request with the header `Range: bytes=0-0` is sent first to probe the asset.
10+
The S3 origin responds with `HTTP 206 Partial Content` and includes two useful headers:
11+
12+
| Header | Value |
13+
| ------------------- | ----------------------------------------------------------------- |
14+
| `Content-Range` | `bytes 0-0/<total_size>` — the total size of the object |
15+
| `x-amz-meta-sha256` | SHA-256 hex digest of the full object (when set by the publisher) |
16+
17+
The file is then downloaded chunk by chunk using `Range: bytes=<start>-<end>`, and the final file is verified against the expected size and checksum.
18+
19+
You can probe an asset manually with `curl`:
20+
21+
```bash
22+
curl --silent --show-error --location \
23+
--header "Range: bytes=0-0" \
24+
--dump-header - \
25+
--output /dev/null \
26+
"https://data.geo.admin.ch/ch.swisstopo.pixelkarte-farbe/pixelkarte-farbe/ch.swisstopo.pixelkarte-farbe_3857_0.5.tar"
27+
```
28+
29+
The response headers will contain the total size and checksum:
30+
31+
```
32+
HTTP/2 206
33+
content-range: bytes 0-0/102265118720
34+
x-amz-meta-sha256: <hex>
35+
...
36+
```
37+
38+
::: tip
39+
`HEAD` requests are **also blocked** by CloudFront for objects > 50 GB. Always use `GET` with a `Range` header to probe asset metadata.
40+
:::
41+
42+
## Download Script
43+
44+
The script below requires **Python 3.6+ and no third-party packages** (stdlib only). It works on Linux, macOS, and Windows.
45+
46+
**Usage:**
47+
48+
```bash
49+
# Basic usage (auto-detects size and checksum via range probe)
50+
python3 range_download.py "https://data.geo.admin.ch/.../file.tar" /path/to/file.tar
51+
52+
# Custom chunk size 2GB (in MB, default is 20 GB)
53+
python3 range_download.py "https://..." /path/to/file.tar --chunk-size 2048
54+
55+
# Skip the probe if you already know the total size
56+
python3 range_download.py "https://..." /path/to/file.tar --total-size 102265118720
57+
```
58+
59+
**Script (`range_download.py`):**
60+
61+
```python
62+
#!/usr/bin/env python3
63+
"""Download large assets via HTTP range requests, bypassing CDN size limits.
64+
65+
Usage:
66+
python3 range_download.py <URL> <OUTPUT_FILE> [--chunk-size MB] [--total-size BYTES]
67+
68+
Examples:
69+
python3 range_download.py "https://data.geo.admin.ch/.../file.tar" /path/to/file.tar
70+
python3 range_download.py "https://..." /path/to/file.tar --chunk-size 2048
71+
python3 range_download.py "https://..." /path/to/file.tar --total-size 102265118720
72+
"""
73+
74+
import argparse
75+
import hashlib
76+
import os
77+
import sys
78+
import time
79+
import urllib.error
80+
import urllib.request
81+
82+
DEFAULT_CHUNK_MB = 20 * 1024 # 20 GB
83+
READ_BUFFER = 16 * 1024 * 1024 # 16 MB read buffer for streaming to disk
84+
MAX_ATTEMPTS = 3
85+
86+
87+
def format_size(n):
88+
for unit in ('B', 'KB', 'MB', 'GB', 'TB'):
89+
if n < 1024:
90+
return f'{n:.1f} {unit}'
91+
n /= 1024
92+
return f'{n:.1f} PB'
93+
94+
95+
def format_eta(secs):
96+
secs = int(secs)
97+
if secs >= 3600:
98+
return f'{secs // 3600}h {(secs % 3600) // 60}m'
99+
if secs >= 60:
100+
return f'{secs // 60}m {secs % 60}s'
101+
return f'{secs}s'
102+
103+
104+
def probe_asset(url):
105+
"""Probe asset size and checksum via a Range: bytes=0-0 GET request.
106+
107+
Uses GET (not HEAD) because CloudFront blocks HEAD for objects > 50 GB.
108+
The S3 origin returns HTTP 206 with:
109+
Content-Range: bytes 0-0/<total>
110+
x-amz-meta-sha256: <hex>
111+
Returns (total_size, sha256_hex_or_None).
112+
"""
113+
print('Probing asset via range request...')
114+
req = urllib.request.Request(url, headers={'Range': 'bytes=0-0'})
115+
with urllib.request.urlopen(req, timeout=30) as resp:
116+
content_range = resp.headers.get('Content-Range', '')
117+
sha256 = resp.headers.get('x-amz-meta-sha256', '')
118+
119+
total_size = None
120+
if '/' in content_range:
121+
try:
122+
total_size = int(content_range.split('/')[-1].strip())
123+
except ValueError:
124+
pass
125+
126+
if total_size is None:
127+
print(f'ERROR: Could not detect total size from Content-Range header.', file=sys.stderr)
128+
print(f' Content-Range received: "{content_range}"', file=sys.stderr)
129+
print(f' Use --total-size to provide it manually.', file=sys.stderr)
130+
sys.exit(1)
131+
132+
print(f'Total size: {format_size(total_size)} ({total_size} bytes)')
133+
if sha256:
134+
print(f'SHA256: {sha256}')
135+
else:
136+
print('SHA256: (not available in headers)')
137+
138+
return total_size, sha256 or None
139+
140+
141+
def download_chunk(url, file_handle, offset, end, label):
142+
chunk_bytes = end - offset + 1
143+
print(f' Chunk {label}: bytes {offset}-{end} ({format_size(chunk_bytes)}) ... ',
144+
end='', flush=True)
145+
146+
for attempt in range(1, MAX_ATTEMPTS + 1):
147+
req = urllib.request.Request(url, headers={'Range': f'bytes={offset}-{end}'})
148+
try:
149+
with urllib.request.urlopen(req, timeout=300) as resp:
150+
file_handle.seek(offset)
151+
while True:
152+
data = resp.read(READ_BUFFER)
153+
if not data:
154+
break
155+
file_handle.write(data)
156+
file_handle.flush()
157+
return
158+
except Exception as exc: # pylint: disable=broad-except
159+
if attempt < MAX_ATTEMPTS:
160+
delay = 2 ** attempt
161+
print(f'\n Attempt {attempt} failed ({exc}). Retrying in {delay}s...',
162+
end=' ', flush=True)
163+
file_handle.seek(offset)
164+
file_handle.truncate()
165+
time.sleep(delay)
166+
else:
167+
print(f'\n All {MAX_ATTEMPTS} attempts failed: {exc}', file=sys.stderr)
168+
sys.exit(1)
169+
170+
171+
def download_loop(url, output, total_size, chunk_size):
172+
total_chunks = -(-total_size // chunk_size) # ceiling division
173+
174+
print(f'Output: {output}')
175+
print(f'Chunk size: {format_size(chunk_size)}')
176+
print(f'Total chunks: {total_chunks}')
177+
print()
178+
179+
session_start = time.monotonic()
180+
downloaded = 0
181+
182+
with open(output, 'wb') as fh:
183+
for chunk_num in range(1, total_chunks + 1):
184+
offset = (chunk_num - 1) * chunk_size
185+
end = min(offset + chunk_size - 1, total_size - 1)
186+
chunk_bytes = end - offset + 1
187+
188+
chunk_start = time.monotonic()
189+
download_chunk(url, fh, offset, end, f'{chunk_num}/{total_chunks}')
190+
chunk_elapsed = max(time.monotonic() - chunk_start, 0.001)
191+
192+
speed = chunk_bytes / chunk_elapsed
193+
downloaded += chunk_bytes
194+
195+
session_elapsed = max(time.monotonic() - session_start, 0.001)
196+
overall_speed = downloaded / session_elapsed
197+
remaining_bytes = total_size - downloaded
198+
eta = remaining_bytes / overall_speed if overall_speed > 0 else 0
199+
progress = downloaded * 100 // total_size
200+
201+
print(
202+
f'{format_size(speed)}/s | '
203+
f'{format_size(downloaded)}/{format_size(total_size)} ({progress}%) | '
204+
f'ETA {format_eta(eta)}'
205+
)
206+
207+
print()
208+
total_elapsed = max(time.monotonic() - session_start, 0.001)
209+
avg_speed = downloaded / total_elapsed
210+
print(f'Download complete in {format_eta(total_elapsed)} ({format_size(avg_speed)}/s average)')
211+
212+
213+
def verify_download(output, expected):
214+
final_size = os.path.getsize(output)
215+
print(f'Final size: {format_size(final_size)} ({final_size} bytes)')
216+
if final_size != expected:
217+
print(f'ERROR: final size {final_size} != expected {expected}.', file=sys.stderr)
218+
sys.exit(1)
219+
print('Size OK.')
220+
221+
222+
def verify_checksum(output, expected):
223+
if not expected:
224+
print('Checksum: skipped (not available).')
225+
return
226+
227+
print('Computing SHA256 (this may take a moment for large files)...')
228+
sha256 = hashlib.sha256()
229+
with open(output, 'rb') as fh:
230+
while True:
231+
data = fh.read(READ_BUFFER)
232+
if not data:
233+
break
234+
sha256.update(data)
235+
actual = sha256.hexdigest()
236+
237+
if actual == expected:
238+
print(f'SHA256 OK: {actual}')
239+
else:
240+
print('ERROR: SHA256 mismatch!', file=sys.stderr)
241+
print(f' expected: {expected}', file=sys.stderr)
242+
print(f' actual: {actual}', file=sys.stderr)
243+
sys.exit(1)
244+
245+
246+
def get_args():
247+
parser = argparse.ArgumentParser(
248+
description='Download large assets via HTTP range requests, '
249+
'bypassing CDN size limits (e.g. CloudFront 50 GB cap).'
250+
)
251+
parser.add_argument('url', help='URL of the asset to download')
252+
parser.add_argument('output', help='Local output file path')
253+
parser.add_argument(
254+
'--chunk-size',
255+
type=int,
256+
default=DEFAULT_CHUNK_MB,
257+
metavar='MB',
258+
dest='chunk_size_mb',
259+
help=f'Chunk size in MB (default: {DEFAULT_CHUNK_MB} = 20 GB)'
260+
)
261+
parser.add_argument(
262+
'--total-size',
263+
type=int,
264+
default=None,
265+
metavar='BYTES',
266+
dest='total_size',
267+
help='Known total size in bytes (skips range probe)'
268+
)
269+
return parser.parse_args()
270+
271+
272+
def main():
273+
args = get_args()
274+
275+
if os.path.exists(args.output):
276+
print(f"ERROR: Output file '{args.output}' already exists. Delete it first.",
277+
file=sys.stderr)
278+
sys.exit(1)
279+
280+
chunk_size = args.chunk_size_mb * 1024 * 1024
281+
282+
if args.total_size is not None:
283+
total_size = args.total_size
284+
sha256 = None
285+
print(f'Total size (provided): {format_size(total_size)} ({total_size} bytes)')
286+
else:
287+
total_size, sha256 = probe_asset(args.url)
288+
289+
print()
290+
download_loop(args.url, args.output, total_size, chunk_size)
291+
print()
292+
verify_download(args.output, total_size)
293+
verify_checksum(args.output, sha256)
294+
295+
296+
if __name__ == '__main__':
297+
main()
298+
```
299+
300+
::: tip Parallel Downloads
301+
The script above downloads chunks sequentially, which is simple and reliable. For faster downloads on high-bandwidth connections, you can parallelize by downloading multiple chunks simultaneously using threads or asyncio.
302+
303+
Since each chunk has a specific byte range and the script uses `file_handle.seek(offset)` before writing, chunks can be written to their correct positions in the file regardless of the order they complete. This allows multiple chunks to download in parallel and write to different parts of the file safely.
304+
:::

0 commit comments

Comments
 (0)