|
| 1 | +# Downloading Large Assets (> 50 GB) |
| 2 | + |
| 3 | +Assets larger than **50 GB** cannot be downloaded with a regular HTTP `GET` or `HEAD` request through CloudFront, which enforces a 50 GB object size limit and returns `HTTP 400` for objects above that threshold. |
| 4 | + |
| 5 | +The workaround is to use **HTTP range requests**, which bypass the CloudFront limit by fetching the file in sequential chunks directly from the S3 origin. |
| 6 | + |
| 7 | +## How It Works |
| 8 | + |
| 9 | +A `GET` request with the header `Range: bytes=0-0` is sent first to probe the asset. |
| 10 | +The S3 origin responds with `HTTP 206 Partial Content` and includes two useful headers: |
| 11 | + |
| 12 | +| Header | Value | |
| 13 | +| ------------------- | ----------------------------------------------------------------- | |
| 14 | +| `Content-Range` | `bytes 0-0/<total_size>` — the total size of the object | |
| 15 | +| `x-amz-meta-sha256` | SHA-256 hex digest of the full object (when set by the publisher) | |
| 16 | + |
| 17 | +The file is then downloaded chunk by chunk using `Range: bytes=<start>-<end>`, and the final file is verified against the expected size and checksum. |
| 18 | + |
| 19 | +::: tip |
| 20 | +`HEAD` requests are **also blocked** by CloudFront for objects > 50 GB. Always use `GET` with a `Range` header to probe asset metadata. |
| 21 | +::: |
| 22 | + |
| 23 | +## Download Script |
| 24 | + |
| 25 | +The script below requires **Python 3.6+ and no third-party packages** (stdlib only). It works on Linux, macOS, and Windows. |
| 26 | + |
| 27 | +**Usage:** |
| 28 | + |
| 29 | +```bash |
| 30 | +# Basic usage (auto-detects size and checksum via range probe) |
| 31 | +python3 range_download.py "https://data.geo.admin.ch/.../file.tar" /path/to/file.tar |
| 32 | + |
| 33 | +# Custom chunk size 2GB (in MB, default is 20 GB) |
| 34 | +python3 range_download.py "https://..." /path/to/file.tar --chunk-size 2048 |
| 35 | + |
| 36 | +# Skip the probe if you already know the total size |
| 37 | +python3 range_download.py "https://..." /path/to/file.tar --total-size 102265118720 |
| 38 | +``` |
| 39 | + |
| 40 | +**Script (`range_download.py`):** |
| 41 | + |
| 42 | +```python |
| 43 | +#!/usr/bin/env python3 |
| 44 | +"""Download large assets via HTTP range requests, bypassing CDN size limits. |
| 45 | +
|
| 46 | +Usage: |
| 47 | + python3 range_download.py <URL> <OUTPUT_FILE> [--chunk-size MB] [--total-size BYTES] |
| 48 | +
|
| 49 | +Examples: |
| 50 | + python3 range_download.py "https://data.geo.admin.ch/.../file.tar" /path/to/file.tar |
| 51 | + python3 range_download.py "https://..." /path/to/file.tar --chunk-size 2048 |
| 52 | + python3 range_download.py "https://..." /path/to/file.tar --total-size 102265118720 |
| 53 | +""" |
| 54 | + |
| 55 | +import argparse |
| 56 | +import hashlib |
| 57 | +import os |
| 58 | +import sys |
| 59 | +import time |
| 60 | +import urllib.error |
| 61 | +import urllib.request |
| 62 | + |
| 63 | +DEFAULT_CHUNK_MB = 20 * 1024 # 20 GB |
| 64 | +READ_BUFFER = 16 * 1024 * 1024 # 16 MB read buffer for streaming to disk |
| 65 | +MAX_ATTEMPTS = 3 |
| 66 | + |
| 67 | + |
| 68 | +def format_size(n): |
| 69 | + for unit in ('B', 'KB', 'MB', 'GB', 'TB'): |
| 70 | + if n < 1024: |
| 71 | + return f'{n:.1f} {unit}' |
| 72 | + n /= 1024 |
| 73 | + return f'{n:.1f} PB' |
| 74 | + |
| 75 | + |
| 76 | +def format_eta(secs): |
| 77 | + secs = int(secs) |
| 78 | + if secs >= 3600: |
| 79 | + return f'{secs // 3600}h {(secs % 3600) // 60}m' |
| 80 | + if secs >= 60: |
| 81 | + return f'{secs // 60}m {secs % 60}s' |
| 82 | + return f'{secs}s' |
| 83 | + |
| 84 | + |
| 85 | +def probe_asset(url): |
| 86 | + """Probe asset size and checksum via a Range: bytes=0-0 GET request. |
| 87 | +
|
| 88 | + Uses GET (not HEAD) because CloudFront blocks HEAD for objects > 50 GB. |
| 89 | + The S3 origin returns HTTP 206 with: |
| 90 | + Content-Range: bytes 0-0/<total> |
| 91 | + x-amz-meta-sha256: <hex> |
| 92 | + Returns (total_size, sha256_hex_or_None). |
| 93 | + """ |
| 94 | + print('Probing asset via range request...') |
| 95 | + req = urllib.request.Request(url, headers={'Range': 'bytes=0-0'}) |
| 96 | + try: |
| 97 | + with urllib.request.urlopen(req, timeout=30) as resp: |
| 98 | + content_range = resp.headers.get('Content-Range', '') |
| 99 | + sha256 = resp.headers.get('x-amz-meta-sha256', '') |
| 100 | + except urllib.error.HTTPError as exc: |
| 101 | + if exc.code == 206: |
| 102 | + content_range = exc.headers.get('Content-Range', '') |
| 103 | + sha256 = exc.headers.get('x-amz-meta-sha256', '') |
| 104 | + else: |
| 105 | + raise |
| 106 | + |
| 107 | + total_size = None |
| 108 | + if '/' in content_range: |
| 109 | + try: |
| 110 | + total_size = int(content_range.split('/')[-1].strip()) |
| 111 | + except ValueError: |
| 112 | + pass |
| 113 | + |
| 114 | + if total_size is None: |
| 115 | + print(f'ERROR: Could not detect total size from Content-Range header.', file=sys.stderr) |
| 116 | + print(f' Content-Range received: "{content_range}"', file=sys.stderr) |
| 117 | + print(f' Use --total-size to provide it manually.', file=sys.stderr) |
| 118 | + sys.exit(1) |
| 119 | + |
| 120 | + print(f'Total size: {format_size(total_size)} ({total_size} bytes)') |
| 121 | + if sha256: |
| 122 | + print(f'SHA256: {sha256}') |
| 123 | + else: |
| 124 | + print('SHA256: (not available in headers)') |
| 125 | + |
| 126 | + return total_size, sha256 or None |
| 127 | + |
| 128 | + |
| 129 | +def download_chunk(url, file_handle, offset, end, label): |
| 130 | + chunk_bytes = end - offset + 1 |
| 131 | + print(f' Chunk {label}: bytes {offset}-{end} ({format_size(chunk_bytes)}) ... ', |
| 132 | + end='', flush=True) |
| 133 | + |
| 134 | + for attempt in range(1, MAX_ATTEMPTS + 1): |
| 135 | + req = urllib.request.Request(url, headers={'Range': f'bytes={offset}-{end}'}) |
| 136 | + try: |
| 137 | + with urllib.request.urlopen(req, timeout=300) as resp: |
| 138 | + file_handle.seek(offset) |
| 139 | + while True: |
| 140 | + data = resp.read(READ_BUFFER) |
| 141 | + if not data: |
| 142 | + break |
| 143 | + file_handle.write(data) |
| 144 | + file_handle.flush() |
| 145 | + return |
| 146 | + except Exception as exc: # pylint: disable=broad-except |
| 147 | + if attempt < MAX_ATTEMPTS: |
| 148 | + delay = 2 ** attempt |
| 149 | + print(f'\n Attempt {attempt} failed ({exc}). Retrying in {delay}s...', |
| 150 | + end=' ', flush=True) |
| 151 | + file_handle.seek(offset) |
| 152 | + file_handle.truncate() |
| 153 | + time.sleep(delay) |
| 154 | + else: |
| 155 | + print(f'\n All {MAX_ATTEMPTS} attempts failed: {exc}', file=sys.stderr) |
| 156 | + sys.exit(1) |
| 157 | + |
| 158 | + |
| 159 | +def download_loop(url, output, total_size, chunk_size): |
| 160 | + total_chunks = -(-total_size // chunk_size) # ceiling division |
| 161 | + |
| 162 | + print(f'Output: {output}') |
| 163 | + print(f'Chunk size: {format_size(chunk_size)}') |
| 164 | + print(f'Total chunks: {total_chunks}') |
| 165 | + print() |
| 166 | + |
| 167 | + session_start = time.monotonic() |
| 168 | + downloaded = 0 |
| 169 | + |
| 170 | + with open(output, 'wb') as fh: |
| 171 | + for chunk_num in range(1, total_chunks + 1): |
| 172 | + offset = (chunk_num - 1) * chunk_size |
| 173 | + end = min(offset + chunk_size - 1, total_size - 1) |
| 174 | + chunk_bytes = end - offset + 1 |
| 175 | + |
| 176 | + chunk_start = time.monotonic() |
| 177 | + download_chunk(url, fh, offset, end, f'{chunk_num}/{total_chunks}') |
| 178 | + chunk_elapsed = max(time.monotonic() - chunk_start, 0.001) |
| 179 | + |
| 180 | + speed = chunk_bytes / chunk_elapsed |
| 181 | + downloaded += chunk_bytes |
| 182 | + |
| 183 | + session_elapsed = max(time.monotonic() - session_start, 0.001) |
| 184 | + overall_speed = downloaded / session_elapsed |
| 185 | + remaining_bytes = total_size - downloaded |
| 186 | + eta = remaining_bytes / overall_speed if overall_speed > 0 else 0 |
| 187 | + progress = downloaded * 100 // total_size |
| 188 | + |
| 189 | + print( |
| 190 | + f'{format_size(speed)}/s | ' |
| 191 | + f'{format_size(downloaded)}/{format_size(total_size)} ({progress}%) | ' |
| 192 | + f'ETA {format_eta(eta)}' |
| 193 | + ) |
| 194 | + |
| 195 | + print() |
| 196 | + total_elapsed = max(time.monotonic() - session_start, 0.001) |
| 197 | + avg_speed = downloaded / total_elapsed |
| 198 | + print(f'Download complete in {format_eta(total_elapsed)} ({format_size(avg_speed)}/s average)') |
| 199 | + |
| 200 | + |
| 201 | +def verify_download(output, expected): |
| 202 | + final_size = os.path.getsize(output) |
| 203 | + print(f'Final size: {format_size(final_size)} ({final_size} bytes)') |
| 204 | + if final_size != expected: |
| 205 | + print(f'ERROR: final size {final_size} != expected {expected}.', file=sys.stderr) |
| 206 | + sys.exit(1) |
| 207 | + print('Size OK.') |
| 208 | + |
| 209 | + |
| 210 | +def verify_checksum(output, expected): |
| 211 | + if not expected: |
| 212 | + print('Checksum: skipped (not available).') |
| 213 | + return |
| 214 | + |
| 215 | + print('Computing SHA256 (this may take a moment for large files)...') |
| 216 | + sha256 = hashlib.sha256() |
| 217 | + with open(output, 'rb') as fh: |
| 218 | + while True: |
| 219 | + data = fh.read(READ_BUFFER) |
| 220 | + if not data: |
| 221 | + break |
| 222 | + sha256.update(data) |
| 223 | + actual = sha256.hexdigest() |
| 224 | + |
| 225 | + if actual == expected: |
| 226 | + print(f'SHA256 OK: {actual}') |
| 227 | + else: |
| 228 | + print('ERROR: SHA256 mismatch!', file=sys.stderr) |
| 229 | + print(f' expected: {expected}', file=sys.stderr) |
| 230 | + print(f' actual: {actual}', file=sys.stderr) |
| 231 | + sys.exit(1) |
| 232 | + |
| 233 | + |
| 234 | +def get_args(): |
| 235 | + parser = argparse.ArgumentParser( |
| 236 | + description='Download large assets via HTTP range requests, ' |
| 237 | + 'bypassing CDN size limits (e.g. CloudFront 50 GB cap).' |
| 238 | + ) |
| 239 | + parser.add_argument('url', help='URL of the asset to download') |
| 240 | + parser.add_argument('output', help='Local output file path') |
| 241 | + parser.add_argument( |
| 242 | + '--chunk-size', |
| 243 | + type=int, |
| 244 | + default=DEFAULT_CHUNK_MB, |
| 245 | + metavar='MB', |
| 246 | + dest='chunk_size_mb', |
| 247 | + help=f'Chunk size in MB (default: {DEFAULT_CHUNK_MB} = 20 GB)' |
| 248 | + ) |
| 249 | + parser.add_argument( |
| 250 | + '--total-size', |
| 251 | + type=int, |
| 252 | + default=None, |
| 253 | + metavar='BYTES', |
| 254 | + dest='total_size', |
| 255 | + help='Known total size in bytes (skips range probe)' |
| 256 | + ) |
| 257 | + return parser.parse_args() |
| 258 | + |
| 259 | + |
| 260 | +def main(): |
| 261 | + args = get_args() |
| 262 | + |
| 263 | + if os.path.exists(args.output): |
| 264 | + print(f"ERROR: Output file '{args.output}' already exists. Delete it first.", |
| 265 | + file=sys.stderr) |
| 266 | + sys.exit(1) |
| 267 | + |
| 268 | + chunk_size = args.chunk_size_mb * 1024 * 1024 |
| 269 | + |
| 270 | + if args.total_size is not None: |
| 271 | + total_size = args.total_size |
| 272 | + sha256 = None |
| 273 | + print(f'Total size (provided): {format_size(total_size)} ({total_size} bytes)') |
| 274 | + else: |
| 275 | + total_size, sha256 = probe_asset(args.url) |
| 276 | + |
| 277 | + print() |
| 278 | + download_loop(args.url, args.output, total_size, chunk_size) |
| 279 | + print() |
| 280 | + verify_download(args.output, total_size) |
| 281 | + verify_checksum(args.output, sha256) |
| 282 | + |
| 283 | + |
| 284 | +if __name__ == '__main__': |
| 285 | + main() |
| 286 | +``` |
0 commit comments