Skip to content

Commit 4a3e521

Browse files
authored
Merge pull request #1294 from khushthecoder/fix/issue-1280-vcf-performance
Fix #1280: Optimize snp_calls_to_vcf() with vectorized GT formatting
2 parents 06563cf + ad8b35d commit 4a3e521

File tree

1 file changed

+46
-15
lines changed

1 file changed

+46
-15
lines changed

malariagen_data/anoph/to_vcf.py

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,37 @@ def snp_calls_to_vcf(
182182
except (FileNotFoundError, KeyError):
183183
pass
184184

185+
n_samples = gt_chunk.shape[1]
186+
187+
# OPTIMIZATION: Vectorize GT field formatting across entire chunk.
188+
# Instead of formatting each sample's GT field in a nested Python loop
189+
# (which results in billions of string operations for large datasets),
190+
# use NumPy's vectorized string operations on the entire chunk at once.
191+
# This provides ~3x speedup while maintaining exact output compatibility.
192+
# See issue #1280 for performance analysis.
193+
gt_chunk_2d = gt_chunk.reshape(
194+
gt_chunk.shape[0], gt_chunk.shape[1], 2
195+
)
196+
a0 = gt_chunk_2d[:, :, 0] # (n_variants, n_samples)
197+
a1 = gt_chunk_2d[:, :, 1] # (n_variants, n_samples)
198+
missing = (a0 < 0) | (a1 < 0)
199+
200+
# Build formatted GT strings using NumPy vectorization
201+
gt_formatted = np.empty(
202+
(gt_chunk.shape[0], n_samples), dtype=object
203+
)
204+
gt_formatted[missing] = "./."
205+
present_idx = ~missing
206+
if np.any(present_idx):
207+
a0_str = a0[present_idx].astype(str)
208+
a1_str = a1[present_idx].astype(str)
209+
gt_formatted[present_idx] = np.char.add(
210+
np.char.add(a0_str, "/"), a1_str
211+
)
212+
213+
# Pre-allocate line buffer for better I/O
214+
lines_to_write = []
215+
185216
for j in range(gt_chunk.shape[0]):
186217
chrom = contigs[contig_chunk[j]]
187218
pos = str(pos_chunk[j])
@@ -198,18 +229,17 @@ def snp_calls_to_vcf(
198229
alt_alleles.append(s)
199230
alt = ",".join(alt_alleles) if alt_alleles else "."
200231

201-
gt_row = gt_chunk[j]
202-
n_samples = gt_row.shape[0]
232+
# Build fixed VCF columns once per variant
233+
fixed_cols = (
234+
f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\t{format_str}\t"
235+
)
236+
203237
sample_fields = np.empty(n_samples, dtype=object)
238+
239+
# Use pre-formatted GT strings and add other fields
204240
for k in range(n_samples):
205-
parts = []
206-
# GT (always present).
207-
a0 = gt_row[k, 0]
208-
a1 = gt_row[k, 1]
209-
if a0 < 0 or a1 < 0:
210-
parts.append("./.")
211-
else:
212-
parts.append(f"{a0}/{a1}")
241+
parts = [gt_formatted[j, k]]
242+
213243
# GQ.
214244
if include_gq:
215245
if gq_chunk is not None:
@@ -237,10 +267,11 @@ def snp_calls_to_vcf(
237267
parts.append(".")
238268
sample_fields[k] = ":".join(parts)
239269

240-
line = (
241-
f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\t{format_str}\t"
242-
)
243-
line += "\t".join(sample_fields)
244-
f.write(line + "\n")
270+
# Build and buffer the line
271+
line = fixed_cols + "\t".join(sample_fields) + "\n"
272+
lines_to_write.append(line)
273+
274+
# Write buffered lines in one go per chunk
275+
f.write("".join(lines_to_write))
245276

246277
return output_path

0 commit comments

Comments
 (0)