Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 74 additions & 35 deletions malariagen_data/anoph/to_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import date
from typing import Optional

import warnings
import numpy as np
from numpydoc_decorator import doc # type: ignore

Expand Down Expand Up @@ -95,11 +96,60 @@ def snp_calls_to_vcf(
compress = output_path.endswith(".gz")
opener = gzip.open if compress else open

# Determine which extra fields to include.
include_gq = "GQ" in fields
include_ad = "AD" in fields
include_mq = "MQ" in fields
format_str = ":".join(fields)
# Extract dask arrays.
gt_data = ds["call_genotype"].data
pos_data = ds["variant_position"].data
contig_data = ds["variant_contig"].data
allele_data = ds["variant_allele"].data

# Optional field arrays — may not exist in all datasets.
gq_data = None
ad_data = None
mq_data = None

# Determine which extra fields to actually include in output.
output_fields = list(fields)

if "GQ" in fields:
try:
gq_data = ds["call_GQ"].data
except KeyError:
warnings.warn(
"Requested FORMAT field 'GQ' not found in dataset. "
"GQ values will be omitted from the output VCF.",
UserWarning,
stacklevel=2,
)
output_fields.remove("GQ")

if "AD" in fields:
try:
ad_data = ds["call_AD"].data
except KeyError:
warnings.warn(
"Requested FORMAT field 'AD' not found in dataset. "
"AD values will be omitted from the output VCF.",
UserWarning,
stacklevel=2,
)
output_fields.remove("AD")
Comment on lines +113 to +135
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

New behavior (omitting requested FORMAT fields when the backing arrays are missing, and emitting warnings) isn’t covered by tests. Since there are already VCF exporter tests, it would be good to add cases asserting: (1) a requested missing field (e.g. GQ) triggers a warning, (2) the corresponding ##FORMAT header line is not written, and (3) the FORMAT column/sample values reflect the filtered field list.

Copilot uses AI. Check for mistakes.

if "MQ" in fields:
try:
mq_data = ds["call_MQ"].data
except KeyError:
warnings.warn(
"Requested FORMAT field 'MQ' not found in dataset. "
"MQ values will be omitted from the output VCF.",
UserWarning,
stacklevel=2,
)
output_fields.remove("MQ")

include_gq = "GQ" in output_fields
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should come before the if-statement.

include_ad = "AD" in output_fields
include_mq = "MQ" in output_fields
format_str = ":".join(output_fields)
Comment on lines +149 to +152
Copy link

Copilot AI Apr 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

format_str is built from output_fields (user-supplied order), but the per-sample parts list is assembled in a fixed order (GT, then GQ, then AD, then MQ). If a caller provides fields in a different order (e.g. ("GT","MQ","GQ")), the FORMAT column will not match the sample value ordering, producing an invalid/misleading VCF. Consider either enforcing a canonical field order when building output_fields/format_str, or (preferably) constructing per-sample values by iterating over output_fields so the ordering always stays consistent.

Copilot uses AI. Check for mistakes.

with opener(output_path, "wt") as f:
# Write VCF header.
Expand All @@ -108,7 +158,7 @@ def snp_calls_to_vcf(
f.write("##source=malariagen_data\n")
for contig in contigs:
f.write(f"##contig=<ID={contig}>\n")
for field in fields:
for field in output_fields:
f.write(_FORMAT_HEADERS[field] + "\n")
header_cols = [
"#CHROM",
Expand All @@ -123,32 +173,6 @@ def snp_calls_to_vcf(
]
f.write("\t".join(header_cols + list(sample_ids)) + "\n")

# Extract dask arrays.
gt_data = ds["call_genotype"].data
pos_data = ds["variant_position"].data
contig_data = ds["variant_contig"].data
allele_data = ds["variant_allele"].data

# Optional field arrays — may not exist in all datasets.
gq_data = None
ad_data = None
mq_data = None
if include_gq:
try:
gq_data = ds["call_GQ"].data
except KeyError:
pass
if include_ad:
try:
ad_data = ds["call_AD"].data
except KeyError:
pass
if include_mq:
try:
mq_data = ds["call_MQ"].data
except KeyError:
pass

chunk_sizes = gt_data.chunks[0]
offsets = np.cumsum((0,) + chunk_sizes)

Expand All @@ -170,17 +194,32 @@ def snp_calls_to_vcf(
try:
gq_chunk = gq_data[start:stop].compute()
except (FileNotFoundError, KeyError):
pass
warnings.warn(
"FORMAT field 'GQ' data could not be read for variants "
f"{start}-{stop}. Values will be written as '.' (missing).",
UserWarning,
stacklevel=2,
)
if ad_data is not None:
try:
ad_chunk = ad_data[start:stop].compute()
except (FileNotFoundError, KeyError):
pass
warnings.warn(
"FORMAT field 'AD' data could not be read for variants "
f"{start}-{stop}. Values will be written as '.' (missing).",
UserWarning,
stacklevel=2,
)
if mq_data is not None:
try:
mq_chunk = mq_data[start:stop].compute()
except (FileNotFoundError, KeyError):
pass
warnings.warn(
"FORMAT field 'MQ' data could not be read for variants "
f"{start}-{stop}. Values will be written as '.' (missing).",
UserWarning,
stacklevel=2,
)

for j in range(gt_chunk.shape[0]):
chrom = contigs[contig_chunk[j]]
Expand Down
Loading