diff --git a/malariagen_data/anoph/to_vcf.py b/malariagen_data/anoph/to_vcf.py index 14d3c1ee6..8b620a30b 100644 --- a/malariagen_data/anoph/to_vcf.py +++ b/malariagen_data/anoph/to_vcf.py @@ -3,6 +3,7 @@ from datetime import date from typing import Optional +import warnings import numpy as np from numpydoc_decorator import doc # type: ignore @@ -95,11 +96,60 @@ def snp_calls_to_vcf( compress = output_path.endswith(".gz") opener = gzip.open if compress else open - # Determine which extra fields to include. - include_gq = "GQ" in fields - include_ad = "AD" in fields - include_mq = "MQ" in fields - format_str = ":".join(fields) + # Extract dask arrays. + gt_data = ds["call_genotype"].data + pos_data = ds["variant_position"].data + contig_data = ds["variant_contig"].data + allele_data = ds["variant_allele"].data + + # Optional field arrays — may not exist in all datasets. + gq_data = None + ad_data = None + mq_data = None + + # Determine which extra fields to actually include in output. + output_fields = list(fields) + + if "GQ" in fields: + try: + gq_data = ds["call_GQ"].data + except KeyError: + warnings.warn( + "Requested FORMAT field 'GQ' not found in dataset. " + "GQ values will be omitted from the output VCF.", + UserWarning, + stacklevel=2, + ) + output_fields.remove("GQ") + + if "AD" in fields: + try: + ad_data = ds["call_AD"].data + except KeyError: + warnings.warn( + "Requested FORMAT field 'AD' not found in dataset. " + "AD values will be omitted from the output VCF.", + UserWarning, + stacklevel=2, + ) + output_fields.remove("AD") + + if "MQ" in fields: + try: + mq_data = ds["call_MQ"].data + except KeyError: + warnings.warn( + "Requested FORMAT field 'MQ' not found in dataset. " + "MQ values will be omitted from the output VCF.", + UserWarning, + stacklevel=2, + ) + output_fields.remove("MQ") + + include_gq = "GQ" in output_fields + include_ad = "AD" in output_fields + include_mq = "MQ" in output_fields + format_str = ":".join(output_fields) with opener(output_path, "wt") as f: # Write VCF header. @@ -108,7 +158,7 @@ def snp_calls_to_vcf( f.write("##source=malariagen_data\n") for contig in contigs: f.write(f"##contig=\n") - for field in fields: + for field in output_fields: f.write(_FORMAT_HEADERS[field] + "\n") header_cols = [ "#CHROM", @@ -123,32 +173,6 @@ def snp_calls_to_vcf( ] f.write("\t".join(header_cols + list(sample_ids)) + "\n") - # Extract dask arrays. - gt_data = ds["call_genotype"].data - pos_data = ds["variant_position"].data - contig_data = ds["variant_contig"].data - allele_data = ds["variant_allele"].data - - # Optional field arrays — may not exist in all datasets. - gq_data = None - ad_data = None - mq_data = None - if include_gq: - try: - gq_data = ds["call_GQ"].data - except KeyError: - pass - if include_ad: - try: - ad_data = ds["call_AD"].data - except KeyError: - pass - if include_mq: - try: - mq_data = ds["call_MQ"].data - except KeyError: - pass - chunk_sizes = gt_data.chunks[0] offsets = np.cumsum((0,) + chunk_sizes) @@ -170,17 +194,32 @@ def snp_calls_to_vcf( try: gq_chunk = gq_data[start:stop].compute() except (FileNotFoundError, KeyError): - pass + warnings.warn( + "FORMAT field 'GQ' data could not be read for variants " + f"{start}-{stop}. Values will be written as '.' (missing).", + UserWarning, + stacklevel=2, + ) if ad_data is not None: try: ad_chunk = ad_data[start:stop].compute() except (FileNotFoundError, KeyError): - pass + warnings.warn( + "FORMAT field 'AD' data could not be read for variants " + f"{start}-{stop}. Values will be written as '.' (missing).", + UserWarning, + stacklevel=2, + ) if mq_data is not None: try: mq_chunk = mq_data[start:stop].compute() except (FileNotFoundError, KeyError): - pass + warnings.warn( + "FORMAT field 'MQ' data could not be read for variants " + f"{start}-{stop}. Values will be written as '.' (missing).", + UserWarning, + stacklevel=2, + ) for j in range(gt_chunk.shape[0]): chrom = contigs[contig_chunk[j]]