-
Notifications
You must be signed in to change notification settings - Fork 174
Fix #1283: Warn and handle missing FORMAT fields in snp_calls_to_vcf() #1284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,6 +3,7 @@ | |
| from datetime import date | ||
| from typing import Optional | ||
|
|
||
| import warnings | ||
| import numpy as np | ||
| from numpydoc_decorator import doc # type: ignore | ||
|
|
||
|
|
@@ -95,11 +96,60 @@ def snp_calls_to_vcf( | |
| compress = output_path.endswith(".gz") | ||
| opener = gzip.open if compress else open | ||
|
|
||
| # Determine which extra fields to include. | ||
| include_gq = "GQ" in fields | ||
| include_ad = "AD" in fields | ||
| include_mq = "MQ" in fields | ||
| format_str = ":".join(fields) | ||
| # Extract dask arrays. | ||
| gt_data = ds["call_genotype"].data | ||
| pos_data = ds["variant_position"].data | ||
| contig_data = ds["variant_contig"].data | ||
| allele_data = ds["variant_allele"].data | ||
|
|
||
| # Optional field arrays — may not exist in all datasets. | ||
| gq_data = None | ||
| ad_data = None | ||
| mq_data = None | ||
|
|
||
| # Determine which extra fields to actually include in output. | ||
| output_fields = list(fields) | ||
|
|
||
| if "GQ" in fields: | ||
| try: | ||
| gq_data = ds["call_GQ"].data | ||
| except KeyError: | ||
| warnings.warn( | ||
| "Requested FORMAT field 'GQ' not found in dataset. " | ||
| "GQ values will be omitted from the output VCF.", | ||
| UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
| output_fields.remove("GQ") | ||
|
|
||
| if "AD" in fields: | ||
| try: | ||
| ad_data = ds["call_AD"].data | ||
| except KeyError: | ||
| warnings.warn( | ||
| "Requested FORMAT field 'AD' not found in dataset. " | ||
| "AD values will be omitted from the output VCF.", | ||
| UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
| output_fields.remove("AD") | ||
|
|
||
| if "MQ" in fields: | ||
| try: | ||
| mq_data = ds["call_MQ"].data | ||
| except KeyError: | ||
| warnings.warn( | ||
| "Requested FORMAT field 'MQ' not found in dataset. " | ||
| "MQ values will be omitted from the output VCF.", | ||
| UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
| output_fields.remove("MQ") | ||
|
|
||
| include_gq = "GQ" in output_fields | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These should come before the if-statement. |
||
| include_ad = "AD" in output_fields | ||
| include_mq = "MQ" in output_fields | ||
| format_str = ":".join(output_fields) | ||
|
Comment on lines
+149
to
+152
|
||
|
|
||
| with opener(output_path, "wt") as f: | ||
| # Write VCF header. | ||
|
|
@@ -108,7 +158,7 @@ def snp_calls_to_vcf( | |
| f.write("##source=malariagen_data\n") | ||
| for contig in contigs: | ||
| f.write(f"##contig=<ID={contig}>\n") | ||
| for field in fields: | ||
| for field in output_fields: | ||
| f.write(_FORMAT_HEADERS[field] + "\n") | ||
| header_cols = [ | ||
| "#CHROM", | ||
|
|
@@ -123,32 +173,6 @@ def snp_calls_to_vcf( | |
| ] | ||
| f.write("\t".join(header_cols + list(sample_ids)) + "\n") | ||
|
|
||
| # Extract dask arrays. | ||
| gt_data = ds["call_genotype"].data | ||
| pos_data = ds["variant_position"].data | ||
| contig_data = ds["variant_contig"].data | ||
| allele_data = ds["variant_allele"].data | ||
|
|
||
| # Optional field arrays — may not exist in all datasets. | ||
| gq_data = None | ||
| ad_data = None | ||
| mq_data = None | ||
| if include_gq: | ||
| try: | ||
| gq_data = ds["call_GQ"].data | ||
| except KeyError: | ||
| pass | ||
| if include_ad: | ||
| try: | ||
| ad_data = ds["call_AD"].data | ||
| except KeyError: | ||
| pass | ||
| if include_mq: | ||
| try: | ||
| mq_data = ds["call_MQ"].data | ||
| except KeyError: | ||
| pass | ||
|
|
||
| chunk_sizes = gt_data.chunks[0] | ||
| offsets = np.cumsum((0,) + chunk_sizes) | ||
|
|
||
|
|
@@ -170,17 +194,32 @@ def snp_calls_to_vcf( | |
| try: | ||
| gq_chunk = gq_data[start:stop].compute() | ||
| except (FileNotFoundError, KeyError): | ||
| pass | ||
| warnings.warn( | ||
| "FORMAT field 'GQ' data could not be read for variants " | ||
| f"{start}-{stop}. Values will be written as '.' (missing).", | ||
| UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
| if ad_data is not None: | ||
| try: | ||
| ad_chunk = ad_data[start:stop].compute() | ||
| except (FileNotFoundError, KeyError): | ||
| pass | ||
| warnings.warn( | ||
| "FORMAT field 'AD' data could not be read for variants " | ||
| f"{start}-{stop}. Values will be written as '.' (missing).", | ||
| UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
| if mq_data is not None: | ||
| try: | ||
| mq_chunk = mq_data[start:stop].compute() | ||
| except (FileNotFoundError, KeyError): | ||
| pass | ||
| warnings.warn( | ||
| "FORMAT field 'MQ' data could not be read for variants " | ||
| f"{start}-{stop}. Values will be written as '.' (missing).", | ||
| UserWarning, | ||
| stacklevel=2, | ||
| ) | ||
|
|
||
| for j in range(gt_chunk.shape[0]): | ||
| chrom = contigs[contig_chunk[j]] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
New behavior (omitting requested FORMAT fields when the backing arrays are missing, and emitting warnings) isn’t covered by tests. Since there are already VCF exporter tests, it would be good to add cases asserting: (1) a requested missing field (e.g. GQ) triggers a warning, (2) the corresponding ##FORMAT header line is not written, and (3) the FORMAT column/sample values reflect the filtered field list.