|
| 1 | +import gzip |
| 2 | +import os |
| 3 | +from datetime import date |
| 4 | +from typing import Optional |
| 5 | + |
| 6 | +import numpy as np |
| 7 | +from numpydoc_decorator import doc # type: ignore |
| 8 | + |
| 9 | +from .snp_data import AnophelesSnpData |
| 10 | +from . import base_params |
| 11 | +from . import plink_params |
| 12 | +from . import vcf_params |
| 13 | + |
| 14 | + |
| 15 | +class VcfExporter( |
| 16 | + AnophelesSnpData, |
| 17 | +): |
| 18 | + def __init__( |
| 19 | + self, |
| 20 | + **kwargs, |
| 21 | + ): |
| 22 | + # N.B., this class is designed to work cooperatively, and |
| 23 | + # so it's important that any remaining parameters are passed |
| 24 | + # to the superclass constructor. |
| 25 | + super().__init__(**kwargs) |
| 26 | + |
| 27 | + @doc( |
| 28 | + summary=""" |
| 29 | + Export SNP calls to Variant Call Format (VCF). |
| 30 | + """, |
| 31 | + extended_summary=""" |
| 32 | + This function writes SNP calls to a VCF file. Data is written |
| 33 | + in chunks to avoid loading the entire genotype matrix into |
| 34 | + memory. Supports optional gzip compression when the output |
| 35 | + path ends with `.gz`. |
| 36 | + """, |
| 37 | + returns=""" |
| 38 | + Path to the VCF output file. |
| 39 | + """, |
| 40 | + ) |
| 41 | + def snp_calls_to_vcf( |
| 42 | + self, |
| 43 | + output_path: vcf_params.vcf_output_path, |
| 44 | + region: base_params.regions, |
| 45 | + sample_sets: Optional[base_params.sample_sets] = None, |
| 46 | + sample_query: Optional[base_params.sample_query] = None, |
| 47 | + sample_query_options: Optional[base_params.sample_query_options] = None, |
| 48 | + sample_indices: Optional[base_params.sample_indices] = None, |
| 49 | + site_mask: Optional[base_params.site_mask] = base_params.DEFAULT, |
| 50 | + inline_array: base_params.inline_array = base_params.inline_array_default, |
| 51 | + chunks: base_params.chunks = base_params.native_chunks, |
| 52 | + overwrite: plink_params.overwrite = False, |
| 53 | + ) -> str: |
| 54 | + base_params._validate_sample_selection_params( |
| 55 | + sample_query=sample_query, sample_indices=sample_indices |
| 56 | + ) |
| 57 | + |
| 58 | + if os.path.exists(output_path) and not overwrite: |
| 59 | + return output_path |
| 60 | + |
| 61 | + ds = self.snp_calls( |
| 62 | + region=region, |
| 63 | + sample_sets=sample_sets, |
| 64 | + sample_query=sample_query, |
| 65 | + sample_query_options=sample_query_options, |
| 66 | + sample_indices=sample_indices, |
| 67 | + site_mask=site_mask, |
| 68 | + inline_array=inline_array, |
| 69 | + chunks=chunks, |
| 70 | + ) |
| 71 | + |
| 72 | + sample_ids = ds["sample_id"].values |
| 73 | + contigs = ds.attrs.get("contigs", self.contigs) |
| 74 | + compress = output_path.endswith(".gz") |
| 75 | + opener = gzip.open if compress else open |
| 76 | + |
| 77 | + with opener(output_path, "wt") as f: |
| 78 | + # Write VCF header. |
| 79 | + f.write("##fileformat=VCFv4.3\n") |
| 80 | + f.write(f"##fileDate={date.today().strftime('%Y%m%d')}\n") |
| 81 | + f.write("##source=malariagen_data\n") |
| 82 | + for contig in contigs: |
| 83 | + f.write(f"##contig=<ID={contig}>\n") |
| 84 | + f.write('##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n') |
| 85 | + header_cols = [ |
| 86 | + "#CHROM", |
| 87 | + "POS", |
| 88 | + "ID", |
| 89 | + "REF", |
| 90 | + "ALT", |
| 91 | + "QUAL", |
| 92 | + "FILTER", |
| 93 | + "INFO", |
| 94 | + "FORMAT", |
| 95 | + ] |
| 96 | + f.write("\t".join(header_cols + list(sample_ids)) + "\n") |
| 97 | + |
| 98 | + # Write records in chunks. |
| 99 | + gt_data = ds["call_genotype"].data |
| 100 | + pos_data = ds["variant_position"].data |
| 101 | + contig_data = ds["variant_contig"].data |
| 102 | + allele_data = ds["variant_allele"].data |
| 103 | + |
| 104 | + chunk_sizes = gt_data.chunks[0] |
| 105 | + offsets = np.cumsum((0,) + chunk_sizes) |
| 106 | + |
| 107 | + with self._spinner(f"Write VCF ({ds.sizes['variants']} variants)"): |
| 108 | + for ci in range(len(chunk_sizes)): |
| 109 | + start = offsets[ci] |
| 110 | + stop = offsets[ci + 1] |
| 111 | + gt_chunk = gt_data[start:stop].compute() |
| 112 | + pos_chunk = pos_data[start:stop].compute() |
| 113 | + contig_chunk = contig_data[start:stop].compute() |
| 114 | + allele_chunk = allele_data[start:stop].compute() |
| 115 | + |
| 116 | + for j in range(gt_chunk.shape[0]): |
| 117 | + chrom = contigs[contig_chunk[j]] |
| 118 | + pos = str(pos_chunk[j]) |
| 119 | + alleles = allele_chunk[j] |
| 120 | + ref = ( |
| 121 | + alleles[0].decode() |
| 122 | + if isinstance(alleles[0], bytes) |
| 123 | + else str(alleles[0]) |
| 124 | + ) |
| 125 | + alt_alleles = [] |
| 126 | + for a in alleles[1:]: |
| 127 | + s = a.decode() if isinstance(a, bytes) else str(a) |
| 128 | + if s: |
| 129 | + alt_alleles.append(s) |
| 130 | + alt = ",".join(alt_alleles) if alt_alleles else "." |
| 131 | + |
| 132 | + gt_row = gt_chunk[j] |
| 133 | + sample_fields = np.empty(gt_row.shape[0], dtype=object) |
| 134 | + for k in range(gt_row.shape[0]): |
| 135 | + a0 = gt_row[k, 0] |
| 136 | + a1 = gt_row[k, 1] |
| 137 | + if a0 < 0 or a1 < 0: |
| 138 | + sample_fields[k] = "./." |
| 139 | + else: |
| 140 | + sample_fields[k] = f"{a0}/{a1}" |
| 141 | + |
| 142 | + line = f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.\tGT\t" |
| 143 | + line += "\t".join(sample_fields) |
| 144 | + f.write(line + "\n") |
| 145 | + |
| 146 | + return output_path |
0 commit comments