Skip to content

Commit b6705e7

Browse files
authored
Merge branch 'master' into GH-1054-add-vcf-export
2 parents 1aa0de5 + 2f29890 commit b6705e7

15 files changed

Lines changed: 1032 additions & 30 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
.idea
22
.vscode
33
__pycache__
4+
.mypy_cache
45
*.pyc
56
dist
67
.venv/

malariagen_data/adar1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def __init__(
130130
tqdm_class=tqdm_class,
131131
taxon_colors=TAXON_COLORS,
132132
virtual_contigs=None,
133-
gene_names=None,
134133
inversion_tag_path=None,
135134
unrestricted_use_only=unrestricted_use_only,
136135
surveillance_use_only=surveillance_use_only,

malariagen_data/adir1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def __init__(
130130
tqdm_class=tqdm_class,
131131
taxon_colors=TAXON_COLORS,
132132
virtual_contigs=None,
133-
gene_names=None,
134133
inversion_tag_path=None,
135134
unrestricted_use_only=unrestricted_use_only,
136135
surveillance_use_only=surveillance_use_only,

malariagen_data/af1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,6 @@ def __init__(
132132
tqdm_class=tqdm_class,
133133
taxon_colors=TAXON_COLORS,
134134
virtual_contigs=None,
135-
gene_names=None,
136135
inversion_tag_path=None,
137136
unrestricted_use_only=unrestricted_use_only,
138137
surveillance_use_only=surveillance_use_only,

malariagen_data/ag3.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,6 @@ def __init__(
212212
taxon_colors=TAXON_COLORS,
213213
aim_species_colors=AIM_SPECIES_COLORS,
214214
virtual_contigs=VIRTUAL_CONTIGS,
215-
gene_names=GENE_NAMES,
216215
inversion_tag_path=INVERSION_TAG_PATH,
217216
unrestricted_use_only=unrestricted_use_only,
218217
surveillance_use_only=surveillance_use_only,

malariagen_data/amin1.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ def __init__(
130130
tqdm_class=tqdm_class,
131131
taxon_colors=TAXON_COLORS,
132132
virtual_contigs=None,
133-
gene_names=None,
134133
inversion_tag_path=None,
135134
unrestricted_use_only=unrestricted_use_only,
136135
surveillance_use_only=surveillance_use_only,

malariagen_data/anoph/aim_data.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,24 @@ def __init__(
4040
# to the superclass constructor.
4141
super().__init__(**kwargs)
4242

43-
# Store possible values for the `aims` parameter.
44-
# TODO Consider moving this to data resource configuration.
45-
self._aim_ids = aim_ids
46-
self._aim_palettes = aim_palettes
43+
# Read AIM parameters from the JSON config, falling back to
44+
# constructor args for backward compatibility.
45+
config = self.config
46+
_aim_ids = config.get("AIM_IDS", None)
47+
if _aim_ids is not None:
48+
self._aim_ids: Optional[aim_params.aim_ids] = tuple(_aim_ids)
49+
else:
50+
self._aim_ids = aim_ids
51+
52+
_aim_palettes = config.get("AIM_PALETTES", None)
53+
if _aim_palettes is not None:
54+
# Convert lists to tuples for each palette entry.
55+
self._aim_palettes: Optional[aim_params.aim_palettes] = {
56+
k: tuple(v)
57+
for k, v in _aim_palettes.items() # type: ignore
58+
}
59+
else:
60+
self._aim_palettes = aim_palettes
4761

4862
# Set up caches.
4963
self._cache_aim_variants: Dict[str, xr.Dataset] = dict()

malariagen_data/anoph/base_params.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,14 @@ def _validate_sample_selection_params(
189189
"Random seed used for reproducible down-sampling.",
190190
]
191191

192+
gene: TypeAlias = Annotated[
193+
str,
194+
"""
195+
Gene identifier. Can be either a gene ID or gene name.
196+
Gene names are matched case-insensitively.
197+
""",
198+
]
199+
192200
transcript: TypeAlias = Annotated[
193201
str,
194202
"Gene transcript identifier.",

malariagen_data/anoph/frq_base.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,14 @@ def plot_frequencies_heatmap(
417417
`aa_allele_frequencies_advanced()` or
418418
`gene_cnv_frequencies_advanced()`.
419419
""",
420+
taxa="""
421+
Taxon or list of taxa to include in the plot. If None,
422+
all taxa are shown.
423+
""",
424+
areas="""
425+
Area or list of areas to include in the plot. If None,
426+
all areas are shown.
427+
""",
420428
kwargs="Passed through to `px.line()`.",
421429
),
422430
returns="""

malariagen_data/anoph/genome_features.py

Lines changed: 159 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ class AnophelesGenomeFeaturesData(AnophelesGenomeSequenceData):
2323
def __init__(
2424
self,
2525
*,
26-
gff_gene_type: str,
27-
gff_gene_name_attribute: str,
28-
gff_default_attributes: Tuple[str, ...],
26+
gff_gene_type: Optional[str] = None,
27+
gff_gene_name_attribute: Optional[str] = None,
28+
gff_default_attributes: Optional[Tuple[str, ...]] = None,
2929
gene_names: Optional[Mapping[str, str]] = None,
3030
**kwargs,
3131
):
@@ -34,16 +34,30 @@ def __init__(
3434
# to the superclass constructor.
3535
super().__init__(**kwargs)
3636

37-
# TODO Consider moving these parameters to configuration, as they could
38-
# change if the GFF ever changed.
39-
self._gff_gene_type = gff_gene_type
40-
self._gff_gene_name_attribute = gff_gene_name_attribute
41-
self._gff_default_attributes = gff_default_attributes
37+
# Read GFF parameters from the JSON config, falling back to
38+
# constructor args for backward compatibility.
39+
config = self.config
40+
self._gff_gene_type = config.get("GFF_GENE_TYPE", gff_gene_type or "gene")
41+
self._gff_gene_name_attribute = config.get(
42+
"GFF_GENE_NAME_ATTRIBUTE", gff_gene_name_attribute or "Name"
43+
)
44+
_default_attrs = config.get("GFF_DEFAULT_ATTRIBUTES", None)
45+
if _default_attrs is not None:
46+
self._gff_default_attributes = tuple(_default_attrs)
47+
elif gff_default_attributes is not None:
48+
self._gff_default_attributes = gff_default_attributes
49+
else:
50+
self._gff_default_attributes = ("ID", "Parent", "Name", "description")
4251

4352
# Allow manual override of gene names.
44-
if gene_names is None:
45-
gene_names = dict()
46-
self._gene_name_overrides = gene_names
53+
# Read from config if available, falling back to constructor arg.
54+
_gene_names = config.get("GENE_NAMES", None)
55+
if _gene_names is not None:
56+
self._gene_name_overrides = _gene_names
57+
elif gene_names is not None:
58+
self._gene_name_overrides = gene_names
59+
else:
60+
self._gene_name_overrides = dict()
4761

4862
# Setup caches.
4963
self._cache_genome_features: Dict[Tuple[str, ...], pd.DataFrame] = dict()
@@ -314,6 +328,140 @@ def plot_transcript(
314328
bokeh.plotting.show(fig)
315329
return fig
316330

331+
@_check_types
332+
@doc(
333+
summary="Get the canonical transcript for a gene.",
334+
returns="""
335+
The transcript ID for the canonical transcript of the specified gene.
336+
The canonical transcript is the one with the highest number of
337+
transcribed base pairs (sum of exon lengths).
338+
""",
339+
)
340+
def canonical_transcript(
341+
self,
342+
gene: base_params.gene,
343+
) -> str:
344+
"""
345+
Parameters
346+
----------
347+
gene : str
348+
A gene identifier. Can be either a gene ID or gene name.
349+
350+
Returns
351+
-------
352+
str
353+
The transcript ID of the canonical transcript.
354+
355+
Raises
356+
------
357+
ValueError
358+
If the gene identifier is not found or if the gene has no transcripts.
359+
360+
Examples
361+
--------
362+
Get the canonical transcript for a gene by ID:
363+
364+
>>> import malariagen_data
365+
>>> ag3 = malariagen_data.ag3(pre=False)
366+
>>> canonical = ag3.canonical_transcript("AGAP004707")
367+
368+
Get the canonical transcript for a gene by name:
369+
370+
>>> canonical = ag3.canonical_transcript("Pvr")
371+
"""
372+
debug = self._log.debug
373+
debug(f"Looking up canonical transcript for gene '{gene}'")
374+
375+
# Load genome features once with required attributes
376+
with self._spinner(desc="Load gene data"):
377+
# Load required attributes (ordered for consistency with GFF3)
378+
attributes = ("ID", "Parent", self._gff_gene_name_attribute)
379+
df_features = self.genome_features(attributes=attributes)
380+
debug(f"Loaded {len(df_features)} genome features")
381+
382+
# Filter for genes
383+
df_genes = df_features[df_features["type"] == self._gff_gene_type]
384+
name_attr = self._gff_gene_name_attribute
385+
386+
# Normalize input: strip whitespace
387+
gene_normalized = gene.strip()
388+
389+
# Reject empty identifiers after normalization to avoid ambiguous matches
390+
if not gene_normalized:
391+
raise ValueError(
392+
"Gene identifier is empty after stripping whitespace; please provide a valid gene ID or name."
393+
)
394+
# Try exact ID match first (case-sensitive)
395+
debug(f"Attempting ID match for '{gene_normalized}'")
396+
gene_id_match = df_genes[df_genes["ID"].str.strip() == gene_normalized]
397+
398+
if len(gene_id_match) == 1:
399+
gene_id = gene_id_match.iloc[0]["ID"]
400+
debug(f"Found ID match: {gene_id}")
401+
elif len(gene_id_match) > 1:
402+
# This should not happen (ID should be unique), but handling gracefully
403+
raise ValueError(
404+
f"Multiple features with ID '{gene}' found (data integrity issue)"
405+
)
406+
else:
407+
# Trying name match (case-insensitive with whitespace handling)
408+
debug("No ID match, attempting name match")
409+
gene_name_match = df_genes[
410+
df_genes[name_attr].fillna("").str.strip().str.lower()
411+
== gene_normalized.lower()
412+
]
413+
414+
if len(gene_name_match) == 0:
415+
raise ValueError(f"Gene '{gene}' not found (no matching ID or name)")
416+
elif len(gene_name_match) > 1:
417+
# Suggest which genes matched for better debugging
418+
matching_ids = ", ".join(gene_name_match["ID"].values)
419+
raise ValueError(
420+
f"Gene name '{gene}' is ambiguous (matches {len(gene_name_match)} genes: {matching_ids}). "
421+
f"Please use a specific gene ID instead."
422+
)
423+
424+
gene_id = gene_name_match.iloc[0]["ID"]
425+
debug(f"Found name match: {gene_id}")
426+
427+
# Get transcripts for the gene
428+
debug(f"Finding transcripts for gene '{gene_id}'")
429+
df_transcripts = self.genome_feature_children(
430+
parent=gene_id, attributes=("ID",)
431+
)
432+
df_transcripts = df_transcripts[df_transcripts["type"] == "mRNA"]
433+
434+
if len(df_transcripts) == 0:
435+
raise ValueError(f"Gene '{gene}' has no transcripts")
436+
437+
debug(f"Found {len(df_transcripts)} transcripts for gene {gene_id}")
438+
439+
# Calculate transcript lengths and find canonical
440+
debug("Calculating transcript lengths for each transcript")
441+
transcript_lengths = {}
442+
443+
for transcript_id in df_transcripts["ID"]:
444+
# Get all exon children (genome_feature_children handles multi-parent exons)
445+
df_exons = self.genome_feature_children(
446+
parent=transcript_id, attributes=None
447+
)
448+
# Filter for exons only (important: exclude other feature types)
449+
df_exons = df_exons[df_exons["type"] == "exon"].sort_values("start")
450+
451+
if len(df_exons) > 0:
452+
# Calculate total transcribed length (1-based inclusive coordinates)
453+
exon_lengths = (df_exons["end"] - df_exons["start"] + 1).sum()
454+
transcript_lengths[transcript_id] = exon_lengths
455+
debug(f" {transcript_id}: {len(df_exons)} exons, {exon_lengths} bp")
456+
if not transcript_lengths:
457+
raise ValueError(f"Gene '{gene}' has no transcripts with exons")
458+
459+
# Find canonical (maximum length)
460+
canonical = max(transcript_lengths, key=lambda tid: transcript_lengths[tid])
461+
canonical_length = transcript_lengths[canonical]
462+
debug(f"Canonical transcript: {canonical} with {canonical_length} bp")
463+
return canonical
464+
317465
@_check_types
318466
@doc(
319467
summary="Plot a genes track, using bokeh.",

0 commit comments

Comments
 (0)