@@ -23,9 +23,9 @@ class AnophelesGenomeFeaturesData(AnophelesGenomeSequenceData):
2323 def __init__ (
2424 self ,
2525 * ,
26- gff_gene_type : str ,
27- gff_gene_name_attribute : str ,
28- gff_default_attributes : Tuple [str , ...],
26+ gff_gene_type : Optional [ str ] = None ,
27+ gff_gene_name_attribute : Optional [ str ] = None ,
28+ gff_default_attributes : Optional [ Tuple [str , ...]] = None ,
2929 gene_names : Optional [Mapping [str , str ]] = None ,
3030 ** kwargs ,
3131 ):
@@ -34,16 +34,30 @@ def __init__(
3434 # to the superclass constructor.
3535 super ().__init__ (** kwargs )
3636
37- # TODO Consider moving these parameters to configuration, as they could
38- # change if the GFF ever changed.
39- self ._gff_gene_type = gff_gene_type
40- self ._gff_gene_name_attribute = gff_gene_name_attribute
41- self ._gff_default_attributes = gff_default_attributes
37+ # Read GFF parameters from the JSON config, falling back to
38+ # constructor args for backward compatibility.
39+ config = self .config
40+ self ._gff_gene_type = config .get ("GFF_GENE_TYPE" , gff_gene_type or "gene" )
41+ self ._gff_gene_name_attribute = config .get (
42+ "GFF_GENE_NAME_ATTRIBUTE" , gff_gene_name_attribute or "Name"
43+ )
44+ _default_attrs = config .get ("GFF_DEFAULT_ATTRIBUTES" , None )
45+ if _default_attrs is not None :
46+ self ._gff_default_attributes = tuple (_default_attrs )
47+ elif gff_default_attributes is not None :
48+ self ._gff_default_attributes = gff_default_attributes
49+ else :
50+ self ._gff_default_attributes = ("ID" , "Parent" , "Name" , "description" )
4251
4352 # Allow manual override of gene names.
44- if gene_names is None :
45- gene_names = dict ()
46- self ._gene_name_overrides = gene_names
53+ # Read from config if available, falling back to constructor arg.
54+ _gene_names = config .get ("GENE_NAMES" , None )
55+ if _gene_names is not None :
56+ self ._gene_name_overrides = _gene_names
57+ elif gene_names is not None :
58+ self ._gene_name_overrides = gene_names
59+ else :
60+ self ._gene_name_overrides = dict ()
4761
4862 # Setup caches.
4963 self ._cache_genome_features : Dict [Tuple [str , ...], pd .DataFrame ] = dict ()
@@ -314,6 +328,140 @@ def plot_transcript(
314328 bokeh .plotting .show (fig )
315329 return fig
316330
331+ @_check_types
332+ @doc (
333+ summary = "Get the canonical transcript for a gene." ,
334+ returns = """
335+ The transcript ID for the canonical transcript of the specified gene.
336+ The canonical transcript is the one with the highest number of
337+ transcribed base pairs (sum of exon lengths).
338+ """ ,
339+ )
340+ def canonical_transcript (
341+ self ,
342+ gene : base_params .gene ,
343+ ) -> str :
344+ """
345+ Parameters
346+ ----------
347+ gene : str
348+ A gene identifier. Can be either a gene ID or gene name.
349+
350+ Returns
351+ -------
352+ str
353+ The transcript ID of the canonical transcript.
354+
355+ Raises
356+ ------
357+ ValueError
358+ If the gene identifier is not found or if the gene has no transcripts.
359+
360+ Examples
361+ --------
362+ Get the canonical transcript for a gene by ID:
363+
364+ >>> import malariagen_data
365+ >>> ag3 = malariagen_data.ag3(pre=False)
366+ >>> canonical = ag3.canonical_transcript("AGAP004707")
367+
368+ Get the canonical transcript for a gene by name:
369+
370+ >>> canonical = ag3.canonical_transcript("Pvr")
371+ """
372+ debug = self ._log .debug
373+ debug (f"Looking up canonical transcript for gene '{ gene } '" )
374+
375+ # Load genome features once with required attributes
376+ with self ._spinner (desc = "Load gene data" ):
377+ # Load required attributes (ordered for consistency with GFF3)
378+ attributes = ("ID" , "Parent" , self ._gff_gene_name_attribute )
379+ df_features = self .genome_features (attributes = attributes )
380+ debug (f"Loaded { len (df_features )} genome features" )
381+
382+ # Filter for genes
383+ df_genes = df_features [df_features ["type" ] == self ._gff_gene_type ]
384+ name_attr = self ._gff_gene_name_attribute
385+
386+ # Normalize input: strip whitespace
387+ gene_normalized = gene .strip ()
388+
389+ # Reject empty identifiers after normalization to avoid ambiguous matches
390+ if not gene_normalized :
391+ raise ValueError (
392+ "Gene identifier is empty after stripping whitespace; please provide a valid gene ID or name."
393+ )
394+ # Try exact ID match first (case-sensitive)
395+ debug (f"Attempting ID match for '{ gene_normalized } '" )
396+ gene_id_match = df_genes [df_genes ["ID" ].str .strip () == gene_normalized ]
397+
398+ if len (gene_id_match ) == 1 :
399+ gene_id = gene_id_match .iloc [0 ]["ID" ]
400+ debug (f"Found ID match: { gene_id } " )
401+ elif len (gene_id_match ) > 1 :
402+ # This should not happen (ID should be unique), but handling gracefully
403+ raise ValueError (
404+ f"Multiple features with ID '{ gene } ' found (data integrity issue)"
405+ )
406+ else :
407+ # Trying name match (case-insensitive with whitespace handling)
408+ debug ("No ID match, attempting name match" )
409+ gene_name_match = df_genes [
410+ df_genes [name_attr ].fillna ("" ).str .strip ().str .lower ()
411+ == gene_normalized .lower ()
412+ ]
413+
414+ if len (gene_name_match ) == 0 :
415+ raise ValueError (f"Gene '{ gene } ' not found (no matching ID or name)" )
416+ elif len (gene_name_match ) > 1 :
417+ # Suggest which genes matched for better debugging
418+ matching_ids = ", " .join (gene_name_match ["ID" ].values )
419+ raise ValueError (
420+ f"Gene name '{ gene } ' is ambiguous (matches { len (gene_name_match )} genes: { matching_ids } ). "
421+ f"Please use a specific gene ID instead."
422+ )
423+
424+ gene_id = gene_name_match .iloc [0 ]["ID" ]
425+ debug (f"Found name match: { gene_id } " )
426+
427+ # Get transcripts for the gene
428+ debug (f"Finding transcripts for gene '{ gene_id } '" )
429+ df_transcripts = self .genome_feature_children (
430+ parent = gene_id , attributes = ("ID" ,)
431+ )
432+ df_transcripts = df_transcripts [df_transcripts ["type" ] == "mRNA" ]
433+
434+ if len (df_transcripts ) == 0 :
435+ raise ValueError (f"Gene '{ gene } ' has no transcripts" )
436+
437+ debug (f"Found { len (df_transcripts )} transcripts for gene { gene_id } " )
438+
439+ # Calculate transcript lengths and find canonical
440+ debug ("Calculating transcript lengths for each transcript" )
441+ transcript_lengths = {}
442+
443+ for transcript_id in df_transcripts ["ID" ]:
444+ # Get all exon children (genome_feature_children handles multi-parent exons)
445+ df_exons = self .genome_feature_children (
446+ parent = transcript_id , attributes = None
447+ )
448+ # Filter for exons only (important: exclude other feature types)
449+ df_exons = df_exons [df_exons ["type" ] == "exon" ].sort_values ("start" )
450+
451+ if len (df_exons ) > 0 :
452+ # Calculate total transcribed length (1-based inclusive coordinates)
453+ exon_lengths = (df_exons ["end" ] - df_exons ["start" ] + 1 ).sum ()
454+ transcript_lengths [transcript_id ] = exon_lengths
455+ debug (f" { transcript_id } : { len (df_exons )} exons, { exon_lengths } bp" )
456+ if not transcript_lengths :
457+ raise ValueError (f"Gene '{ gene } ' has no transcripts with exons" )
458+
459+ # Find canonical (maximum length)
460+ canonical = max (transcript_lengths , key = lambda tid : transcript_lengths [tid ])
461+ canonical_length = transcript_lengths [canonical ]
462+ debug (f"Canonical transcript: { canonical } with { canonical_length } bp" )
463+ return canonical
464+
317465 @_check_types
318466 @doc (
319467 summary = "Plot a genes track, using bokeh." ,
0 commit comments