@@ -252,3 +252,135 @@ def test_genome_features_virtual_contigs(ag3_sim_api, chrom):
252252 assert isinstance (df , pd .DataFrame )
253253 if len (df ) > 0 :
254254 assert df ["contig" ].unique () == region .split (":" )[0 ]
255+
256+
257+ # =============================================================================
258+ # Tests for canonical_transcript functionality
259+ # =============================================================================
260+
261+
262+ @parametrize_with_cases ("fixture,api" , cases = "." )
263+ def test_canonical_transcript_by_id (fixture , api : AnophelesGenomeFeaturesData ):
264+ """Test finding canonical transcript by gene ID."""
265+ genes = api .genome_features ().query (f"type == '{ api ._gff_gene_type } '" )
266+ if len (genes ) == 0 :
267+ pytest .skip ("No genes available in fixture" )
268+
269+ gene_id = genes .iloc [0 ]["ID" ]
270+ canonical = api .canonical_transcript (gene_id )
271+ assert isinstance (canonical , str )
272+ assert len (canonical ) > 0
273+
274+
275+ @parametrize_with_cases ("fixture,api" , cases = "." )
276+ def test_canonical_transcript_by_name (fixture , api : AnophelesGenomeFeaturesData ):
277+ """Test finding canonical transcript by gene name."""
278+ genes = api .genome_features ().query (f"type == '{ api ._gff_gene_type } '" )
279+ if len (genes ) == 0 :
280+ pytest .skip ("No genes available in fixture" )
281+
282+ gene_name = genes .iloc [0 ][api ._gff_gene_name_attribute ]
283+ canonical = api .canonical_transcript (gene_name )
284+ assert isinstance (canonical , str )
285+ assert len (canonical ) > 0
286+
287+
288+ @parametrize_with_cases ("fixture,api" , cases = "." )
289+ def test_canonical_transcript_invalid_gene (fixture , api : AnophelesGenomeFeaturesData ):
290+ """Test that ValueError is raised for non-existent gene."""
291+ with pytest .raises (ValueError , match = "not found" ):
292+ api .canonical_transcript ("NONEXISTENT_GENE_ID_12345" )
293+
294+
295+ @parametrize_with_cases ("fixture,api" , cases = "." )
296+ def test_canonical_transcript_empty_string (fixture , api : AnophelesGenomeFeaturesData ):
297+ """Test that ValueError is raised for empty string."""
298+ with pytest .raises (ValueError ):
299+ api .canonical_transcript ("" )
300+
301+
302+ @parametrize_with_cases ("fixture,api" , cases = "." )
303+ def test_canonical_transcript_whitespace_handling (
304+ fixture , api : AnophelesGenomeFeaturesData
305+ ):
306+ """Test that whitespace handling is preserved during lookup."""
307+ genes = api .genome_features ().query (f"type == '{ api ._gff_gene_type } '" )
308+ if len (genes ) == 0 :
309+ pytest .skip ("No genes available in fixture" )
310+
311+ gene_id = genes .iloc [0 ]["ID" ]
312+ canonical = api .canonical_transcript (gene_id )
313+ assert isinstance (canonical , str )
314+
315+
316+ @parametrize_with_cases ("fixture,api" , cases = "." )
317+ def test_canonical_transcript_case_insensitive (
318+ fixture , api : AnophelesGenomeFeaturesData
319+ ):
320+ """Test that gene name matching is case-insensitive."""
321+ genes = api .genome_features ().query (f"type == '{ api ._gff_gene_type } '" )
322+ if len (genes ) == 0 :
323+ pytest .skip ("No genes available in fixture" )
324+
325+ gene_name = genes .iloc [0 ][api ._gff_gene_name_attribute ]
326+ gene_name_lower = gene_name .lower ()
327+ canonical = api .canonical_transcript (gene_name_lower )
328+ assert isinstance (canonical , str )
329+
330+
331+ @parametrize_with_cases ("fixture,api" , cases = "." )
332+ def test_canonical_transcript_single_transcript_gene (
333+ fixture , api : AnophelesGenomeFeaturesData
334+ ):
335+ """Test that genes with only one transcript return that transcript."""
336+ genes = api .genome_features ().query (f"type == '{ api ._gff_gene_type } '" )
337+ if len (genes ) == 0 :
338+ pytest .skip ("No genes available in fixture" )
339+
340+ # Find a gene with exactly one transcript
341+ for gene_id in genes ["ID" ]:
342+ transcripts = api .genome_feature_children (parent = gene_id )
343+ transcripts = transcripts [transcripts ["type" ] == "mRNA" ]
344+ if len (transcripts ) == 1 :
345+ canonical = api .canonical_transcript (gene_id )
346+ assert canonical == transcripts .iloc [0 ]["ID" ]
347+ return
348+
349+ pytest .skip ("No gene with exactly one transcript available in fixture" )
350+
351+
352+ @parametrize_with_cases ("fixture,api" , cases = "." )
353+ def test_canonical_transcript_calculation_correctness (
354+ fixture , api : AnophelesGenomeFeaturesData
355+ ):
356+ """Test that the returned transcript has the highest exon length."""
357+ genes = api .genome_features ().query (f"type == '{ api ._gff_gene_type } '" )
358+ if len (genes ) == 0 :
359+ pytest .skip ("No genes available in fixture" )
360+
361+ gene_id = genes .iloc [0 ]["ID" ]
362+ canonical = api .canonical_transcript (gene_id )
363+
364+ # Verify by calculating manually
365+ all_transcripts = api .genome_feature_children (parent = gene_id )
366+ all_transcripts = all_transcripts [all_transcripts ["type" ] == "mRNA" ]
367+
368+ # Calculate lengths for all transcripts
369+ max_length = 0
370+ max_transcript = None
371+ for transcript_id in all_transcripts ["ID" ]:
372+ exons = api .genome_feature_children (parent = transcript_id )
373+ exons = exons [exons ["type" ] == "exon" ]
374+ length = (exons ["end" ] - exons ["start" ] + 1 ).sum ()
375+ if length > max_length :
376+ max_length = length
377+ max_transcript = transcript_id
378+
379+ # Verify canonical matches the manually calculated maximum
380+ assert canonical == max_transcript
381+
382+ # Verify canonical has the correct length
383+ canonical_exons = api .genome_feature_children (parent = canonical )
384+ canonical_exons = canonical_exons [canonical_exons ["type" ] == "exon" ]
385+ canonical_length = (canonical_exons ["end" ] - canonical_exons ["start" ] + 1 ).sum ()
386+ assert canonical_length == max_length
0 commit comments