@@ -888,14 +888,24 @@ def _gene_cnv(
888888 chunks ,
889889 inline_array ,
890890 ):
891- debug = self ._log .debug
892-
893- debug ("sanity check" )
891+ # Sanity check.
894892 assert isinstance (region , Region )
895893
896- debug ("access HMM data" )
894+ # Access genes within the region of interest.
895+ df_genome_features = self .genome_features (region = region )
896+ sample_query_options = sample_query_options or {}
897+ df_genes = df_genome_features .query (
898+ f"type == '{ self ._gff_gene_type } '" , ** sample_query_options
899+ )
900+
901+ # Refine the region for CNV data to ensure coverage of all requested genes.
902+ cnv_region = Region (
903+ region .contig , df_genes ["start" ].min (), df_genes ["end" ].max ()
904+ )
905+
906+ # Access HMM data.
897907 ds_hmm = self .cnv_hmm (
898- region = region . contig ,
908+ region = cnv_region ,
899909 sample_sets = sample_sets ,
900910 sample_query = sample_query ,
901911 sample_query_options = sample_query_options ,
@@ -909,45 +919,38 @@ def _gene_cnv(
909919 with self ._dask_progress (desc = "Load CNV HMM data" ):
910920 pos , end , cn = dask .compute (pos , end , cn )
911921
912- debug ("access genes" )
913- df_genome_features = self .genome_features (region = region )
914- sample_query_options = sample_query_options or {}
915- df_genes = df_genome_features .query (
916- f"type == '{ self ._gff_gene_type } '" , ** sample_query_options
917- )
918-
919- debug ("setup intermediates" )
922+ # Set up intermediates.
920923 windows = []
921924 modes = []
922925 counts = []
923926
924- debug ( "iterate over genes" )
927+ # Iterate over genes.
925928 genes_iterator = self ._progress (
926929 df_genes .itertuples (),
927930 desc = "Compute modal gene copy number" ,
928931 total = len (df_genes ),
929932 )
930933 for gene in genes_iterator :
931- # locate windows overlapping the gene
934+ # Locate windows overlapping the gene.
932935 loc_gene_start = bisect_left (end , gene .start )
933936 loc_gene_stop = bisect_right (pos , gene .end )
934937 w = loc_gene_stop - loc_gene_start
935938 windows .append (w )
936939
937- # slice out copy number data for the given gene
940+ # Slice out copy number data for the given gene.
938941 cn_gene = cn [loc_gene_start :loc_gene_stop ]
939942
940- # compute the modes
943+ # Compute the modes.
941944 m , c = _cn_mode (cn_gene , vmax = 12 )
942945 modes .append (m )
943946 counts .append (c )
944947
945- debug ( "combine results" )
948+ # Combine results.
946949 windows = np .array (windows )
947950 modes = np .vstack (modes )
948951 counts = np .vstack (counts )
949952
950- debug ( "build dataset" )
953+ # Build dataset.
951954 ds_out = xr .Dataset (
952955 coords = {
953956 "gene_id" : (["genes" ], df_genes ["ID" ].values ),
@@ -1182,6 +1185,11 @@ def _gene_cnv_frequencies(
11821185
11831186 freq_cols [f"frq_{ coh } " ] = np .concatenate ([amp_freq_coh , del_freq_coh ])
11841187
1188+ if len (coh_dict ) == 0 :
1189+ raise ValueError (
1190+ "No cohorts available for the given sample selection parameters and minimum cohort size."
1191+ )
1192+
11851193 debug ("build a dataframe with the frequency columns" )
11861194 df_freqs = pd .DataFrame (freq_cols )
11871195
0 commit comments