@@ -71,13 +71,15 @@ def pca(
7171 cohort_size : Optional [base_params .cohort_size ] = None ,
7272 min_cohort_size : Optional [base_params .min_cohort_size ] = None ,
7373 max_cohort_size : Optional [base_params .max_cohort_size ] = None ,
74+ exclude_samples : Optional [base_params .samples ] = None ,
75+ fit_exclude_samples : Optional [base_params .samples ] = None ,
7476 random_seed : base_params .random_seed = 42 ,
7577 inline_array : base_params .inline_array = base_params .inline_array_default ,
7678 chunks : base_params .chunks = base_params .chunks_default ,
7779 ) -> Tuple [pca_params .df_pca , pca_params .evr ]:
7880 # Change this name if you ever change the behaviour of this function, to
7981 # invalidate any previously cached data.
80- name = "pca_v3 "
82+ name = "pca_v4 "
8183
8284 # Normalize params for consistent hash value.
8385 (
@@ -104,6 +106,8 @@ def pca(
104106 cohort_size = cohort_size ,
105107 min_cohort_size = min_cohort_size ,
106108 max_cohort_size = max_cohort_size ,
109+ exclude_samples = exclude_samples ,
110+ fit_exclude_samples = fit_exclude_samples ,
107111 random_seed = random_seed ,
108112 )
109113
@@ -119,11 +123,11 @@ def pca(
119123 coords = results ["coords" ]
120124 evr = results ["evr" ]
121125 samples = results ["samples" ]
126+ loc_keep_fit = results ["loc_keep_fit" ]
122127
123128 # Load sample metadata.
124129 df_samples = self .sample_metadata (
125130 sample_sets = sample_sets ,
126- sample_indices = sample_indices_prepped ,
127131 )
128132
129133 # Ensure aligned with genotype data.
@@ -134,6 +138,8 @@ def pca(
134138 {f"PC{ i + 1 } " : coords [:, i ] for i in range (coords .shape [1 ])}
135139 )
136140 df_pca = df_samples .join (df_coords , how = "inner" )
141+ # Add a column for which samples were included in fitting.
142+ df_pca ["pca_fit" ] = loc_keep_fit
137143
138144 return df_pca , evr
139145
@@ -153,6 +159,8 @@ def _pca(
153159 cohort_size ,
154160 min_cohort_size ,
155161 max_cohort_size ,
162+ exclude_samples ,
163+ fit_exclude_samples ,
156164 random_seed ,
157165 chunks ,
158166 inline_array ,
@@ -177,12 +185,39 @@ def _pca(
177185 )
178186
179187 with self ._spinner (desc = "Compute PCA" ):
188+ # Exclude any samples prior to computing PCA.
189+ if exclude_samples is not None :
190+ x = np .array (exclude_samples , dtype = "U" )
191+ loc_keep = ~ np .isin (samples , x )
192+ samples = samples [loc_keep ]
193+ gn = gn [:, loc_keep ]
194+
195+ # Exclude any samples from fitting only.
196+ if fit_exclude_samples is not None :
197+ xf = np .array (fit_exclude_samples , dtype = "U" )
198+ loc_keep_fit = ~ np .isin (samples , xf )
199+ gn_fit = gn [:, loc_keep_fit ]
200+ else :
201+ loc_keep_fit = np .ones (len (samples ), dtype = bool )
202+ gn_fit = gn
203+
180204 # Remove any sites where all genotypes are identical.
181- loc_var = np .any (gn != gn [:, 0 , np .newaxis ], axis = 1 )
205+ loc_var = np .any (gn_fit != gn_fit [:, 0 , np .newaxis ], axis = 1 )
206+ gn_fit_var = np .compress (loc_var , gn_fit , axis = 0 )
182207 gn_var = np .compress (loc_var , gn , axis = 0 )
183208
184209 # Run the PCA.
185- coords , model = allel .pca (gn_var , n_components = n_components )
210+ if fit_exclude_samples is None :
211+ # Simple fit and transform on the same data.
212+ coords , model = allel .pca (gn_fit_var , n_components = n_components )
213+
214+ else :
215+ # Fit and transform separately.
216+ model = allel .stats .decomposition .GenotypePCA (
217+ n_components = n_components ,
218+ )
219+ model .fit (gn_fit_var )
220+ coords = model .transform (gn_var , copy = False )
186221
187222 # Work around sign indeterminacy.
188223 for i in range (coords .shape [1 ]):
@@ -191,7 +226,10 @@ def _pca(
191226 coords [:, i ] = c * - 1
192227
193228 results = dict (
194- samples = samples , coords = coords , evr = model .explained_variance_ratio_
229+ samples = samples ,
230+ coords = coords ,
231+ evr = model .explained_variance_ratio_ ,
232+ loc_keep_fit = loc_keep_fit ,
195233 )
196234 return results
197235
0 commit comments