@@ -253,6 +253,13 @@ pub struct SelectivityTracker {
253253 /// During collection, all filters go to post-scan for accurate measurement.
254254 /// Default: 10_000
255255 min_rows_for_collection : u64 ,
256+ /// Fraction of total dataset rows for collection phase (0.0 = disabled).
257+ /// When > 0 and dataset size is known, effective threshold =
258+ /// max(min_rows_for_collection, (fraction * total_rows) as u64).
259+ collection_fraction : f64 ,
260+ /// Resolved minimum rows after notify_dataset_rows() is called.
261+ /// None = not yet resolved (use min_rows_for_collection as-is).
262+ resolved_min_rows : Option < u64 > ,
256263}
257264
258265impl Default for SelectivityTracker {
@@ -275,6 +282,8 @@ impl SelectivityTracker {
275282 min_bytes_per_sec,
276283 correlation_threshold : 1.5 ,
277284 min_rows_for_collection : 10_000 ,
285+ collection_fraction : 0.0 ,
286+ resolved_min_rows : None ,
278287 }
279288 }
280289
@@ -283,13 +292,16 @@ impl SelectivityTracker {
283292 min_bytes_per_sec : f64 ,
284293 correlation_threshold : f64 ,
285294 min_rows_for_collection : u64 ,
295+ collection_fraction : f64 ,
286296 ) -> Self {
287297 Self {
288298 stats : HashMap :: new ( ) ,
289299 correlations : HashMap :: new ( ) ,
290300 min_bytes_per_sec,
291301 correlation_threshold,
292302 min_rows_for_collection,
303+ collection_fraction,
304+ resolved_min_rows : None ,
293305 }
294306 }
295307
@@ -298,6 +310,25 @@ impl SelectivityTracker {
298310 self . min_bytes_per_sec
299311 }
300312
313+ /// Returns the effective minimum rows for collection, taking into account
314+ /// the fraction-based threshold if it has been resolved.
315+ fn effective_min_rows ( & self ) -> u64 {
316+ self . resolved_min_rows . unwrap_or ( self . min_rows_for_collection )
317+ }
318+
319+ /// Notify the tracker of the total dataset row count so the fraction-based
320+ /// threshold can be resolved.
321+ ///
322+ /// When `collection_fraction > 0`, computes:
323+ /// `resolved_min_rows = max(min_rows_for_collection, (fraction * total_rows) as u64)`
324+ pub fn notify_dataset_rows ( & mut self , total_rows : u64 ) {
325+ if self . collection_fraction > 0.0 {
326+ let fraction_rows = ( self . collection_fraction * total_rows as f64 ) as u64 ;
327+ self . resolved_min_rows =
328+ Some ( self . min_rows_for_collection . max ( fraction_rows) ) ;
329+ }
330+ }
331+
301332 /// Get the effectiveness for a filter expression, if known.
302333 pub fn get_effectiveness ( & self , expr : & Arc < dyn PhysicalExpr > ) -> Option < f64 > {
303334 let key = ExprKey :: new ( expr) ;
@@ -314,17 +345,16 @@ impl SelectivityTracker {
314345 /// Returns false if no stats exist yet (no filters registered) or if
315346 /// min_rows_for_collection is 0 (collection disabled).
316347 pub fn in_collection_phase ( & self ) -> bool {
317- if self . min_rows_for_collection == 0 {
348+ let min_rows = self . effective_min_rows ( ) ;
349+ if min_rows == 0 {
318350 return false ;
319351 }
320352 if self . stats . is_empty ( ) {
321353 // No filters registered yet - treat as collection phase
322354 // so the first file's filters go to post-scan for measurement
323355 return true ;
324356 }
325- self . stats
326- . values ( )
327- . any ( |s| s. rows_total < self . min_rows_for_collection )
357+ self . stats . values ( ) . any ( |s| s. rows_total < min_rows)
328358 }
329359
330360 /// Partition filters into row_filters and post_scan based on bytes/sec throughput.
@@ -498,16 +528,15 @@ impl SelectivityTracker {
498528 let stats_b = self . stats . get ( & key_b) ?;
499529
500530 // Need sufficient data
501- if stats_a. rows_total < self . min_rows_for_collection
502- || stats_b. rows_total < self . min_rows_for_collection
503- {
531+ let min_rows = self . effective_min_rows ( ) ;
532+ if stats_a. rows_total < min_rows || stats_b. rows_total < min_rows {
504533 return None ;
505534 }
506535
507536 let pair_key = PairKey :: new ( & key_a, & key_b) ;
508537 let pair_stats = self . correlations . get ( & pair_key) ?;
509538
510- if pair_stats. rows_total < self . min_rows_for_collection {
539+ if pair_stats. rows_total < min_rows {
511540 return None ;
512541 }
513542
@@ -890,7 +919,7 @@ mod tests {
890919
891920 #[ test]
892921 fn test_correlation_stats_update ( ) {
893- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 ) ;
922+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
894923
895924 let filter_a = make_filter ( "a" , 5 ) ;
896925 let filter_b = make_filter ( "a" , 10 ) ;
@@ -916,7 +945,7 @@ mod tests {
916945
917946 #[ test]
918947 fn test_correlation_ratio_independent ( ) {
919- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 ) ;
948+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
920949
921950 let filter_a = make_filter ( "a" , 5 ) ;
922951 let filter_b = make_filter ( "a" , 10 ) ;
@@ -932,7 +961,7 @@ mod tests {
932961
933962 #[ test]
934963 fn test_correlation_ratio_insufficient_data ( ) {
935- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 1000 ) ;
964+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 1000 , 0.0 ) ;
936965
937966 let filter_a = make_filter ( "a" , 5 ) ;
938967 let filter_b = make_filter ( "a" , 10 ) ;
@@ -947,7 +976,7 @@ mod tests {
947976
948977 #[ test]
949978 fn test_in_collection_phase ( ) {
950- let mut tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 1000 ) ;
979+ let mut tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 1000 , 0.0 ) ;
951980
952981 // No stats yet - in collection phase
953982 assert ! ( tracker. in_collection_phase( ) ) ;
@@ -966,15 +995,15 @@ mod tests {
966995
967996 #[ test]
968997 fn test_in_collection_phase_disabled ( ) {
969- let tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 0 ) ;
998+ let tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 0 , 0.0 ) ;
970999
9711000 // min_rows = 0 means collection is disabled
9721001 assert ! ( !tracker. in_collection_phase( ) ) ;
9731002 }
9741003
9751004 #[ test]
9761005 fn test_partition_filters_grouped_collection_phase ( ) {
977- let tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 10_000 ) ;
1006+ let tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 10_000 , 0.0 ) ;
9781007
9791008 let filter_a = make_filter ( "a" , 5 ) ;
9801009 let filter_b = make_filter ( "a" , 10 ) ;
@@ -991,7 +1020,7 @@ mod tests {
9911020
9921021 #[ test]
9931022 fn test_partition_filters_grouped_all_independent ( ) {
994- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 ) ;
1023+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
9951024
9961025 let filter_a = make_filter ( "a" , 5 ) ;
9971026 let filter_b = make_filter ( "a" , 10 ) ;
@@ -1018,7 +1047,7 @@ mod tests {
10181047
10191048 #[ test]
10201049 fn test_partition_filters_grouped_correlated ( ) {
1021- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 ) ;
1050+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
10221051
10231052 let filter_a = make_filter ( "a" , 5 ) ;
10241053 let filter_b = make_filter ( "a" , 10 ) ;
@@ -1043,7 +1072,7 @@ mod tests {
10431072
10441073 #[ test]
10451074 fn test_partition_filters_grouped_mixed ( ) {
1046- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 ) ;
1075+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
10471076
10481077 let filter_a = make_filter ( "a" , 5 ) ;
10491078 let filter_b = make_filter ( "a" , 10 ) ;
@@ -1096,7 +1125,7 @@ mod tests {
10961125
10971126 #[ test]
10981127 fn test_partition_filters_grouped_single_filter ( ) {
1099- let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 ) ;
1128+ let mut tracker = SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
11001129
11011130 let filter_a = make_filter ( "a" , 5 ) ;
11021131 tracker. update ( & filter_a, 10 , 100 , 0 ) ;
@@ -1113,7 +1142,7 @@ mod tests {
11131142 #[ test]
11141143 fn test_partition_filters_grouped_with_low_throughput ( ) {
11151144 // Use a bytes/sec threshold: 100 bytes/sec
1116- let mut tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 100 ) ;
1145+ let mut tracker = SelectivityTracker :: new_with_config ( 100.0 , 1.5 , 100 , 0.0 ) ;
11171146
11181147 let filter_a = make_filter ( "a" , 5 ) ;
11191148 let filter_b = make_filter ( "a" , 10 ) ;
@@ -1136,4 +1165,53 @@ mod tests {
11361165 assert_eq ! ( result. row_filter_groups[ 0 ] . len( ) , 1 ) ;
11371166 assert_eq ! ( result. post_scan. len( ) , 1 ) ;
11381167 }
1168+
1169+ #[ test]
1170+ fn test_notify_dataset_rows_resolves_fraction ( ) {
1171+ let mut tracker =
1172+ SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.05 ) ;
1173+ // Before notify, effective_min_rows = min_rows_for_collection
1174+ assert_eq ! ( tracker. effective_min_rows( ) , 100 ) ;
1175+
1176+ // 5% of 10_000 = 500, which is > 100
1177+ tracker. notify_dataset_rows ( 10_000 ) ;
1178+ assert_eq ! ( tracker. effective_min_rows( ) , 500 ) ;
1179+ }
1180+
1181+ #[ test]
1182+ fn test_notify_dataset_rows_floor_behavior ( ) {
1183+ let mut tracker =
1184+ SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 1000 , 0.05 ) ;
1185+ // 5% of 10_000 = 500, but min_rows = 1000 is larger
1186+ tracker. notify_dataset_rows ( 10_000 ) ;
1187+ assert_eq ! ( tracker. effective_min_rows( ) , 1000 ) ;
1188+ }
1189+
1190+ #[ test]
1191+ fn test_notify_dataset_rows_fraction_disabled ( ) {
1192+ let mut tracker =
1193+ SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.0 ) ;
1194+ tracker. notify_dataset_rows ( 1_000_000 ) ;
1195+ // fraction = 0.0, so resolved_min_rows stays None
1196+ assert_eq ! ( tracker. effective_min_rows( ) , 100 ) ;
1197+ }
1198+
1199+ #[ test]
1200+ fn test_collection_phase_with_fraction ( ) {
1201+ let filter = make_filter ( "a" , 5 ) ;
1202+
1203+ let mut tracker =
1204+ SelectivityTracker :: new_with_config ( 0.0 , 1.5 , 100 , 0.05 ) ;
1205+ // 5% of 100_000 = 5000
1206+ tracker. notify_dataset_rows ( 100_000 ) ;
1207+ assert_eq ! ( tracker. effective_min_rows( ) , 5000 ) ;
1208+
1209+ // Record 200 rows — still in collection (200 < 5000)
1210+ tracker. update ( & filter, 80 , 200 , 10_000_000 ) ;
1211+ assert ! ( tracker. in_collection_phase( ) ) ;
1212+
1213+ // Record enough to pass threshold
1214+ tracker. update ( & filter, 1000 , 5000 , 50_000_000 ) ;
1215+ assert ! ( !tracker. in_collection_phase( ) ) ;
1216+ }
11391217}
0 commit comments