@@ -42,6 +42,7 @@ use datafusion_expr::{Expr, Volatility};
4242use datafusion_physical_expr:: create_physical_expr;
4343use object_store:: path:: Path ;
4444use object_store:: { ObjectMeta , ObjectStore } ;
45+ use percent_encoding:: percent_decode_str;
4546
4647/// Check whether the given expression can be resolved using only the columns `col_names`.
4748/// This means that if this function returns true:
@@ -418,12 +419,15 @@ pub async fn pruned_partition_list<'a>(
418419}
419420
420421/// Extract the partition values for the given `file_path` (in the given `table_path`)
421- /// associated to the partitions defined by `table_partition_cols`
422+ /// associated to the partitions defined by `table_partition_cols`.
423+ ///
424+ /// Partition values are URL-decoded, since object stores like S3 encode special
425+ /// characters (e.g., `/` becomes `%2F`) in path segments.
422426pub fn parse_partitions_for_path < ' a , I > (
423427 table_path : & ListingTableUrl ,
424428 file_path : & ' a Path ,
425429 table_partition_cols : I ,
426- ) -> Option < Vec < & ' a str > >
430+ ) -> Option < Vec < String > >
427431where
428432 I : IntoIterator < Item = & ' a str > ,
429433{
@@ -432,7 +436,10 @@ where
432436 let mut part_values = vec ! [ ] ;
433437 for ( part, expected_partition) in subpath. zip ( table_partition_cols) {
434438 match part. split_once ( '=' ) {
435- Some ( ( name, val) ) if name == expected_partition => part_values. push ( val) ,
439+ Some ( ( name, val) ) if name == expected_partition => {
440+ let decoded = percent_decode_str ( val) . decode_utf8 ( ) . ok ( ) ?;
441+ part_values. push ( decoded. into_owned ( ) ) ;
442+ }
436443 _ => {
437444 debug ! (
438445 "Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'" ,
@@ -508,7 +515,7 @@ mod tests {
508515 #[ test]
509516 fn test_parse_partitions_for_path ( ) {
510517 assert_eq ! (
511- Some ( vec![ ] ) ,
518+ Some ( vec![ ] as Vec < String > ) ,
512519 parse_partitions_for_path(
513520 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
514521 & Path :: from( "bucket/mytable/file.csv" ) ,
@@ -532,15 +539,25 @@ mod tests {
532539 )
533540 ) ;
534541 assert_eq ! (
535- Some ( vec![ "v1" ] ) ,
542+ Some ( vec![ "v1" . to_string ( ) ] ) ,
536543 parse_partitions_for_path(
537544 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
538545 & Path :: from( "bucket/mytable/mypartition=v1/file.csv" ) ,
539546 vec![ "mypartition" ]
540547 )
541548 ) ;
549+ // URL-encoded partition values should be decoded
550+ // Use Path::parse to avoid double-encoding (Path::from encodes % as %25)
542551 assert_eq ! (
543- Some ( vec![ "v1" ] ) ,
552+ Some ( vec![ "v/1" . to_string( ) ] ) ,
553+ parse_partitions_for_path(
554+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
555+ & Path :: parse( "bucket/mytable/mypartition=v%2F1/file.csv" ) . unwrap( ) ,
556+ vec![ "mypartition" ]
557+ )
558+ ) ;
559+ assert_eq ! (
560+ Some ( vec![ "v1" . to_string( ) ] ) ,
544561 parse_partitions_for_path(
545562 & ListingTableUrl :: parse( "file:///bucket/mytable/" ) . unwrap( ) ,
546563 & Path :: from( "bucket/mytable/mypartition=v1/file.csv" ) ,
@@ -557,21 +574,61 @@ mod tests {
557574 )
558575 ) ;
559576 assert_eq ! (
560- Some ( vec![ "v1" , "v2" ] ) ,
577+ Some ( vec![ "v1" . to_string ( ) , "v2" . to_string ( ) ] ) ,
561578 parse_partitions_for_path(
562579 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
563580 & Path :: from( "bucket/mytable/mypartition=v1/otherpartition=v2/file.csv" ) ,
564581 vec![ "mypartition" , "otherpartition" ]
565582 )
566583 ) ;
567584 assert_eq ! (
568- Some ( vec![ "v1" ] ) ,
585+ Some ( vec![ "v1" . to_string ( ) ] ) ,
569586 parse_partitions_for_path(
570587 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
571588 & Path :: from( "bucket/mytable/mypartition=v1/otherpartition=v2/file.csv" ) ,
572589 vec![ "mypartition" ]
573590 )
574591 ) ;
592+ assert_eq ! (
593+ Some ( vec![ "John Doe" . to_string( ) ] ) ,
594+ parse_partitions_for_path(
595+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
596+ & Path :: parse( "bucket/mytable/name=John%20Doe/file.csv" ) . unwrap( ) ,
597+ vec![ "name" ]
598+ )
599+ ) ;
600+ assert_eq ! (
601+ Some ( vec![ "a/b" . to_string( ) , "c d" . to_string( ) ] ) ,
602+ parse_partitions_for_path(
603+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
604+ & Path :: parse( "bucket/mytable/p1=a%2Fb/p2=c%20d/file.csv" ) . unwrap( ) ,
605+ vec![ "p1" , "p2" ]
606+ )
607+ ) ;
608+ assert_eq ! (
609+ Some ( vec![ "Müller" . to_string( ) ] ) ,
610+ parse_partitions_for_path(
611+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
612+ & Path :: parse( "bucket/mytable/name=M%C3%BCller/file.csv" ) . unwrap( ) ,
613+ vec![ "name" ]
614+ )
615+ ) ;
616+ assert_eq ! (
617+ Some ( vec![ "invalid%XX" . to_string( ) ] ) ,
618+ parse_partitions_for_path(
619+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
620+ & Path :: parse( "bucket/mytable/p1=invalid%XX/file.csv" ) . unwrap( ) ,
621+ vec![ "p1" ]
622+ )
623+ ) ;
624+ assert_eq ! (
625+ None ,
626+ parse_partitions_for_path(
627+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
628+ & Path :: parse( "bucket/mytable/p1=%FF/file.csv" ) . unwrap( ) ,
629+ vec![ "p1" ]
630+ )
631+ ) ;
575632 }
576633
577634 #[ test]
0 commit comments