@@ -42,6 +42,7 @@ use datafusion_expr::{Expr, Volatility};
4242use datafusion_physical_expr:: create_physical_expr;
4343use object_store:: path:: Path ;
4444use object_store:: { ObjectMeta , ObjectStore } ;
45+ use percent_encoding:: percent_decode_str;
4546
4647/// Check whether the given expression can be resolved using only the columns `col_names`.
4748/// This means that if this function returns true:
@@ -417,12 +418,15 @@ pub async fn pruned_partition_list<'a>(
417418}
418419
419420/// Extract the partition values for the given `file_path` (in the given `table_path`)
420- /// associated to the partitions defined by `table_partition_cols`
421+ /// associated to the partitions defined by `table_partition_cols`.
422+ ///
423+ /// Partition values are URL-decoded, since object stores like S3 encode special
424+ /// characters (e.g., `/` becomes `%2F`) in path segments.
421425pub fn parse_partitions_for_path < ' a , I > (
422426 table_path : & ListingTableUrl ,
423427 file_path : & ' a Path ,
424428 table_partition_cols : I ,
425- ) -> Option < Vec < & ' a str > >
429+ ) -> Option < Vec < String > >
426430where
427431 I : IntoIterator < Item = & ' a str > ,
428432{
@@ -431,7 +435,10 @@ where
431435 let mut part_values = vec ! [ ] ;
432436 for ( part, expected_partition) in subpath. zip ( table_partition_cols) {
433437 match part. split_once ( '=' ) {
434- Some ( ( name, val) ) if name == expected_partition => part_values. push ( val) ,
438+ Some ( ( name, val) ) if name == expected_partition => {
439+ let decoded = percent_decode_str ( val) . decode_utf8 ( ) . ok ( ) ?;
440+ part_values. push ( decoded. into_owned ( ) ) ;
441+ }
435442 _ => {
436443 debug ! (
437444 "Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'" ,
@@ -507,7 +514,7 @@ mod tests {
507514 #[ test]
508515 fn test_parse_partitions_for_path ( ) {
509516 assert_eq ! (
510- Some ( vec![ ] ) ,
517+ Some ( vec![ ] as Vec < String > ) ,
511518 parse_partitions_for_path(
512519 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
513520 & Path :: from( "bucket/mytable/file.csv" ) ,
@@ -531,15 +538,25 @@ mod tests {
531538 )
532539 ) ;
533540 assert_eq ! (
534- Some ( vec![ "v1" ] ) ,
541+ Some ( vec![ "v1" . to_string ( ) ] ) ,
535542 parse_partitions_for_path(
536543 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
537544 & Path :: from( "bucket/mytable/mypartition=v1/file.csv" ) ,
538545 vec![ "mypartition" ]
539546 )
540547 ) ;
548+ // URL-encoded partition values should be decoded
549+ // Use Path::parse to avoid double-encoding (Path::from encodes % as %25)
541550 assert_eq ! (
542- Some ( vec![ "v1" ] ) ,
551+ Some ( vec![ "v/1" . to_string( ) ] ) ,
552+ parse_partitions_for_path(
553+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
554+ & Path :: parse( "bucket/mytable/mypartition=v%2F1/file.csv" ) . unwrap( ) ,
555+ vec![ "mypartition" ]
556+ )
557+ ) ;
558+ assert_eq ! (
559+ Some ( vec![ "v1" . to_string( ) ] ) ,
543560 parse_partitions_for_path(
544561 & ListingTableUrl :: parse( "file:///bucket/mytable/" ) . unwrap( ) ,
545562 & Path :: from( "bucket/mytable/mypartition=v1/file.csv" ) ,
@@ -556,21 +573,61 @@ mod tests {
556573 )
557574 ) ;
558575 assert_eq ! (
559- Some ( vec![ "v1" , "v2" ] ) ,
576+ Some ( vec![ "v1" . to_string ( ) , "v2" . to_string ( ) ] ) ,
560577 parse_partitions_for_path(
561578 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
562579 & Path :: from( "bucket/mytable/mypartition=v1/otherpartition=v2/file.csv" ) ,
563580 vec![ "mypartition" , "otherpartition" ]
564581 )
565582 ) ;
566583 assert_eq ! (
567- Some ( vec![ "v1" ] ) ,
584+ Some ( vec![ "v1" . to_string ( ) ] ) ,
568585 parse_partitions_for_path(
569586 & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
570587 & Path :: from( "bucket/mytable/mypartition=v1/otherpartition=v2/file.csv" ) ,
571588 vec![ "mypartition" ]
572589 )
573590 ) ;
591+ assert_eq ! (
592+ Some ( vec![ "John Doe" . to_string( ) ] ) ,
593+ parse_partitions_for_path(
594+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
595+ & Path :: parse( "bucket/mytable/name=John%20Doe/file.csv" ) . unwrap( ) ,
596+ vec![ "name" ]
597+ )
598+ ) ;
599+ assert_eq ! (
600+ Some ( vec![ "a/b" . to_string( ) , "c d" . to_string( ) ] ) ,
601+ parse_partitions_for_path(
602+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
603+ & Path :: parse( "bucket/mytable/p1=a%2Fb/p2=c%20d/file.csv" ) . unwrap( ) ,
604+ vec![ "p1" , "p2" ]
605+ )
606+ ) ;
607+ assert_eq ! (
608+ Some ( vec![ "Müller" . to_string( ) ] ) ,
609+ parse_partitions_for_path(
610+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
611+ & Path :: parse( "bucket/mytable/name=M%C3%BCller/file.csv" ) . unwrap( ) ,
612+ vec![ "name" ]
613+ )
614+ ) ;
615+ assert_eq ! (
616+ Some ( vec![ "invalid%XX" . to_string( ) ] ) ,
617+ parse_partitions_for_path(
618+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
619+ & Path :: parse( "bucket/mytable/p1=invalid%XX/file.csv" ) . unwrap( ) ,
620+ vec![ "p1" ]
621+ )
622+ ) ;
623+ assert_eq ! (
624+ None ,
625+ parse_partitions_for_path(
626+ & ListingTableUrl :: parse( "file:///bucket/mytable" ) . unwrap( ) ,
627+ & Path :: parse( "bucket/mytable/p1=%FF/file.csv" ) . unwrap( ) ,
628+ vec![ "p1" ]
629+ )
630+ ) ;
574631 }
575632
576633 #[ test]
0 commit comments