Skip to content

Commit 8d0403b

Browse files
committed
fix: Percent Encoding of paths for hive style partitioning
1 parent 47df535 commit 8d0403b

4 files changed

Lines changed: 68 additions & 8 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ log = "^0.4"
166166
num-traits = { version = "0.2" }
167167
object_store = { version = "0.12.4", default-features = false }
168168
parking_lot = "0.12"
169+
percent-encoding = "2.3"
169170
parquet = { version = "57.1.0", default-features = false, features = [
170171
"arrow",
171172
"async",

datafusion/catalog-listing/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ futures = { workspace = true }
4646
itertools = { workspace = true }
4747
log = { workspace = true }
4848
object_store = { workspace = true }
49+
percent-encoding = { workspace = true }
4950

5051
[dev-dependencies]
5152
datafusion-datasource-parquet = { workspace = true }

datafusion/catalog-listing/src/helpers.rs

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ use datafusion_expr::{Expr, Volatility};
4242
use datafusion_physical_expr::create_physical_expr;
4343
use object_store::path::Path;
4444
use object_store::{ObjectMeta, ObjectStore};
45+
use percent_encoding::percent_decode_str;
4546

4647
/// Check whether the given expression can be resolved using only the columns `col_names`.
4748
/// This means that if this function returns true:
@@ -417,12 +418,15 @@ pub async fn pruned_partition_list<'a>(
417418
}
418419

419420
/// Extract the partition values for the given `file_path` (in the given `table_path`)
420-
/// associated to the partitions defined by `table_partition_cols`
421+
/// associated to the partitions defined by `table_partition_cols`.
422+
///
423+
/// Partition values are URL-decoded, since object stores like S3 encode special
424+
/// characters (e.g., `/` becomes `%2F`) in path segments.
421425
pub fn parse_partitions_for_path<'a, I>(
422426
table_path: &ListingTableUrl,
423427
file_path: &'a Path,
424428
table_partition_cols: I,
425-
) -> Option<Vec<&'a str>>
429+
) -> Option<Vec<String>>
426430
where
427431
I: IntoIterator<Item = &'a str>,
428432
{
@@ -431,7 +435,10 @@ where
431435
let mut part_values = vec![];
432436
for (part, expected_partition) in subpath.zip(table_partition_cols) {
433437
match part.split_once('=') {
434-
Some((name, val)) if name == expected_partition => part_values.push(val),
438+
Some((name, val)) if name == expected_partition => {
439+
let decoded = percent_decode_str(val).decode_utf8().ok()?;
440+
part_values.push(decoded.into_owned());
441+
}
435442
_ => {
436443
debug!(
437444
"Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'",
@@ -507,7 +514,7 @@ mod tests {
507514
#[test]
508515
fn test_parse_partitions_for_path() {
509516
assert_eq!(
510-
Some(vec![]),
517+
Some(vec![] as Vec<String>),
511518
parse_partitions_for_path(
512519
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
513520
&Path::from("bucket/mytable/file.csv"),
@@ -531,15 +538,25 @@ mod tests {
531538
)
532539
);
533540
assert_eq!(
534-
Some(vec!["v1"]),
541+
Some(vec!["v1".to_string()]),
535542
parse_partitions_for_path(
536543
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
537544
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
538545
vec!["mypartition"]
539546
)
540547
);
548+
// URL-encoded partition values should be decoded
549+
// Use Path::parse to avoid double-encoding (Path::from encodes % as %25)
541550
assert_eq!(
542-
Some(vec!["v1"]),
551+
Some(vec!["v/1".to_string()]),
552+
parse_partitions_for_path(
553+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
554+
&Path::parse("bucket/mytable/mypartition=v%2F1/file.csv").unwrap(),
555+
vec!["mypartition"]
556+
)
557+
);
558+
assert_eq!(
559+
Some(vec!["v1".to_string()]),
543560
parse_partitions_for_path(
544561
&ListingTableUrl::parse("file:///bucket/mytable/").unwrap(),
545562
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
@@ -556,21 +573,61 @@ mod tests {
556573
)
557574
);
558575
assert_eq!(
559-
Some(vec!["v1", "v2"]),
576+
Some(vec!["v1".to_string(), "v2".to_string()]),
560577
parse_partitions_for_path(
561578
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
562579
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
563580
vec!["mypartition", "otherpartition"]
564581
)
565582
);
566583
assert_eq!(
567-
Some(vec!["v1"]),
584+
Some(vec!["v1".to_string()]),
568585
parse_partitions_for_path(
569586
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
570587
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
571588
vec!["mypartition"]
572589
)
573590
);
591+
assert_eq!(
592+
Some(vec!["John Doe".to_string()]),
593+
parse_partitions_for_path(
594+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
595+
&Path::parse("bucket/mytable/name=John%20Doe/file.csv").unwrap(),
596+
vec!["name"]
597+
)
598+
);
599+
assert_eq!(
600+
Some(vec!["a/b".to_string(), "c d".to_string()]),
601+
parse_partitions_for_path(
602+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
603+
&Path::parse("bucket/mytable/p1=a%2Fb/p2=c%20d/file.csv").unwrap(),
604+
vec!["p1", "p2"]
605+
)
606+
);
607+
assert_eq!(
608+
Some(vec!["Müller".to_string()]),
609+
parse_partitions_for_path(
610+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
611+
&Path::parse("bucket/mytable/name=M%C3%BCller/file.csv").unwrap(),
612+
vec!["name"]
613+
)
614+
);
615+
assert_eq!(
616+
Some(vec!["invalid%XX".to_string()]),
617+
parse_partitions_for_path(
618+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
619+
&Path::parse("bucket/mytable/p1=invalid%XX/file.csv").unwrap(),
620+
vec!["p1"]
621+
)
622+
);
623+
assert_eq!(
624+
None,
625+
parse_partitions_for_path(
626+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
627+
&Path::parse("bucket/mytable/p1=%FF/file.csv").unwrap(),
628+
vec!["p1"]
629+
)
630+
);
574631
}
575632

576633
#[test]

0 commit comments

Comments
 (0)