Skip to content

Commit 6a71db3

Browse files
greedAuguriapotatokuka
authored andcommitted
fix: Percent Encoding of paths for hive style partitioning
1 parent 3d2e6b2 commit 6a71db3

4 files changed

Lines changed: 68 additions & 8 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ parquet = { version = "58.1.0", default-features = false, features = [
179179
] }
180180
pbjson = { version = "0.9.0" }
181181
pbjson-types = "0.9"
182+
percent-encoding = "2.3"
182183
# Should match arrow-flight's version of prost.
183184
prost = "0.14.1"
184185
rand = "0.9"

datafusion/catalog-listing/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ futures = { workspace = true }
4646
itertools = { workspace = true }
4747
log = { workspace = true }
4848
object_store = { workspace = true }
49+
percent-encoding = { workspace = true }
4950

5051
[dev-dependencies]
5152
datafusion-datasource-parquet = { workspace = true }

datafusion/catalog-listing/src/helpers.rs

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ use datafusion_expr::{Expr, Volatility};
4242
use datafusion_physical_expr::create_physical_expr;
4343
use object_store::path::Path;
4444
use object_store::{ObjectMeta, ObjectStore};
45+
use percent_encoding::percent_decode_str;
4546

4647
/// Check whether the given expression can be resolved using only the columns `col_names`.
4748
/// This means that if this function returns true:
@@ -418,12 +419,15 @@ pub async fn pruned_partition_list<'a>(
418419
}
419420

420421
/// Extract the partition values for the given `file_path` (in the given `table_path`)
421-
/// associated to the partitions defined by `table_partition_cols`
422+
/// associated to the partitions defined by `table_partition_cols`.
423+
///
424+
/// Partition values are URL-decoded, since object stores like S3 encode special
425+
/// characters (e.g., `/` becomes `%2F`) in path segments.
422426
pub fn parse_partitions_for_path<'a, I>(
423427
table_path: &ListingTableUrl,
424428
file_path: &'a Path,
425429
table_partition_cols: I,
426-
) -> Option<Vec<&'a str>>
430+
) -> Option<Vec<String>>
427431
where
428432
I: IntoIterator<Item = &'a str>,
429433
{
@@ -432,7 +436,10 @@ where
432436
let mut part_values = vec![];
433437
for (part, expected_partition) in subpath.zip(table_partition_cols) {
434438
match part.split_once('=') {
435-
Some((name, val)) if name == expected_partition => part_values.push(val),
439+
Some((name, val)) if name == expected_partition => {
440+
let decoded = percent_decode_str(val).decode_utf8().ok()?;
441+
part_values.push(decoded.into_owned());
442+
}
436443
_ => {
437444
debug!(
438445
"Ignoring file: file_path='{file_path}', table_path='{table_path}', part='{part}', partition_col='{expected_partition}'",
@@ -508,7 +515,7 @@ mod tests {
508515
#[test]
509516
fn test_parse_partitions_for_path() {
510517
assert_eq!(
511-
Some(vec![]),
518+
Some(vec![] as Vec<String>),
512519
parse_partitions_for_path(
513520
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
514521
&Path::from("bucket/mytable/file.csv"),
@@ -532,15 +539,25 @@ mod tests {
532539
)
533540
);
534541
assert_eq!(
535-
Some(vec!["v1"]),
542+
Some(vec!["v1".to_string()]),
536543
parse_partitions_for_path(
537544
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
538545
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
539546
vec!["mypartition"]
540547
)
541548
);
549+
// URL-encoded partition values should be decoded
550+
// Use Path::parse to avoid double-encoding (Path::from encodes % as %25)
542551
assert_eq!(
543-
Some(vec!["v1"]),
552+
Some(vec!["v/1".to_string()]),
553+
parse_partitions_for_path(
554+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
555+
&Path::parse("bucket/mytable/mypartition=v%2F1/file.csv").unwrap(),
556+
vec!["mypartition"]
557+
)
558+
);
559+
assert_eq!(
560+
Some(vec!["v1".to_string()]),
544561
parse_partitions_for_path(
545562
&ListingTableUrl::parse("file:///bucket/mytable/").unwrap(),
546563
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
@@ -557,21 +574,61 @@ mod tests {
557574
)
558575
);
559576
assert_eq!(
560-
Some(vec!["v1", "v2"]),
577+
Some(vec!["v1".to_string(), "v2".to_string()]),
561578
parse_partitions_for_path(
562579
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
563580
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
564581
vec!["mypartition", "otherpartition"]
565582
)
566583
);
567584
assert_eq!(
568-
Some(vec!["v1"]),
585+
Some(vec!["v1".to_string()]),
569586
parse_partitions_for_path(
570587
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
571588
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
572589
vec!["mypartition"]
573590
)
574591
);
592+
assert_eq!(
593+
Some(vec!["John Doe".to_string()]),
594+
parse_partitions_for_path(
595+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
596+
&Path::parse("bucket/mytable/name=John%20Doe/file.csv").unwrap(),
597+
vec!["name"]
598+
)
599+
);
600+
assert_eq!(
601+
Some(vec!["a/b".to_string(), "c d".to_string()]),
602+
parse_partitions_for_path(
603+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
604+
&Path::parse("bucket/mytable/p1=a%2Fb/p2=c%20d/file.csv").unwrap(),
605+
vec!["p1", "p2"]
606+
)
607+
);
608+
assert_eq!(
609+
Some(vec!["Müller".to_string()]),
610+
parse_partitions_for_path(
611+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
612+
&Path::parse("bucket/mytable/name=M%C3%BCller/file.csv").unwrap(),
613+
vec!["name"]
614+
)
615+
);
616+
assert_eq!(
617+
Some(vec!["invalid%XX".to_string()]),
618+
parse_partitions_for_path(
619+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
620+
&Path::parse("bucket/mytable/p1=invalid%XX/file.csv").unwrap(),
621+
vec!["p1"]
622+
)
623+
);
624+
assert_eq!(
625+
None,
626+
parse_partitions_for_path(
627+
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
628+
&Path::parse("bucket/mytable/p1=%FF/file.csv").unwrap(),
629+
vec!["p1"]
630+
)
631+
);
575632
}
576633

577634
#[test]

0 commit comments

Comments
 (0)