Skip to content

Commit 41fe291

Browse files
committed
address review: use Cow<str> and fallback on invalid encoding
- Return `Cow<'a, str>` instead of `String` to avoid unnecessary allocations when partition values have no percent-encoded characters. - Fall back to the raw value instead of returning `None` when percent-decoding produces invalid UTF-8 (e.g. `%FF`). - Fix Cargo.toml alphabetical ordering for taplo check.
1 parent 6a71db3 commit 41fe291

1 file changed

Lines changed: 22 additions & 15 deletions

File tree

datafusion/catalog-listing/src/helpers.rs

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! Helper functions for the table implementation
1919
20+
use std::borrow::Cow;
2021
use std::mem;
2122
use std::sync::Arc;
2223

@@ -354,7 +355,7 @@ fn try_into_partitioned_file(
354355
.flatten()
355356
.zip(partition_cols)
356357
.map(|(parsed, (_, datatype))| {
357-
ScalarValue::try_from_string(parsed.to_string(), datatype)
358+
ScalarValue::try_from_string(parsed.into_owned(), datatype)
358359
})
359360
.collect::<Result<Vec<_>>>()?;
360361

@@ -427,7 +428,7 @@ pub fn parse_partitions_for_path<'a, I>(
427428
table_path: &ListingTableUrl,
428429
file_path: &'a Path,
429430
table_partition_cols: I,
430-
) -> Option<Vec<String>>
431+
) -> Option<Vec<Cow<'a, str>>>
431432
where
432433
I: IntoIterator<Item = &'a str>,
433434
{
@@ -437,8 +438,10 @@ where
437438
for (part, expected_partition) in subpath.zip(table_partition_cols) {
438439
match part.split_once('=') {
439440
Some((name, val)) if name == expected_partition => {
440-
let decoded = percent_decode_str(val).decode_utf8().ok()?;
441-
part_values.push(decoded.into_owned());
441+
let decoded = percent_decode_str(val)
442+
.decode_utf8()
443+
.unwrap_or(Cow::Borrowed(val));
444+
part_values.push(decoded);
442445
}
443446
_ => {
444447
debug!(
@@ -515,7 +518,7 @@ mod tests {
515518
#[test]
516519
fn test_parse_partitions_for_path() {
517520
assert_eq!(
518-
Some(vec![] as Vec<String>),
521+
Some(vec![] as Vec<Cow<'_, str>>),
519522
parse_partitions_for_path(
520523
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
521524
&Path::from("bucket/mytable/file.csv"),
@@ -539,7 +542,7 @@ mod tests {
539542
)
540543
);
541544
assert_eq!(
542-
Some(vec!["v1".to_string()]),
545+
Some(vec![Cow::Borrowed("v1")]),
543546
parse_partitions_for_path(
544547
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
545548
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
@@ -549,15 +552,15 @@ mod tests {
549552
// URL-encoded partition values should be decoded
550553
// Use Path::parse to avoid double-encoding (Path::from encodes % as %25)
551554
assert_eq!(
552-
Some(vec!["v/1".to_string()]),
555+
Some(vec![Cow::<str>::Owned("v/1".to_string())]),
553556
parse_partitions_for_path(
554557
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
555558
&Path::parse("bucket/mytable/mypartition=v%2F1/file.csv").unwrap(),
556559
vec!["mypartition"]
557560
)
558561
);
559562
assert_eq!(
560-
Some(vec!["v1".to_string()]),
563+
Some(vec![Cow::Borrowed("v1")]),
561564
parse_partitions_for_path(
562565
&ListingTableUrl::parse("file:///bucket/mytable/").unwrap(),
563566
&Path::from("bucket/mytable/mypartition=v1/file.csv"),
@@ -574,55 +577,59 @@ mod tests {
574577
)
575578
);
576579
assert_eq!(
577-
Some(vec!["v1".to_string(), "v2".to_string()]),
580+
Some(vec![Cow::Borrowed("v1"), Cow::Borrowed("v2")]),
578581
parse_partitions_for_path(
579582
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
580583
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
581584
vec!["mypartition", "otherpartition"]
582585
)
583586
);
584587
assert_eq!(
585-
Some(vec!["v1".to_string()]),
588+
Some(vec![Cow::Borrowed("v1")]),
586589
parse_partitions_for_path(
587590
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
588591
&Path::from("bucket/mytable/mypartition=v1/otherpartition=v2/file.csv"),
589592
vec!["mypartition"]
590593
)
591594
);
592595
assert_eq!(
593-
Some(vec!["John Doe".to_string()]),
596+
Some(vec![Cow::<str>::Owned("John Doe".to_string())]),
594597
parse_partitions_for_path(
595598
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
596599
&Path::parse("bucket/mytable/name=John%20Doe/file.csv").unwrap(),
597600
vec!["name"]
598601
)
599602
);
600603
assert_eq!(
601-
Some(vec!["a/b".to_string(), "c d".to_string()]),
604+
Some(vec![
605+
Cow::<str>::Owned("a/b".to_string()),
606+
Cow::<str>::Owned("c d".to_string()),
607+
]),
602608
parse_partitions_for_path(
603609
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
604610
&Path::parse("bucket/mytable/p1=a%2Fb/p2=c%20d/file.csv").unwrap(),
605611
vec!["p1", "p2"]
606612
)
607613
);
608614
assert_eq!(
609-
Some(vec!["Müller".to_string()]),
615+
Some(vec![Cow::<str>::Owned("Müller".to_string())]),
610616
parse_partitions_for_path(
611617
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
612618
&Path::parse("bucket/mytable/name=M%C3%BCller/file.csv").unwrap(),
613619
vec!["name"]
614620
)
615621
);
616622
assert_eq!(
617-
Some(vec!["invalid%XX".to_string()]),
623+
Some(vec![Cow::Borrowed("invalid%XX")]),
618624
parse_partitions_for_path(
619625
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
620626
&Path::parse("bucket/mytable/p1=invalid%XX/file.csv").unwrap(),
621627
vec!["p1"]
622628
)
623629
);
630+
// Invalid UTF-8 after percent-decoding falls back to raw value
624631
assert_eq!(
625-
None,
632+
Some(vec![Cow::Borrowed("%FF")]),
626633
parse_partitions_for_path(
627634
&ListingTableUrl::parse("file:///bucket/mytable").unwrap(),
628635
&Path::parse("bucket/mytable/p1=%FF/file.csv").unwrap(),

0 commit comments

Comments
 (0)