Skip to content

Commit a7e0941

Browse files
davidlghellinalamb
andauthored
fix(spark): return input string for PATH/FILE on schemeless URLs in parse_url (#20506)
## Which issue does this PR close? - NA ## Rationale for this change Spark's `java.net.URI` treats schemeless strings (e.g. `'notaurl'`) as relative URIs where the entire input becomes the path component. The Rust `url` crate rejects these with `RelativeUrlWithoutBase`, and the current implementation maps all such errors to `NULL` — but Spark returns the input string for `PATH` and `FILE`. ## What changes are included in this PR? - In `parse_url.rs`, when catching `RelativeUrlWithoutBase` for schemeless URLs, return the input string for `PATH` and `FILE` parts instead of `NULL` - Updated unit tests and sqllogictests for both `parse_url` and `try_parse_url` ## Are these changes tested? Yes: - Unit test `test_parse_schemeless_url` covers all 8 URL parts against a schemeless input - sqllogictest coverage in `parse_url.slt` and `try_parse_url.slt` ## Are there any user-facing changes? Yes — `parse_url('notaurl', 'PATH')` and `parse_url('notaurl', 'FILE')` now return `'notaurl'` instead of `NULL`, matching Spark behavior. --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 26251bb commit a7e0941

3 files changed

Lines changed: 387 additions & 5 deletions

File tree

datafusion/spark/src/function/url/parse_url.rs

Lines changed: 121 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,35 @@ impl ParseUrl {
8484
let url: std::result::Result<Url, ParseError> = Url::parse(value);
8585
if let Err(ParseError::RelativeUrlWithoutBase) = url {
8686
return if !value.contains("://") {
87-
Ok(None)
87+
// Schemeless URLs are treated as relative URIs (like java.net.URI).
88+
// Manually parse path, query, and fragment components.
89+
let (without_fragment, fragment) = match value.split_once('#') {
90+
Some((before, frag)) => (before, Some(frag)),
91+
None => (value, None),
92+
};
93+
let (path, query) = match without_fragment.split_once('?') {
94+
Some((p, q)) => (p, Some(q)),
95+
None => (without_fragment, None),
96+
};
97+
Ok(match part {
98+
"PATH" => Some(path.to_string()),
99+
"QUERY" => match key {
100+
None => query.map(String::from),
101+
Some(key) => query.and_then(|q| {
102+
q.split('&')
103+
.filter_map(|pair| pair.split_once('='))
104+
.find(|(k, _)| *k == key)
105+
.map(|(_, v)| v.to_string())
106+
}),
107+
},
108+
"REF" => fragment.map(String::from),
109+
"FILE" => {
110+
// FILE = path + query (without fragment)
111+
Some(without_fragment.to_string())
112+
}
113+
// HOST, PROTOCOL, AUTHORITY, USERINFO → NULL
114+
_ => None,
115+
})
88116
} else {
89117
Err(exec_datafusion_err!(
90118
"The url is invalid: {value}. Use `try_parse_url` to tolerate invalid URL and return NULL instead. SQLSTATE: 22P02"
@@ -199,6 +227,7 @@ pub fn spark_handled_parse_url(
199227
as_string_array(part)?,
200228
as_string_array(key)?,
201229
handler_err,
230+
true,
202231
)
203232
}
204233
(DataType::Utf8View, DataType::Utf8View, DataType::Utf8View) => {
@@ -207,6 +236,7 @@ pub fn spark_handled_parse_url(
207236
as_string_view_array(part)?,
208237
as_string_view_array(key)?,
209238
handler_err,
239+
true,
210240
)
211241
}
212242
(DataType::LargeUtf8, DataType::LargeUtf8, DataType::LargeUtf8) => {
@@ -215,6 +245,7 @@ pub fn spark_handled_parse_url(
215245
as_large_string_array(part)?,
216246
as_large_string_array(key)?,
217247
handler_err,
248+
true,
218249
)
219250
}
220251
_ => exec_err!(
@@ -240,6 +271,7 @@ pub fn spark_handled_parse_url(
240271
as_string_array(part)?,
241272
&key,
242273
handler_err,
274+
false,
243275
)
244276
}
245277
(DataType::Utf8View, DataType::Utf8View) => {
@@ -248,6 +280,7 @@ pub fn spark_handled_parse_url(
248280
as_string_view_array(part)?,
249281
&key,
250282
handler_err,
283+
false,
251284
)
252285
}
253286
(DataType::LargeUtf8, DataType::LargeUtf8) => {
@@ -256,6 +289,7 @@ pub fn spark_handled_parse_url(
256289
as_large_string_array(part)?,
257290
&key,
258291
handler_err,
292+
false,
259293
)
260294
}
261295
_ => exec_err!(
@@ -272,6 +306,7 @@ fn process_parse_url<'a, A, B, C, T>(
272306
part_array: &'a B,
273307
key_array: &'a C,
274308
handle: impl Fn(Result<Option<String>>) -> Result<Option<String>>,
309+
has_key_arg: bool,
275310
) -> Result<ArrayRef>
276311
where
277312
&'a A: StringArrayType<'a>,
@@ -284,7 +319,11 @@ where
284319
.zip(part_array.iter())
285320
.zip(key_array.iter())
286321
.map(|((url, part), key)| {
287-
if let (Some(url), Some(part), key) = (url, part, key) {
322+
// Spark returns NULL when the third argument is explicitly NULL
323+
if has_key_arg && key.is_none() {
324+
return Ok(None);
325+
}
326+
if let (Some(url), Some(part)) = (url, part) {
288327
handle(ParseUrl::parse(url, part, key))
289328
} else {
290329
Ok(None)
@@ -357,9 +396,86 @@ mod tests {
357396
}
358397

359398
#[test]
360-
fn test_parse_malformed_url_returns_error() -> Result<()> {
361-
let got = ParseUrl::parse("notaurl", "HOST", None)?;
362-
assert_eq!(got, None);
399+
fn test_parse_schemeless_url() -> Result<()> {
400+
// Spark's java.net.URI treats schemeless strings as relative URIs.
401+
// Simple schemeless string: no query, no fragment.
402+
assert_eq!(
403+
ParseUrl::parse("notaurl", "PATH", None)?,
404+
Some("notaurl".to_string())
405+
);
406+
assert_eq!(
407+
ParseUrl::parse("notaurl", "FILE", None)?,
408+
Some("notaurl".to_string())
409+
);
410+
assert_eq!(ParseUrl::parse("notaurl", "HOST", None)?, None);
411+
assert_eq!(ParseUrl::parse("notaurl", "PROTOCOL", None)?, None);
412+
assert_eq!(ParseUrl::parse("notaurl", "QUERY", None)?, None);
413+
assert_eq!(ParseUrl::parse("notaurl", "REF", None)?, None);
414+
assert_eq!(ParseUrl::parse("notaurl", "AUTHORITY", None)?, None);
415+
assert_eq!(ParseUrl::parse("notaurl", "USERINFO", None)?, None);
416+
417+
// Schemeless URL with query string
418+
assert_eq!(
419+
ParseUrl::parse("notaurl?key=value", "PATH", None)?,
420+
Some("notaurl".to_string())
421+
);
422+
assert_eq!(
423+
ParseUrl::parse("notaurl?key=value", "FILE", None)?,
424+
Some("notaurl?key=value".to_string())
425+
);
426+
assert_eq!(
427+
ParseUrl::parse("notaurl?key=value", "QUERY", None)?,
428+
Some("key=value".to_string())
429+
);
430+
assert_eq!(
431+
ParseUrl::parse("notaurl?key=value", "QUERY", Some("key"))?,
432+
Some("value".to_string())
433+
);
434+
assert_eq!(
435+
ParseUrl::parse("notaurl?key=value", "QUERY", Some("missing"))?,
436+
None
437+
);
438+
assert_eq!(ParseUrl::parse("notaurl?key=value", "HOST", None)?, None);
439+
assert_eq!(
440+
ParseUrl::parse("notaurl?key=value", "PROTOCOL", None)?,
441+
None
442+
);
443+
444+
// Schemeless URL with fragment
445+
assert_eq!(
446+
ParseUrl::parse("notaurl#reference", "REF", None)?,
447+
Some("reference".to_string())
448+
);
449+
assert_eq!(
450+
ParseUrl::parse("notaurl#reference", "PATH", None)?,
451+
Some("notaurl".to_string())
452+
);
453+
assert_eq!(
454+
ParseUrl::parse("notaurl#reference", "FILE", None)?,
455+
Some("notaurl".to_string())
456+
);
457+
458+
// Schemeless URL with both query and fragment
459+
assert_eq!(
460+
ParseUrl::parse("notaurl?a=1&b=2#frag", "PATH", None)?,
461+
Some("notaurl".to_string())
462+
);
463+
assert_eq!(
464+
ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", None)?,
465+
Some("a=1&b=2".to_string())
466+
);
467+
assert_eq!(
468+
ParseUrl::parse("notaurl?a=1&b=2#frag", "QUERY", Some("b"))?,
469+
Some("2".to_string())
470+
);
471+
assert_eq!(
472+
ParseUrl::parse("notaurl?a=1&b=2#frag", "REF", None)?,
473+
Some("frag".to_string())
474+
);
475+
assert_eq!(
476+
ParseUrl::parse("notaurl?a=1&b=2#frag", "FILE", None)?,
477+
Some("notaurl?a=1&b=2".to_string())
478+
);
363479
Ok(())
364480
}
365481

datafusion/sqllogictest/test_files/spark/url/parse_url.slt

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,96 @@ SELECT parse_url('notaurl', 'host');
140140
----
141141
NULL
142142

143+
# Schemeless URLs: Spark java.net.URI behavior
144+
# Simple schemeless string
145+
query T
146+
SELECT parse_url('notaurl', 'PATH');
147+
----
148+
notaurl
149+
150+
query T
151+
SELECT parse_url('notaurl', 'FILE');
152+
----
153+
notaurl
154+
155+
query T
156+
SELECT parse_url('notaurl', 'PROTOCOL');
157+
----
158+
NULL
159+
160+
query T
161+
SELECT parse_url('notaurl', 'QUERY');
162+
----
163+
NULL
164+
165+
# Schemeless URL with query string
166+
query T
167+
SELECT parse_url('notaurl?key=value', 'PATH');
168+
----
169+
notaurl
170+
171+
query T
172+
SELECT parse_url('notaurl?key=value', 'FILE');
173+
----
174+
notaurl?key=value
175+
176+
query T
177+
SELECT parse_url('notaurl?key=value', 'QUERY');
178+
----
179+
key=value
180+
181+
query T
182+
SELECT parse_url('notaurl?key=value', 'QUERY', 'key');
183+
----
184+
value
185+
186+
query T
187+
SELECT parse_url('notaurl?key=value', 'HOST');
188+
----
189+
NULL
190+
191+
# Schemeless URL with fragment
192+
query T
193+
SELECT parse_url('notaurl#reference', 'REF');
194+
----
195+
reference
196+
197+
query T
198+
SELECT parse_url('notaurl#reference', 'PATH');
199+
----
200+
notaurl
201+
202+
query T
203+
SELECT parse_url('notaurl#reference', 'FILE');
204+
----
205+
notaurl
206+
207+
# Schemeless URL with both query and fragment
208+
query T
209+
SELECT parse_url('notaurl?a=1&b=2#frag', 'PATH');
210+
----
211+
notaurl
212+
213+
query T
214+
SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY');
215+
----
216+
a=1&b=2
217+
218+
query T
219+
SELECT parse_url('notaurl?a=1&b=2#frag', 'QUERY', 'b');
220+
----
221+
2
222+
223+
query T
224+
SELECT parse_url('notaurl?a=1&b=2#frag', 'REF');
225+
----
226+
frag
227+
228+
query T
229+
SELECT parse_url('notaurl?a=1&b=2#frag', 'FILE');
230+
----
231+
notaurl?a=1&b=2
232+
143233
query T
144234
SELECT parse_url('https://example.com', 'PATH');
145235
----
@@ -175,3 +265,46 @@ SELECT parse_url();
175265

176266
query error DataFusion error: Execution error: The url is invalid: inva lid://spark\.apache\.org/path\?query=1\. Use `try_parse_url` to tolerate invalid URL and return NULL instead\. SQLSTATE: 22P02
177267
SELECT parse_url('inva lid://spark.apache.org/path?query=1', 'QUERY');
268+
269+
# NULL argument handling (Sail PR #1393)
270+
# NULL URL should return NULL
271+
query T
272+
SELECT parse_url(NULL, 'HOST');
273+
----
274+
NULL
275+
276+
# NULL part should return NULL
277+
query T
278+
SELECT parse_url('https://example.com/path?query=1', NULL);
279+
----
280+
NULL
281+
282+
# Both NULL should return NULL
283+
query T
284+
SELECT parse_url(NULL, NULL);
285+
----
286+
NULL
287+
288+
# NULL URL with 3 args
289+
query T
290+
SELECT parse_url(NULL, 'QUERY', 'key');
291+
----
292+
NULL
293+
294+
# NULL part with 3 args
295+
query T
296+
SELECT parse_url('https://example.com/path?query=1', NULL, 'key');
297+
----
298+
NULL
299+
300+
# NULL key with 3 args (valid URL and part) - Spark returns NULL when third arg is NULL
301+
query T
302+
SELECT parse_url('https://example.com/path?query=1', 'QUERY', NULL);
303+
----
304+
NULL
305+
306+
# All three NULL
307+
query T
308+
SELECT parse_url(NULL, NULL, NULL);
309+
----
310+
NULL

0 commit comments

Comments
 (0)