Skip to content

Commit 8d47fc6

Browse files
kazantsev-maksimKazantsev Maksim
andauthored
Spark soundex function implementation (#20725)
## Which issue does this PR close? N/A ## Rationale for this change Add new spark function: https://spark.apache.org/docs/latest/api/sql/index.html#soundex ## What changes are included in this PR? - Implementation - SLT tests ## Are these changes tested? Yes, tests added as part of this PR. ## Are there any user-facing changes? No, these are new function. --------- Co-authored-by: Kazantsev Maksim <mn.kazantsev@gmail.com>
1 parent aa9d819 commit 8d47fc6

3 files changed

Lines changed: 343 additions & 10 deletions

File tree

datafusion/spark/src/function/string/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub mod ilike;
2525
pub mod length;
2626
pub mod like;
2727
pub mod luhn_check;
28+
pub mod soundex;
2829
pub mod space;
2930
pub mod substring;
3031

@@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string);
4546
make_udf_function!(space::SparkSpace, space);
4647
make_udf_function!(substring::SparkSubstring, substring);
4748
make_udf_function!(base64::SparkUnBase64, unbase64);
49+
make_udf_function!(soundex::SparkSoundex, soundex);
4850

4951
pub mod expr_fn {
5052
use datafusion_functions::export_functions;
@@ -110,6 +112,7 @@ pub mod expr_fn {
110112
"Decodes the input string `str` from a base64 string into binary data.",
111113
str
112114
));
115+
export_functions!((soundex, "Returns Soundex code of the string.", str));
113116
}
114117

115118
pub fn functions() -> Vec<Arc<ScalarUDF>> {
@@ -127,5 +130,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
127130
space(),
128131
substring(),
129132
unbase64(),
133+
soundex(),
130134
]
131135
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
19+
use arrow::datatypes::DataType;
20+
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
21+
use datafusion_common::utils::take_function_args;
22+
use datafusion_common::{Result, exec_err};
23+
use datafusion_expr::{ColumnarValue, Signature, Volatility};
24+
use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
25+
use datafusion_functions::utils::make_scalar_function;
26+
use std::any::Any;
27+
use std::sync::Arc;
28+
29+
/// Spark-compatible `soundex` expression
30+
/// <https://spark.apache.org/docs/latest/api/sql/index.html#soundex>
31+
#[derive(Debug, PartialEq, Eq, Hash)]
32+
pub struct SparkSoundex {
33+
signature: Signature,
34+
}
35+
36+
impl Default for SparkSoundex {
37+
fn default() -> Self {
38+
Self::new()
39+
}
40+
}
41+
42+
impl SparkSoundex {
43+
pub fn new() -> Self {
44+
Self {
45+
signature: Signature::string(1, Volatility::Immutable),
46+
}
47+
}
48+
}
49+
50+
impl ScalarUDFImpl for SparkSoundex {
51+
fn as_any(&self) -> &dyn Any {
52+
self
53+
}
54+
55+
fn name(&self) -> &str {
56+
"soundex"
57+
}
58+
59+
fn signature(&self) -> &Signature {
60+
&self.signature
61+
}
62+
63+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
64+
match &arg_types[0] {
65+
DataType::LargeUtf8 => Ok(DataType::LargeUtf8),
66+
_ => Ok(DataType::Utf8),
67+
}
68+
}
69+
70+
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
71+
make_scalar_function(spark_soundex_inner, vec![])(&args.args)
72+
}
73+
}
74+
75+
fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
76+
let [array] = take_function_args("soundex", arg)?;
77+
match &array.data_type() {
78+
DataType::Utf8 => soundex_array::<i32>(array),
79+
DataType::LargeUtf8 => soundex_array::<i64>(array),
80+
DataType::Utf8View => soundex_view(array),
81+
other => {
82+
exec_err!("unsupported data type {other:?} for function `soundex`")
83+
}
84+
}
85+
}
86+
87+
fn soundex_array<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
88+
let str_array = as_generic_string_array::<T>(array)?;
89+
let result = str_array
90+
.iter()
91+
.map(|s| s.map(compute_soundex))
92+
.collect::<StringArray>();
93+
Ok(Arc::new(result))
94+
}
95+
96+
fn soundex_view(str_view: &ArrayRef) -> Result<ArrayRef> {
97+
let str_array = as_string_view_array(str_view)?;
98+
let result = str_array
99+
.iter()
100+
.map(|opt_str| opt_str.map(compute_soundex))
101+
.collect::<StringArray>();
102+
Ok(Arc::new(result) as ArrayRef)
103+
}
104+
105+
fn classify_char(c: char) -> Option<char> {
106+
match c.to_ascii_uppercase() {
107+
'B' | 'F' | 'P' | 'V' => Some('1'),
108+
'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
109+
'D' | 'T' => Some('3'),
110+
'L' => Some('4'),
111+
'M' | 'N' => Some('5'),
112+
'R' => Some('6'),
113+
_ => None,
114+
}
115+
}
116+
117+
fn is_ignored(c: char) -> bool {
118+
matches!(c.to_ascii_uppercase(), 'H' | 'W')
119+
}
120+
121+
fn compute_soundex(s: &str) -> String {
122+
let mut chars = s.chars();
123+
124+
let first_char = match chars.next() {
125+
Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(),
126+
_ => return s.to_string(),
127+
};
128+
129+
let mut soundex_code = String::with_capacity(4);
130+
soundex_code.push(first_char);
131+
let mut last_code = classify_char(first_char);
132+
133+
for c in chars {
134+
if soundex_code.len() >= 4 {
135+
break;
136+
}
137+
138+
if is_ignored(c) {
139+
continue;
140+
}
141+
142+
match classify_char(c) {
143+
Some(code) => {
144+
if last_code != Some(code) {
145+
soundex_code.push(code);
146+
}
147+
last_code = Some(code);
148+
}
149+
None => {
150+
last_code = None;
151+
}
152+
}
153+
}
154+
format!("{soundex_code:0<4}")
155+
}

datafusion/sqllogictest/test_files/spark/string/soundex.slt

Lines changed: 184 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,187 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
# This file was originally created by a porting script from:
19-
# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
20-
# This file is part of the implementation of the datafusion-spark function library.
21-
# For more information, please see:
22-
# https://github.com/apache/datafusion/issues/15914
23-
24-
## Original Query: SELECT soundex('Miller');
25-
## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'}
26-
#query
27-
#SELECT soundex('Miller'::string);
18+
query T
19+
SELECT soundex('Miller');
20+
----
21+
M460
22+
23+
query T
24+
SELECT soundex(NULL);
25+
----
26+
NULL
27+
28+
query T
29+
SELECT soundex('');
30+
----
31+
(empty)
32+
33+
query T
34+
SELECT soundex('Apache Spark');
35+
----
36+
A122
37+
38+
query T
39+
SELECT soundex('123');
40+
----
41+
123
42+
43+
query T
44+
SELECT soundex('a123');
45+
----
46+
A000
47+
48+
query T
49+
SELECT soundex('Datafusion');
50+
----
51+
D312
52+
53+
query T
54+
SELECT soundex('Ashcroft');
55+
----
56+
A261
57+
58+
query T
59+
SELECT soundex('B1B');
60+
----
61+
B100
62+
63+
query T
64+
SELECT soundex('B B');
65+
----
66+
B100
67+
68+
query T
69+
SELECT soundex('BAB');
70+
----
71+
B100
72+
73+
query T
74+
SELECT soundex('#hello');
75+
----
76+
#hello
77+
78+
query T
79+
SELECT soundex(' hello');
80+
----
81+
hello
82+
83+
query T
84+
SELECT soundex('\thello');
85+
----
86+
\thello
87+
88+
query T
89+
SELECT soundex('😀hello');
90+
----
91+
😀hello
92+
93+
query T
94+
SELECT soundex('123');
95+
----
96+
123
97+
98+
query T
99+
SELECT soundex('1abc');
100+
----
101+
1abc
102+
103+
query T
104+
SELECT soundex('A');
105+
----
106+
A000
107+
108+
query T
109+
SELECT soundex('BFPV');
110+
----
111+
B000
112+
113+
query T
114+
SELECT soundex('Robert');
115+
----
116+
R163
117+
118+
query T
119+
SELECT soundex('Rupert');
120+
----
121+
R163
122+
123+
query T
124+
SELECT soundex(NULL);
125+
----
126+
NULL
127+
128+
query T
129+
SELECT soundex('');
130+
----
131+
(empty)
132+
133+
query T
134+
SELECT soundex('robert');
135+
----
136+
R163
137+
138+
query T
139+
SELECT soundex('rObErT');
140+
----
141+
R163
142+
143+
query T
144+
SELECT soundex('Müller');
145+
----
146+
M460
147+
148+
query T
149+
SELECT soundex('Abcdefghijklmnop');
150+
----
151+
A123
152+
153+
query T
154+
SELECT soundex('Lloyd');
155+
----
156+
L300
157+
158+
query T
159+
SELECT soundex('BWB');
160+
----
161+
B000
162+
163+
query T
164+
SELECT soundex('BHB');
165+
----
166+
B000
167+
168+
query T
169+
SELECT soundex('Tymczak');
170+
----
171+
T522
172+
173+
query T
174+
SELECT soundex('Aeiou');
175+
----
176+
A000
177+
178+
query T
179+
SELECT soundex('1Robert');
180+
----
181+
1Robert
182+
183+
query T
184+
SELECT soundex('Smith-Jones');
185+
----
186+
S532
187+
188+
query T
189+
SELECT soundex('#');
190+
----
191+
#
192+
193+
query T
194+
SELECT soundex('\nhello');
195+
----
196+
\nhello
197+
198+
query T
199+
SELECT concat(soundex(' '), 'Spark')
200+
----
201+
Spark

0 commit comments

Comments
 (0)