perf: Optimize strpos() for ASCII-only inputs (#20295)

neilconway · web-flow · commit 4ad5c3dab7cb · 2026-02-13T17:05:14.000Z
The previous implementation had a fast path for ASCII-only inputs, but it was still relatively slow. Switch to using memchr::memchr() to find the first matching byte and then check the rest of the bytes by hand. This improves performance for ASCII inputs by 2x-4x on the built-in strpos benchmarks. ## Which issue does this PR close?  - Closes #20294. ## Are these changes tested? Yes, passes unit tests and SLT. ## Are there any user-facing changes? No.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
@@ -82,6 +82,7 @@ hex = { workspace = true, optional = true }
 itertools = { workspace = true }
 log = { workspace = true }
 md-5 = { version = "^0.10.0", optional = true }
+memchr = "2.8.0"
 num-traits = { workspace = true }
 rand = { workspace = true }
 regex = { workspace = true, optional = true }
diff --git a/datafusion/functions/benches/strpos.rs b/datafusion/functions/benches/strpos.rs
@@ -27,9 +27,12 @@ use std::hint::black_box;
 use std::str::Chars;
 use std::sync::Arc;
 
-/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with
-/// 4096 rows, each row containing a string with 128 random characters.
-/// around 10% of the rows are null, around 10% of the rows are non-ASCII.
+/// Returns a `Vec<ColumnarValue>` with two elements: a haystack array and a
+/// needle array. Each haystack is a random string of `str_len_chars`
+/// characters. Each needle is a random contiguous substring of its
+/// corresponding haystack (i.e., the needle is always present in the haystack).
+/// Around `null_density` fraction of rows are null and `utf8_density` fraction
+/// contain non-ASCII characters; the remaining rows are ASCII-only.
 fn gen_string_array(
     n_rows: usize,
     str_len_chars: usize,
diff --git a/datafusion/functions/src/unicode/strpos.rs b/datafusion/functions/src/unicode/strpos.rs
@@ -32,6 +32,7 @@ use datafusion_expr::{
     Volatility,
 };
 use datafusion_macros::user_doc;
+use memchr::memchr;
 
 #[user_doc(
     doc_section(label = "String Functions"),
@@ -179,6 +180,31 @@ fn strpos(args: &[ArrayRef]) -> Result<ArrayRef> {
     }
 }
 
+/// Find `needle` in `haystack` using `memchr` to quickly skip to positions
+/// where the first byte matches, then verify the remaining bytes. Using
+/// string::find is slower because it has significant per-call overhead that
+/// `memchr` does not, and strpos is often invoked many times on short inputs.
+/// Returns a 1-based position, or 0 if not found.
+/// Both inputs must be ASCII-only.
+fn find_ascii_substring(haystack: &[u8], needle: &[u8]) -> usize {
+    let needle_len = needle.len();
+    let first_byte = needle[0];
+    let mut offset = 0;
+
+    while let Some(pos) = memchr(first_byte, &haystack[offset..]) {
+        let start = offset + pos;
+        if start + needle_len > haystack.len() {
+            return 0;
+        }
+        if haystack[start..start + needle_len] == *needle {
+            return start + 1;
+        }
+        offset = start + 1;
+    }
+
+    0
+}
+
 /// Returns starting index of specified substring within string, or zero if it's not present. (Same as position(substring in string), but note the reversed argument order.)
 /// strpos('high', 'ig') = 2
 /// The implementation uses UTF-8 code points as characters
@@ -198,37 +224,25 @@ where
         .zip(substring_iter)
         .map(|(string, substring)| match (string, substring) {
             (Some(string), Some(substring)) => {
-                // If only ASCII characters are present, we can use the slide window method to find
-                // the sub vector in the main vector. This is faster than string.find() method.
+                if substring.is_empty() {
+                    return T::Native::from_usize(1);
+                }
+
+                let substring_bytes = substring.as_bytes();
+                let string_bytes = string.as_bytes();
+
+                if substring_bytes.len() > string_bytes.len() {
+                    return T::Native::from_usize(0);
+                }
+
                 if ascii_only {
-                    // If the substring is empty, the result is 1.
-                    if substring.is_empty() {
-                        T::Native::from_usize(1)
-                    } else {
-                        T::Native::from_usize(
-                            string
-                                .as_bytes()
-                                .windows(substring.len())
-                                .position(|w| w == substring.as_bytes())
-                                .map(|x| x + 1)
-                                .unwrap_or(0),
-                        )
-                    }
+                    T::Native::from_usize(find_ascii_substring(
+                        string_bytes,
+                        substring_bytes,
+                    ))
                 } else {
                     // For non-ASCII, use a single-pass search that tracks both
                     // byte position and character position simultaneously
-                    if substring.is_empty() {
-                        return T::Native::from_usize(1);
-                    }
-
-                    let substring_bytes = substring.as_bytes();
-                    let string_bytes = string.as_bytes();
-
-                    if substring_bytes.len() > string_bytes.len() {
-                        return T::Native::from_usize(0);
-                    }
-
-                    // Single pass: find substring while counting characters
                     let mut char_pos = 0;
                     for (byte_idx, _) in string.char_indices() {
                         char_pos += 1;