Skip to content

Commit ae666d9

Browse files
authored
feat: Improve encoding/decoding performance for ASCII strings (#224)
* feat: Improve benchmark (#222) Fixes a bug in the benchmark initialization and adds a `toLowerCase` benchmark. * fix: Benchmark initialization The benchmark **must** be initialized in a `@Setup` method, otherwise `nonAsciiProb` will always be `0.0`. * fix: Improve encoding/decoding performance for ASCII strings Since strings that don't require **any** percent encoding are in practice the rule, the encoding/decoding code should be optimized for this case.
1 parent 576f2c6 commit ae666d9

2 files changed

Lines changed: 87 additions & 66 deletions

File tree

src/main/java/com/github/packageurl/internal/StringUtil.java

Lines changed: 74 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@
2121
*/
2222
package com.github.packageurl.internal;
2323

24+
import static java.lang.Byte.toUnsignedInt;
25+
2426
import com.github.packageurl.ValidationException;
25-
import java.nio.ByteBuffer;
2627
import java.nio.charset.StandardCharsets;
27-
import java.util.stream.IntStream;
2828

2929
/**
3030
* String utility for validation and encoding.
@@ -35,6 +35,24 @@ public final class StringUtil {
3535

3636
private static final byte PERCENT_CHAR = '%';
3737

38+
private static final boolean[] UNRESERVED_CHARS = new boolean[128];
39+
40+
static {
41+
for (char c = '0'; c <= '9'; c++) {
42+
UNRESERVED_CHARS[c] = true;
43+
}
44+
for (char c = 'A'; c <= 'Z'; c++) {
45+
UNRESERVED_CHARS[c] = true;
46+
}
47+
for (char c = 'a'; c <= 'z'; c++) {
48+
UNRESERVED_CHARS[c] = true;
49+
}
50+
UNRESERVED_CHARS['-'] = true;
51+
UNRESERVED_CHARS['.'] = true;
52+
UNRESERVED_CHARS['_'] = true;
53+
UNRESERVED_CHARS['~'] = true;
54+
}
55+
3856
private StringUtil() {
3957
throw new AssertionError("Cannot instantiate StringUtil");
4058
}
@@ -48,21 +66,16 @@ private StringUtil() {
4866
* @since 2.0.0
4967
*/
5068
public static String toLowerCase(String s) {
51-
if (s == null) {
52-
return null;
53-
}
54-
5569
int pos = indexOfFirstUpperCaseChar(s);
5670

5771
if (pos == -1) {
5872
return s;
5973
}
6074

6175
char[] chars = s.toCharArray();
62-
int length = chars.length;
6376

64-
for (int i = pos; i < length; i++) {
65-
chars[i] = (char) toLowerCase(chars[i]);
77+
for (int length = chars.length; pos < length; pos++) {
78+
chars[pos] = (char) toLowerCase(chars[pos]);
6679
}
6780

6881
return new String(chars);
@@ -77,26 +90,22 @@ public static String toLowerCase(String s) {
7790
* @since 2.0.0
7891
*/
7992
public static String percentDecode(final String source) {
80-
if (source == null || source.isEmpty()) {
93+
if (source.indexOf(PERCENT_CHAR) == -1) {
8194
return source;
8295
}
8396

8497
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
85-
int i = indexOfFirstPercentChar(bytes);
86-
87-
if (i == -1) {
88-
return source;
89-
}
9098

99+
int readPos = indexOfFirstPercentChar(bytes);
100+
int writePos = readPos;
91101
int length = bytes.length;
92-
int writePos = i;
93-
while (i < length) {
94-
byte b = bytes[i];
102+
while (readPos < length) {
103+
byte b = bytes[readPos];
95104
if (b == PERCENT_CHAR) {
96-
bytes[writePos++] = percentDecode(bytes, i++);
97-
i += 2;
105+
bytes[writePos++] = percentDecode(bytes, readPos++);
106+
readPos += 2;
98107
} else {
99-
bytes[writePos++] = bytes[i++];
108+
bytes[writePos++] = bytes[readPos++];
100109
}
101110
}
102111

@@ -112,34 +121,29 @@ public static String percentDecode(final String source) {
112121
* @since 2.0.0
113122
*/
114123
public static String percentEncode(final String source) {
115-
if (source == null || source.isEmpty()) {
116-
return source;
117-
}
118-
byte[] bytes = source.getBytes(StandardCharsets.UTF_8);
119-
int start = indexOfFirstNonAsciiChar(bytes);
120-
if (start == -1) {
124+
if (!shouldEncode(source)) {
121125
return source;
122126
}
123-
int length = bytes.length;
124-
ByteBuffer buffer = ByteBuffer.allocate(start + ((length - start) * 3));
125-
if (start != 0) {
126-
buffer.put(bytes, 0, start);
127-
}
128127

129-
for (int i = start; i < length; i++) {
130-
byte b = bytes[i];
131-
if (shouldEncode(b)) {
132-
byte b1 = (byte) Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
133-
byte b2 = (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16));
134-
buffer.put(PERCENT_CHAR);
135-
buffer.put(b1);
136-
buffer.put(b2);
128+
byte[] src = source.getBytes(StandardCharsets.UTF_8);
129+
byte[] dest = new byte[3 * src.length];
130+
131+
int writePos = 0;
132+
for (byte b : src) {
133+
if (shouldEncode(toUnsignedInt(b))) {
134+
dest[writePos++] = PERCENT_CHAR;
135+
dest[writePos++] = toHexDigit(b >> 4);
136+
dest[writePos++] = toHexDigit(b);
137137
} else {
138-
buffer.put(b);
138+
dest[writePos++] = b;
139139
}
140140
}
141141

142-
return new String(buffer.array(), 0, buffer.position(), StandardCharsets.UTF_8);
142+
return new String(dest, 0, writePos, StandardCharsets.UTF_8);
143+
}
144+
145+
private static byte toHexDigit(int b) {
146+
return (byte) Character.toUpperCase(Character.forDigit(b & 0xF, 16));
143147
}
144148

145149
/**
@@ -178,14 +182,34 @@ public static boolean isValidCharForKey(int c) {
178182
return (isAlphaNumeric(c) || c == '.' || c == '_' || c == '-');
179183
}
180184

185+
/**
186+
* Returns {@code true} if the character is in the unreserved RFC 3986 set.
187+
* <p>
188+
* <strong>Warning</strong>: Profiling shows that the performance of {@link #percentEncode} relies heavily on this method.
189+
* Modify with care.
190+
* </p>
191+
* @param c non-negative integer.
192+
*/
181193
private static boolean isUnreserved(int c) {
182-
return (isValidCharForKey(c) || c == '~');
194+
return c < 128 && UNRESERVED_CHARS[c];
183195
}
184196

197+
/**
198+
* @param c non-negative integer
199+
*/
185200
private static boolean shouldEncode(int c) {
186201
return !isUnreserved(c);
187202
}
188203

204+
private static boolean shouldEncode(String s) {
205+
for (int i = 0, length = s.length(); i < length; i++) {
206+
if (shouldEncode(s.charAt(i))) {
207+
return true;
208+
}
209+
}
210+
return false;
211+
}
212+
189213
private static boolean isAlpha(int c) {
190214
return (isLowerCase(c) || isUpperCase(c));
191215
}
@@ -195,7 +219,7 @@ private static boolean isAlphaNumeric(int c) {
195219
}
196220

197221
private static boolean isUpperCase(int c) {
198-
return (c >= 'A' && c <= 'Z');
222+
return 'A' <= c && c <= 'Z';
199223
}
200224

201225
private static boolean isLowerCase(int c) {
@@ -207,34 +231,21 @@ private static int toLowerCase(int c) {
207231
}
208232

209233
private static int indexOfFirstUpperCaseChar(String s) {
210-
int length = s.length();
211-
212-
for (int i = 0; i < length; i++) {
234+
for (int i = 0, length = s.length(); i < length; i++) {
213235
if (isUpperCase(s.charAt(i))) {
214236
return i;
215237
}
216238
}
217-
218239
return -1;
219240
}
220241

221-
private static int indexOfFirstNonAsciiChar(byte[] bytes) {
222-
int length = bytes.length;
223-
int start = -1;
224-
for (int i = 0; i < length; i++) {
225-
if (shouldEncode(bytes[i])) {
226-
start = i;
227-
break;
242+
private static int indexOfFirstPercentChar(final byte[] bytes) {
243+
for (int i = 0, length = bytes.length; i < length; i++) {
244+
if (bytes[i] == PERCENT_CHAR) {
245+
return i;
228246
}
229247
}
230-
return start;
231-
}
232-
233-
private static int indexOfFirstPercentChar(final byte[] bytes) {
234-
return IntStream.range(0, bytes.length)
235-
.filter(i -> bytes[i] == PERCENT_CHAR)
236-
.findFirst()
237-
.orElse(-1);
248+
return -1;
238249
}
239250

240251
private static byte percentDecode(final byte[] bytes, final int start) {

src/test/java/com/github/packageurl/internal/StringUtilBenchmark.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.openjdk.jmh.annotations.OutputTimeUnit;
3232
import org.openjdk.jmh.annotations.Param;
3333
import org.openjdk.jmh.annotations.Scope;
34+
import org.openjdk.jmh.annotations.Setup;
3435
import org.openjdk.jmh.annotations.State;
3536
import org.openjdk.jmh.infra.Blackhole;
3637

@@ -62,8 +63,14 @@ public class StringUtilBenchmark {
6263
@Param({"0", "0.1", "0.5"})
6364
private double nonAsciiProb;
6465

65-
private final String[] decodedData = createDecodedData();
66-
private final String[] encodedData = encodeData(decodedData);
66+
private String[] decodedData;
67+
private String[] encodedData;
68+
69+
@Setup
70+
public void setup() {
71+
decodedData = createDecodedData();
72+
encodedData = encodeData(decodedData);
73+
}
6774

6875
private String[] createDecodedData() {
6976
Random random = new Random();
@@ -87,7 +94,10 @@ private static String[] encodeData(String[] decodedData) {
8794
for (int i = 0; i < encodedData.length; i++) {
8895
encodedData[i] = StringUtil.percentEncode(decodedData[i]);
8996
if (!StringUtil.percentDecode(encodedData[i]).equals(decodedData[i])) {
90-
throw new RuntimeException("Invalid implementation of `percentEncode` and `percentDecode`.");
97+
throw new RuntimeException(
98+
"Invalid implementation of `percentEncode` and `percentDecode`.\nOriginal data: "
99+
+ encodedData[i] + "\nEncoded and decoded data: "
100+
+ StringUtil.percentDecode(encodedData[i]));
91101
}
92102
}
93103
return encodedData;

0 commit comments

Comments
 (0)