Skip to content

Commit 2404a29

Browse files
committed
GH-3511: Add JMH encoding benchmarks and fix parquet-benchmarks shaded jar
The parquet-benchmarks pom is missing the JMH annotation-processor configuration and the AppendingTransformer entries for BenchmarkList / CompilerHints. As a result, the shaded jar built from master fails at runtime with "Unable to find the resource: /META-INF/BenchmarkList". This commit: - Fixes parquet-benchmarks/pom.xml so the shaded jar is runnable: adds jmh-generator-annprocess to maven-compiler-plugin's annotation processor paths, and adds AppendingTransformer entries for META-INF/BenchmarkList and META-INF/CompilerHints to the shade plugin. - Adds 11 JMH benchmarks covering the encode/decode paths used by the pending performance optimization PRs (#3494, #3496, #3500, #3504, #3506, #3510), so reviewers can reproduce the reported numbers and detect regressions: IntEncodingBenchmark, BinaryEncodingBenchmark, ByteStreamSplitEncodingBenchmark, ByteStreamSplitDecodingBenchmark, FixedLenByteArrayEncodingBenchmark, FileReadBenchmark, FileWriteBenchmark, RowGroupFlushBenchmark, ConcurrentReadWriteBenchmark, BlackHoleOutputFile, TestDataFactory. After this change the shaded jar registers 87 benchmarks (was 0 from a working build, or unrunnable at all from a default build).
1 parent d96c669 commit 2404a29

12 files changed

Lines changed: 1612 additions & 0 deletions

parquet-benchmarks/pom.xml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,18 @@
8989
<plugin>
9090
<groupId>org.apache.maven.plugins</groupId>
9191
<artifactId>maven-compiler-plugin</artifactId>
92+
<configuration>
93+
<annotationProcessorPaths>
94+
<path>
95+
<groupId>org.openjdk.jmh</groupId>
96+
<artifactId>jmh-generator-annprocess</artifactId>
97+
<version>${jmh.version}</version>
98+
</path>
99+
</annotationProcessorPaths>
100+
<annotationProcessors>
101+
<annotationProcessor>org.openjdk.jmh.generators.BenchmarkProcessor</annotationProcessor>
102+
</annotationProcessors>
103+
</configuration>
92104
</plugin>
93105
<plugin>
94106
<groupId>org.apache.maven.plugins</groupId>
@@ -107,6 +119,12 @@
107119
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
108120
<mainClass>org.openjdk.jmh.Main</mainClass>
109121
</transformer>
122+
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
123+
<resource>META-INF/BenchmarkList</resource>
124+
</transformer>
125+
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
126+
<resource>META-INF/CompilerHints</resource>
127+
</transformer>
110128
</transformers>
111129
<artifactSet>
112130
<includes>
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.benchmarks;
20+
21+
import java.io.IOException;
22+
import java.nio.ByteBuffer;
23+
import java.util.Random;
24+
import java.util.concurrent.TimeUnit;
25+
import org.apache.parquet.bytes.ByteBufferInputStream;
26+
import org.apache.parquet.bytes.HeapByteBufferAllocator;
27+
import org.apache.parquet.column.Encoding;
28+
import org.apache.parquet.column.values.ValuesWriter;
29+
import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesReader;
30+
import org.apache.parquet.column.values.deltalengthbytearray.DeltaLengthByteArrayValuesWriter;
31+
import org.apache.parquet.column.values.deltastrings.DeltaByteArrayReader;
32+
import org.apache.parquet.column.values.deltastrings.DeltaByteArrayWriter;
33+
import org.apache.parquet.column.values.dictionary.DictionaryValuesWriter;
34+
import org.apache.parquet.column.values.plain.BinaryPlainValuesReader;
35+
import org.apache.parquet.column.values.plain.PlainValuesWriter;
36+
import org.apache.parquet.io.api.Binary;
37+
import org.openjdk.jmh.annotations.Benchmark;
38+
import org.openjdk.jmh.annotations.BenchmarkMode;
39+
import org.openjdk.jmh.annotations.Fork;
40+
import org.openjdk.jmh.annotations.Level;
41+
import org.openjdk.jmh.annotations.Measurement;
42+
import org.openjdk.jmh.annotations.Mode;
43+
import org.openjdk.jmh.annotations.OperationsPerInvocation;
44+
import org.openjdk.jmh.annotations.OutputTimeUnit;
45+
import org.openjdk.jmh.annotations.Param;
46+
import org.openjdk.jmh.annotations.Scope;
47+
import org.openjdk.jmh.annotations.Setup;
48+
import org.openjdk.jmh.annotations.State;
49+
import org.openjdk.jmh.annotations.Warmup;
50+
import org.openjdk.jmh.infra.Blackhole;
51+
52+
/**
53+
* Encoding-level micro-benchmarks for BINARY values.
54+
* Compares PLAIN, DELTA_BYTE_ARRAY, DELTA_LENGTH_BYTE_ARRAY, and DICTIONARY encodings
55+
* across different string lengths and cardinality patterns.
56+
*
57+
* <p>Each benchmark invocation processes {@value #VALUE_COUNT} values. Throughput is
58+
* reported per-value using {@link OperationsPerInvocation}.
59+
*/
60+
@BenchmarkMode(Mode.Throughput)
61+
@OutputTimeUnit(TimeUnit.SECONDS)
62+
@Fork(1)
63+
@Warmup(iterations = 3, time = 1)
64+
@Measurement(iterations = 5, time = 1)
65+
@State(Scope.Thread)
66+
public class BinaryEncodingBenchmark {
67+
68+
static final int VALUE_COUNT = 100_000;
69+
private static final int INIT_SLAB_SIZE = 64 * 1024;
70+
private static final int PAGE_SIZE = 4 * 1024 * 1024;
71+
private static final int MAX_DICT_BYTE_SIZE = 4 * 1024 * 1024;
72+
73+
@Param({"10", "100", "1000"})
74+
public int stringLength;
75+
76+
/** LOW = 100 distinct values; HIGH = all unique. */
77+
@Param({"LOW", "HIGH"})
78+
public String cardinality;
79+
80+
private Binary[] data;
81+
private byte[] plainEncoded;
82+
private byte[] deltaLengthEncoded;
83+
private byte[] deltaStringsEncoded;
84+
85+
@Setup(Level.Trial)
86+
public void setup() throws IOException {
87+
Random random = new Random(42);
88+
int distinct = "LOW".equals(cardinality) ? TestDataFactory.LOW_CARDINALITY_DISTINCT : 0;
89+
data = TestDataFactory.generateBinaryData(VALUE_COUNT, stringLength, distinct, random);
90+
91+
// Pre-encode data for decode benchmarks
92+
plainEncoded = encodeBinaryWith(newPlainWriter());
93+
deltaLengthEncoded = encodeBinaryWith(newDeltaLengthWriter());
94+
deltaStringsEncoded = encodeBinaryWith(newDeltaStringsWriter());
95+
}
96+
97+
private byte[] encodeBinaryWith(ValuesWriter writer) throws IOException {
98+
for (Binary v : data) {
99+
writer.writeBytes(v);
100+
}
101+
byte[] bytes = writer.getBytes().toByteArray();
102+
writer.close();
103+
return bytes;
104+
}
105+
106+
// ---- Writer factories ----
107+
108+
private static PlainValuesWriter newPlainWriter() {
109+
return new PlainValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
110+
}
111+
112+
private static DeltaLengthByteArrayValuesWriter newDeltaLengthWriter() {
113+
return new DeltaLengthByteArrayValuesWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
114+
}
115+
116+
private static DeltaByteArrayWriter newDeltaStringsWriter() {
117+
return new DeltaByteArrayWriter(INIT_SLAB_SIZE, PAGE_SIZE, new HeapByteBufferAllocator());
118+
}
119+
120+
private static DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter newDictWriter() {
121+
return new DictionaryValuesWriter.PlainBinaryDictionaryValuesWriter(
122+
MAX_DICT_BYTE_SIZE, Encoding.PLAIN_DICTIONARY, Encoding.PLAIN, new HeapByteBufferAllocator());
123+
}
124+
125+
// ---- Encode benchmarks ----
126+
127+
@Benchmark
128+
@OperationsPerInvocation(VALUE_COUNT)
129+
public byte[] encodePlain() throws IOException {
130+
return encodeBinaryWith(newPlainWriter());
131+
}
132+
133+
@Benchmark
134+
@OperationsPerInvocation(VALUE_COUNT)
135+
public byte[] encodeDeltaLengthByteArray() throws IOException {
136+
return encodeBinaryWith(newDeltaLengthWriter());
137+
}
138+
139+
@Benchmark
140+
@OperationsPerInvocation(VALUE_COUNT)
141+
public byte[] encodeDeltaByteArray() throws IOException {
142+
return encodeBinaryWith(newDeltaStringsWriter());
143+
}
144+
145+
@Benchmark
146+
@OperationsPerInvocation(VALUE_COUNT)
147+
public byte[] encodeDictionary() throws IOException {
148+
return encodeBinaryWith(newDictWriter());
149+
}
150+
151+
// ---- Decode benchmarks ----
152+
153+
@Benchmark
154+
@OperationsPerInvocation(VALUE_COUNT)
155+
public void decodePlain(Blackhole bh) throws IOException {
156+
BinaryPlainValuesReader reader = new BinaryPlainValuesReader();
157+
reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(plainEncoded)));
158+
for (int i = 0; i < VALUE_COUNT; i++) {
159+
bh.consume(reader.readBytes());
160+
}
161+
}
162+
163+
@Benchmark
164+
@OperationsPerInvocation(VALUE_COUNT)
165+
public void decodeDeltaLengthByteArray(Blackhole bh) throws IOException {
166+
DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();
167+
reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaLengthEncoded)));
168+
for (int i = 0; i < VALUE_COUNT; i++) {
169+
bh.consume(reader.readBytes());
170+
}
171+
}
172+
173+
@Benchmark
174+
@OperationsPerInvocation(VALUE_COUNT)
175+
public void decodeDeltaByteArray(Blackhole bh) throws IOException {
176+
DeltaByteArrayReader reader = new DeltaByteArrayReader();
177+
reader.initFromPage(VALUE_COUNT, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaStringsEncoded)));
178+
for (int i = 0; i < VALUE_COUNT; i++) {
179+
bh.consume(reader.readBytes());
180+
}
181+
}
182+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
package org.apache.parquet.benchmarks;
20+
21+
import java.io.IOException;
22+
import org.apache.parquet.io.OutputFile;
23+
import org.apache.parquet.io.PositionOutputStream;
24+
25+
/**
26+
* A no-op {@link OutputFile} that discards all written data.
27+
* Useful for isolating CPU/encoding cost from filesystem I/O in write benchmarks.
28+
*/
29+
public final class BlackHoleOutputFile implements OutputFile {
30+
31+
public static final BlackHoleOutputFile INSTANCE = new BlackHoleOutputFile();
32+
33+
private BlackHoleOutputFile() {}
34+
35+
@Override
36+
public boolean supportsBlockSize() {
37+
return false;
38+
}
39+
40+
@Override
41+
public long defaultBlockSize() {
42+
return -1L;
43+
}
44+
45+
@Override
46+
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
47+
return create(blockSizeHint);
48+
}
49+
50+
@Override
51+
public PositionOutputStream create(long blockSizeHint) {
52+
return new PositionOutputStream() {
53+
private long pos;
54+
55+
@Override
56+
public long getPos() throws IOException {
57+
return pos;
58+
}
59+
60+
@Override
61+
public void write(int b) throws IOException {
62+
++pos;
63+
}
64+
65+
@Override
66+
public void write(byte[] b, int off, int len) throws IOException {
67+
pos += len;
68+
}
69+
};
70+
}
71+
72+
@Override
73+
public String getPath() {
74+
return "/dev/null";
75+
}
76+
}

0 commit comments

Comments
 (0)