Skip to content

Commit 0450e24

Browse files
authored
Merge branch 'main' into fix/binary-map-initial-map-size
2 parents 0873909 + c17c87c commit 0450e24

237 files changed

Lines changed: 8821 additions & 2777 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.asf.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ github:
5151
main:
5252
required_pull_request_reviews:
5353
required_approving_review_count: 1
54+
required_status_checks:
55+
contexts:
56+
- "Check License Header"
57+
- "Use prettier to check formatting of documents"
58+
- "Validate required_status_checks in .asf.yaml"
59+
- "Spell Check with Typos"
5460
# needs to be updated as part of the release process
5561
# .asf.yaml doesn't support wildcard branch protection rules, only exact branch names
5662
# https://github.com/apache/infrastructure-asfyaml?tab=readme-ov-file#branch-protection

.github/workflows/dev.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,14 @@ jobs:
5151
# if you encounter error, see instructions inside the script
5252
run: ci/scripts/doc_prettier_check.sh
5353

54+
asf-yaml-check:
55+
name: Validate required_status_checks in .asf.yaml
56+
runs-on: ubuntu-latest
57+
steps:
58+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
59+
- run: pip install pyyaml
60+
- run: python3 ci/scripts/check_asf_yaml_status_checks.py
61+
5462
typos:
5563
name: Spell Check with Typos
5664
runs-on: ubuntu-latest

.github/workflows/extended.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ jobs:
6464
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=8,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
6565
# note: do not use amd/rust container to preserve disk space
6666
steps:
67-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
67+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
6868
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
6969
with:
7070
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
@@ -91,7 +91,7 @@ jobs:
9191
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=32,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
9292
# note: do not use amd/rust container to preserve disk space
9393
steps:
94-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
94+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
9595
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
9696
with:
9797
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
@@ -138,7 +138,7 @@ jobs:
138138
container:
139139
image: amd64/rust
140140
steps:
141-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
141+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
142142
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
143143
with:
144144
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push
@@ -160,7 +160,7 @@ jobs:
160160
container:
161161
image: amd64/rust
162162
steps:
163-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
163+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
164164
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
165165
with:
166166
ref: ${{ github.event.inputs.pr_head_sha }} # will be empty if triggered by push

.github/workflows/rust.yml

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ on:
3838
- "**.md"
3939
- ".github/ISSUE_TEMPLATE/**"
4040
- ".github/pull_request_template.md"
41-
merge_group:
4241
# manual trigger
4342
# https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
4443
workflow_dispatch:
@@ -51,7 +50,7 @@ jobs:
5150
container:
5251
image: amd64/rust
5352
steps:
54-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
53+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
5554
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
5655
- name: Setup Rust toolchain
5756
uses: ./.github/actions/setup-builder
@@ -142,7 +141,7 @@ jobs:
142141
container:
143142
image: amd64/rust
144143
steps:
145-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
144+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
146145
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
147146
- name: Setup Rust toolchain
148147
uses: ./.github/actions/setup-builder
@@ -174,7 +173,7 @@ jobs:
174173
container:
175174
image: amd64/rust
176175
steps:
177-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
176+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
178177
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
179178
- name: Setup Rust toolchain
180179
uses: ./.github/actions/setup-builder
@@ -277,7 +276,7 @@ jobs:
277276
volumes:
278277
- /usr/local:/host/usr/local
279278
steps:
280-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
279+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
281280
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
282281
with:
283282
submodules: true
@@ -324,7 +323,7 @@ jobs:
324323
needs: linux-build-lib
325324
runs-on: ${{ github.repository_owner == 'apache' && format('runs-on={0},family=m8a+m7a+c8a,cpu=16,image=ubuntu24-full-x64,extras=s3-cache,disk=large,tag=datafusion', github.run_id) || 'ubuntu-latest' }}
326325
steps:
327-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
326+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
328327
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
329328
with:
330329
submodules: true
@@ -356,7 +355,7 @@ jobs:
356355
container:
357356
image: amd64/rust
358357
steps:
359-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
358+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
360359
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
361360
with:
362361
submodules: true
@@ -387,7 +386,7 @@ jobs:
387386
container:
388387
image: amd64/rust
389388
steps:
390-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
389+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
391390
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
392391
with:
393392
submodules: true
@@ -409,7 +408,7 @@ jobs:
409408
container:
410409
image: amd64/rust
411410
steps:
412-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
411+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
413412
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
414413
- name: Setup Rust toolchain
415414
uses: ./.github/actions/setup-builder
@@ -450,7 +449,7 @@ jobs:
450449
container:
451450
image: amd64/rust
452451
steps:
453-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
452+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
454453
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
455454
with:
456455
submodules: true
@@ -498,7 +497,7 @@ jobs:
498497
--health-timeout 5s
499498
--health-retries 5
500499
steps:
501-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
500+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
502501
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
503502
with:
504503
submodules: true
@@ -523,7 +522,7 @@ jobs:
523522
container:
524523
image: amd64/rust
525524
steps:
526-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
525+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
527526
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
528527
with:
529528
submodules: true
@@ -654,7 +653,7 @@ jobs:
654653
container:
655654
image: amd64/rust
656655
steps:
657-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
656+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
658657
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
659658
with:
660659
submodules: true
@@ -701,7 +700,7 @@ jobs:
701700
container:
702701
image: amd64/rust
703702
steps:
704-
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # v2.0.3
703+
- uses: runs-on/action@742bf56072eb4845a0f94b3394673e4903c90ff0 # v2.1.0
705704
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
706705
with:
707706
submodules: true

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,6 @@ datafusion-examples/examples/datafusion-examples/
7878

7979
# Samply profile data
8080
profile.json.gz
81+
82+
# Claude Code personal settings
83+
.claude/settings.local.json

Cargo.lock

Lines changed: 9 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ liblzma = { version = "0.4.6", features = ["static"] }
170170
log = "^0.4"
171171
memchr = "2.8.0"
172172
num-traits = { version = "0.2" }
173-
object_store = { version = "0.13.1", default-features = false }
173+
object_store = { version = "0.13.2", default-features = false }
174174
parking_lot = "0.12"
175175
parquet = { version = "58.1.0", default-features = false, features = [
176176
"arrow",

benchmarks/bench.sh

Lines changed: 43 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,7 @@ main() {
314314
data_tpch "1" "parquet"
315315
;;
316316
sort_pushdown|sort_pushdown_sorted)
317-
# same data as for tpch
318-
data_tpch "1" "parquet"
317+
data_sort_pushdown
319318
;;
320319
sort_tpch)
321320
# same data as for tpch
@@ -1085,19 +1084,57 @@ run_external_aggr() {
10851084
}
10861085

10871086
# Runs the sort pushdown benchmark (without WITH ORDER)
1087+
# Generates sort pushdown benchmark data: TPC-H lineitem with 3 parts,
1088+
# renamed so alphabetical order does NOT match sort key order.
1089+
# This forces the sort pushdown optimizer to reorder files by statistics.
1090+
#
1091+
# tpchgen produces 3 sorted, non-overlapping parquet files:
1092+
# lineitem.1.parquet: l_orderkey 1 ~ 2M (lowest keys)
1093+
# lineitem.2.parquet: l_orderkey 2M ~ 4M
1094+
# lineitem.3.parquet: l_orderkey 4M ~ 6M (highest keys)
1095+
#
1096+
# We rename them so alphabetical order is reversed:
1097+
# a_part3.parquet (highest keys, sorts first alphabetically)
1098+
# b_part2.parquet
1099+
# c_part1.parquet (lowest keys, sorts last alphabetically)
1100+
data_sort_pushdown() {
1101+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown/lineitem"
1102+
if [ -d "${SORT_PUSHDOWN_DIR}" ] && [ "$(ls -A ${SORT_PUSHDOWN_DIR}/*.parquet 2>/dev/null)" ]; then
1103+
echo "Sort pushdown data already exists at ${SORT_PUSHDOWN_DIR}"
1104+
return
1105+
fi
1106+
1107+
echo "Generating sort pushdown benchmark data (3 parts with reversed naming)..."
1108+
1109+
TEMP_DIR="${DATA_DIR}/sort_pushdown_temp"
1110+
mkdir -p "${TEMP_DIR}" "${SORT_PUSHDOWN_DIR}"
1111+
1112+
tpchgen-cli --scale-factor 1 --format parquet --parquet-compression='ZSTD(1)' --parts=3 --output-dir "${TEMP_DIR}"
1113+
1114+
# Rename: reverse alphabetical order vs key order
1115+
mv "${TEMP_DIR}/lineitem/lineitem.3.parquet" "${SORT_PUSHDOWN_DIR}/a_part3.parquet"
1116+
mv "${TEMP_DIR}/lineitem/lineitem.2.parquet" "${SORT_PUSHDOWN_DIR}/b_part2.parquet"
1117+
mv "${TEMP_DIR}/lineitem/lineitem.1.parquet" "${SORT_PUSHDOWN_DIR}/c_part1.parquet"
1118+
1119+
rm -rf "${TEMP_DIR}"
1120+
1121+
echo "Sort pushdown data generated at ${SORT_PUSHDOWN_DIR}"
1122+
ls -la "${SORT_PUSHDOWN_DIR}"
1123+
}
1124+
10881125
run_sort_pushdown() {
1089-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1126+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10901127
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown.json"
10911128
echo "Running sort pushdown benchmark (no WITH ORDER)..."
1092-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1129+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
10931130
}
10941131

10951132
# Runs the sort pushdown benchmark with WITH ORDER (enables sort elimination)
10961133
run_sort_pushdown_sorted() {
1097-
TPCH_DIR="${DATA_DIR}/tpch_sf1"
1134+
SORT_PUSHDOWN_DIR="${DATA_DIR}/sort_pushdown"
10981135
RESULTS_FILE="${RESULTS_DIR}/sort_pushdown_sorted.json"
10991136
echo "Running sort pushdown benchmark (with WITH ORDER)..."
1100-
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${TPCH_DIR}" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
1137+
debug_run $CARGO_COMMAND --bin dfbench -- sort-pushdown --sorted --iterations 5 --path "${SORT_PUSHDOWN_DIR}" --queries-path "${SCRIPT_DIR}/queries/sort_pushdown" -o "${RESULTS_FILE}" ${QUERY_ARG} ${LATENCY_ARG}
11011138
}
11021139

11031140
# Runs the sort integration benchmark

benchmarks/queries/clickbench/README.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,22 @@ Results look like
228228
Elapsed 30.195 seconds.
229229
```
230230

231+
232+
### Q9-Q12: FIRST_VALUE Aggregation Performance
233+
234+
These queries test the performance of the `FIRST_VALUE` aggregation function with different data types and grouping cardinalities.
235+
236+
| Query | `FIRST_VALUE` Column | Column Type | Group By Column | Group By Type | Number of Groups |
237+
|-------|----------------------|-------------|-----------------|---------------|------------------|
238+
| Q9 | `URL` | `Utf8` | `UserID` | `Int64` | 17,630,976 |
239+
| Q10 | `URL` | `Utf8` | `OS` | `Int16` | 91 |
240+
| Q11 | `WatchID` | `Int64` | `UserID` | `Int64` | 17,630,976 |
241+
| Q12 | `WatchID` | `Int64` | `OS` | `Int16` | 91 |
242+
243+
244+
245+
246+
231247
## Data Notes
232248

233249
Here are some interesting statistics about the data used in the queries
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
-- Must set for ClickBench hits_partitioned dataset. See https://github.com/apache/datafusion/issues/16591
2+
-- set datafusion.execution.parquet.binary_as_string = true
3+
4+
SELECT MAX(len) FROM (
5+
SELECT LENGTH(FIRST_VALUE("URL" ORDER BY "EventTime")) as len
6+
FROM hits
7+
GROUP BY "OS"
8+
);

0 commit comments

Comments
 (0)