Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bench/ctable/bench_persistency.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,10 +182,10 @@ def bench_append_file():
t_ro = blosc2.CTable.open(path, mode="r")

def bench_read_mem(t=t_mem_table):
_ = t["id"].to_numpy()
_ = t["id"][:]

def bench_read_file(t=t_ro):
_ = t["id"].to_numpy()
_ = t["id"][:]

t_m = tmin(bench_read_mem)
t_f = tmin(bench_read_file)
Expand Down
2 changes: 1 addition & 1 deletion bench/ctable/compact.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# of varying fractions of the table.

from dataclasses import dataclass
from time import time
from time import perf_counter as time

import numpy as np

Expand Down
6 changes: 3 additions & 3 deletions bench/ctable/ctable_v_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# 4. Row iteration

from dataclasses import dataclass
from time import time
from time import perf_counter as time

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -75,7 +75,7 @@ class Row:

# 2.5 Column access (full column)
t0 = time()
arr = ct["score"].to_numpy()
arr = ct["score"][:]
t_ct_col = time() - t0

t0 = time()
Expand All @@ -86,7 +86,7 @@ class Row:

# 3. Filtering
t0 = time()
result_ct = ct.where((ct["id"] > 250_000) & (ct["id"] < 750_000))
result_ct = ct.where((ct.id > 250_000) & (ct.id < 750_000))
t_ct_filter = time() - t0

t0 = time()
Expand Down
2 changes: 1 addition & 1 deletion bench/ctable/delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# int, slice, and list — with varying sizes.

from dataclasses import dataclass
from time import time
from time import perf_counter as time

import numpy as np

Expand Down
2 changes: 1 addition & 1 deletion bench/ctable/expected_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# is too small (M rows) vs correctly sized (N rows) during extend().

from dataclasses import dataclass
from time import time
from time import perf_counter as time

import numpy as np

Expand Down
2 changes: 1 addition & 1 deletion bench/ctable/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# 3. An existing CTable (previously created from Python lists, 1M rows)

from dataclasses import dataclass
from time import time
from time import perf_counter as time

import numpy as np

Expand Down
87 changes: 35 additions & 52 deletions bench/ctable/extend_vs_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,72 +5,55 @@
# SPDX-License-Identifier: BSD-3-Clause
#######################################################################

# Benchmark for comparing append() (row by row) vs extend() (bulk),
# to find the crossover point where extend() becomes worth it.
# Benchmark: append() row-by-row vs extend() bulk insert.
#
# Compares three strategies at increasing N to find where extend() wins:
# 1. append() x N — one call per row, Pydantic path
# 2. extend() x N — extend([row]) per row, one at a time
# 3. extend() x 1 — single bulk call with all N rows

from dataclasses import dataclass
from time import time
from time import perf_counter

import blosc2


@dataclass
class Row:
id: int = blosc2.field(blosc2.int64(ge=0))
c_val: complex = blosc2.field(blosc2.complex128(), default=0j)
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
active: bool = blosc2.field(blosc2.bool(), default=True)
id: int = blosc2.field(blosc2.int64(ge=0))
score: float = blosc2.field(blosc2.float64(ge=0, le=100), default=0.0)
active: bool = blosc2.field(blosc2.bool(), default=True)


# Parameter — change N to test different crossover points
N = 2
print("append() vs extend() benchmark")
for i in range(6):
print("\n")
print("%" * 100)
SIZES = [10, 100, 1_000, 10_000, 100_000]

print(f"append() vs extend() | sizes: {SIZES}")
print()
print(f"{'N':>10} {'append×N (s)':>14} {'extend×N (s)':>14} {'extend×1 (s)':>14} {'speedup bulk':>13}")
print(f"{'─'*10} {'─'*14} {'─'*14} {'─'*14} {'─'*13}")

# Base data generation
data_list = [
[i, complex(i * 0.1, i * 0.01), 10.0 + (i % 100) * 0.4, i % 3 == 0] for i in range(N)
]
for N in SIZES:
data = [[i, float(i % 100), i % 2 == 0] for i in range(N)]

# 1. N individual append() calls
print(f"{N} individual append() calls")
ct_append = blosc2.CTable(Row, expected_size=N)
t0 = time()
for row in data_list:
ct_append.append(row)
t_append = time() - t0
print(f" Time: {t_append:.6f} s")
print(f" Rows: {len(ct_append):,}")
ct = blosc2.CTable(Row, expected_size=N)
t0 = perf_counter()
for row in data:
ct.append(row)
t_append = perf_counter() - t0

# 2. N individual extend() calls (one row at a time)
print(f"{N} individual extend() calls (one row at a time)")
ct_extend_one = blosc2.CTable(Row, expected_size=N)
t0 = time()
for row in data_list:
ct_extend_one.extend([row])
t_extend_one = time() - t0
print(f" Time: {t_extend_one:.6f} s")
print(f" Rows: {len(ct_extend_one):,}")
ct = blosc2.CTable(Row, expected_size=N)
t0 = perf_counter()
for row in data:
ct.extend([row])
t_extend_one = perf_counter() - t0

# 3. Single extend() call with all N rows at once
print(f"Single extend() call with all {N} rows at once")
ct_extend_bulk = blosc2.CTable(Row, expected_size=N)
t0 = time()
ct_extend_bulk.extend(data_list)
t_extend_bulk = time() - t0
print(f" Time: {t_extend_bulk:.6f} s")
print(f" Rows: {len(ct_extend_bulk):,}")
ct = blosc2.CTable(Row, expected_size=N)
t0 = perf_counter()
ct.extend(data)
t_extend_bulk = perf_counter() - t0

# Summary
print("=" * 70)
print(f"{'METHOD':<35} {'TIME (s)':>12} {'SPEEDUP vs append':>20}")
print("-" * 70)
print(f"{'append() x N':<35} {t_append:>12.6f} {'1.00x':>20}")
print(f"{'extend() x N (one row each)':<35} {t_extend_one:>12.6f} {t_append / t_extend_one:>19.2f}x")
print(f"{'extend() x 1 (all at once)':<35} {t_extend_bulk:>12.6f} {t_append / t_extend_bulk:>19.2f}x")
print("-" * 70)
speedup = t_append / t_extend_bulk if t_extend_bulk > 0 else float("inf")
print(f"{N:>10,} {t_append:>14.6f} {t_extend_one:>14.6f} {t_extend_bulk:>14.6f} {speedup:>12.1f}×")

N=N*2
print()
print("speedup bulk = append×N time / extend×1 time (higher is better for extend)")
191 changes: 191 additions & 0 deletions bench/ctable/indexin.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
# CTable Index Benchmark | N=1,000,000 REPS=5

> Random data: sensor_id uniform random in [0, 100,000)
> Sorted data: sensor_id = 0,0,…,1,1,…,2,2,… (clustered, ~10 rows/value)


## BUCKET

> Stores min/max per chunk. Can skip chunks whose range doesn't overlap the
> query. Only effective when data is sorted/clustered. Useless on random data.

### Range query — random data
```
──────────────────────────────────────────────────────────────────────
Random data — BUCKET index
──────────────────────────────────────────────────────────────────────
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
────────────── ───────── ───────── ───────── ────────
0.1% 922 12.8 10.1 1.3×
1% 9,879 13.7 14.7 0.9×
5% 49,991 17.1 17.9 1.0×
10% 99,775 19.8 21.0 0.9×
25% 249,376 24.0 25.0 1.0×
50% 499,826 24.0 27.2 0.9× (slower)
75% 749,665 23.2 27.5 0.8× (slower)
──────────────────────────────────────────────────────────────────────
```

### Range query — sorted data
```
──────────────────────────────────────────────────────────────────────
Sorted data — BUCKET index
──────────────────────────────────────────────────────────────────────
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
────────────── ───────── ───────── ───────── ────────
0.1% 990 11.9 2.5 4.8× ←
1% 9,990 11.9 2.2 5.5× ←
5% 49,990 12.0 3.1 3.9× ←
10% 99,990 12.1 5.1 2.4× ←
25% 249,990 11.7 9.3 1.3×
50% 499,990 12.3 19.0 0.6× (slower)
75% 749,990 11.9 35.9 0.3× (slower)
──────────────────────────────────────────────────────────────────────
```


## PARTIAL

> Stores exact row positions. Works on any data layout.
> Smaller index than FULL; slightly less overhead to build.

### Range query — random data
```
──────────────────────────────────────────────────────────────────────
Random data — PARTIAL index
──────────────────────────────────────────────────────────────────────
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
────────────── ───────── ───────── ───────── ────────
0.1% 922 12.4 1.9 6.4× ←
1% 9,879 14.4 2.5 5.8× ←
5% 49,991 17.3 5.3 3.3× ←
10% 99,775 20.1 8.8 2.3× ←
25% 249,376 23.6 21.4 1.1×
50% 499,826 26.2 46.4 0.6× (slower)
75% 749,665 22.8 75.2 0.3× (slower)
──────────────────────────────────────────────────────────────────────
```

### Range query — sorted data
```
──────────────────────────────────────────────────────────────────────
Sorted data — PARTIAL index
──────────────────────────────────────────────────────────────────────
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
────────────── ───────── ───────── ───────── ────────
0.1% 990 13.2 2.4 5.5× ←
1% 9,990 12.8 2.0 6.4× ←
5% 49,990 12.5 2.6 4.9× ←
10% 99,990 12.7 4.0 3.1× ←
25% 249,990 12.0 8.1 1.5×
50% 499,990 11.9 18.5 0.6× (slower)
75% 749,990 13.1 33.4 0.4× (slower)
──────────────────────────────────────────────────────────────────────
```

### Equality query — random data
```
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
──────────── ────── ───────── ───────── ────────
==0 12 12.6 2.0 6.3× ←
==25,000 13 14.2 1.9 7.5× ←
==50,000 9 12.6 1.9 6.7× ←
==99,999 4 12.4 1.9 6.7× ←
```

### Equality query — sorted data
```
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
──────────── ────── ───────── ───────── ────────
==0 10 11.8 1.9 6.3× ←
==25,000 10 11.7 1.8 6.7× ←
==50,000 10 12.0 1.7 7.0× ←
==99,999 10 12.1 1.7 7.1× ←
```


## FULL

> Stores exact row positions with full chunk coverage.
> Best query performance; larger index than PARTIAL.

### Range query — random data
```
──────────────────────────────────────────────────────────────────────
Random data — FULL index
──────────────────────────────────────────────────────────────────────
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
────────────── ───────── ───────── ───────── ────────
0.1% 922 13.2 2.1 6.4× ←
1% 9,879 15.3 2.8 5.5× ←
5% 49,991 18.1 5.1 3.5× ←
10% 99,775 20.5 11.0 1.9×
25% 249,376 23.5 21.5 1.1×
50% 499,826 25.4 46.1 0.6× (slower)
75% 749,665 23.2 86.9 0.3× (slower)
──────────────────────────────────────────────────────────────────────
```

### Range query — sorted data
```
──────────────────────────────────────────────────────────────────────
Sorted data — FULL index
──────────────────────────────────────────────────────────────────────
SELECTIVITY ROWS SCAN(ms) IDX(ms) SPEEDUP
────────────── ───────── ───────── ───────── ────────
0.1% 990 12.0 1.9 6.4× ←
1% 9,990 12.0 2.0 6.1× ←
5% 49,990 11.5 2.8 4.1× ←
10% 99,990 12.0 4.2 2.9× ←
25% 249,990 11.9 7.8 1.5×
50% 499,990 11.8 18.5 0.6× (slower)
75% 749,990 11.5 44.5 0.3× (slower)
──────────────────────────────────────────────────────────────────────
```

### Equality query — random data
```
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
──────────── ────── ───────── ───────── ────────
==0 12 12.1 2.5 4.8× ←
==25,000 13 12.0 2.0 6.1× ←
==50,000 9 12.4 2.0 6.2× ←
==99,999 4 12.6 2.0 6.4× ←
```

### Equality query — sorted data
```
VALUE ROWS SCAN(ms) IDX(ms) SPEEDUP
──────────── ────── ───────── ───────── ────────
==0 10 11.7 1.8 6.5× ←
==25,000 10 11.5 1.7 6.6× ←
==50,000 10 12.4 1.7 7.1× ←
==99,999 10 12.3 1.8 7.0× ←
```

### Cardinality comparison — sorted data, FULL index

> Shows how repetition level affects speedup (data always sorted).
```
CARDINALITY 0.1% sel 1% sel 5% sel 10% sel
──────────────────────────────────────────────────────────────────────
High rep (10 uniq) 9.1× 9.6× 8.9× 10.1×
Med rep (1k uniq) 8.5× 6.2× 4.3× 3.5×
Low rep (1M uniq) 6.4× 5.9× 4.2× 3.2×
──────────────────────────────────────────────────────────────────────
(speedup — higher is better)
```

### Compound filter — sorted data, FULL index

> sensor_id > X AND region == Y | region in [0,8) → ~12.5% per value
```
────────────────────────────────────────────────────────────────────────────────
QUERY ROWS NO IDX IDX:sid IDX:reg 2 IDX BEST
────────────── ──────── ───────── ───────── ───────── ───────── ────────────
0.1%+12.5% 127 14.6ms 2.6ms 15.0ms 14.4ms sid(5.6×)
1%+12.5% 1,297 14.7ms 2.6ms 15.2ms 17.2ms sid(5.7×)
5%+12.5% 6,268 16.2ms 4.5ms 16.8ms 20.3ms sid(3.6×)
10%+12.5% 12,377 19.5ms 6.2ms 19.6ms 21.0ms sid(3.2×)
────────────────────────────────────────────────────────────────────────────────
```
Loading
Loading