Skip to content

Commit 7acbe03

Browse files
authored
test: add tests for spill file sizes to verify View GC (#21750)
## Summary This PR adds unit tests to verify that `StringView` and `BinaryView` arrays are correctly compacted (garbage collected) before being spilled to disk. The tests address [Issue #21683](#21683) by: 1. Creating "bloated" batches with large underlying buffers. 2. Slicing them to a small number of rows (1%). 3. Spilling them to disk. 4. Asserting that the resulting file size is small (proving that GC removed the unused "ghost" data). ## Test plan - Run `cargo test -p datafusion-physical-plan --lib spill::mod::tests::test_spill_file_size_gc_verification_string_view` - Run `cargo test -p datafusion-physical-plan --lib spill::mod::tests::test_spill_file_size_gc_verification_binary_view` Made with [Cursor](https://cursor.com)
1 parent a311d14 commit 7acbe03

File tree

1 file changed

+88
-0
lines changed
  • datafusion/physical-plan/src/spill

1 file changed

+88
-0
lines changed

datafusion/physical-plan/src/spill/mod.rs

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,4 +1421,92 @@ mod tests {
14211421

14221422
Ok(())
14231423
}
1424+
1425+
#[tokio::test]
1426+
async fn test_spill_file_size_gc_verification_string_view() -> Result<()> {
1427+
use arrow::array::StringViewArray;
1428+
use std::fs;
1429+
1430+
// 1. Setup bloated data (large buffers)
1431+
let num_rows = 1000;
1432+
let string_array: StringViewArray = (0..num_rows)
1433+
.map(|i| Some(format!("this_is_a_long_string_to_ensure_it_is_not_inlined_and_causes_waste_{i}")))
1434+
.collect();
1435+
let schema = Arc::new(Schema::new(vec![Field::new(
1436+
"s",
1437+
DataType::Utf8View,
1438+
false,
1439+
)]));
1440+
let batch = RecordBatch::try_new(
1441+
Arc::clone(&schema),
1442+
vec![Arc::new(string_array.clone()) as ArrayRef],
1443+
)?;
1444+
1445+
// 2. Slice it heavily (1% of the data)
1446+
let sliced_batch = batch.slice(0, 10);
1447+
1448+
// 3. Spill to disk using SpillManager
1449+
let env = Arc::new(RuntimeEnv::default());
1450+
let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
1451+
let spill_manager = SpillManager::new(env, metrics, schema);
1452+
let spill_file = spill_manager
1453+
.spill_record_batch_and_finish(&[sliced_batch], "TestGC")?
1454+
.unwrap();
1455+
1456+
// 4. Check file size on disk
1457+
let file_size = fs::metadata(spill_file.path())?.len();
1458+
1459+
// The original buffer size is around 70KB.
1460+
// Without GC, the spill file would be > 70KB.
1461+
// With GC, it should be much smaller (only 10 rows of ~70 bytes each + metadata).
1462+
assert!(
1463+
file_size < 10 * 1024,
1464+
"Spill file is too large ({file_size} bytes)! GC might not be working."
1465+
);
1466+
1467+
Ok(())
1468+
}
1469+
1470+
#[tokio::test]
1471+
async fn test_spill_file_size_gc_verification_binary_view() -> Result<()> {
1472+
use arrow::array::BinaryViewArray;
1473+
use std::fs;
1474+
1475+
// 1. Setup bloated data (large buffers)
1476+
let num_rows = 1000;
1477+
let binary_array: BinaryViewArray =
1478+
(0..num_rows).map(|i| Some(vec![i as u8; 100])).collect();
1479+
let schema = Arc::new(Schema::new(vec![Field::new(
1480+
"b",
1481+
DataType::BinaryView,
1482+
false,
1483+
)]));
1484+
let batch = RecordBatch::try_new(
1485+
Arc::clone(&schema),
1486+
vec![Arc::new(binary_array.clone()) as ArrayRef],
1487+
)?;
1488+
1489+
// 2. Slice it heavily (1% of the data)
1490+
let sliced_batch = batch.slice(0, 10);
1491+
1492+
// 3. Spill to disk using SpillManager
1493+
let env = Arc::new(RuntimeEnv::default());
1494+
let metrics = SpillMetrics::new(&ExecutionPlanMetricsSet::new(), 0);
1495+
let spill_manager = SpillManager::new(env, metrics, schema);
1496+
let spill_file = spill_manager
1497+
.spill_record_batch_and_finish(&[sliced_batch], "TestGCBinary")?
1498+
.unwrap();
1499+
1500+
// 4. Check file size on disk
1501+
let file_size = fs::metadata(spill_file.path())?.len();
1502+
1503+
// Original buffer is 100KB.
1504+
// With GC, it should be much smaller.
1505+
assert!(
1506+
file_size < 10 * 1024,
1507+
"Spill file is too large ({file_size} bytes)! GC might not be working."
1508+
);
1509+
1510+
Ok(())
1511+
}
14241512
}

0 commit comments

Comments
 (0)