feat: add ndjson tests with multiple file ranges and object store chunk sizes

ariel-miculas · ariel-miculas · commit e3b53554b905 · 2026-03-09T16:13:07.000+02:00
diff --git a/datafusion/datasource-json/src/source.rs b/datafusion/datasource-json/src/source.rs
@@ -667,10 +667,13 @@ pub async fn plan_to_json(
 #[cfg(test)]
 mod tests {
     use super::*;
+    use arrow::array::{Int64Array, StringArray};
+    use arrow::compute;
     use arrow::datatypes::{DataType, Field, Schema};
     use bytes::Bytes;
     use datafusion_datasource::FileRange;
     use futures::TryStreamExt;
+    use object_store::chunked::ChunkedStore;
     use object_store::memory::InMemory;
     use object_store::path::Path;
     use object_store::{ObjectStoreExt, PutPayload};
@@ -683,6 +686,24 @@ mod tests {
         ]))
     }
 
+    fn get_chunked_stores() -> Vec<Arc<ChunkedStore>> {
+        let object_store = Arc::new(InMemory::new());
+        let mut chunked_stores = Vec::new();
+        // use usize::max() as chunk_size to basically use the original store
+        for chunk_size in [usize::MAX, 1, 2, 3, 4, 8, 13, 16] {
+            chunked_stores.push(Arc::new(ChunkedStore::new(
+                Arc::new(Box::new(object_store.clone()) as Box<dyn ObjectStore>),
+                chunk_size,
+            )));
+        }
+
+        chunked_stores
+    }
+
+    fn get_partition_splits() -> Vec<usize> {
+        vec![1usize, 2, 3, 5, 7, 10]
+    }
+
     #[tokio::test]
     async fn test_json_array_from_file() -> Result<()> {
         // Test reading JSON array format from a file
@@ -922,6 +943,313 @@ mod tests {
         Ok(())
     }
 
+    /// Opens each byte-range partition of `path` in `store` and collects all
+    /// record batches produced across every partition.
+    async fn collect_partitioned_batches(
+        store: Arc<dyn ObjectStore>,
+        path: &Path,
+        file_size: u64,
+        num_partitions: usize,
+    ) -> Result<Vec<RecordBatch>> {
+        let mut all_batches = Vec::new();
+        for p in 0..num_partitions {
+            let start = (p as u64 * file_size) / num_partitions as u64;
+            let end = ((p as u64 + 1) * file_size) / num_partitions as u64;
+
+            let meta = store.head(path).await?;
+            let mut file = PartitionedFile::new(path.to_string(), meta.size);
+            file.range = Some(FileRange {
+                start: start as i64,
+                end: end as i64,
+            });
+
+            let opener = JsonOpener::new(
+                1024,
+                test_schema(),
+                FileCompressionType::UNCOMPRESSED,
+                Arc::clone(&store),
+                true, // NDJSON
+            );
+
+            let stream = opener.open(file)?.await?;
+            let batches: Vec<_> = stream.try_collect().await?;
+            all_batches.extend(batches);
+        }
+        Ok(all_batches)
+    }
+
+    /// Concatenates `batches` and returns a single batch sorted ascending by
+    /// the first (id) column.
+    fn concat_and_sort_by_id(batches: &[RecordBatch]) -> Result<RecordBatch> {
+        let schema = test_schema();
+        let combined = compute::concat_batches(&schema, batches)?;
+        let indices = compute::sort_to_indices(combined.column(0), None, None)?;
+        let sorted_cols: Vec<_> = combined
+            .columns()
+            .iter()
+            .map(|col| compute::take(col.as_ref(), &indices, None))
+            .collect::<std::result::Result<_, _>>()?;
+        Ok(RecordBatch::try_new(schema, sorted_cols)?)
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned() -> Result<()> {
+        // Build an NDJSON file with a known number of rows.
+        let num_rows: usize = 20;
+        let mut ndjson = String::new();
+        for i in 0..num_rows {
+            ndjson.push_str(&format!("{{\"id\": {i}, \"name\": \"user{i}\"}}\n"));
+        }
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+        let path = Path::from("test_partitioned.ndjson");
+
+        for store in get_chunked_stores() {
+            let store = Arc::clone(&store) as Arc<dyn ObjectStore>;
+            store
+                .put(&path, PutPayload::from(ndjson_bytes.clone()))
+                .await?;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, num_rows,
+                    "Expected {num_rows} rows with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                for i in 0..num_rows {
+                    assert_eq!(
+                        ids.value(i),
+                        i as i64,
+                        "id mismatch at row {i} with {num_partitions} partitions"
+                    );
+                    assert_eq!(
+                        names.value(i),
+                        format!("user{i}"),
+                        "name mismatch at row {i} with {num_partitions} partitions"
+                    );
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned_uneven_lines() -> Result<()> {
+        // Lines of deliberately varying lengths so byte-range boundaries are
+        // more likely to land in the middle of a line.
+        let rows: &[(&str, &str)] = &[
+            ("1", "alice"),
+            ("2", "bob-with-a-longer-name"),
+            ("3", "charlie"),
+            ("4", "x"),
+            ("5", "diana-has-an-even-longer-name-here"),
+            ("6", "ed"),
+            ("7", "francesca"),
+            ("8", "g"),
+            ("9", "hector-the-magnificent"),
+            ("10", "isabella"),
+        ];
+        let num_rows = rows.len();
+
+        let mut ndjson = String::new();
+        for (id, name) in rows {
+            ndjson.push_str(&format!("{{\"id\": {id}, \"name\": \"{name}\"}}\n"));
+        }
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+        let path = Path::from("test_partitioned_uneven.ndjson");
+
+        for store in get_chunked_stores() {
+            let store = Arc::clone(&store) as Arc<dyn ObjectStore>;
+            store
+                .put(&path, PutPayload::from(ndjson_bytes.clone()))
+                .await?;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, num_rows,
+                    "Expected {num_rows} rows with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                for (i, (expected_id, expected_name)) in rows.iter().enumerate() {
+                    assert_eq!(
+                        ids.value(i),
+                        expected_id.parse::<i64>().unwrap(),
+                        "id mismatch at row {i} with {num_partitions} partitions"
+                    );
+                    assert_eq!(
+                        names.value(i),
+                        *expected_name,
+                        "name mismatch at row {i} with {num_partitions} partitions"
+                    );
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned_single_entry() -> Result<()> {
+        // A single JSON object with no trailing newline. No matter how many
+        // byte-range partitions the file is split into, exactly one row must
+        // be produced in total.
+        let ndjson = r#"{"id": 1, "name": "alice"}"#;
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+        let path = Path::from("test_single_entry.ndjson");
+
+        for store in get_chunked_stores() {
+            let store = Arc::clone(&store) as Arc<dyn ObjectStore>;
+            store
+                .put(&path, PutPayload::from(ndjson_bytes.clone()))
+                .await?;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, 1,
+                    "Expected exactly 1 row with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                assert_eq!(ids.value(0), 1);
+                assert_eq!(names.value(0), "alice");
+            }
+        }
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_ndjson_partitioned_overflow_stream() -> Result<()> {
+        // Force the overflow_stream code path by making row 1's line longer
+        // than END_SCAN_LOOKAHEAD.  When a partition boundary lands inside
+        // that line, the initial bounded fetch covers
+        //   [fetch_start, raw_end + END_SCAN_LOOKAHEAD)
+        // which does not reach the line's newline.  overflow_stream then
+        // issues successive END_SCAN_LOOKAHEAD-sized GETs until the newline
+        // is found.
+        //
+        // With N=2 partitions:
+        //   raw_end            ≈ file_size / 2
+        //   initial_fetch_end  = raw_end + END_SCAN_LOOKAHEAD
+        //   row-1 newline      ≈ 2 * END_SCAN_LOOKAHEAD + overhead
+        //                      > initial_fetch_end  ✓
+        let long_name = "x".repeat(2 * END_SCAN_LOOKAHEAD as usize + 1000);
+        let mut ndjson = String::new();
+        ndjson.push_str(&format!("{{\"id\": 1, \"name\": \"{long_name}\"}}\n"));
+        for i in 2..=5 {
+            ndjson.push_str(&format!("{{\"id\": {i}, \"name\": \"short{i}\"}}\n"));
+        }
+        let ndjson_bytes = Bytes::from(ndjson);
+        let file_size = ndjson_bytes.len() as u64;
+        let path = Path::from("test_overflow.ndjson");
+        let num_rows = 5usize;
+
+        for store in get_chunked_stores() {
+            let store = Arc::clone(&store) as Arc<dyn ObjectStore>;
+            store
+                .put(&path, PutPayload::from(ndjson_bytes.clone()))
+                .await?;
+
+            for num_partitions in get_partition_splits() {
+                let batches = collect_partitioned_batches(
+                    Arc::clone(&store),
+                    &path,
+                    file_size,
+                    num_partitions,
+                )
+                .await?;
+
+                let total: usize = batches.iter().map(|b| b.num_rows()).sum();
+                assert_eq!(
+                    total, num_rows,
+                    "Expected {num_rows} rows with {num_partitions} partitions"
+                );
+
+                let result = concat_and_sort_by_id(&batches)?;
+                let ids = result
+                    .column(0)
+                    .as_any()
+                    .downcast_ref::<Int64Array>()
+                    .unwrap();
+                let names = result
+                    .column(1)
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .unwrap();
+                assert_eq!(ids.value(0), 1);
+                assert_eq!(names.value(0), long_name);
+                for i in 1..num_rows {
+                    assert_eq!(ids.value(i), (i + 1) as i64);
+                    assert_eq!(names.value(i), format!("short{}", i + 1));
+                }
+            }
+        }
+
+        Ok(())
+    }
+
     #[tokio::test]
     async fn test_json_array_stream_cancellation() -> Result<()> {
         // Test that cancellation works correctly (tasks are aborted when stream is dropped)