Refactor range functionality to a testing-only helper in _testing.py

kosiew · kosiew · commit 02e39f85524d · 2025-09-01T21:40:52.000+08:00
diff --git a/python/datafusion/_testing.py b/python/datafusion/_testing.py
@@ -0,0 +1,42 @@
+"""Testing-only helpers for datafusion-python.
+
+This module contains utilities used by the test-suite that should not be
+exposed as part of the public API. Keep the implementation minimal and
+documented so reviewers can easily see it's test-only.
+"""
+from __future__ import annotations
+
+from typing import Any
+
+from .context import SessionContext
+
+
+def range_table(
+    ctx: SessionContext,
+    start: int,
+    stop: int | None = None,
+    step: int = 1,
+    partitions: int | None = None,
+) -> Any:
+    """Create a DataFrame containing a sequence of numbers using SQL RANGE.
+
+    This mirrors the previous ``SessionContext.range`` convenience method but
+    lives in a testing-only module so it doesn't expand the public surface.
+
+    Args:
+        ctx: SessionContext instance to run the SQL against.
+        start: Starting value for the sequence or exclusive stop when ``stop``
+            is ``None``.
+        stop: Exclusive upper bound of the sequence.
+        step: Increment between successive values.
+        partitions: Optional number of partitions for the generated data.
+
+    Returns:
+        DataFrame produced by the range table function.
+    """
+    if stop is None:
+        start, stop = 0, start
+
+    parts = f", {int(partitions)}" if partitions is not None else ""
+    sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"
+    return ctx.sql(sql)
diff --git a/python/datafusion/context.py b/python/datafusion/context.py
@@ -731,36 +731,6 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
         """
         return DataFrame(self.ctx.from_polars(data, name))
 
-    def range(
-        self,
-        start: int,
-        stop: int | None = None,
-        step: int = 1,
-        partitions: int | None = None,
-    ) -> DataFrame:
-        """Create a DataFrame containing a sequence of numbers.
-
-        This is backed by DataFusion's ``range`` table function, which generates
-        values lazily and therefore does not materialize the full range in
-        memory. When ``stop`` is omitted, ``start`` is treated as the stop value
-        and the sequence begins at zero.
-
-        Args:
-            start: Starting value for the sequence or the exclusive stop if
-                ``stop`` is ``None``.
-            stop: Exclusive upper bound of the sequence.
-            step: Increment between successive values.
-            partitions: Optional number of partitions for the generated data.
-
-        Returns:
-            DataFrame yielding the requested range of values.
-        """
-        if stop is None:
-            start, stop = 0, start
-
-        parts = f", {int(partitions)}" if partitions is not None else ""
-        sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"  # noqa: S608
-        return self.sql(sql)
 
     # https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
     # is the discussion on how we arrived at adding register_view
diff --git a/python/tests/test_io.py b/python/tests/test_io.py
@@ -19,6 +19,7 @@
 import pyarrow as pa
 import pytest
 from datafusion import DataFrame, column
+from datafusion._testing import range_table
 from datafusion.io import read_avro, read_csv, read_json, read_parquet
 
 
@@ -104,7 +105,7 @@ def test_arrow_c_stream_large_dataset(ctx):
     handful of batches should not exhaust process memory.
     """
     # Create a very large DataFrame using range; this would be terabytes if collected
-    df = ctx.range(0, 1 << 40)
+    df = range_table(ctx, 0, 1 << 40)
 
     reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
 
@@ -123,7 +124,7 @@ def test_arrow_c_stream_large_dataset(ctx):
 
 
 def test_table_from_batches_stream(ctx, monkeypatch):
-    df = ctx.range(0, 10)
+    df = range_table(ctx, 0, 10)
 
     def fail_collect(self):  # pragma: no cover - failure path
         msg = "collect should not be called"