Skip to content

Commit 02e39f8

Browse files
committed
Refactor range functionality to a testing-only helper in _testing.py
1 parent b5f2d80 commit 02e39f8

3 files changed

Lines changed: 45 additions & 32 deletions

File tree

python/datafusion/_testing.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""Testing-only helpers for datafusion-python.
2+
3+
This module contains utilities used by the test-suite that should not be
4+
exposed as part of the public API. Keep the implementation minimal and
5+
documented so reviewers can easily see it's test-only.
6+
"""
7+
from __future__ import annotations
8+
9+
from typing import Any
10+
11+
from .context import SessionContext
12+
13+
14+
def range_table(
15+
ctx: SessionContext,
16+
start: int,
17+
stop: int | None = None,
18+
step: int = 1,
19+
partitions: int | None = None,
20+
) -> Any:
21+
"""Create a DataFrame containing a sequence of numbers using SQL RANGE.
22+
23+
This mirrors the previous ``SessionContext.range`` convenience method but
24+
lives in a testing-only module so it doesn't expand the public surface.
25+
26+
Args:
27+
ctx: SessionContext instance to run the SQL against.
28+
start: Starting value for the sequence or exclusive stop when ``stop``
29+
is ``None``.
30+
stop: Exclusive upper bound of the sequence.
31+
step: Increment between successive values.
32+
partitions: Optional number of partitions for the generated data.
33+
34+
Returns:
35+
DataFrame produced by the range table function.
36+
"""
37+
if stop is None:
38+
start, stop = 0, start
39+
40+
parts = f", {int(partitions)}" if partitions is not None else ""
41+
sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})"
42+
return ctx.sql(sql)

python/datafusion/context.py

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -731,36 +731,6 @@ def from_polars(self, data: pl.DataFrame, name: str | None = None) -> DataFrame:
731731
"""
732732
return DataFrame(self.ctx.from_polars(data, name))
733733

734-
def range(
735-
self,
736-
start: int,
737-
stop: int | None = None,
738-
step: int = 1,
739-
partitions: int | None = None,
740-
) -> DataFrame:
741-
"""Create a DataFrame containing a sequence of numbers.
742-
743-
This is backed by DataFusion's ``range`` table function, which generates
744-
values lazily and therefore does not materialize the full range in
745-
memory. When ``stop`` is omitted, ``start`` is treated as the stop value
746-
and the sequence begins at zero.
747-
748-
Args:
749-
start: Starting value for the sequence or the exclusive stop if
750-
``stop`` is ``None``.
751-
stop: Exclusive upper bound of the sequence.
752-
step: Increment between successive values.
753-
partitions: Optional number of partitions for the generated data.
754-
755-
Returns:
756-
DataFrame yielding the requested range of values.
757-
"""
758-
if stop is None:
759-
start, stop = 0, start
760-
761-
parts = f", {int(partitions)}" if partitions is not None else ""
762-
sql = f"SELECT * FROM range({int(start)}, {int(stop)}, {int(step)}{parts})" # noqa: S608
763-
return self.sql(sql)
764734

765735
# https://github.com/apache/datafusion-python/pull/1016#discussion_r1983239116
766736
# is the discussion on how we arrived at adding register_view

python/tests/test_io.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import pyarrow as pa
2020
import pytest
2121
from datafusion import DataFrame, column
22+
from datafusion._testing import range_table
2223
from datafusion.io import read_avro, read_csv, read_json, read_parquet
2324

2425

@@ -104,7 +105,7 @@ def test_arrow_c_stream_large_dataset(ctx):
104105
handful of batches should not exhaust process memory.
105106
"""
106107
# Create a very large DataFrame using range; this would be terabytes if collected
107-
df = ctx.range(0, 1 << 40)
108+
df = range_table(ctx, 0, 1 << 40)
108109

109110
reader = pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())
110111

@@ -123,7 +124,7 @@ def test_arrow_c_stream_large_dataset(ctx):
123124

124125

125126
def test_table_from_batches_stream(ctx, monkeypatch):
126-
df = ctx.range(0, 10)
127+
df = range_table(ctx, 0, 10)
127128

128129
def fail_collect(self): # pragma: no cover - failure path
129130
msg = "collect should not be called"

0 commit comments

Comments
 (0)