google-cloud-python/packages/bigframes/tests/system/small/test_progress_bar.py at a6c42aff42e449fdedf438c49d727238948a9b4a · googleapis/google-cloud-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
import tempfile

import numpy as np
import pandas as pd
import pytest

import bigframes as bf
import bigframes.formatting_helpers as formatting_helpers
from bigframes.session import MAX_INLINE_DF_BYTES

job_load_message_regex = r"Query"
EXPECTED_DRY_RUN_MESSAGE = "Computation deferred. Computation will process"


def test_progress_bar_dataframe(
    penguins_df_default_index: bf.dataframe.DataFrame, capsys
):
    capsys.readouterr()  # clear output

    with bf.option_context("display.progress_bar", "terminal"):
        penguins_df_default_index.to_pandas(allow_large_results=True)

    assert_loading_msg_exist(capsys.readouterr().out)
    assert penguins_df_default_index.query_job is not None


def test_progress_bar_series(penguins_df_default_index: bf.dataframe.DataFrame, capsys):
    series = penguins_df_default_index["body_mass_g"].head(10)
    capsys.readouterr()  # clear output

    with bf.option_context("display.progress_bar", "terminal"):
        series.to_pandas(allow_large_results=True)

    assert_loading_msg_exist(capsys.readouterr().out)
    assert series.query_job is not None


def test_progress_bar_scalar(penguins_df_default_index: bf.dataframe.DataFrame, capsys):
    capsys.readouterr()  # clear output

    with bf.option_context("display.progress_bar", "terminal"):
        penguins_df_default_index["body_mass_g"].head(10).mean()

    assert_loading_msg_exist(capsys.readouterr().out)


def test_progress_bar_scalar_allow_large_results(
    penguins_df_default_index: bf.dataframe.DataFrame, capsys
):
    capsys.readouterr()  # clear output

    with bf.option_context(
        "display.progress_bar", "terminal", "compute.allow_large_results", "True"
    ):
        penguins_df_default_index["body_mass_g"].head(10).mean()

    assert_loading_msg_exist(capsys.readouterr().out)


def test_progress_bar_extract_jobs(
    penguins_df_default_index: bf.dataframe.DataFrame, gcs_folder, capsys
):
    path = gcs_folder + "test_read_csv_progress_bar*.csv"
    capsys.readouterr()  # clear output

    with bf.option_context("display.progress_bar", "terminal"):
        penguins_df_default_index.to_csv(path)

    assert_loading_msg_exist(capsys.readouterr().out)


def test_progress_bar_load_jobs(
    session: bf.Session, penguins_pandas_df_default_index: pd.DataFrame, capsys
):
    # repeat the DF to be big enough to trigger the load job.
    df = penguins_pandas_df_default_index
    while len(df) < MAX_INLINE_DF_BYTES:
        df = pd.DataFrame(np.repeat(df.values, 2, axis=0))

    with (
        bf.option_context("display.progress_bar", "terminal"),
        tempfile.TemporaryDirectory() as dir,
    ):
        path = dir + "/test_read_csv_progress_bar*.csv"
        df.to_csv(path, index=False)
        capsys.readouterr()  # clear output
        session.read_csv(path)

    assert_loading_msg_exist(capsys.readouterr().out, pattern="Load")


def test_progress_bar_uniqueness_check(session: bf.Session, capsys):
    # Ensure strictly_ordered is True (default) to trigger uniqueness check
    assert session._strictly_ordered

    capsys.readouterr()  # clear output

    with bf.option_context("display.progress_bar", "terminal"):
        # Read a table and specify a non-unique index_col to trigger the check.
        # We use a public table to make it a "real" test.
        session.read_gbq_table(
            "bigquery-public-data.ml_datasets.penguins",
            index_col="island",
        )

    assert_loading_msg_exist(capsys.readouterr().out)


def assert_loading_msg_exist(capstdout: str, pattern=job_load_message_regex):
    num_loading_msg = 0
    lines = capstdout.split("\n")
    lines = [line for line in lines if len(line) > 0]

    assert len(lines) > 0
    for line in lines:
        if re.search(pattern, line) is not None:
            num_loading_msg += 1
    assert num_loading_msg > 0


def test_query_job_repr(penguins_df_default_index: bf.dataframe.DataFrame):
    penguins_df_default_index.to_pandas(allow_large_results=True)
    query_job_repr = formatting_helpers.repr_query_job(
        penguins_df_default_index.query_job
    )
    string_checks = [
        "Job",
        "Destination Table",
        "Slot Time",
        "Bytes Processed",
        "Cache hit",
    ]
    for string in string_checks:
        assert string in query_job_repr


def test_query_job_dry_run_dataframe(penguins_df_default_index: bf.dataframe.DataFrame):
    with bf.option_context("display.repr_mode", "deferred"):
        df_result = repr(penguins_df_default_index)
        assert EXPECTED_DRY_RUN_MESSAGE in df_result


def test_query_job_dry_run_index(penguins_df_default_index: bf.dataframe.DataFrame):
    with bf.option_context("display.repr_mode", "deferred"):
        index_result = repr(penguins_df_default_index.index)
        assert EXPECTED_DRY_RUN_MESSAGE in index_result


def test_query_job_dry_run_series(penguins_df_default_index: bf.dataframe.DataFrame):
    with bf.option_context("display.repr_mode", "deferred"):
        series_result = repr(penguins_df_default_index["body_mass_g"])
        assert EXPECTED_DRY_RUN_MESSAGE in series_result


def test_repr_anywidget_dataframe(penguins_df_default_index: bf.dataframe.DataFrame):
    pytest.importorskip("anywidget")
    with bf.option_context("display.render_mode", "anywidget"):
        actual_repr = repr(penguins_df_default_index)
        assert "species" in actual_repr
        assert "island" in actual_repr
        assert "[344 rows x 7 columns]" in actual_repr


def test_repr_anywidget_index(penguins_df_default_index: bf.dataframe.DataFrame):
    pytest.importorskip("anywidget")
    with bf.option_context("display.render_mode", "anywidget"):
        index = penguins_df_default_index.index
        actual_repr = repr(index)
        # In non-interactive environments, should still get a useful summary.
        assert "Index" in actual_repr
        assert "0, 1, 2, 3, 4" in actual_repr
        assert "dtype='Int64'" in actual_repr