diff --git a/packages/bigframes/notebooks/dataframes/anywidget_mode.ipynb b/packages/bigframes/notebooks/dataframes/anywidget_mode.ipynb index a0efa571a7d7..403aec53d6ac 100644 --- a/packages/bigframes/notebooks/dataframes/anywidget_mode.ipynb +++ b/packages/bigframes/notebooks/dataframes/anywidget_mode.ipynb @@ -1,11 +1,8 @@ { "cells": [ { - "cell_type": "code", - "execution_count": 1, "id": "d10bfca4", - "metadata": {}, - "outputs": [], + "cell_type": "code", "source": [ "# Copyright 2025 Google LLC\n", "#\n", @@ -20,30 +17,33 @@ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", "# See the License for the specific language governing permissions and\n", "# limitations under the License." - ] + ], + "metadata": {}, + "execution_count": 1, + "outputs": [] }, { - "cell_type": "markdown", "id": "acca43ae", - "metadata": {}, + "cell_type": "markdown", "source": [ "# Demo to Show Anywidget mode" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 2, "id": "ca22f059", - "metadata": {}, - "outputs": [], + "cell_type": "code", "source": [ "import bigframes.pandas as bpd" - ] + ], + "metadata": {}, + "execution_count": 2, + "outputs": [] }, { - "cell_type": "markdown", "id": "04406a4d", - "metadata": {}, + "cell_type": "markdown", "source": [ "This notebook demonstrates the **anywidget** display mode for BigQuery DataFrames. This mode provides an interactive table experience for exploring your data directly within the notebook.\n", "\n", @@ -53,41 +53,49 @@ "- **Column Sorting:** Click column headers to toggle between ascending, descending, and unsorted views. Use **Shift + Click** to sort by multiple columns.\n", "- **Column Resizing:** Drag the dividers between column headers to adjust their width.\n", "- **Max Columns Control:** Limit the number of displayed columns to improve performance and readability for wide datasets." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 3, "id": "1bc5aaf3", - "metadata": {}, - "outputs": [], + "cell_type": "code", "source": [ "bpd.options.bigquery.ordering_mode = \"partial\"\n", "bpd.options.display.render_mode = \"anywidget\"" - ] + ], + "metadata": {}, + "execution_count": 3, + "outputs": [] }, { - "cell_type": "markdown", "id": "0a354c69", - "metadata": {}, + "cell_type": "markdown", "source": [ "Load Sample Data" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "interactive-df-header", - "metadata": {}, + "cell_type": "markdown", "source": [ "## 1. Interactive DataFrame Display\n", "Loading a dataset from BigQuery automatically renders the interactive widget." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 4, "id": "f289d250", + "cell_type": "code", + "source": [ + "df = bpd.read_gbq(\"bigquery-public-data.usa_names.usa_1910_2013\")\n", + "print(df)" + ], "metadata": {}, + "execution_count": 4, "outputs": [ { "data": { @@ -123,17 +131,16 @@ "[5552452 rows x 5 columns]\n" ] } - ], - "source": [ - "df = bpd.read_gbq(\"bigquery-public-data.usa_names.usa_1910_2013\")\n", - "print(df)" ] }, { - "cell_type": "code", - "execution_count": 5, "id": "220340b0", + "cell_type": "code", + "source": [ + "df" + ], "metadata": {}, + "execution_count": 5, "outputs": [ { "data": { @@ -275,25 +282,28 @@ "metadata": {}, "output_type": "execute_result" } - ], - "source": [ - "df" ] }, { - "cell_type": "markdown", "id": "3a73e472", - "metadata": {}, + "cell_type": "markdown", "source": [ "## 2. Interactive Series Display\n", "BigQuery DataFrames `Series` objects now also support the full interactive widget experience, including pagination and formatting." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 6, "id": "42bb02ab", + "cell_type": "code", + "source": [ + "test_series = df[\"year\"]\n", + "# Displaying the series triggers the interactive widget\n", + "print(test_series)" + ], "metadata": {}, + "execution_count": 6, "outputs": [ { "data": { @@ -343,26 +353,25 @@ "[5552452 rows]\n" ] } - ], - "source": [ - "test_series = df[\"year\"]\n", - "# Displaying the series triggers the interactive widget\n", - "print(test_series)" ] }, { - "cell_type": "markdown", "id": "7bcf1bb7", - "metadata": {}, + "cell_type": "markdown", "source": [ "Display with Pagination" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 7, "id": "da23e0f3", + "cell_type": "code", + "source": [ + "test_series" + ], "metadata": {}, + "execution_count": 7, "outputs": [ { "data": { @@ -404,15 +413,11 @@ "metadata": {}, "output_type": "execute_result" } - ], - "source": [ - "test_series" ] }, { - "cell_type": "markdown", "id": "sorting-intro", - "metadata": {}, + "cell_type": "markdown", "source": [ "### Sorting by Column(s)\n", "You can sort the table by clicking on the headers of columns that have orderable data types (like numbers, strings, and dates). Non-orderable columns (like arrays or structs) do not have sorting controls.\n", @@ -428,42 +433,58 @@ "- **Shift + Click:** Hold the `Shift` key while clicking additional column headers to add them to the sort order. \n", "- Each column in a multi-sort also cycles through the three states (Ascending, Descending, Unsorted).\n", "- **Indicator visibility:** Sorting indicators (▲, ▼) are always visible for all columns currently included in the sort. The unsorted indicator (●) is only visible when you hover over an unsorted column header." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "adjustable-width-intro", - "metadata": {}, + "cell_type": "markdown", "source": [ "### Adjustable Column Widths\n", "You can easily adjust the width of any column in the table. Simply hover your mouse over the vertical dividers between column headers. When the cursor changes to a resize icon, click and drag to expand or shrink the column to your desired width. This allows for better readability and customization of your table view.\n", "\n", "### Control Maximum Columns\n", "You can control the number of columns displayed in the widget using the **Max columns** dropdown in the footer. This is useful for wide DataFrames where you want to focus on a subset of columns or improve rendering performance. Options include 3, 5, 7, 10, 20, or All." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "bb15bab6", - "metadata": {}, + "cell_type": "markdown", "source": [ "Programmatic Navigation Demo" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "programmatic-header", - "metadata": {}, + "cell_type": "markdown", "source": [ "## 3. Programmatic Widget Control\n", "You can also instantiate the `TableWidget` directly for more control, such as checking page counts or driving navigation programmatically." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 8, "id": "6920d49b", + "cell_type": "code", + "source": [ + "from bigframes.display.anywidget import TableWidget\n", + "import math\n", + " \n", + "# Create widget programmatically \n", + "widget = TableWidget(df)\n", + "print(f\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\")\n", + " \n", + "# Display the widget\n", + "widget" + ], "metadata": {}, + "execution_count": 8, "outputs": [ { "name": "stdout", @@ -487,43 +508,20 @@ "metadata": {}, "output_type": "execute_result" } - ], - "source": [ - "from bigframes.display.anywidget import TableWidget\n", - "import math\n", - " \n", - "# Create widget programmatically \n", - "widget = TableWidget(df)\n", - "print(f\"Total pages: {math.ceil(widget.row_count / widget.page_size)}\")\n", - " \n", - "# Display the widget\n", - "widget" ] }, { - "cell_type": "markdown", "id": "02cbd1be", - "metadata": {}, + "cell_type": "markdown", "source": [ "Test Navigation Programmatically" - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 9, "id": "12b68f15", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Current page: 0\n", - "After next: 1\n", - "After prev: 0\n" - ] - } - ], + "cell_type": "code", "source": [ "# Simulate button clicks programmatically\n", "print(\"Current page:\", widget.page)\n", @@ -535,22 +533,43 @@ "# Go to previous page\n", "widget.page = 0\n", "print(\"After prev:\", widget.page)" + ], + "metadata": {}, + "execution_count": 9, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Current page: 0\n", + "After next: 1\n", + "After prev: 0\n" + ] + } ] }, { - "cell_type": "markdown", "id": "9d310138", - "metadata": {}, + "cell_type": "markdown", "source": [ "## 4. Edge Cases\n", "The widget handles small datasets gracefully, disabling unnecessary pagination controls." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 10, "id": "a9d5d13a", + "cell_type": "code", + "source": [ + "# Test with very small dataset\n", + "small_df = df.sort_values([\"name\", \"year\", \"state\"]).head(5)\n", + "small_widget = TableWidget(small_df)\n", + "print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n", + "small_widget" + ], "metadata": {}, + "execution_count": 10, "outputs": [ { "name": "stdout", @@ -574,38 +593,45 @@ "metadata": {}, "output_type": "execute_result" } - ], - "source": [ - "# Test with very small dataset\n", - "small_df = df.sort_values([\"name\", \"year\", \"state\"]).head(5)\n", - "small_widget = TableWidget(small_df)\n", - "print(f\"Small dataset pages: {math.ceil(small_widget.row_count / small_widget.page_size)}\")\n", - "small_widget" ] }, { - "cell_type": "markdown", "id": "added-cell-2", - "metadata": {}, + "cell_type": "markdown", "source": [ "### Displaying Generative AI results containing JSON\n", "The `AI.GENERATE` function in BigQuery returns results in a JSON column. While BigQuery's JSON type is not natively supported by the underlying Arrow `to_pandas_batches()` method used in anywidget mode ([Apache Arrow issue #45262](https://github.com/apache/arrow/issues/45262)), BigQuery Dataframes automatically converts JSON columns to strings for display. This allows you to view the results of generative AI functions seamlessly." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "markdown", "id": "ai-header", - "metadata": {}, + "cell_type": "markdown", "source": [ "## 5. Advanced Data Types (JSON/Structs)\n", "The `AI.GENERATE` function in BigQuery returns results in a JSON column. BigQuery Dataframes automatically handles complex types like JSON strings for display, allowing you to view generative AI results seamlessly." - ] + ], + "metadata": {}, + "execution_count": null }, { - "cell_type": "code", - "execution_count": 11, "id": "added-cell-1", + "cell_type": "code", + "source": [ + "bpd.read_gbq(\"\"\"\n", + " SELECT\n", + " AI.GENERATE(\n", + " prompt=>(\"Extract the values.\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \"us.conn\")), \"r\")),\n", + " connection_id=>\"your-project-id.your-location.your-connection\",\n", + " output_schema=>\"publication_date string, class_international string, application_number string, filing_date string\") AS result,\n", + " *\n", + " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", + " LIMIT 5;\n", + "\"\"\")" + ], "metadata": {}, + "execution_count": 11, "outputs": [ { "data": { @@ -810,18 +836,6 @@ "metadata": {}, "output_type": "execute_result" } - ], - "source": [ - "bpd._read_gbq_colab(\"\"\"\n", - " SELECT\n", - " AI.GENERATE(\n", - " prompt=>(\\\"Extract the values.\\\", OBJ.GET_ACCESS_URL(OBJ.FETCH_METADATA(OBJ.MAKE_REF(gcs_path, \\\"us.conn\\\")), \\\"r\\\")),\n", - " connection_id=>\\\"bigframes-dev.us.bigframes-default-connection\\\",\n", - " output_schema=>\\\"publication_date string, class_international string, application_number string, filing_date string\\\") AS result,\n", - " *\n", - " FROM `bigquery-public-data.labeled_patents.extracted_data`\n", - " LIMIT 5;\n", - "\"\"\")" ] } ], @@ -844,6 +858,6 @@ "version": "3.13.0" } }, - "nbformat": 4, - "nbformat_minor": 5 + "nbformat_minor": 5, + "nbformat": 4 } diff --git a/packages/bigframes/notebooks/generative_ai/ai_movie_poster.ipynb b/packages/bigframes/notebooks/generative_ai/ai_movie_poster.ipynb index b25e2b556e65..b6d9e86e6104 100644 --- a/packages/bigframes/notebooks/generative_ai/ai_movie_poster.ipynb +++ b/packages/bigframes/notebooks/generative_ai/ai_movie_poster.ipynb @@ -1,732 +1,783 @@ { - "cells": [ + "cells": [ + { + "id": "7add2e44", + "cell_type": "code", + "source": [ + "# Copyright 2026 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "metadata": { + "id": "XZpKUoHjXw3_" + }, + "execution_count": 1, + "outputs": [] + }, + { + "id": "ee509844", + "cell_type": "markdown", + "source": [ + "# Analyzing movie posters with BigQuery Dataframe AI functions" + ], + "metadata": { + "id": "SEKzWP6jW9Oj" + }, + "execution_count": null + }, + { + "id": "81b8de8d", + "cell_type": "markdown", + "source": [ + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"BQ\n", + " Open in BQ Studio\n", + " \n", + "
" + ], + "metadata": {}, + "execution_count": null + }, + { + "id": "256b6c02", + "cell_type": "markdown", + "source": [ + "BigQuery Dataframe provides a Pythonic way to use AI functions directly with your dataframes. In this notebook, you will use these functions to analyze old\n", + "movie posters. These posters are images stored in a public Google Cloud Storage bucket: `gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters`" + ], + "metadata": { + "id": "c9CCKXG5XTb-" + }, + "execution_count": null + }, + { + "id": "3f71d3cb", + "cell_type": "markdown", + "source": [ + "## Set up" + ], + "metadata": { + "id": "CUJDa_7MPbL9" + }, + "execution_count": null + }, + { + "id": "547145f5", + "cell_type": "markdown", + "source": [ + "Before you begin, you need to\n", + "\n", + "* Set up your permissions for generative AI functions with [these instructions](https://docs.cloud.google.com/bigquery/docs/permissions-for-ai-functions)\n", + "* Set up your Cloud Resource connection by following [these instructions](https://docs.cloud.google.com/bigquery/docs/create-cloud-resource-connection)\n", + "\n", + "Once you have the permissions set up, import the `bigframes.pandas` package, and\n", + "set your cloud project ID." + ], + "metadata": { + "id": "D3iYtBSkYpCK" + }, + "execution_count": null + }, + { + "id": "d9cd6da8", + "cell_type": "code", + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "MY_PROJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", + "LOCATION = \"us\" # @param {type:\"string\"}\n", + "\n", + "bpd.options.bigquery.project = MY_PROJECT_ID\n", + "bpd.options.bigquery.location = LOCATION" + ], + "metadata": { + "id": "6nqoRHYbPAx3" + }, + "execution_count": 2, + "outputs": [] + }, + { + "id": "015a63c1", + "cell_type": "markdown", + "source": [ + "## Load data" + ], + "metadata": { + "id": "2XHcNHtvPhNW" + }, + "execution_count": null + }, + { + "id": "254561e0", + "cell_type": "markdown", + "source": [ + "First, you load the data from the GCS bucket to a BigQuery Dataframe:" + ], + "metadata": { + "id": "eS-9A7DijfoQ" + }, + "execution_count": null + }, + { + "id": "47acbbfe", + "cell_type": "code", + "source": [ + "# Replace with your own connection name.\n", + "MY_CONNECTION = 'bigframes-default-connection' # @param {type:\"string\"}\n", + "FULL_CONNECTION_ID = f\"{MY_PROJECT_ID}.{LOCATION}.{MY_CONNECTION}\"\n", + "\n", + "import gcsfs\n", + "import bigframes\n", + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "import json\n", + "from IPython.display import HTML, display\n", + "\n", + "session = bpd.get_global_session()\n", + "\n", + "# Configure global display parameters \n", + "bigframes.options.display.blob_display_width = 200\n", + "\n", + "def get_runtime_json_str(series, mode=\"R\", with_metadata=False):\n", + " s = bbq.obj.fetch_metadata(series) if with_metadata else series\n", + " runtime = bbq.obj.get_access_url(s, mode=mode)\n", + " return bbq.to_json_string(runtime)\n", + "\n", + "def get_read_url(series):\n", + " runtime = bbq.obj.get_access_url(series, mode=\"R\")\n", + " return bbq.json_value(runtime, \"$.access_urls.read_url\")\n", + "\n", + "def render_images(df):\n", + " \"\"\"Helper to display BigFrames DataFrame with rendered image previews.\"\"\"\n", + " from bigframes import dtypes\n", + " if isinstance(df, bpd.Series):\n", + " df = df.to_frame()\n", + " \n", + " object_cols = [col for col, dtype in zip(df.columns, df.dtypes) if dtype == dtypes.OBJ_REF_DTYPE]\n", + " if not object_cols:\n", + " display(df)\n", + " return\n", + "\n", + " limit = bigframes.options.display.max_rows or 10\n", + " view_df = df.head(limit)\n", + " runtime_cols = {\n", + " col: get_runtime_json_str(view_df[col], mode=\"R\", with_metadata=False) \n", + " for col in object_cols\n", + " }\n", + " \n", + " pandas_json_df = bpd.DataFrame(runtime_cols).to_pandas()\n", + " final_pd = view_df.to_pandas()\n", + " width = bigframes.options.display.blob_display_width or 200\n", + " \n", + " def format_cell_html(raw_json):\n", + " if not raw_json: return \"\"\n", + " try:\n", + " obj_rt = json.loads(raw_json)\n", + " if \"access_urls\" not in obj_rt: return \"Error fetching URL\"\n", + " uri = obj_rt.get(\"objectref\", {}).get(\"uri\", \"\")\n", + " url = obj_rt[\"access_urls\"][\"read_url\"]\n", + " if str(uri).lower().endswith((\".png\", \".jpg\", \".jpeg\", \".webp\")):\n", + " return f''\n", + " return f'{uri}'\n", + " except: return \"Format Error\"\n", + "\n", + " for col in object_cols:\n", + " final_pd[col] = pandas_json_df[col].map(format_cell_html)\n", + " display(HTML(final_pd.to_html(escape=False)))\n", + "\n", + "# List files using gcsfs\n", + "fs = gcsfs.GCSFileSystem(anon=True)\n", + "uris = fs.glob(\"gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/*\")\n", + "\n", + "# Ensure URIs have gs:// prefix\n", + "uris = [u if u.startswith(\"gs://\") else f\"gs://{u}\" for u in uris]\n", + "\n", + "# Read the URIs into a BigQuery DataFrame\n", + "movies = bpd.read_gbq(f\"SELECT uri FROM UNNEST({uris[:5]}) as uri\")\n", + "\n", + "# Create the object reference column using the fully qualified connection ID\n", + "movies['poster'] = bbq.obj.make_ref(movies['uri'], authorizer=FULL_CONNECTION_ID)\n", + "movies = movies[['poster']]\n", + "render_images(movies.head(1))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "ZNPzFjCyPap0", + "outputId": "346d20b2-d615-4094-d24e-2d40e5c90ee2" + }, + "execution_count": 3, + "outputs": [ { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "XZpKUoHjXw3_" - }, - "outputs": [], - "source": [ - "# Copyright 2026 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "SEKzWP6jW9Oj" - }, - "source": [ - "# Analyzing movie posters with BigQuery Dataframe AI functions" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] }, { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"BQ\n", - " Open in BQ Studio\n", - " \n", - "
" + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in 16 seconds of slot time.\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "c9CCKXG5XTb-" - }, - "source": [ - "BigQuery Dataframe provides a Pythonic way to use AI functions directly with your dataframes. In this notebook, you will use these functions to analyze old\n", - "movie posters. These posters are images stored in a public Google Cloud Storage bucket: `gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters`" + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in 9 seconds of slot time.\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "CUJDa_7MPbL9" - }, - "source": [ - "## Set up" + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
poster
0
" + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" + } + ] + }, + { + "id": "f1096d2f", + "cell_type": "markdown", + "source": [ + "## Extract titles from posters" + ], + "metadata": { + "id": "EfkdDH08QnYw" + }, + "execution_count": null + }, + { + "id": "bb30d47c", + "cell_type": "code", + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "movies['title'] = bbq.ai.generate(\n", + " (\"What is the movie title for this poster image?\", get_read_url(movies['poster']))\n", + ").struct.field(\"result\")\n", + "render_images(movies.head(1))" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "6CoZZ5tSQm1r", + "outputId": "1b3915ce-eb83-4be9-b1c1-d9a326dc9408" + }, + "execution_count": 4, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "D3iYtBSkYpCK" - }, - "source": [ - "Before you begin, you need to\n", - "\n", - "* Set up your permissions for generative AI functions with [these instructions](https://docs.cloud.google.com/bigquery/docs/permissions-for-ai-functions)\n", - "* Set up your Cloud Resource connection by following [these instructions](https://docs.cloud.google.com/bigquery/docs/create-cloud-resource-connection)\n", - "\n", - "Once you have the permissions set up, import the `bigframes.pandas` package, and\n", - "set your cloud project ID." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6nqoRHYbPAx3" - }, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "\n", - "MY_RPOJECT_ID = \"bigframes-dev\" # @param {type:\"string\"}\n", - "\n", - "bpd.options.bigquery.project = MY_RPOJECT_ID" + "data": { + "text/html": [ + "\n", + " Query started with request ID bigframes-dev:US.dc0385a0-1910-4dc4-b090-19d92db9bbcb.
SQL
WITH `bfcte_0` AS (\n",
+       "  SELECT\n",
+       "    *\n",
+       "  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` STRING, `bfcol_1` INT64, `bfcol_2` INT64>>[STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/au_secours.jpeg',\n",
+       "    0,\n",
+       "    0\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/barque_sortant_du_port.jpeg',\n",
+       "    1,\n",
+       "    1\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/battling_butler.jpg',\n",
+       "    2,\n",
+       "    2\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/brown_of_harvard.jpeg',\n",
+       "    3,\n",
+       "    3\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/der_student_von_prag.jpg',\n",
+       "    4,\n",
+       "    4\n",
+       "  )])\n",
+       ")\n",
+       "SELECT\n",
+       "  `bfcol_1` AS `bfuid_col_20`,\n",
+       "  TO_JSON_STRING(\n",
+       "    OBJ.GET_ACCESS_URL(OBJ.MAKE_REF(`bfcol_0`, 'bigframes-dev.us.bigframes-default-connection'), 'R')\n",
+       "  ) AS `bfuid_col_24`\n",
+       "FROM `bfcte_0`\n",
+       "ORDER BY\n",
+       "  `bfcol_2` ASC NULLS LAST\n",
+       "LIMIT 1
\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "2XHcNHtvPhNW" - }, - "source": [ - "## Load data" + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in 44 seconds of slot time. [Job bigframes-dev:US.job_3KY0bZD8ZOVtXa1mDZrw6FBieAZk details]\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "eS-9A7DijfoQ" - }, - "source": [ - "First, you load the data from the GCS bucket to a BigQuery Dataframe with the `from_glob_path` method:" + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitle
0The movie title for this poster image is **Au secours!**
" + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" + } + ] + }, + { + "id": "eb9eb261", + "cell_type": "markdown", + "source": [ + "Notice that `ai.generate()` has a `struct` return type, which holds not only the LLM response, but also the status. If you do not provide a field name for your answer, `\"result\"` will be the default name. You can access LLM response content with the struct accessor (e.g. `my_response.struct.filed(\"result\")`);." + ], + "metadata": { + "id": "cFQHQ9S2lr6t" + }, + "execution_count": null + }, + { + "id": "ea29eb21", + "cell_type": "markdown", + "source": [ + "## Get movie release year\n", + "\n", + "In the example below, you will use `ai.generate_int()` to find the release year for each movie poster:" + ], + "metadata": { + "id": "R8kkUhgoS5Xz" + }, + "execution_count": null + }, + { + "id": "bf426247", + "cell_type": "code", + "source": [ + "movies['year'] = bbq.ai.generate_int(\n", + " (\"What is the release year for this movie?\", movies['title']),\n", + " endpoint='gemini-2.5-pro'\n", + ").struct.field(\"result\")\n", + "\n", + "movies.head(1)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 976 }, + "id": "cKZdHq0XS1iW", + "outputId": "72cbad57-4518-4e1e-97bb-333d424dba73" + }, + "execution_count": 5, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "ZNPzFjCyPap0", - "outputId": "346d20b2-d615-4094-d24e-2d40e5c90ee2" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/bigframes/core/global_session.py:113: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n", - " _global_session = bigframes.session.connect(\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 0 Bytes in a moment of slot time. [Job bigframes-dev:US.48a27954-7a4a-4b9e-8176-ea227fd188ad details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.3 kB in a minute of slot time. [Job bigframes-dev:US.09c48ecb-e041-4c18-a390-ca5a36fd07c3 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.2 kB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
poster
0
\n", - "

1 rows × 1 columns

\n", - "
[1 rows x 1 columns in total]" - ], - "text/plain": [ - " poster\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0...\n", - "\n", - "[1 rows x 1 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in a minute of slot time. [Job bigframes-dev:US.cdbe8ee8-3e39-4cb3-aaf8-060419f5b58a details]\n", + " " ], - "source": [ - "# Replace with your own connection name.\n", - "MY_CONNECTION = 'bigframes-default-connection' # @param {type:\"string\"}\n", - "\n", - "movies = bpd.from_glob_path(\n", - " \"gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/*\",\n", - " connection = MY_CONNECTION,\n", - " name='poster')\n", - "movies.head(1)" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "EfkdDH08QnYw" - }, - "source": [ - "## Extract titles from posters" + "data": { + "text/html": [ + "\n", + " Query processed 347 Bytes in a moment of slot time.\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "6CoZZ5tSQm1r", - "outputId": "1b3915ce-eb83-4be9-b1c1-d9a326dc9408" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n", - "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.3 kB in 2 minutes of slot time. [Job bigframes-dev:US.4a08a15f-5a2f-463b-bba8-734858ec992b details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.2 kB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
postertitle
0Der Student von Prag
\n", - "

1 rows × 2 columns

\n", - "
[1 rows x 2 columns in total]" - ], - "text/plain": [ - " poster title\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag\n", - "\n", - "[1 rows x 2 columns]" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
0{'uri': 'gs://cloud-samples-data/vertex-ai/dat...The movie title for the poster image is **Au S...1924
\n", + "

1 rows \u00d7 3 columns

\n", + "
[1 rows x 3 columns in total]" ], - "source": [ - "import bigframes.bigquery as bbq\n", - "\n", - "movies['title'] = bbq.ai.generate(\n", - " (\"What is the movie title for this poster? Name only\", movies['poster']),\n", - " endpoint='gemini-2.5-pro'\n", - ").struct.field(\"result\")\n", - "movies.head(1)" + "text/plain": [ + " poster \\\n", + "0 {'uri': 'gs://cloud-samples-data/vertex-ai/dat... \n", + "\n", + " title year \n", + "0 The movie title for the poster image is **Au S... 1924 \n", + "\n", + "[1 rows x 3 columns]" ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ] + }, + { + "id": "8bf12352", + "cell_type": "code", + "source": [ + "movies.dtypes" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 250 }, + "id": "yqRiNRY8_8fs", + "outputId": "efa60107-6883-4f5c-8e40-43c7287ea7fb" + }, + "execution_count": 6, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "cFQHQ9S2lr6t" - }, - "source": [ - "Notice that `ai.generate()` has a `struct` return type, which holds not only the LLM response, but also the status. If you do not provide a field name for your answer, `\"result\"` will be the default name. You can access LLM response content with the struct accessor (e.g. `my_response.struct.filed(\"result\")`);." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "R8kkUhgoS5Xz" - }, - "source": [ - "## Get movie release year\n", - "\n", - "In the example below, you will use `ai.generate_int()` to find the release year for each movie poster:" + "data": { + "text/plain": [ + "poster structJob bigframes-dev:US.b60a151a-6cbc-405e-9c40-8a7461981a00 details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.3 kB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
postertitleyear
0Der Student von Prag1913
\n", - "

1 rows × 3 columns

\n", - "
[1 rows x 3 columns in total]" - ], - "text/plain": [ - " poster title \\\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Der Student von Prag \n", - "\n", - " year \n", - "0 1913 \n", - "\n", - "[1 rows x 3 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "movies['year'] = bbq.ai.generate_int(\n", - " (\"What is the release year for this movie?\", movies['title']),\n", - " endpoint='gemini-2.5-pro'\n", - ").struct.field(\"result\")\n", - "\n", - "movies.head(1)" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 250 - }, - "id": "yqRiNRY8_8fs", - "outputId": "efa60107-6883-4f5c-8e40-43c7287ea7fb" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0
posterstruct<uri: string, version: string, authorize...
titlestring[pyarrow]
yearInt64
\n", - "

" - ], - "text/plain": [ - "poster structSQL
WITH `bfcte_0` AS (\n",
+       "  SELECT\n",
+       "    *\n",
+       "  FROM UNNEST(ARRAY<STRUCT<`bfcol_0` STRING, `bfcol_1` INT64, `bfcol_2` INT64>>[STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/au_secours.jpeg',\n",
+       "    0,\n",
+       "    0\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/barque_sortant_du_port.jpeg',\n",
+       "    1,\n",
+       "    1\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/battling_butler.jpg',\n",
+       "    2,\n",
+       "    2\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/brown_of_harvard.jpeg',\n",
+       "    3,\n",
+       "    3\n",
+       "  ), STRUCT(\n",
+       "    'gs://cloud-samples-data/vertex-ai/dataset-management/datasets/classic-movie-posters/der_student_von_prag.jpg',\n",
+       "    4,\n",
+       "    4\n",
+       "  )])\n",
+       ")\n",
+       "SELECT\n",
+       "  `bfcol_1` AS `bfuid_col_52`,\n",
+       "  TO_JSON_STRING(\n",
+       "    OBJ.GET_ACCESS_URL(OBJ.MAKE_REF(`bfcol_0`, 'bigframes-dev.us.bigframes-default-connection'), 'R')\n",
+       "  ) AS `bfuid_col_58`\n",
+       "FROM `bfcte_0`\n",
+       "WHERE\n",
+       "  AI.IF(\n",
+       "    prompt => (\n",
+       "      'The movie ',\n",
+       "      AI.GENERATE(\n",
+       "        prompt => (\n",
+       "          'What is the movie title for this poster image?',\n",
+       "          JSON_VALUE(\n",
+       "            OBJ.GET_ACCESS_URL(OBJ.MAKE_REF(`bfcol_0`, 'bigframes-dev.us.bigframes-default-connection'), 'R'),\n",
+       "            '$.access_urls.read_url'\n",
+       "          )\n",
+       "        ),\n",
+       "        request_type => 'UNSPECIFIED'\n",
+       "      ).`result`,\n",
+       "      ' was made in US'\n",
+       "    )\n",
+       "  )\n",
+       "ORDER BY\n",
+       "  `bfcol_2` ASC NULLS LAST\n",
+       "LIMIT 1
\n", + " " ], - "source": [ - "movies.dtypes" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "markdown", - "metadata": { - "id": "0WwbiMtdTXt5" - }, - "source": [ - "## Filter movie by production country\n", - "\n", - "In the next example, you will use `ai.if_()` to find the movies that were produced in the USA." + "data": { + "text/html": [ + "\n", + " Query processed 0 Bytes in 3 minutes of slot time. [Job bigframes-dev:US.job_pEC4qGIM1vr98oTcLjp-HYQ6R9h_ details]\n", + " " + ], + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "xTE8dj3LThy6", - "outputId": "941e04d8-9f24-4309-a59e-35e8740c9c54" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.12/dist-packages/bigframes/dtypes.py:1010: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/lib/python3.12/dist-packages/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.3 kB in 6 minutes of slot time. [Job bigframes-dev:US.c9bb23f0-5ceb-4d6c-8241-960c496274ae details]\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - " Query processed 1.2 kB in a moment of slot time.\n", - " " - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
postertitleyear
8Shoulder Arms1918
\n", - "

1 rows × 3 columns

\n", - "
[1 rows x 3 columns in total]" - ], - "text/plain": [ - " poster title year\n", - "8 {\"access_urls\":{\"expiry_time\":\"2026-03-27T02:0... Shoulder Arms 1918\n", - "\n", - "[1 rows x 3 columns]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
postertitleyear
3NaNThe movie title is **Brown of Harvard**.1926
" ], - "source": [ - "us_movies = movies[bbq.ai.if_(\n", - " (\"The movie \", movies['title'], \" was made in US\")\n", - ")]\n", - "us_movies.head(1)" + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" } - ], - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 -} + "language_info": { + "name": "python" + } + }, + "nbformat_minor": 0, + "nbformat": 4 +} \ No newline at end of file diff --git a/packages/bigframes/notebooks/kaggle/describe-product-images-with-bigframes-multimodal.ipynb b/packages/bigframes/notebooks/kaggle/describe-product-images-with-bigframes-multimodal.ipynb index 1c2e2b53a830..1a7de9b837f9 100644 --- a/packages/bigframes/notebooks/kaggle/describe-product-images-with-bigframes-multimodal.ipynb +++ b/packages/bigframes/notebooks/kaggle/describe-product-images-with-bigframes-multimodal.ipynb @@ -1 +1,1131 @@ -{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.13","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":110281,"databundleVersionId":13391012,"sourceType":"competition"}],"dockerImageVersionId":31089,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Describe product images with BigFrames multimodal DataFrames\n\nBased on notebook at https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/multimodal/multimodal_dataframe.ipynb\n\nThis notebook is introducing BigFrames Multimodal features:\n\n1. Create Multimodal DataFrame\n2. Combine unstructured data with structured data\n3. Conduct image transformations\n4. Use LLM models to ask questions and generate embeddings on images\n5. PDF chunking function\n\nInstall the bigframes package and upgrade other packages that are already included in Kaggle but have versions incompatible with bigframes.","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19"}},{"cell_type":"code","source":"%pip install --upgrade bigframes google-cloud-automl google-cloud-translate google-ai-generativelanguage tensorflow ","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"**Important:** restart the kernel by going to \"Run -> Restart & clear cell outputs\" before continuing.\n\nConfigure bigframes to use your GCP project. First, go to \"Add-ons -> Google Cloud SDK\" and click the \"Attach\" button. Then,","metadata":{}},{"cell_type":"code","source":"from kaggle_secrets import UserSecretsClient\nuser_secrets = UserSecretsClient()\nuser_credential = user_secrets.get_gcloud_credential()\nuser_secrets.set_tensorflow_credential(user_credential)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:17:14.872905Z","iopub.execute_input":"2025-08-18T20:17:14.873201Z","iopub.status.idle":"2025-08-18T20:17:14.946971Z","shell.execute_reply.started":"2025-08-18T20:17:14.873171Z","shell.execute_reply":"2025-08-18T20:17:14.945996Z"}},"outputs":[],"execution_count":2},{"cell_type":"code","source":"PROJECT = \"bigframes-dev\" # replace with your project. \n# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n\nOUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n# The connection (or bigframes-default-connection of the project) must have read/write permission to the bucket. \n# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.\n\nimport bigframes\n# Setup project\nbigframes.options.bigquery.project = PROJECT\n\n# Display options\nbigframes.options.display.blob_display_width = 300\nbigframes.options.display.progress_bar = None\n\nimport bigframes.pandas as bpd","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:17:25.573874Z","iopub.execute_input":"2025-08-18T20:17:25.574192Z","iopub.status.idle":"2025-08-18T20:17:45.102002Z","shell.execute_reply.started":"2025-08-18T20:17:25.574168Z","shell.execute_reply":"2025-08-18T20:17:45.101140Z"}},"outputs":[],"execution_count":3},{"cell_type":"code","source":"# Create blob columns from wildcard path.\ndf_image = bpd.from_glob_path(\n \"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*\", name=\"image\"\n)\n# Other ways are: from string uri column\n# df = bpd.DataFrame({\"uri\": [\"gs:///\", \"gs:///\"]})\n# df[\"blob_col\"] = df[\"uri\"].str.to_blob()\n\n# From an existing object table\n# df = bpd.read_gbq_object_table(\"\", name=\"blob_col\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:17:45.103249Z","iopub.execute_input":"2025-08-18T20:17:45.103530Z","iopub.status.idle":"2025-08-18T20:17:47.424586Z","shell.execute_reply.started":"2025-08-18T20:17:45.103499Z","shell.execute_reply":"2025-08-18T20:17:47.423762Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/global_session.py:103: DefaultLocationWarning: No explicit location is set, so using location US for the session.\n _global_session = bigframes.session.connect(\n","output_type":"stream"},{"name":"stdout","text":"Please ensure you have selected a BigQuery account in the Notebook Add-ons menu.\n","output_type":"stream"}],"execution_count":4},{"cell_type":"code","source":"# Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame\ndf_image = df_image.head(5)\ndf_image","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:17:47.425578Z","iopub.execute_input":"2025-08-18T20:17:47.425873Z","iopub.status.idle":"2025-08-18T20:18:07.919961Z","shell.execute_reply.started":"2025-08-18T20:17:47.425844Z","shell.execute_reply":"2025-08-18T20:18:07.918942Z"}},"outputs":[{"execution_count":5,"output_type":"execute_result","data":{"text/plain":" image\n0 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n1 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n2 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n3 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n4 {'uri': 'gs://cloud-samples-data/bigquery/tuto...\n\n[5 rows x 1 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
image
0
1
2
3
4
\n

5 rows × 1 columns

\n
[5 rows x 1 columns in total]"},"metadata":{}}],"execution_count":5},{"cell_type":"markdown","source":"# 2. Combine unstructured data with structured data\n\nNow you can put more information into the table to describe the files. Such as author info from inputs, or other metadata from the gcs object itself.","metadata":{}},{"cell_type":"code","source":"# Combine unstructured data with structured data\ndf_image[\"author\"] = [\"alice\", \"bob\", \"bob\", \"alice\", \"bob\"] # type: ignore\ndf_image[\"content_type\"] = df_image[\"image\"].blob.content_type()\ndf_image[\"size\"] = df_image[\"image\"].blob.size()\ndf_image[\"updated\"] = df_image[\"image\"].blob.updated()\ndf_image","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:18:07.921884Z","iopub.execute_input":"2025-08-18T20:18:07.922593Z","iopub.status.idle":"2025-08-18T20:18:35.549725Z","shell.execute_reply.started":"2025-08-18T20:18:07.922551Z","shell.execute_reply":"2025-08-18T20:18:35.548942Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\nversion. Use `json_query` instead.\n warnings.warn(bfe.format_message(msg), category=UserWarning)\n/usr/local/lib/python3.11/dist-packages/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\nversion. Use `json_query` instead.\n warnings.warn(bfe.format_message(msg), category=UserWarning)\n/usr/local/lib/python3.11/dist-packages/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\nversion. Use `json_query` instead.\n warnings.warn(bfe.format_message(msg), category=UserWarning)\n","output_type":"stream"},{"execution_count":6,"output_type":"execute_result","data":{"text/plain":" image author content_type \\\n0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n\n size updated \n0 1591240 2025-03-20 17:45:04+00:00 \n1 1182951 2025-03-20 17:45:02+00:00 \n2 1520884 2025-03-20 17:44:55+00:00 \n3 1235401 2025-03-20 17:45:19+00:00 \n4 1591923 2025-03-20 17:44:47+00:00 \n\n[5 rows x 5 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
imageauthorcontent_typesizeupdated
0aliceimage/png15912402025-03-20 17:45:04+00:00
1bobimage/png11829512025-03-20 17:45:02+00:00
2bobimage/png15208842025-03-20 17:44:55+00:00
3aliceimage/png12354012025-03-20 17:45:19+00:00
4bobimage/png15919232025-03-20 17:44:47+00:00
\n

5 rows × 5 columns

\n
[5 rows x 5 columns in total]"},"metadata":{}}],"execution_count":6},{"cell_type":"markdown","source":"Then you can filter the rows based on the structured data. And for different content types, you can display them respectively or together.","metadata":{}},{"cell_type":"code","source":"# filter images and display, you can also display audio and video types\ndf_image[df_image[\"author\"] == \"alice\"][\"image\"].blob.display()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:18:55.299993Z","iopub.execute_input":"2025-08-18T20:18:55.300314Z","iopub.status.idle":"2025-08-18T20:19:09.154492Z","shell.execute_reply.started":"2025-08-18T20:18:55.300289Z","shell.execute_reply":"2025-08-18T20:19:09.153315Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/bigquery/_operations/json.py:124: UserWarning: The `json_extract` is deprecated and will be removed in a future\nversion. Use `json_query` instead.\n warnings.warn(bfe.format_message(msg), category=UserWarning)\n","output_type":"stream"},{"output_type":"display_data","data":{"text/html":"","text/plain":""},"metadata":{}},{"output_type":"display_data","data":{"text/html":"","text/plain":""},"metadata":{}}],"execution_count":7},{"cell_type":"markdown","source":"# 3. Conduct image transformations\n\nBigFrames Multimodal DataFrame provides image(and other) transformation functions. Such as image_blur, image_resize and image_normalize. The output can be saved to GCS folders or to BQ as bytes.","metadata":{}},{"cell_type":"code","source":"df_image[\"blurred\"] = df_image[\"image\"].blob.image_blur(\n (20, 20), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\", engine=\"opencv\"\n)\ndf_image[\"resized\"] = df_image[\"image\"].blob.image_resize(\n (300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_resize_transformed/\", engine=\"opencv\"\n)\ndf_image[\"normalized\"] = df_image[\"image\"].blob.image_normalize(\n alpha=50.0,\n beta=150.0,\n norm_type=\"minmax\",\n dst=f\"gs://{OUTPUT_BUCKET}/image_normalize_transformed/\",\n engine=\"opencv\",\n)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:19:22.950277Z","iopub.execute_input":"2025-08-18T20:19:22.950652Z","iopub.status.idle":"2025-08-18T20:31:51.799997Z","shell.execute_reply.started":"2025-08-18T20:19:22.950625Z","shell.execute_reply":"2025-08-18T20:31:51.798840Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/log_adapter.py:175: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n return method(*args, **kwargs)\n/usr/local/lib/python3.11/dist-packages/bigframes/core/log_adapter.py:175: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n return method(*args, **kwargs)\n/usr/local/lib/python3.11/dist-packages/bigframes/core/log_adapter.py:175: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n return method(*args, **kwargs)\n","output_type":"stream"}],"execution_count":8},{"cell_type":"code","source":"# You can also chain functions together\ndf_image[\"blur_resized\"] = df_image[\"blurred\"].blob.image_resize((300, 200), dst=f\"gs://{OUTPUT_BUCKET}/image_blur_resize_transformed/\", engine=\"opencv\")\ndf_image","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:31:51.802219Z","iopub.execute_input":"2025-08-18T20:31:51.802745Z","iopub.status.idle":"2025-08-18T20:36:13.953258Z","shell.execute_reply.started":"2025-08-18T20:31:51.802700Z","shell.execute_reply":"2025-08-18T20:36:13.951930Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/log_adapter.py:175: FunctionAxisOnePreviewWarning: Blob Functions use bigframes DataFrame Managed function with axis=1 senario, which is a preview feature.\n return method(*args, **kwargs)\n","output_type":"stream"},{"execution_count":9,"output_type":"execute_result","data":{"text/plain":" image author content_type \\\n0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n2 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n3 {'uri': 'gs://cloud-samples-data/bigquery/tuto... alice image/png \n4 {'uri': 'gs://cloud-samples-data/bigquery/tuto... bob image/png \n\n size updated \\\n0 1591240 2025-03-20 17:45:04+00:00 \n1 1182951 2025-03-20 17:45:02+00:00 \n2 1520884 2025-03-20 17:44:55+00:00 \n3 1235401 2025-03-20 17:45:19+00:00 \n4 1591923 2025-03-20 17:44:47+00:00 \n\n blurred \\\n0 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n1 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n2 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n3 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n4 {'uri': 'gs://bigframes_blob_test/image_blur_t... \n\n resized \\\n0 {'uri': 'gs://bigframes_blob_test/image_resize... \n1 {'uri': 'gs://bigframes_blob_test/image_resize... \n2 {'uri': 'gs://bigframes_blob_test/image_resize... \n3 {'uri': 'gs://bigframes_blob_test/image_resize... \n4 {'uri': 'gs://bigframes_blob_test/image_resize... \n\n normalized \\\n0 {'uri': 'gs://bigframes_blob_test/image_normal... \n1 {'uri': 'gs://bigframes_blob_test/image_normal... \n2 {'uri': 'gs://bigframes_blob_test/image_normal... \n3 {'uri': 'gs://bigframes_blob_test/image_normal... \n4 {'uri': 'gs://bigframes_blob_test/image_normal... \n\n blur_resized \n0 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n1 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n2 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n3 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n4 {'uri': 'gs://bigframes_blob_test/image_blur_r... \n\n[5 rows x 9 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
imageauthorcontent_typesizeupdatedblurredresizednormalizedblur_resized
0aliceimage/png15912402025-03-20 17:45:04+00:00
1bobimage/png11829512025-03-20 17:45:02+00:00
2bobimage/png15208842025-03-20 17:44:55+00:00
3aliceimage/png12354012025-03-20 17:45:19+00:00
4bobimage/png15919232025-03-20 17:44:47+00:00
\n

5 rows × 9 columns

\n
[5 rows x 9 columns in total]"},"metadata":{}}],"execution_count":9},{"cell_type":"markdown","source":"# 4. Use LLM models to ask questions and generate embeddings on images","metadata":{}},{"cell_type":"code","source":"from bigframes.ml import llm\ngemini = llm.GeminiTextGenerator()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:36:13.954340Z","iopub.execute_input":"2025-08-18T20:36:13.954686Z","iopub.status.idle":"2025-08-18T20:36:43.225449Z","shell.execute_reply.started":"2025-08-18T20:36:13.954661Z","shell.execute_reply":"2025-08-18T20:36:43.224579Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message.\n return method(*args, **kwargs)\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"# Ask the same question on the images\ndf_image = df_image.head(2)\nanswer = gemini.predict(df_image, prompt=[\"what item is it?\", df_image[\"image\"]])\nanswer[[\"ml_generate_text_llm_result\", \"image\"]]","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:36:43.227457Z","iopub.execute_input":"2025-08-18T20:36:43.227798Z","iopub.status.idle":"2025-08-18T20:37:25.238649Z","shell.execute_reply.started":"2025-08-18T20:36:43.227764Z","shell.execute_reply":"2025-08-18T20:37:25.237623Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n`db_dtypes` is a preview feature and subject to change.\n warnings.warn(msg, bfe.PreviewWarning)\n","output_type":"stream"},{"execution_count":11,"output_type":"execute_result","data":{"text/plain":" ml_generate_text_llm_result \\\n0 The item is a tin of K9 Guard Dog Paw Balm. \n1 The item is a bottle of K9 Guard Dog Hot Spot ... \n\n image \n0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n\n[2 rows x 2 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ml_generate_text_llm_resultimage
0The item is a tin of K9 Guard Dog Paw Balm.
1The item is a bottle of K9 Guard Dog Hot Spot Spray.
\n

2 rows × 2 columns

\n
[2 rows x 2 columns in total]"},"metadata":{}}],"execution_count":11},{"cell_type":"code","source":"# Ask different questions\ndf_image[\"question\"] = [\"what item is it?\", \"what color is the picture?\"]","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:37:25.239607Z","iopub.execute_input":"2025-08-18T20:37:25.239875Z","iopub.status.idle":"2025-08-18T20:37:25.263034Z","shell.execute_reply.started":"2025-08-18T20:37:25.239847Z","shell.execute_reply":"2025-08-18T20:37:25.262002Z"}},"outputs":[],"execution_count":12},{"cell_type":"code","source":"answer_alt = gemini.predict(df_image, prompt=[df_image[\"question\"], df_image[\"image\"]])\nanswer_alt[[\"ml_generate_text_llm_result\", \"image\"]]","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:37:25.264072Z","iopub.execute_input":"2025-08-18T20:37:25.264585Z","iopub.status.idle":"2025-08-18T20:38:10.129667Z","shell.execute_reply.started":"2025-08-18T20:37:25.264518Z","shell.execute_reply":"2025-08-18T20:38:10.128677Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n`db_dtypes` is a preview feature and subject to change.\n warnings.warn(msg, bfe.PreviewWarning)\n","output_type":"stream"},{"execution_count":13,"output_type":"execute_result","data":{"text/plain":" ml_generate_text_llm_result \\\n0 The item is a tin of K9 Guard Dog Paw Balm. \n1 The picture has colors such as white, gray, an... \n\n image \n0 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n1 {'uri': 'gs://cloud-samples-data/bigquery/tuto... \n\n[2 rows x 2 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ml_generate_text_llm_resultimage
0The item is a tin of K9 Guard Dog Paw Balm.
1The picture has colors such as white, gray, and a light blue (cyan).
\n

2 rows × 2 columns

\n
[2 rows x 2 columns in total]"},"metadata":{}}],"execution_count":13},{"cell_type":"code","source":"# Generate embeddings.\nembed_model = llm.MultimodalEmbeddingGenerator()\nembeddings = embed_model.predict(df_image[\"image\"])\nembeddings","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-08-18T20:38:10.130617Z","iopub.execute_input":"2025-08-18T20:38:10.130851Z","iopub.status.idle":"2025-08-18T20:39:04.790416Z","shell.execute_reply.started":"2025-08-18T20:38:10.130833Z","shell.execute_reply":"2025-08-18T20:39:04.789398Z"}},"outputs":[{"name":"stderr","text":"/usr/local/lib/python3.11/dist-packages/bigframes/core/log_adapter.py:175: FutureWarning: Since upgrading the default model can cause unintended breakages, the\ndefault model will be removed in BigFrames 3.0. Please supply an\nexplicit model to avoid this message.\n return method(*args, **kwargs)\n/usr/local/lib/python3.11/dist-packages/bigframes/core/array_value.py:108: PreviewWarning: JSON column interpretation as a custom PyArrow extention in\n`db_dtypes` is a preview feature and subject to change.\n warnings.warn(msg, bfe.PreviewWarning)\n","output_type":"stream"},{"execution_count":14,"output_type":"execute_result","data":{"text/plain":" ml_generate_embedding_result \\\n0 [ 0.00638822 0.01666385 0.00451817 ... -0.02... \n1 [ 0.00973672 0.02148364 0.00244308 ... 0.00... \n\n ml_generate_embedding_status ml_generate_embedding_start_sec \\\n0 \n1 \n\n ml_generate_embedding_end_sec \\\n0 \n1 \n\n content \n0 {\"access_urls\":{\"expiry_time\":\"2025-08-19T02:3... \n1 {\"access_urls\":{\"expiry_time\":\"2025-08-19T02:3... \n\n[2 rows x 5 columns]","text/html":"
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
ml_generate_embedding_resultml_generate_embedding_statusml_generate_embedding_start_secml_generate_embedding_end_seccontent
0[ 0.00638822 0.01666385 0.00451817 ... -0.02...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2025-08-19T02:3...
1[ 0.00973672 0.02148364 0.00244308 ... 0.00...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2025-08-19T02:3...
\n

2 rows × 5 columns

\n
[2 rows x 5 columns in total]"},"metadata":{}}],"execution_count":14},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]} +{ + "cells": [ + { + "cell_type": "markdown", + "id": "876eb80c", + "metadata": { + "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", + "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" + }, + "source": [ + "# Describe product images with BigFrames multimodal DataFrames\n", + "\n", + "Based on notebook at https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/multimodal/multimodal_dataframe.ipynb\n", + "\n", + "This notebook is introducing BigFrames Multimodal features:\n", + "\n", + "1. Create Multimodal DataFrame\n", + "2. Combine unstructured data with structured data\n", + "3. Conduct image transformations\n", + "4. Use LLM models to ask questions and generate embeddings on images\n", + "5. PDF chunking function\n", + "\n", + "Install the bigframes package and upgrade other packages that are already included in Kaggle but have versions incompatible with bigframes." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "0506e15e", + "metadata": { + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: bigframes in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (2.39.0)\n", + "Requirement already satisfied: google-cloud-automl in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (2.19.0)\n", + "Requirement already satisfied: google-cloud-translate in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (3.26.0)\n", + "Requirement already satisfied: google-ai-generativelanguage in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (0.11.0)\n", + "Requirement already satisfied: tensorflow in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (2.21.0)\n", + "Requirement already satisfied: cloudpickle>=2.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (3.1.2)\n", + "Requirement already satisfied: fsspec>=2023.3.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2026.1.0)\n", + "Requirement already satisfied: gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2026.1.0)\n", + "Requirement already satisfied: geopandas>=0.12.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.1.3)\n", + "Requirement already satisfied: google-auth<3.0,>=2.15.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.49.1)\n", + "Requirement already satisfied: google-cloud-bigquery>=3.36.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-cloud-bigquery[bqstorage,pandas]>=3.36.0->bigframes) (3.41.0)\n", + "Requirement already satisfied: google-cloud-bigquery-storage<3.0.0,>=2.30.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.37.0)\n", + "Requirement already satisfied: google-cloud-functions>=1.12.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.23.0)\n", + "Requirement already satisfied: google-cloud-bigquery-connection>=1.12.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.21.0)\n", + "Requirement already satisfied: google-cloud-resource-manager>=1.10.3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.17.0)\n", + "Requirement already satisfied: google-cloud-storage>=2.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (3.10.1)\n", + "Requirement already satisfied: google-crc32c<2.0.0,>=1.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.8.0)\n", + "Requirement already satisfied: grpc-google-iam-v1>=0.14.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (0.14.4)\n", + "Requirement already satisfied: numpy>=1.24.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.4.4)\n", + "Requirement already satisfied: pandas>=1.5.3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.3.3)\n", + "Requirement already satisfied: pandas-gbq>=0.26.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (0.34.1)\n", + "Requirement already satisfied: pyarrow>=15.0.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (21.0.0)\n", + "Requirement already satisfied: pydata-google-auth>=1.8.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.9.1)\n", + "Requirement already satisfied: requests>=2.27.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.33.1)\n", + "Requirement already satisfied: shapely>=1.8.5 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.1.2)\n", + "Requirement already satisfied: tabulate>=0.9 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (0.10.0)\n", + "Requirement already satisfied: humanize>=4.6.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (4.15.0)\n", + "Requirement already satisfied: matplotlib>=3.7.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (3.10.8)\n", + "Requirement already satisfied: db-dtypes>=1.4.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.5.1)\n", + "Requirement already satisfied: pyiceberg>=0.7.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (0.11.1)\n", + "Requirement already satisfied: atpublic<6,>=2.3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (5.1)\n", + "Requirement already satisfied: python-dateutil<3,>=2.8.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2022.7 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (2026.1.post1)\n", + "Requirement already satisfied: toolz<2,>=0.11 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (1.1.0)\n", + "Requirement already satisfied: typing-extensions<5,>=4.5.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (4.15.0)\n", + "Requirement already satisfied: rich<14,>=12.4.4 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from bigframes) (13.9.4)\n", + "Requirement already satisfied: google-api-core<3.0.0,>=2.11.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-api-core[grpc]<3.0.0,>=2.11.0->google-cloud-automl) (2.30.2)\n", + "Requirement already satisfied: grpcio<2.0.0,>=1.33.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-cloud-automl) (1.80.0)\n", + "Requirement already satisfied: proto-plus<2.0.0,>=1.22.3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-cloud-automl) (1.27.2)\n", + "Requirement already satisfied: protobuf<8.0.0,>=4.25.8 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-cloud-automl) (6.33.6)\n", + "Requirement already satisfied: google-cloud-core<3.0.0,>=2.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-cloud-translate) (2.5.1)\n", + "Requirement already satisfied: absl-py>=1.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (2.4.0)\n", + "Requirement already satisfied: astunparse>=1.6.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (1.6.3)\n", + "Requirement already satisfied: flatbuffers>=25.9.23 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (25.12.19)\n", + "Requirement already satisfied: gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (0.7.0)\n", + "Requirement already satisfied: google_pasta>=0.1.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (0.2.0)\n", + "Requirement already satisfied: libclang>=13.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (18.1.1)\n", + "Requirement already satisfied: opt_einsum>=2.3.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (3.4.0)\n", + "Requirement already satisfied: packaging in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (26.0)\n", + "Requirement already satisfied: setuptools in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (82.0.1)\n", + "Requirement already satisfied: six>=1.12.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (1.17.0)\n", + "Requirement already satisfied: termcolor>=1.1.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (3.3.0)\n", + "Requirement already satisfied: wrapt>=1.11.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (2.1.2)\n", + "Requirement already satisfied: keras>=3.12.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (3.14.0)\n", + "Requirement already satisfied: h5py<3.15.0,>=3.11.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (3.14.0)\n", + "Requirement already satisfied: ml_dtypes<1.0.0,>=0.5.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from tensorflow) (0.5.4)\n", + "Requirement already satisfied: wheel<1.0,>=0.23.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from astunparse>=1.6.0->tensorflow) (0.47.0)\n", + "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (3.13.5)\n", + "Requirement already satisfied: decorator>4.1.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (5.2.1)\n", + "Requirement already satisfied: google-auth-oauthlib in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (1.3.1)\n", + "Requirement already satisfied: google-cloud-storage-control in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (1.11.0)\n", + "Requirement already satisfied: pyogrio>=0.7.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from geopandas>=0.12.2->bigframes) (0.12.1)\n", + "Requirement already satisfied: pyproj>=3.5.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from geopandas>=0.12.2->bigframes) (3.7.2)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.0,>=1.63.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-api-core<3.0.0,>=2.11.0->google-api-core[grpc]<3.0.0,>=2.11.0->google-cloud-automl) (1.74.0)\n", + "Requirement already satisfied: grpcio-status<2.0.0,>=1.33.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-api-core[grpc]<3.0.0,>=2.11.0->google-cloud-automl) (1.80.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-auth<3.0,>=2.15.0->bigframes) (0.4.2)\n", + "Requirement already satisfied: cryptography>=38.0.3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-auth<3.0,>=2.15.0->bigframes) (46.0.7)\n", + "Requirement already satisfied: google-resumable-media<3.0.0,>=2.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-cloud-bigquery>=3.36.0->google-cloud-bigquery[bqstorage,pandas]>=3.36.0->bigframes) (2.8.2)\n", + "Requirement already satisfied: namex in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from keras>=3.12.0->tensorflow) (0.1.0)\n", + "Requirement already satisfied: optree in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from keras>=3.12.0->tensorflow) (0.19.0)\n", + "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from matplotlib>=3.7.1->bigframes) (1.3.3)\n", + "Requirement already satisfied: cycler>=0.10 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from matplotlib>=3.7.1->bigframes) (0.12.1)\n", + "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from matplotlib>=3.7.1->bigframes) (4.62.1)\n", + "Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from matplotlib>=3.7.1->bigframes) (1.5.0)\n", + "Requirement already satisfied: pillow>=8 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from matplotlib>=3.7.1->bigframes) (12.2.0)\n", + "Requirement already satisfied: pyparsing>=3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from matplotlib>=3.7.1->bigframes) (3.3.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pandas>=1.5.3->bigframes) (2026.1)\n", + "Requirement already satisfied: psutil>=5.9.8 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pandas-gbq>=0.26.1->bigframes) (7.2.2)\n", + "Requirement already satisfied: mmh3<6.0.0,>=4.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (5.2.1)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (8.3.2)\n", + "Requirement already satisfied: strictyaml<2.0.0,>=1.7.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (1.7.3)\n", + "Requirement already satisfied: pydantic!=2.12.0,!=2.12.1,!=2.4.0,!=2.4.1,<3.0,>=2.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (2.12.5)\n", + "Requirement already satisfied: tenacity<10.0.0,>=8.2.3 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (9.1.4)\n", + "Requirement already satisfied: pyroaring<2.0.0,>=1.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (1.0.4)\n", + "Requirement already satisfied: cachetools<7.0,>=5.5 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (6.2.6)\n", + "Requirement already satisfied: zstandard<1.0.0,>=0.13.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyiceberg>=0.7.1->bigframes) (0.25.0)\n", + "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from requests>=2.27.1->bigframes) (3.4.7)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from requests>=2.27.1->bigframes) (3.11)\n", + "Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from requests>=2.27.1->bigframes) (2.6.3)\n", + "Requirement already satisfied: certifi>=2023.5.7 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from requests>=2.27.1->bigframes) (2026.2.25)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from rich<14,>=12.4.4->bigframes) (4.0.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from rich<14,>=12.4.4->bigframes) (2.20.0)\n", + "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (2.6.1)\n", + "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (1.4.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (26.1.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (1.8.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (6.7.1)\n", + "Requirement already satisfied: propcache>=0.2.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (0.4.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (1.23.0)\n", + "Requirement already satisfied: cffi>=2.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from cryptography>=38.0.3->google-auth<3.0,>=2.15.0->bigframes) (2.0.0)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from google-auth-oauthlib->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (2.0.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from markdown-it-py>=2.2.0->rich<14,>=12.4.4->bigframes) (0.1.2)\n", + "Requirement already satisfied: pyasn1<0.7.0,>=0.6.1 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0,>=2.15.0->bigframes) (0.6.3)\n", + "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pydantic!=2.12.0,!=2.12.1,!=2.4.0,!=2.4.1,<3.0,>=2.0->pyiceberg>=0.7.1->bigframes) (0.7.0)\n", + "Requirement already satisfied: pydantic-core==2.41.5 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pydantic!=2.12.0,!=2.12.1,!=2.4.0,!=2.4.1,<3.0,>=2.0->pyiceberg>=0.7.1->bigframes) (2.41.5)\n", + "Requirement already satisfied: typing-inspection>=0.4.2 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from pydantic!=2.12.0,!=2.12.1,!=2.4.0,!=2.4.1,<3.0,>=2.0->pyiceberg>=0.7.1->bigframes) (0.4.2)\n", + "Requirement already satisfied: pycparser in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from cffi>=2.0.0->cryptography>=38.0.3->google-auth<3.0,>=2.15.0->bigframes) (3.0)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/.venv/lib/python3.13/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs!=2025.5.0,!=2026.2.0,!=2026.3.0,>=2023.3.0->bigframes) (3.3.1)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install --upgrade bigframes google-cloud-automl google-cloud-translate google-ai-generativelanguage tensorflow " + ] + }, + { + "cell_type": "markdown", + "id": "c749e07c", + "metadata": {}, + "source": [ + "**Important:** restart the kernel by going to \"Run -> Restart & clear cell outputs\" before continuing.\n", + "\n", + "Configure bigframes to use your GCP project. First, go to \"Add-ons -> Google Cloud SDK\" and click the \"Attach\" button. Then," + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5e00777d", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:17:14.873201Z", + "iopub.status.busy": "2025-08-18T20:17:14.872905Z", + "iopub.status.idle": "2025-08-18T20:17:14.946971Z", + "shell.execute_reply": "2025-08-18T20:17:14.945996Z", + "shell.execute_reply.started": "2025-08-18T20:17:14.873171Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Not running on Kaggle, skipping Kaggle secrets initialization.\n" + ] + } + ], + "source": [ + "try:\n", + " from kaggle_secrets import UserSecretsClient\n", + " user_secrets = UserSecretsClient()\n", + " user_credential = user_secrets.get_gcloud_credential()\n", + " user_secrets.set_tensorflow_credential(user_credential)\n", + " print(\"Successfully authenticated using Kaggle secrets.\")\n", + "except ImportError:\n", + " print(\"Not running on Kaggle, skipping Kaggle secrets initialization.\")\n", + "except Exception as e:\n", + " print(f\"Could not initialize Kaggle secrets: {e}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b2e171de", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:17:25.574192Z", + "iopub.status.busy": "2025-08-18T20:17:25.573874Z", + "iopub.status.idle": "2025-08-18T20:17:45.102002Z", + "shell.execute_reply": "2025-08-18T20:17:45.101140Z", + "shell.execute_reply.started": "2025-08-18T20:17:25.574168Z" + }, + "trusted": true + }, + "outputs": [], + "source": [ + "PROJECT = \"bigframes-dev\" # replace with your project. \n", + "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#required_roles for your required permissions\n", + "\n", + "LOCATION = \"us\" # replace with your location.\n", + "DATASET_ID = \"bigframes_samples\" # replace with your dataset ID.\n", + "OUTPUT_BUCKET = \"bigframes_blob_test\" # replace with your GCS bucket. \n", + "\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", + "import bigframes\n", + "# Setup project\n", + "bigframes.options.bigquery.project = PROJECT\n", + "bigframes.options.bigquery.location = LOCATION\n", + "\n", + "# Display options\n", + "bigframes.options.display.blob_display_width = 300\n", + "bigframes.options.display.progress_bar = None\n", + "\n", + "import bigframes.pandas as bpd\n", + "import bigframes.bigquery as bbq\n", + "\n", + "def get_runtime_json_str(series, mode=\"R\", with_metadata=False):\n", + " \"\"\"Get runtime JSON from objectref.\"\"\"\n", + " s = bbq.obj.fetch_metadata(series) if with_metadata else series\n", + " runtime = bbq.obj.get_access_url(s, mode=mode)\n", + " return bbq.to_json_string(runtime)\n", + "\n", + "def get_metadata(series):\n", + " metadata_obj = bbq.obj.fetch_metadata(series)\n", + " return bbq.json_query(metadata_obj.struct.field(\"details\"), \"$.gcs_metadata\")\n", + "\n", + "def get_content_type(series):\n", + " return bbq.json_value(get_metadata(series), \"$.content_type\")\n", + "\n", + "def get_size(series):\n", + " return bbq.json_value(get_metadata(series), \"$.size\").astype(\"Int64\")\n", + "\n", + "def get_updated(series):\n", + " return bpd.to_datetime(bbq.json_value(get_metadata(series), \"$.updated\").astype(\"Int64\"), unit=\"us\", utc=True)\n", + "\n", + "from IPython.display import HTML, display\n", + "\n", + "def render_images(df):\n", + " \"\"\"Helper to display BigFrames DataFrame with rendered image previews.\"\"\"\n", + " import bigframes.pandas as bpd\n", + " import bigframes.bigquery as bbq\n", + " import bigframes\n", + " from bigframes import dtypes\n", + " import json\n", + " \n", + " if isinstance(df, bpd.Series):\n", + " df = df.to_frame()\n", + " \n", + " object_cols = [\n", + " col for col, dtype in zip(df.columns, df.dtypes)\n", + " if dtype == dtypes.OBJ_REF_DTYPE\n", + " ]\n", + " \n", + " if not object_cols:\n", + " display(df)\n", + " return\n", + "\n", + " limit = bigframes.options.display.max_rows or 10\n", + " view_df = df.head(limit)\n", + " \n", + " runtime_cols = {\n", + " col: get_runtime_json_str(view_df[col], mode=\"R\", with_metadata=False) \n", + " for col in object_cols\n", + " }\n", + " \n", + " pandas_json_df = bpd.DataFrame(runtime_cols).to_pandas()\n", + " final_pd = view_df.to_pandas()\n", + " \n", + " width = bigframes.options.display.blob_display_width or 300\n", + " IMAGE_EXTENSIONS = (\".png\", \".jpg\", \".jpeg\", \".gif\", \".webp\")\n", + " \n", + " def format_cell_html(raw_json):\n", + " if not raw_json:\n", + " return \"\"\n", + " try:\n", + " obj_rt = json.loads(raw_json)\n", + " if \"access_urls\" not in obj_rt:\n", + " err = obj_rt.get(\"errors\", [{\"message\": \"URL Generation Failed\"}])[0].get(\"message\")\n", + " return f'Error: {err}'\n", + " \n", + " uri = obj_rt.get(\"objectref\", {}).get(\"uri\", \"\")\n", + " url = obj_rt[\"access_urls\"][\"read_url\"]\n", + " \n", + " if uri and str(uri).lower().endswith(IMAGE_EXTENSIONS):\n", + " return f''\n", + " \n", + " return f'{uri if uri else \"view\"}'\n", + " except:\n", + " return \"Format Error\"\n", + "\n", + " for col in object_cols:\n", + " final_pd[col] = pandas_json_df[col].map(format_cell_html)\n", + " \n", + " display(HTML(final_pd.to_html(escape=False)))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d17afaf1", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:17:45.103530Z", + "iopub.status.busy": "2025-08-18T20:17:45.103249Z", + "iopub.status.idle": "2025-08-18T20:17:47.424586Z", + "shell.execute_reply": "2025-08-18T20:17:47.423762Z", + "shell.execute_reply.started": "2025-08-18T20:17:45.103499Z" + }, + "trusted": true + }, + "outputs": [], + "source": [ + "import gcsfs\n", + "import bigframes.bigquery as bbq\n", + "\n", + "# List files using gcsfs (public bucket)\n", + "fs = gcsfs.GCSFileSystem(anon=True)\n", + "uris = fs.glob(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*\")\n", + "\n", + "# Ensure URIs have gs:// prefix\n", + "uris = [u if u.startswith(\"gs://\") else f\"gs://{u}\" for u in uris]\n", + "\n", + "# Read the URIs into a BigQuery DataFrame using UNNEST\n", + "# We take the first 5 for this example\n", + "df_image = bpd.read_gbq(f\"SELECT uri FROM UNNEST({uris[:5]}) as uri\")\n", + "\n", + "# Create the object reference column\n", + "df_image['image'] = bbq.obj.make_ref(df_image['uri'], authorizer=FULL_CONNECTION_ID)\n", + "df_image = df_image[['image']]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "3e84b922", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:17:47.425873Z", + "iopub.status.busy": "2025-08-18T20:17:47.425578Z", + "iopub.status.idle": "2025-08-18T20:18:07.919961Z", + "shell.execute_reply": "2025-08-18T20:18:07.918942Z", + "shell.execute_reply.started": "2025-08-18T20:17:47.425844Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
image
0
1
2
3
4
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame\n", + "df_image = df_image.head(5)\n", + "render_images(df_image)" + ] + }, + { + "cell_type": "markdown", + "id": "b0eaa73c", + "metadata": {}, + "source": [ + "# 2. Combine unstructured data with structured data\n", + "\n", + "Now you can put more information into the table to describe the files. Such as author info from inputs, or other metadata from the gcs object itself." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7d64fb54", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:18:07.922593Z", + "iopub.status.busy": "2025-08-18T20:18:07.921884Z", + "iopub.status.idle": "2025-08-18T20:18:35.549725Z", + "shell.execute_reply": "2025-08-18T20:18:35.548942Z", + "shell.execute_reply.started": "2025-08-18T20:18:07.922551Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageauthorcontent_typesizeupdated
0aliceimage/png7157662025-03-20 17:44:38+00:00
1bobimage/png11674062025-03-20 17:44:38+00:00
2bobimage/png11508922025-03-20 17:44:39+00:00
3aliceimage/png17365332025-03-20 17:44:39+00:00
4bobimage/png4397402025-03-20 17:44:39+00:00
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Combine unstructured data with structured data\n", + "df_image[\"author\"] = [\"alice\", \"bob\", \"bob\", \"alice\", \"bob\"] # type: ignore\n", + "df_image[\"content_type\"] = get_content_type(df_image[\"image\"])\n", + "df_image[\"size\"] = get_size(df_image[\"image\"])\n", + "df_image[\"updated\"] = get_updated(df_image[\"image\"])\n", + "render_images(df_image)" + ] + }, + { + "cell_type": "markdown", + "id": "a23ef0e4", + "metadata": {}, + "source": [ + "Then you can filter the rows based on the structured data. And for different content types, you can display them respectively or together." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ce102df0", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:18:55.300314Z", + "iopub.status.busy": "2025-08-18T20:18:55.299993Z", + "iopub.status.idle": "2025-08-18T20:19:09.154492Z", + "shell.execute_reply": "2025-08-18T20:19:09.153315Z", + "shell.execute_reply.started": "2025-08-18T20:18:55.300289Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageauthorcontent_typesizeupdated
0aliceimage/png7157662025-03-20 17:44:38+00:00
3aliceimage/png17365332025-03-20 17:44:39+00:00
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# filter images and display, you can also display audio and video types\n", + "filtered_df = df_image[df_image[\"author\"] == \"alice\"]\n", + "render_images(filtered_df)" + ] + }, + { + "cell_type": "markdown", + "id": "db2b3b12", + "metadata": {}, + "source": [ + "# 3. Conduct image transformations\n", + "\n", + "BigFrames Multimodal DataFrame provides image(and other) transformation functions. Such as image_blur, image_resize and image_normalize. The output can be saved to GCS folders or to BQ as bytes." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "283036f5", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:19:22.950652Z", + "iopub.status.busy": "2025-08-18T20:19:22.950277Z", + "iopub.status.idle": "2025-08-18T20:31:51.799997Z", + "shell.execute_reply": "2025-08-18T20:31:51.798840Z", + "shell.execute_reply.started": "2025-08-18T20:19:22.950625Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/pandas/__init__.py:211: PreviewWarning: udf is in preview.\n", + " return global_session.with_default_session(\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dataframe.py:4695: FunctionAxisOnePreviewWarning: DataFrame.apply with parameter axis=1 scenario is in preview.\n", + " warnings.warn(msg, category=bfe.FunctionAxisOnePreviewWarning)\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
imageblurred
0
1
2
3
4
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "@bpd.udf(\n", + " input_types=[str, str, int, int],\n", + " output_type=str,\n", + " dataset=DATASET_ID,\n", + " name=\"image_blur_kaggle\",\n", + " bigquery_connection=FULL_CONNECTION_ID,\n", + " packages=[\"opencv-python-headless\", \"numpy\", \"requests\"],\n", + ")\n", + "def image_blur(src_rt: str, dst_rt: str, kx: int, ky: int) -> str:\n", + " import json\n", + " import cv2 as cv\n", + " import numpy as np\n", + " import requests\n", + " \n", + " src_obj = json.loads(src_rt)\n", + " if \"access_urls\" not in src_obj:\n", + " raise ValueError(f\"Missing 'access_urls' in source object. Response: {src_obj}\")\n", + " src_url = src_obj[\"access_urls\"][\"read_url\"]\n", + " \n", + " response = requests.get(src_url, timeout=30)\n", + " response.raise_for_status()\n", + " \n", + " img = cv.imdecode(np.frombuffer(response.content, np.uint8), cv.IMREAD_UNCHANGED)\n", + " if img is None:\n", + " raise ValueError(\"cv.imdecode failed\")\n", + " \n", + " img_blurred = cv.blur(img, ksize=(int(kx), int(ky)))\n", + " success, encoded = cv.imencode(\".jpeg\", img_blurred)\n", + " \n", + " if not success:\n", + " raise ValueError(\"cv.imencode failed\")\n", + " \n", + " if dst_rt: # GCS Output Mode\n", + " dst_obj = json.loads(dst_rt)\n", + " if \"access_urls\" not in dst_obj:\n", + " raise ValueError(f\"Missing 'access_urls' in destination object. Response: {dst_obj}\")\n", + " dst_url = dst_obj[\"access_urls\"][\"write_url\"]\n", + " \n", + " requests.put(dst_url, data=encoded.tobytes(), headers={\"Content-Type\": \"image/jpeg\"}, timeout=30).raise_for_status()\n", + " return dst_obj[\"objectref\"][\"uri\"]\n", + " return \"\"\n", + "\n", + "def apply_transformation(series, dst_folder, udf, *args, verbose=False):\n", + " import os\n", + " dst_folder = os.path.join(dst_folder, \"\")\n", + " metadata = bbq.obj.fetch_metadata(series)\n", + " current_uri = metadata.struct.field(\"uri\")\n", + " dst_uri = current_uri.str.replace(r\"^.*\\/(.*)$\", rf\"{dst_folder}\\1\", regex=True)\n", + " \n", + " # Bypass synchronous validation via JSON initialization\n", + " dst_blob_df = bpd.DataFrame({\"uri\": dst_uri})\n", + " dst_blob_df[\"authorizer\"] = FULL_CONNECTION_ID\n", + " dst_blob = bbq.obj.make_ref(bbq.to_json(bbq.struct(dst_blob_df)))\n", + "\n", + " df_transform = bpd.DataFrame({\n", + " \"src_rt\": get_runtime_json_str(series, mode=\"R\"),\n", + " \"dst_rt\": get_runtime_json_str(dst_blob, mode=\"RW\"),\n", + " })\n", + " res = df_transform[[\"src_rt\", \"dst_rt\"]].apply(udf, axis=1, args=args)\n", + " \n", + " if verbose:\n", + " return res\n", + " \n", + " res_df = bpd.DataFrame({\"uri\": res})\n", + " res_df[\"authorizer\"] = FULL_CONNECTION_ID\n", + " return bbq.obj.make_ref(bbq.to_json(bbq.struct(res_df)))\n", + "\n", + "# Apply Blur Transformation\n", + "df_image[\"blurred\"] = apply_transformation(\n", + " df_image[\"image\"], f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\",\n", + " image_blur, 20, 20\n", + ")\n", + "render_images(df_image[[\"image\", \"blurred\"]])" + ] + }, + { + "cell_type": "markdown", + "id": "2d68a468", + "metadata": {}, + "source": [ + "# 4. Use LLM models to ask questions and generate embeddings on images" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "662054a0", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:36:13.954686Z", + "iopub.status.busy": "2025-08-18T20:36:13.954340Z", + "iopub.status.idle": "2025-08-18T20:36:43.225449Z", + "shell.execute_reply": "2025-08-18T20:36:43.224579Z", + "shell.execute_reply.started": "2025-08-18T20:36:13.954661Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/core/logging/log_adapter.py:183: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/session/__init__.py:437: FutureWarning: You are using the BigFrames session default connection: bigframes-\n", + "default-connection, which can be different from the\n", + "BigQuery project default connection. This default\n", + "connection may change in the future.\n", + " warnings.warn(msg, category=FutureWarning)\n" + ] + } + ], + "source": [ + "from bigframes.ml import llm\n", + "gemini = llm.GeminiTextGenerator()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a31730ff", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:36:43.227798Z", + "iopub.status.busy": "2025-08-18T20:36:43.227457Z", + "iopub.status.idle": "2025-08-18T20:37:25.238649Z", + "shell.execute_reply": "2025-08-18T20:37:25.237623Z", + "shell.execute_reply.started": "2025-08-18T20:36:43.227764Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultimage
0Please provide me with the picture! I need to see the image to tell you what the item is and what color the picture is.\\n
1To answer your question accurately, I need you to provide me with the picture you are referring to. Once you provide the picture, I can analyze it and tell you what item is in the picture and what color the picture is.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Ask the same question on the images\n", + "df_image = df_image.head(2)\n", + "answer = gemini.predict(df_image, prompt=[\"what item is it?\", \"what color is the picture?\"])\n", + "render_images(answer[[\"ml_generate_text_llm_result\", \"image\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "f5d2a1ed", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:37:25.239875Z", + "iopub.status.busy": "2025-08-18T20:37:25.239607Z", + "iopub.status.idle": "2025-08-18T20:37:25.263034Z", + "shell.execute_reply": "2025-08-18T20:37:25.262002Z", + "shell.execute_reply.started": "2025-08-18T20:37:25.239847Z" + }, + "trusted": true + }, + "outputs": [], + "source": [ + "# Ask different questions\n", + "df_image[\"question\"] = [\"what item is it?\", \"what color is the picture?\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "fb67bf8e", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:37:25.264585Z", + "iopub.status.busy": "2025-08-18T20:37:25.264072Z", + "iopub.status.idle": "2025-08-18T20:38:10.129667Z", + "shell.execute_reply": "2025-08-18T20:38:10.128677Z", + "shell.execute_reply.started": "2025-08-18T20:37:25.264518Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_text_llm_resultimage
0The item is a glass aquarium.
1Dark brown
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "answer_alt = gemini.predict(df_image, prompt=[df_image[\"question\"], df_image[\"image\"]])\n", + "render_images(answer_alt[[\"ml_generate_text_llm_result\", \"image\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0cf33170", + "metadata": { + "execution": { + "iopub.execute_input": "2025-08-18T20:38:10.130851Z", + "iopub.status.busy": "2025-08-18T20:38:10.130617Z", + "iopub.status.idle": "2025-08-18T20:39:04.790416Z", + "shell.execute_reply": "2025-08-18T20:39:04.789398Z", + "shell.execute_reply.started": "2025-08-18T20:38:10.130833Z" + }, + "trusted": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/core/logging/log_adapter.py:183: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", + "default model will be removed in BigFrames 3.0. Please supply an\n", + "explicit model to avoid this message.\n", + " return method(*args, **kwargs)\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/session/__init__.py:437: FutureWarning: You are using the BigFrames session default connection: bigframes-\n", + "default-connection, which can be different from the\n", + "BigQuery project default connection. This default\n", + "connection may change in the future.\n", + " warnings.warn(msg, category=FutureWarning)\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "instead of using `db_dtypes` in the future when available in pandas\n", + "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ml_generate_embedding_resultml_generate_embedding_statusml_generate_embedding_start_secml_generate_embedding_end_seccontent
0[ 0.03416207 0.0419732 -0.0227391 ... -0.03...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-05-02T03:3...
1[ 0.01908903 0.0193082 -0.00221754 ... 0.00...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-05-02T03:3...
\n", + "

2 rows × 5 columns

\n", + "
[2 rows x 5 columns in total]" + ], + "text/plain": [ + " ml_generate_embedding_result \\\n", + "0 [ 0.03416207 0.0419732 -0.0227391 ... -0.03... \n", + "1 [ 0.01908903 0.0193082 -0.00221754 ... 0.00... \n", + "\n", + " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", + "0 \n", + "1 \n", + "\n", + " ml_generate_embedding_end_sec \\\n", + "0 \n", + "1 \n", + "\n", + " content \n", + "0 {\"access_urls\":{\"expiry_time\":\"2026-05-02T03:3... \n", + "1 {\"access_urls\":{\"expiry_time\":\"2026-05-02T03:3... \n", + "\n", + "[2 rows x 5 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Generate embeddings.\n", + "embed_model = llm.MultimodalEmbeddingGenerator()\n", + "embeddings = embed_model.predict(df_image[\"image\"])\n", + "embeddings" + ] + } + ], + "metadata": { + "kaggle": { + "accelerator": "none", + "dataSources": [ + { + "databundleVersionId": 13391012, + "sourceId": 110281, + "sourceType": "competition" + } + ], + "dockerImageVersionId": 31089, + "isGpuEnabled": false, + "isInternetEnabled": true, + "language": "python", + "sourceType": "notebook" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/packages/bigframes/notebooks/kaggle/vector-search-with-bigframes-over-national-jukebox.ipynb b/packages/bigframes/notebooks/kaggle/vector-search-with-bigframes-over-national-jukebox.ipynb index 4faff4b8e768..317ba0f1adba 100644 --- a/packages/bigframes/notebooks/kaggle/vector-search-with-bigframes-over-national-jukebox.ipynb +++ b/packages/bigframes/notebooks/kaggle/vector-search-with-bigframes-over-national-jukebox.ipynb @@ -1,23 +1,8 @@ { "cells": [ { + "id": "f4ece66a", "cell_type": "markdown", - "metadata": { - "@deathbeds/jupyterlab-fonts": { - "styles": { - "": { - "body[data-jp-deck-mode='presenting'] &": { - "zoom": "194%" - } - } - } - }, - "editable": true, - "slideshow": { - "slide_type": "subslide" - }, - "tags": [] - }, "source": [ "# Creating a searchable index of the National Jukebox\n", "\n", @@ -35,42 +20,42 @@ "To follow along, you'll need a Google Cloud project\n", "\n", "* Go to https://cloud.google.com/free to start a free trial." - ] - }, - { - "cell_type": "markdown", + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { "": { "body[data-jp-deck-mode='presenting'] &": { - "z-index": "0", - "zoom": "216%" + "zoom": "194%" } } } }, + "editable": true, "slideshow": { - "slide_type": "slide" - } + "slide_type": "subslide" + }, + "tags": [] }, + "execution_count": null + }, + { + "id": "bc01a1d3", + "cell_type": "markdown", "source": [ "The National Jukebox is a project of the USA Library of Congress to provide access to thousands of acoustic sound recordings from the very earliest days of the commercial record industry.\n", "\n", "* Learn more at https://www.loc.gov/collections/national-jukebox/about-this-collection/\n", "\n", "\"recording" - ] - }, - { - "cell_type": "markdown", + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { "": { "body[data-jp-deck-mode='presenting'] &": { "z-index": "0", - "zoom": "181%" + "zoom": "216%" } } } @@ -79,6 +64,11 @@ "slide_type": "slide" } }, + "execution_count": null + }, + { + "id": "4fc7c468", + "cell_type": "markdown", "source": [ "\n", "To search the National Jukebox, we combine powerful features of BigQuery:\n", @@ -96,16 +86,14 @@ "3. BigQuery DataFrames to use Python instead of SQL.\n", "\n", " https://cloud.google.com/bigquery/docs/bigquery-dataframes-introduction" - ] - }, - { - "cell_type": "markdown", + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { "": { "body[data-jp-deck-mode='presenting'] &": { - "zoom": "275%" + "z-index": "0", + "zoom": "181%" } } } @@ -114,15 +102,38 @@ "slide_type": "slide" } }, + "execution_count": null + }, + { + "id": "90f2e543", + "cell_type": "markdown", "source": [ "## Getting started with BigQuery DataFrames (bigframes)\n", "\n", "Install the bigframes package." - ] + ], + "metadata": { + "@deathbeds/jupyterlab-fonts": { + "styles": { + "": { + "body[data-jp-deck-mode='presenting'] &": { + "zoom": "275%" + } + } + } + }, + "slideshow": { + "slide_type": "slide" + } + }, + "execution_count": null }, { + "id": "56694cb4", "cell_type": "code", - "execution_count": null, + "source": [ + "%pip install --upgrade bigframes google-cloud-automl google-cloud-translate google-ai-generativelanguage tensorflow " + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -142,13 +153,17 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "%pip install --upgrade bigframes google-cloud-automl google-cloud-translate google-ai-generativelanguage tensorflow " - ] + "execution_count": null, + "outputs": [] }, { + "id": "fa84ad03", "cell_type": "markdown", + "source": [ + "**Important:** restart the kernel by going to \"Run -> Restart & clear cell outputs\" before continuing.\n", + "\n", + "Configure bigframes to use your GCP project. First, go to \"Add-ons -> Google Cloud SDK\" and click the \"Attach\" button. Then," + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -161,15 +176,17 @@ } } }, - "source": [ - "**Important:** restart the kernel by going to \"Run -> Restart & clear cell outputs\" before continuing.\n", - "\n", - "Configure bigframes to use your GCP project. First, go to \"Add-ons -> Google Cloud SDK\" and click the \"Attach\" button. Then," - ] + "execution_count": null }, { + "id": "1fbd4f9e", "cell_type": "code", - "execution_count": null, + "source": [ + "from kaggle_secrets import UserSecretsClient\n", + "user_secrets = UserSecretsClient()\n", + "user_credential = user_secrets.get_gcloud_credential()\n", + "user_secrets.set_tensorflow_credential(user_credential)" + ], "metadata": { "execution": { "iopub.execute_input": "2025-08-14T15:53:08.494636Z", @@ -180,17 +197,22 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "from kaggle_secrets import UserSecretsClient\n", - "user_secrets = UserSecretsClient()\n", - "user_credential = user_secrets.get_gcloud_credential()\n", - "user_secrets.set_tensorflow_credential(user_credential)" - ] + "execution_count": null, + "outputs": [] }, { + "id": "0b0b1cd8", "cell_type": "code", - "execution_count": null, + "source": [ + "import bigframes._config\n", + "import bigframes.pandas as bpd\n", + "\n", + "PROJECT_ID = \"your-project-id\" # @param {type:\"string\"}\n", + "bpd.options.bigquery.location = \"US\"\n", + "\n", + "# Set to your GCP project ID.\n", + "bpd.options.bigquery.project = PROJECT_ID" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -210,19 +232,17 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "import bigframes._config\n", - "import bigframes.pandas as bpd\n", - "\n", - "bpd.options.bigquery.location = \"US\"\n", - "\n", - "# Set to your GCP project ID.\n", - "bpd.options.bigquery.project = \"swast-scratch\"" - ] + "execution_count": null, + "outputs": [] }, { + "id": "32e58a7f", "cell_type": "markdown", + "source": [ + "## Reading data\n", + "\n", + "BigQuery DataFrames can read data from BigQuery, GCS, or even local sources. With `engine=\"bigquery\"`, BigQuery's distributed processing reads the file without it ever having to reach your local Python environment." + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -237,15 +257,19 @@ "slide_type": "slide" } }, - "source": [ - "## Reading data\n", - "\n", - "BigQuery DataFrames can read data from BigQuery, GCS, or even local sources. With `engine=\"bigquery\"`, BigQuery's distributed processing reads the file without it ever having to reach your local Python environment." - ] + "execution_count": null }, { + "id": "e52aa9e8", "cell_type": "code", - "execution_count": null, + "source": [ + "df = bpd.read_json(\n", + " \"gs://cloud-samples-data/third-party/usa-loc-national-jukebox/jukebox.jsonl\",\n", + " engine=\"bigquery\",\n", + " orient=\"records\",\n", + " lines=True,\n", + ")" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -265,19 +289,16 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "df = bpd.read_json(\n", - " \"gs://cloud-samples-data/third-party/usa-loc-national-jukebox/jukebox.jsonl\",\n", - " engine=\"bigquery\",\n", - " orient=\"records\",\n", - " lines=True,\n", - ")" - ] + "execution_count": null, + "outputs": [] }, { + "id": "0c1fca97", "cell_type": "code", - "execution_count": null, + "source": [ + "# Use `peek()` instead of `head()` to see arbitrary rows rather than the \"first\" rows.\n", + "df.peek()" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -300,15 +321,15 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "# Use `peek()` instead of `head()` to see arbitrary rows rather than the \"first\" rows.\n", - "df.peek()" - ] + "execution_count": null, + "outputs": [] }, { + "id": "4a13e789", "cell_type": "code", - "execution_count": null, + "source": [ + "df.shape" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -328,14 +349,18 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "df.shape" - ] + "execution_count": null, + "outputs": [] }, { + "id": "26b8baba", "cell_type": "code", - "execution_count": null, + "source": [ + "# For the purposes of a demo, select only a subset of rows.\n", + "df = df.sample(n=250)\n", + "df.cache()\n", + "df.shape" + ], "metadata": { "execution": { "iopub.execute_input": "2025-08-14T15:55:55.448664Z", @@ -346,17 +371,32 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "# For the purposes of a demo, select only a subset of rows.\n", - "df = df.sample(n=250)\n", - "df.cache()\n", - "df.shape" - ] + "execution_count": null, + "outputs": [] }, { + "id": "af84cb21", "cell_type": "code", - "execution_count": null, + "source": [ + "# As a side effect of how I extracted the song information from the HTML DOM,\n", + "# we ended up with lists in places where we only expect one item.\n", + "#\n", + "# We can \"explode\" to flatten these lists.\n", + "flattened = df.explode([\n", + " \"Recording Repository\",\n", + " \"Recording Label\",\n", + " \"Recording Take Number\",\n", + " \"Recording Date\",\n", + " \"Recording Matrix Number\",\n", + " \"Recording Catalog Number\",\n", + " \"Media Size\",\n", + " \"Recording Location\",\n", + " \"Summary\",\n", + " \"Rights Advisory\",\n", + " \"Title\",\n", + "])\n", + "flattened.peek()" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -379,31 +419,15 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "# As a side effect of how I extracted the song information from the HTML DOM,\n", - "# we ended up with lists in places where we only expect one item.\n", - "#\n", - "# We can \"explode\" to flatten these lists.\n", - "flattened = df.explode([\n", - " \"Recording Repository\",\n", - " \"Recording Label\",\n", - " \"Recording Take Number\",\n", - " \"Recording Date\",\n", - " \"Recording Matrix Number\",\n", - " \"Recording Catalog Number\",\n", - " \"Media Size\",\n", - " \"Recording Location\",\n", - " \"Summary\",\n", - " \"Rights Advisory\",\n", - " \"Title\",\n", - "])\n", - "flattened.peek()" - ] + "execution_count": null, + "outputs": [] }, { + "id": "085deffd", "cell_type": "code", - "execution_count": null, + "source": [ + "flattened.shape" + ], "metadata": { "execution": { "iopub.execute_input": "2025-08-14T15:56:06.546531Z", @@ -414,13 +438,15 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "flattened.shape" - ] + "execution_count": null, + "outputs": [] }, { + "id": "f8e653ee", "cell_type": "markdown", + "source": [ + "To access unstructured data from BigQuery, create a URI pointing to a file in Google Cloud Storage (GCS). Then, construct a \"blob\" (also known as an \"Object Ref\" in BigQuery terms) so that BigQuery can read from GCS." + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -437,13 +463,14 @@ }, "tags": [] }, - "source": [ - "To access unstructured data from BigQuery, create a URI pointing to a file in Google Cloud Storage (GCS). Then, construct a \"blob\" (also known as an \"Object Ref\" in BigQuery terms) so that BigQuery can read from GCS." - ] + "execution_count": null }, { + "id": "dbd1a844", "cell_type": "code", - "execution_count": null, + "source": [ + "flattened = flattened.assign(**{\\n \"GCS Prefix\": \"gs://cloud-samples-data/third-party/usa-loc-national-jukebox/\",\\n \"GCS Stub\": flattened['URL'].str.extract(r'/(jukebox-[0-9]+)/'),\\n})\\nflattened[\"GCS URI\"] = flattened[\"GCS Prefix\"] + flattened[\"GCS Stub\"] + \".mp3\"" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -468,18 +495,15 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "flattened = flattened.assign(**{\n", - " \"GCS Prefix\": \"gs://cloud-samples-data/third-party/usa-loc-national-jukebox/\",\n", - " \"GCS Stub\": flattened['URL'].str.extract(r'/(jukebox-[0-9]+)/'),\n", - "})\n", - "flattened[\"GCS URI\"] = flattened[\"GCS Prefix\"] + flattened[\"GCS Stub\"] + \".mp3\"\n", - "flattened[\"GCS Blob\"] = flattened[\"GCS URI\"].str.to_blob()" - ] + "execution_count": null, + "outputs": [] }, { + "id": "fae13ec5", "cell_type": "markdown", + "source": [ + "BigQuery (and BigQuery DataFrames) provide access to powerful models and multimodal capabilities. Here, we transcribe audio to text." + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -496,13 +520,35 @@ }, "tags": [] }, - "source": [ - "BigQuery (and BigQuery DataFrames) provide access to powerful models and multimodal capabilities. Here, we transcribe audio to text." - ] + "execution_count": null }, { + "id": "f08f92b1", "cell_type": "code", - "execution_count": null, + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "# Replace with your own connection name.\n", + "CONNECTION_ID = 'your-project-id.your-location.your-connection' # @param {type:\"string\"}\n", + "\n", + "# Convert the audio URI to the runtime representation required by the model.\n", + "audio_ref = bbq.obj.make_ref(flattened[\"GCS URI\"], authorizer=CONNECTION_ID)\n", + "audio_metadata = bbq.obj.fetch_metadata(audio_ref)\n", + "audio_runtime = bbq.obj.get_access_url(audio_metadata, mode=\"R\")\n", + "\n", + "# Call GenAI model to perform audio transcription\n", + "raw_results = bbq.ai.generate(\n", + " prompt=(\"Transcribe the provided audio.\", audio_runtime),\n", + " endpoint=\"gemini-2.5-flash\"\n", + ")\n", + "\n", + "# Package result struct to contain 'content' and 'status' expected by downstream cells\n", + "transcription_df = bpd.DataFrame({\n", + " \"content\": raw_results.struct.field(\"result\"),\n", + " \"status\": raw_results.struct.field(\"status\")\n", + "})\n", + "flattened[\"Transcription\"] = bbq.struct(transcription_df)" + ], "metadata": { "editable": true, "execution": { @@ -518,17 +564,15 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "flattened[\"Transcription\"] = flattened[\"GCS Blob\"].blob.audio_transcribe(\n", - " model_name=\"gemini-2.5-flash\",\n", - " verbose=True,\n", - ")\n", - "flattened[\"Transcription\"]" - ] + "execution_count": null, + "outputs": [] }, { + "id": "30969ae1", "cell_type": "markdown", + "source": [ + "Sometimes the model has transient errors. Check the status column to see if there are errors." + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -543,13 +587,16 @@ "slide_type": "slide" } }, - "source": [ - "Sometimes the model has transient errors. Check the status column to see if there are errors." - ] + "execution_count": null }, { + "id": "7d0dbc38", "cell_type": "code", - "execution_count": null, + "source": [ + "print(f\"Successful rows: {(flattened['Transcription'].struct.field('status') == '').sum()}\")\n", + "print(f\"Failed rows: {(flattened['Transcription'].struct.field('status') != '').sum()}\")\n", + "flattened.shape" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -574,16 +621,16 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "print(f\"Successful rows: {(flattened['Transcription'].struct.field('status') == '').sum()}\")\n", - "print(f\"Failed rows: {(flattened['Transcription'].struct.field('status') != '').sum()}\")\n", - "flattened.shape" - ] + "execution_count": null, + "outputs": [] }, { + "id": "6cddf53b", "cell_type": "code", - "execution_count": null, + "source": [ + "# Show transcribed lyrics.\n", + "flattened[\"Transcription\"].struct.field(\"content\")" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -603,15 +650,19 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "# Show transcribed lyrics.\n", - "flattened[\"Transcription\"].struct.field(\"content\")" - ] + "execution_count": null, + "outputs": [] }, { + "id": "ba0386cc", "cell_type": "code", - "execution_count": null, + "source": [ + "# Find all instrumentatal songs\n", + "instrumental = flattened[flattened[\"Transcription\"].struct.field(\"content\") == \"\"]\n", + "print(instrumental.shape)\n", + "song = instrumental.peek(1)\n", + "song" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -634,18 +685,22 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "# Find all instrumentatal songs\n", - "instrumental = flattened[flattened[\"Transcription\"].struct.field(\"content\") == \"\"]\n", - "print(instrumental.shape)\n", - "song = instrumental.peek(1)\n", - "song" - ] + "execution_count": null, + "outputs": [] }, { + "id": "61a883b2", "cell_type": "code", - "execution_count": null, + "source": [ + "import gcsfs\n", + "import IPython.display\n", + "\n", + "fs = gcsfs.GCSFileSystem(project='bigframes-dev')\n", + "with fs.open(song[\"GCS URI\"].iloc[0]) as song_file:\n", + " song_bytes = song_file.read()\n", + "\n", + "IPython.display.Audio(song_bytes)" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -670,20 +725,19 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "import gcsfs\n", - "import IPython.display\n", - "\n", - "fs = gcsfs.GCSFileSystem(project='bigframes-dev')\n", - "with fs.open(song[\"GCS URI\"].iloc[0]) as song_file:\n", - " song_bytes = song_file.read()\n", - "\n", - "IPython.display.Audio(song_bytes)" - ] + "execution_count": null, + "outputs": [] }, { + "id": "e8a25c46", "cell_type": "markdown", + "source": [ + "## Creating a searchable index\n", + "\n", + "To be able to search by semantics rather than just text, generate embeddings and then create an index to efficiently search these.\n", + "\n", + "See also, this example: https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -698,17 +752,16 @@ "slide_type": "slide" } }, - "source": [ - "## Creating a searchable index\n", - "\n", - "To be able to search by semantics rather than just text, generate embeddings and then create an index to efficiently search these.\n", - "\n", - "See also, this example: https://github.com/googleapis/python-bigquery-dataframes/blob/main/notebooks/generative_ai/bq_dataframes_llm_vector_search.ipynb" - ] + "execution_count": null }, { + "id": "ead0fa8c", "cell_type": "code", - "execution_count": null, + "source": [ + "from bigframes.ml.llm import TextEmbeddingGenerator\n", + "\n", + "text_model = TextEmbeddingGenerator(model_name=\"text-multilingual-embedding-002\")" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -728,16 +781,21 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "from bigframes.ml.llm import TextEmbeddingGenerator\n", - "\n", - "text_model = TextEmbeddingGenerator(model_name=\"text-multilingual-embedding-002\")" - ] + "execution_count": null, + "outputs": [] }, { + "id": "5ed7776d", "cell_type": "code", - "execution_count": null, + "source": [ + "df_to_index = (\n", + " flattened\n", + " .assign(content=flattened[\"Transcription\"].struct.field(\"content\"))\n", + " [flattened[\"Transcription\"].struct.field(\"content\") != \"\"]\n", + ")\n", + "embedding = text_model.predict(df_to_index)\n", + "embedding.peek(1)" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -757,20 +815,18 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "df_to_index = (\n", - " flattened\n", - " .assign(content=flattened[\"Transcription\"].struct.field(\"content\"))\n", - " [flattened[\"Transcription\"].struct.field(\"content\") != \"\"]\n", - ")\n", - "embedding = text_model.predict(df_to_index)\n", - "embedding.peek(1)" - ] + "execution_count": null, + "outputs": [] }, { + "id": "c96e9832", "cell_type": "code", - "execution_count": null, + "source": [ + "# Check the status column to look for errors.\n", + "print(f\"Successful rows: {(embedding['ml_generate_embedding_status'] == '').sum()}\")\n", + "print(f\"Failed rows: {(embedding['ml_generate_embedding_status'] != '').sum()}\")\n", + "embedding.shape" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -795,16 +851,15 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "# Check the status column to look for errors.\n", - "print(f\"Successful rows: {(embedding['ml_generate_embedding_status'] == '').sum()}\")\n", - "print(f\"Failed rows: {(embedding['ml_generate_embedding_status'] != '').sum()}\")\n", - "embedding.shape" - ] + "execution_count": null, + "outputs": [] }, { + "id": "0e2a5d7b", "cell_type": "markdown", + "source": [ + "We're now ready to save this to a table." + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -816,13 +871,15 @@ } } }, - "source": [ - "We're now ready to save this to a table." - ] + "execution_count": null }, { + "id": "51819a0c", "cell_type": "code", - "execution_count": null, + "source": [ + "embedding_table_id = f\"{bpd.options.bigquery.project}.kaggle.national_jukebox\"\n", + "embedding.to_gbq(embedding_table_id, if_exists=\"replace\")" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -842,14 +899,20 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "embedding_table_id = f\"{bpd.options.bigquery.project}.kaggle.national_jukebox\"\n", - "embedding.to_gbq(embedding_table_id, if_exists=\"replace\")" - ] + "execution_count": null, + "outputs": [] }, { + "id": "5e16fb14", "cell_type": "markdown", + "source": [ + "## Searching the database\n", + "\n", + "To search by semantics, we:\n", + "\n", + "1. Turn our search string into an embedding using the same model as our index.\n", + "2. Find the closest matches to the search string." + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -864,18 +927,17 @@ "slide_type": "slide" } }, - "source": [ - "## Searching the database\n", - "\n", - "To search by semantics, we:\n", - "\n", - "1. Turn our search string into an embedding using the same model as our index.\n", - "2. Find the closest matches to the search string." - ] + "execution_count": null }, { + "id": "1bad3317", "cell_type": "code", - "execution_count": null, + "source": [ + "import bigframes.pandas as bpd\n", + "\n", + "df_written = bpd.read_gbq(embedding_table_id)\n", + "df_written.peek(1)" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -898,17 +960,22 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "import bigframes.pandas as bpd\n", - "\n", - "df_written = bpd.read_gbq(embedding_table_id)\n", - "df_written.peek(1)" - ] + "execution_count": null, + "outputs": [] }, { + "id": "8aaaef1f", "cell_type": "code", - "execution_count": null, + "source": [ + "from bigframes.ml.llm import TextEmbeddingGenerator\n", + "\n", + "search_string = \"walking home\"\n", + "\n", + "text_model = TextEmbeddingGenerator(model_name=\"text-multilingual-embedding-002\")\n", + "search_df = bpd.DataFrame([search_string], columns=['search_string'])\n", + "search_embedding = text_model.predict(search_df)\n", + "search_embedding" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -928,21 +995,24 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "from bigframes.ml.llm import TextEmbeddingGenerator\n", - "\n", - "search_string = \"walking home\"\n", - "\n", - "text_model = TextEmbeddingGenerator(model_name=\"text-multilingual-embedding-002\")\n", - "search_df = bpd.DataFrame([search_string], columns=['search_string'])\n", - "search_embedding = text_model.predict(search_df)\n", - "search_embedding" - ] + "execution_count": null, + "outputs": [] }, { + "id": "908a2340", "cell_type": "code", - "execution_count": null, + "source": [ + "import bigframes.bigquery as bbq\n", + "\n", + "vector_search_results = bbq.vector_search(\n", + " base_table=embedding_table_id,\n", + " column_to_search=\"ml_generate_embedding_result\",\n", + " query=search_embedding,\n", + " distance_type=\"COSINE\",\n", + " query_column_to_search=\"ml_generate_embedding_result\",\n", + " top_k=5,\n", + ")" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -967,23 +1037,15 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "import bigframes.bigquery as bbq\n", - "\n", - "vector_search_results = bbq.vector_search(\n", - " base_table=f\"swast-scratch.scipy2025.national_jukebox\",\n", - " column_to_search=\"ml_generate_embedding_result\",\n", - " query=search_embedding,\n", - " distance_type=\"COSINE\",\n", - " query_column_to_search=\"ml_generate_embedding_result\",\n", - " top_k=5,\n", - ")" - ] + "execution_count": null, + "outputs": [] }, { + "id": "f84ebe70", "cell_type": "code", - "execution_count": null, + "source": [ + "vector_search_results.dtypes" + ], "metadata": { "execution": { "iopub.execute_input": "2025-08-14T16:05:50.566930Z", @@ -994,14 +1056,16 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "vector_search_results.dtypes" - ] + "execution_count": null, + "outputs": [] }, { + "id": "eeff1c72", "cell_type": "code", - "execution_count": null, + "source": [ + "results = vector_search_results[[\"Title\", \"Summary\", \"Names\", \"GCS URI\", \"Transcription\", \"distance\"]].sort_values(\"distance\").to_pandas()\n", + "results" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -1024,15 +1088,15 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "results = vector_search_results[[\"Title\", \"Summary\", \"Names\", \"GCS URI\", \"Transcription\", \"distance\"]].sort_values(\"distance\").to_pandas()\n", - "results" - ] + "execution_count": null, + "outputs": [] }, { + "id": "7ec53675", "cell_type": "code", - "execution_count": null, + "source": [ + "print(results[\"Transcription\"].struct.field(\"content\").iloc[0])" + ], "metadata": { "@deathbeds/jupyterlab-fonts": { "styles": { @@ -1052,14 +1116,22 @@ }, "trusted": true }, - "outputs": [], - "source": [ - "print(results[\"Transcription\"].struct.field(\"content\").iloc[0])" - ] + "execution_count": null, + "outputs": [] }, { + "id": "a96552fb", "cell_type": "code", - "execution_count": null, + "source": [ + "import gcsfs\n", + "import IPython.display\n", + "\n", + "fs = gcsfs.GCSFileSystem(project='bigframes-dev')\n", + "with fs.open(results[\"GCS URI\"].iloc[0]) as song_file:\n", + " song_bytes = song_file.read()\n", + "\n", + "IPython.display.Audio(song_bytes)" + ], "metadata": { "editable": true, "execution": { @@ -1076,26 +1148,18 @@ "tags": [], "trusted": true }, - "outputs": [], - "source": [ - "import gcsfs\n", - "import IPython.display\n", - "\n", - "fs = gcsfs.GCSFileSystem(project='bigframes-dev')\n", - "with fs.open(results[\"GCS URI\"].iloc[0]) as song_file:\n", - " song_bytes = song_file.read()\n", - "\n", - "IPython.display.Audio(song_bytes)" - ] + "execution_count": null, + "outputs": [] }, { + "id": "72af7c7f", "cell_type": "code", - "execution_count": null, + "source": [], "metadata": { "trusted": true }, - "outputs": [], - "source": [] + "execution_count": null, + "outputs": [] } ], "metadata": { @@ -1132,6 +1196,6 @@ "version": "3.11.13" } }, - "nbformat": 4, - "nbformat_minor": 4 -} + "nbformat_minor": 4, + "nbformat": 4 +} \ No newline at end of file diff --git a/packages/bigframes/notebooks/multimodal/multimodal_dataframe.ipynb b/packages/bigframes/notebooks/multimodal/multimodal_dataframe.ipynb index 8f3241259d5f..cd363db6f362 100644 --- a/packages/bigframes/notebooks/multimodal/multimodal_dataframe.ipynb +++ b/packages/bigframes/notebooks/multimodal/multimodal_dataframe.ipynb @@ -2,7 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, + "id": "9edad7a6", "metadata": {}, "outputs": [], "source": [ @@ -23,6 +24,7 @@ }, { "cell_type": "markdown", + "id": "816ab253", "metadata": { "id": "YOrUAvz6DMw-" }, @@ -53,6 +55,7 @@ }, { "cell_type": "markdown", + "id": "77d821d4", "metadata": {}, "source": [ "This notebook is introducing BigFrames Multimodal features:\n", @@ -67,6 +70,7 @@ }, { "cell_type": "markdown", + "id": "75ab1c13", "metadata": { "id": "PEAJQQ6AFg-n" }, @@ -76,6 +80,7 @@ }, { "cell_type": "markdown", + "id": "750954c4", "metadata": {}, "source": [ "Install the latest bigframes package if bigframes version < 2.4.0" @@ -83,7 +88,8 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, + "id": "2a6fafb1", "metadata": {}, "outputs": [], "source": [ @@ -92,7 +98,8 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, + "id": "df561d04", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -115,6 +122,8 @@ "# Refer to https://cloud.google.com/bigquery/docs/multimodal-data-dataframes-tutorial#grant-permissions for setting up connection service account permissions.\n", "# In this Notebook it uses bigframes-default-connection by default. You can also bring in your own connections in each method.\n", "\n", + "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", + "\n", "import bigframes\n", "# Setup project\n", "bigframes.options.bigquery.project = PROJECT\n", @@ -130,7 +139,8 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 35, + "id": "35bd6e6e", "metadata": {}, "outputs": [], "source": [ @@ -171,22 +181,90 @@ " return bbq.json_value(get_metadata(series), \"$.size\").astype(\"Int64\")\n", "\n", "def get_updated(series):\n", - " return bpd.to_datetime(bbq.json_value(get_metadata(series), \"$.updated\").astype(\"Int64\"), unit=\"us\", utc=True)" + " return bpd.to_datetime(bbq.json_value(get_metadata(series), \"$.updated\").astype(\"Int64\"), unit=\"us\", utc=True)\n", + "\n", + "from IPython.display import HTML, display\n", + "\n", + "def render_images(df):\n", + " \"\"\"Helper to display BigFrames DataFrame with rendered image previews.\"\"\"\n", + " import bigframes.pandas as bpd\n", + " import bigframes.bigquery as bbq\n", + " import bigframes\n", + " from bigframes import dtypes\n", + " import json\n", + " \n", + " if isinstance(df, bpd.Series):\n", + " df = df.to_frame()\n", + " \n", + " # 1. Auto-detect columns holding ObjectRefs\n", + " object_cols = [\n", + " col for col, dtype in zip(df.columns, df.dtypes)\n", + " if dtype == dtypes.OBJ_REF_DTYPE\n", + " ]\n", + " \n", + " if not object_cols:\n", + " display(df)\n", + " return\n", + "\n", + " limit = bigframes.options.display.max_rows or 10\n", + " view_df = df.head(limit)\n", + " \n", + " # 2. Bulk-fetch access runtime URLs ONLY (disable with_metadata to bypass potential \n", + " # race conditions on new files where BigQuery may error before async writes finalize)\n", + " runtime_cols = {\n", + " col: get_runtime_json_str(view_df[col], mode=\"R\", with_metadata=False) \n", + " for col in object_cols\n", + " }\n", + " \n", + " pandas_json_df = bpd.DataFrame(runtime_cols).to_pandas()\n", + " final_pd = view_df.to_pandas()\n", + " \n", + " width = bigframes.options.display.blob_display_width or 300\n", + " IMAGE_EXTENSIONS = (\".png\", \".jpg\", \".jpeg\", \".gif\", \".webp\")\n", + " \n", + " def format_cell_html(raw_json):\n", + " if not raw_json:\n", + " return \"\"\n", + " try:\n", + " obj_rt = json.loads(raw_json)\n", + " \n", + " if \"access_urls\" not in obj_rt:\n", + " err = obj_rt.get(\"errors\", [{\"message\": \"URL Generation Failed\"}])[0].get(\"message\")\n", + " return f'Error: {err}'\n", + " \n", + " uri = obj_rt.get(\"objectref\", {}).get(\"uri\", \"\")\n", + " url = obj_rt[\"access_urls\"][\"read_url\"]\n", + " \n", + " # Safely infer type from extension to guarantee immediate display availability\n", + " if uri and str(uri).lower().endswith(IMAGE_EXTENSIONS):\n", + " return f''\n", + " \n", + " return f'{uri if uri else \"view\"}'\n", + " except:\n", + " return \"Format Error\"\n", + "\n", + " for col in object_cols:\n", + " final_pd[col] = pandas_json_df[col].map(format_cell_html)\n", + " \n", + " display(HTML(final_pd.to_html(escape=False)))" ] }, { "cell_type": "markdown", + "id": "be9ce892", "metadata": { "id": "ifKOq7VZGtZy" }, "source": [ - "### 1. Create Multimodal DataFrame\n", - "There are several ways to create Multimodal DataFrame. The easiest way is from the wildcard paths." + "To create a Multimodal DataFrame, you can use `bigframes.bigquery.obj.make_ref` on a series of URIs. You can get the URIs from a BigQuery table or by listing them from Cloud Storage.\n", + "\n", + "In this example, we use `gcsfs` to list the files from Cloud Storage, and then use `read_gbq` to load them into a BigQuery DataFrame before creating the object reference." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 36, + "id": "871d02f4", "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -196,15 +274,29 @@ }, "outputs": [], "source": [ - "# Create blob columns from wildcard path.\n", - "df_image = bpd.from_glob_path(\n", - " \"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*\", name=\"image\"\n", - ")" + "import gcsfs\n", + "import bigframes.bigquery as bbq\n", + "\n", + "# List files using gcsfs (public bucket)\n", + "fs = gcsfs.GCSFileSystem(anon=True)\n", + "uris = fs.glob(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/images/*\")\n", + "\n", + "# Ensure URIs have gs:// prefix\n", + "uris = [u if u.startswith(\"gs://\") else f\"gs://{u}\" for u in uris]\n", + "\n", + "# Read the URIs into a BigQuery DataFrame using UNNEST\n", + "# We take the first 5 for this example\n", + "df_image = bpd.read_gbq(f\"SELECT uri FROM UNNEST({uris[:5]}) as uri\")\n", + "\n", + "# Create the object reference column\n", + "df_image['image'] = bbq.obj.make_ref(df_image['uri'], authorizer=FULL_CONNECTION_ID)\n", + "df_image = df_image[['image']]" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 37, + "id": "2e0436b0", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -218,31 +310,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" ] }, { "data": { "text/html": [ - "
\n", - "\n", "\n", " \n", " \n", @@ -253,53 +329,44 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - "
0
1
2
3
4
\n", - "

5 rows × 1 columns

\n", - "
[5 rows x 1 columns in total]" + "" ], "text/plain": [ - " image\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3...\n", - "\n", - "[5 rows x 1 columns]" + "" ] }, - "execution_count": 6, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ "# Take only the 5 images to deal with. Preview the content of the Mutimodal DataFrame\n", "df_image = df_image.head(5)\n", - "df_image" + "render_images(df_image)" ] }, { "cell_type": "markdown", + "id": "429b0117", "metadata": { "id": "b6RRZb3qPi_T" }, @@ -309,6 +376,7 @@ }, { "cell_type": "markdown", + "id": "991fa065", "metadata": { "id": "4YJCdmLtR-qu" }, @@ -318,7 +386,8 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 38, + "id": "08722ec5", "metadata": { "id": "YYYVn7NDH0Me" }, @@ -327,31 +396,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" ] }, { "data": { "text/html": [ - "
\n", - "\n", "\n", " \n", " \n", @@ -366,70 +419,53 @@ " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - "
0aliceimage/png15912402025-03-20 17:45:04+00:007157662025-03-20 17:44:38+00:00
1bobimage/png11829512025-03-20 17:45:02+00:0011674062025-03-20 17:44:38+00:00
2bobimage/png15208842025-03-20 17:44:55+00:0011508922025-03-20 17:44:39+00:00
3aliceimage/png12354012025-03-20 17:45:19+00:0017365332025-03-20 17:44:39+00:00
4bobimage/png15919232025-03-20 17:44:47+00:004397402025-03-20 17:44:39+00:00
\n", - "

5 rows × 5 columns

\n", - "
[5 rows x 5 columns in total]" + "" ], "text/plain": [ - " image author content_type \\\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... alice image/png \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... bob image/png \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... bob image/png \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... alice image/png \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... bob image/png \n", - "\n", - " size updated \n", - "0 1591240 2025-03-20 17:45:04+00:00 \n", - "1 1182951 2025-03-20 17:45:02+00:00 \n", - "2 1520884 2025-03-20 17:44:55+00:00 \n", - "3 1235401 2025-03-20 17:45:19+00:00 \n", - "4 1591923 2025-03-20 17:44:47+00:00 \n", - "\n", - "[5 rows x 5 columns]" + "" ] }, - "execution_count": 7, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ @@ -439,11 +475,12 @@ "df_image[\"content_type\"] = get_content_type(df_image[\"image\"])\n", "df_image[\"size\"] = get_size(df_image[\"image\"])\n", "df_image[\"updated\"] = get_updated(df_image[\"image\"])\n", - "df_image" + "render_images(df_image)" ] }, { "cell_type": "markdown", + "id": "f90826f6", "metadata": {}, "source": [ "### 3. Conduct image transformations" @@ -451,6 +488,7 @@ }, { "cell_type": "markdown", + "id": "e24c9f8c", "metadata": {}, "source": [ "This section demonstrates how to perform image transformations like blur, resize, and normalize using custom BigQuery Python UDFs and the `opencv-python` library." @@ -458,7 +496,8 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 39, + "id": "db665049", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -472,37 +511,19 @@ "name": "stderr", "output_type": "stream", "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:151: PreviewWarning: udf is in preview.\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/pandas/__init__.py:211: PreviewWarning: udf is in preview.\n", " return global_session.with_default_session(\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dataframe.py:4655: FunctionAxisOnePreviewWarning: DataFrame.apply with parameter axis=1 scenario is in preview.\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dataframe.py:4695: FunctionAxisOnePreviewWarning: DataFrame.apply with parameter axis=1 scenario is in preview.\n", " warnings.warn(msg, category=bfe.FunctionAxisOnePreviewWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", + "/usr/local/google/home/shuowei/src/google-cloud-python/google-cloud-python/packages/bigframes/bigframes/dtypes.py:1044: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", "instead of using `db_dtypes` in the future when available in pandas\n", "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" + " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" ] }, { "data": { "text/html": [ - "
\n", - "\n", "\n", " \n", " \n", @@ -514,55 +535,38 @@ " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - "
0
1
2
3
4
\n", - "

5 rows × 2 columns

\n", - "
[5 rows x 2 columns in total]" + "" ], "text/plain": [ - " image \\\n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "\n", - " blurred \n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:3... \n", - "\n", - "[5 rows x 2 columns]" + "" ] }, - "execution_count": 8, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ @@ -573,9 +577,9 @@ " input_types=[str, str, int, int],\n", " output_type=str,\n", " dataset=DATASET_ID,\n", - " name=\"image_blur\",\n", + " name=\"image_blur_v2\",\n", " bigquery_connection=FULL_CONNECTION_ID,\n", - " packages=[\"opencv-python\", \"numpy\", \"requests\"],\n", + " packages=[\"opencv-python-headless\", \"numpy\", \"requests\"],\n", ")\n", "def image_blur(src_rt: str, dst_rt: str, kx: int, ky: int) -> str:\n", " import json\n", @@ -585,6 +589,8 @@ " import base64\n", "\n", " src_obj = json.loads(src_rt)\n", + " if \"access_urls\" not in src_obj:\n", + " raise ValueError(f\"Missing 'access_urls' in source object. Response: {src_obj}\")\n", " src_url = src_obj[\"access_urls\"][\"read_url\"]\n", " \n", " response = requests.get(src_url, timeout=30)\n", @@ -604,6 +610,8 @@ " # Handle two output modes\n", " if dst_rt: # GCS/Series output mode\n", " dst_obj = json.loads(dst_rt)\n", + " if \"access_urls\" not in dst_obj:\n", + " raise ValueError(f\"Missing 'access_urls' in destination object. Verify authorizer permissions. Response: {dst_obj}\")\n", " dst_url = dst_obj[\"access_urls\"][\"write_url\"]\n", " \n", " requests.put(dst_url, data=encoded.tobytes(), headers={\"Content-Type\": \"image/jpeg\"}, timeout=30).raise_for_status()\n", @@ -622,7 +630,13 @@ " metadata = bbq.obj.fetch_metadata(series)\n", " current_uri = metadata.struct.field(\"uri\")\n", " dst_uri = current_uri.str.replace(r\"^.*\\/(.*)$\", rf\"{dst_folder}\\1\", regex=True)\n", - " dst_blob = dst_uri.str.to_blob(connection=FULL_CONNECTION_ID)\n", + " \n", + " # To avoid synchronous 404 validation checks on files that don't exist yet, \n", + " # bypass the validator by explicitly constructing an objectref JSON.\n", + " dst_blob_df = bpd.DataFrame({\"uri\": dst_uri})\n", + " dst_blob_df[\"authorizer\"] = FULL_CONNECTION_ID\n", + " dst_blob = bbq.obj.make_ref(bbq.to_json(bbq.struct(dst_blob_df)))\n", + "\n", " df_transform = bpd.DataFrame({\n", " \"src_rt\": get_runtime_json_str(series, mode=\"R\"),\n", " \"dst_rt\": get_runtime_json_str(dst_blob, mode=\"RW\"),\n", @@ -630,18 +644,27 @@ " res = df_transform[[\"src_rt\", \"dst_rt\"]].apply(\n", " udf, axis=1, args=args\n", " )\n", - " return res if verbose else res.str.to_blob(connection=FULL_CONNECTION_ID)\n", + " \n", + " if verbose:\n", + " return res\n", + " \n", + " # Final return MUST also use JSON bypass to eliminate temporary 404 validation \n", + " # errors from embedded ObjectRefs during fused query execution pipelines.\n", + " res_df = bpd.DataFrame({\"uri\": res})\n", + " res_df[\"authorizer\"] = FULL_CONNECTION_ID\n", + " return bbq.obj.make_ref(bbq.to_json(bbq.struct(res_df)))\n", "\n", "# Apply transformations\n", "df_image[\"blurred\"] = apply_transformation(\n", " df_image[\"image\"], f\"gs://{OUTPUT_BUCKET}/image_blur_transformed/\",\n", " image_blur, 20, 20\n", ")\n", - "df_image[[\"image\", \"blurred\"]]" + "render_images(df_image[[\"image\", \"blurred\"]])" ] }, { "cell_type": "markdown", + "id": "11fcc6ec", "metadata": { "id": "Euk5saeVVdTP" }, @@ -651,22 +674,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, + "id": "793b2f45", "metadata": { "id": "mRUGfcaFVW-3" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:183: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", - "default model will be removed in BigFrames 3.0. Please supply an\n", - "explicit model to avoid this message.\n", - " return method(*args, **kwargs)\n" - ] - } - ], + "outputs": [], "source": [ "from bigframes.ml import llm\n", "gemini = llm.GeminiTextGenerator()" @@ -674,7 +687,8 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, + "id": "13d7cb93", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -683,113 +697,17 @@ "id": "DNFP7CbjWdR9", "outputId": "3f90a062-0abc-4bce-f53c-db57b06a14b9" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_text_llm_resultimage
0The item is a container of K9 Guard Dog Paw Balm.
1The item is K9 Guard Dog Hot Spot Spray.
2The image contains three bags of food, likely for small animals like rabbits or guinea pigs. They are labeled \"Timoth Hay Lend Variety Plend\", \"Herbal Greeıs Mix Variety Blend\", and \"Berry & Blossom Treat Blend\", all under the brand \"Fluffy Buns.\" The bags are yellow, green, and purple, respectively. Each bag has a pile of its contents beneath it.
3The item is a cat tree.\\n
4The item is a bag of bird seed. Specifically, it's labeled \"Chirpy Seed\", \"Deluxe Bird Food\".\\n
\n", - "

5 rows × 2 columns

\n", - "
[5 rows x 2 columns in total]" - ], - "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 The item is a container of K9 Guard Dog Paw Balm. \n", - "1 The item is K9 Guard Dog Hot Spot Spray. \n", - "2 The image contains three bags of food, likely ... \n", - "3 The item is a cat tree.\\n \n", - "4 The item is a bag of bird seed. Specifically, ... \n", - "\n", - " image \n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "\n", - "[5 rows x 2 columns]" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Ask the same question on the images\n", - "answer = gemini.predict(df_image, prompt=[\"what item is it?\", df_image[\"image\"]])\n", - "answer[[\"ml_generate_text_llm_result\", \"image\"]]" + "answer = gemini.predict(df_image, prompt=[\"what item is it?\", \"what color is the picture?\"])\n", + "render_images(answer[[\"ml_generate_text_llm_result\", \"image\"]])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, + "id": "68857305", "metadata": { "id": "IG3J3HsKhyBY" }, @@ -807,7 +725,8 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, + "id": "829afc69", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -816,112 +735,16 @@ "id": "qKOb765IiVuD", "outputId": "731bafad-ea29-463f-c8c1-cb7acfd70e5d" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_text_llm_resultimage
0The item is a container of Dog Paw Balm.
1The picture contains many colors, including white, black, green, and a bright blue. The product label predominantly features a bright blue hue. The background is a solid gray.
2Here are the product names from the image:\\n\\n* **Timoth Hay Lend Variety Plend** is the product in the yellow bag.\\n* **Herbal Greeıs Mix Variety Blend** is the product in the green bag.\\n* **Berry & Blossom Treat Blend** is the product in the purple bag.
3Yes, it is for pets. It appears to be a cat tree or scratching post.\\n
4The image shows that the weight of the product is 15 oz/ 257g.
\n", - "

5 rows × 2 columns

\n", - "
[5 rows x 2 columns in total]" - ], - "text/plain": [ - " ml_generate_text_llm_result \\\n", - "0 The item is a container of Dog Paw Balm. \n", - "1 The picture contains many colors, including wh... \n", - "2 Here are the product names from the image:\\n\\n... \n", - "3 Yes, it is for pets. It appears to be a cat tr... \n", - "4 The image shows that the weight of the product... \n", - "\n", - " image \n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "\n", - "[5 rows x 2 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "answer_alt = gemini.predict(df_image, prompt=[df_image[\"question\"], df_image[\"image\"]])\n", - "answer_alt[[\"ml_generate_text_llm_result\", \"image\"]]" + "render_images(answer_alt[[\"ml_generate_text_llm_result\", \"image\"]])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, + "id": "e75df430", "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -930,138 +753,7 @@ "id": "KATVv2CO5RT1", "outputId": "6ec01f27-70b6-4f69-c545-e5e3c879480c" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:183: FutureWarning: Since upgrading the default model can cause unintended breakages, the\n", - "default model will be removed in BigFrames 3.0. Please supply an\n", - "explicit model to avoid this message.\n", - " return method(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/logging/log_adapter.py:229: ApiDeprecationWarning: The blob accessor is deprecated and will be removed in a future release. Use bigframes.bigquery.obj functions instead.\n", - " return prop(*args, **kwargs)\n", - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
ml_generate_embedding_resultml_generate_embedding_statusml_generate_embedding_start_secml_generate_embedding_end_seccontent
0[ 0.00638822 0.01666385 0.00451817 ... -0.02...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...
1[ 0.00973976 0.02148137 0.0024429 ... 0.00...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...
2[ 0.01195884 0.02139394 0.05968047 ... -0.01...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...
3[-0.02621161 0.02797648 0.04416926 ... -0.01...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...
4[ 0.05918628 0.0125137 0.01907336 ... 0.01...<NA><NA>{\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4...
\n", - "

5 rows × 5 columns

\n", - "
[5 rows x 5 columns in total]" - ], - "text/plain": [ - " ml_generate_embedding_result \\\n", - "0 [ 0.00638822 0.01666385 0.00451817 ... -0.02... \n", - "1 [ 0.00973976 0.02148137 0.0024429 ... 0.00... \n", - "2 [ 0.01195884 0.02139394 0.05968047 ... -0.01... \n", - "3 [-0.02621161 0.02797648 0.04416926 ... -0.01... \n", - "4 [ 0.05918628 0.0125137 0.01907336 ... 0.01... \n", - "\n", - " ml_generate_embedding_status ml_generate_embedding_start_sec \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "\n", - " ml_generate_embedding_end_sec \\\n", - "0 \n", - "1 \n", - "2 \n", - "3 \n", - "4 \n", - "\n", - " content \n", - "0 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "1 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "2 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "3 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "4 {\"access_urls\":{\"expiry_time\":\"2026-02-21T01:4... \n", - "\n", - "[5 rows x 5 columns]" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Generate embeddings.\n", "embed_model = llm.MultimodalEmbeddingGenerator()\n", @@ -1071,6 +763,7 @@ }, { "cell_type": "markdown", + "id": "23892b0e", "metadata": { "id": "iRUi8AjG7cIf" }, @@ -1082,18 +775,10 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, + "id": "136a18b8", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:151: PreviewWarning: udf is in preview.\n", - " return global_session.with_default_session(\n" - ] - } - ], + "outputs": [], "source": [ "# Construct the canonical connection ID\n", "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", @@ -1166,62 +851,27 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, + "id": "234a5f86", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
extracted_textchunked
0CritterCuisine Pro 5000 - Automatic Pet Feeder...[\"CritterCuisine Pro 5000 - Automatic Pet Feed...
\n", - "

1 rows × 2 columns

\n", - "
[1 rows x 2 columns in total]" - ], - "text/plain": [ - " extracted_text \\\n", - "0 CritterCuisine Pro 5000 - Automatic Pet Feeder... \n", - "\n", - " chunked \n", - "0 [\"CritterCuisine Pro 5000 - Automatic Pet Feed... \n", - "\n", - "[1 rows x 2 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "df_pdf = bpd.from_glob_path(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\", name=\"pdf\")\n", + "import gcsfs\n", + "import bigframes.bigquery as bbq\n", + "\n", + "# List files using gcsfs\n", + "fs = gcsfs.GCSFileSystem(anon=True)\n", + "uris = fs.glob(\"gs://cloud-samples-data/bigquery/tutorials/cymbal-pets/documents/*\")\n", + "\n", + "# Ensure URIs have gs:// prefix\n", + "uris = [u if u.startswith(\"gs://\") else f\"gs://{u}\" for u in uris]\n", + "\n", + "# Read the URIs into a BigQuery DataFrame\n", + "df_pdf = bpd.read_gbq(f\"SELECT uri FROM UNNEST({uris[:5]}) as uri\")\n", + "\n", + "# Create the object reference column\n", + "df_pdf['pdf'] = bbq.obj.make_ref(df_pdf['uri'], authorizer=FULL_CONNECTION_ID)\n", + "df_pdf = df_pdf[['pdf']]\n", "\n", "# Generate a JSON string containing the runtime information (including signed read URLs)\n", "access_urls = get_runtime_json_str(df_pdf[\"pdf\"], mode=\"R\")\n", @@ -1237,36 +887,10 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, + "id": "d80effbe", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
0    CritterCuisine Pro 5000 - Automatic Pet Feeder...\n",
-       "0    on a level, stable surface to prevent tipping....\n",
-       "0    included)\\nto maintain the schedule during pow...\n",
-       "0    digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n",
-       "0    paperclip) for 5\\nseconds. This will reset all...\n",
-       "0    unit with a damp cloth. Do not immerse the bas...\n",
-       "0    continues,\\ncontact customer support.\\nE2: Foo...
" - ], - "text/plain": [ - "0 CritterCuisine Pro 5000 - Automatic Pet Feeder...\n", - "0 on a level, stable surface to prevent tipping....\n", - "0 included)\\nto maintain the schedule during pow...\n", - "0 digits for Meal 1 will flash.\\n\u0000. Use the UP/D...\n", - "0 paperclip) for 5\\nseconds. This will reset all...\n", - "0 unit with a damp cloth. Do not immerse the bas...\n", - "0 continues,\\ncontact customer support.\\nE2: Foo...\n", - "Name: chunked, dtype: string" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Explode the chunks to see each chunk as a separate row\n", "chunked = df_pdf[\"chunked\"].explode()\n", @@ -1275,6 +899,7 @@ }, { "cell_type": "markdown", + "id": "118cf1c7", "metadata": {}, "source": [ "### 6. Audio transcribe" @@ -1282,44 +907,42 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, + "id": "1794c54f", "metadata": {}, "outputs": [], "source": [ + "import gcsfs\n", + "import bigframes.bigquery as bbq\n", + "\n", "audio_gcs_path = \"gs://bigframes_blob_test/audio/*\"\n", - "df = bpd.from_glob_path(audio_gcs_path, name=\"audio\")" + "\n", + "# List files using gcsfs\n", + "fs = gcsfs.GCSFileSystem()\n", + "uris = fs.glob(audio_gcs_path)\n", + "\n", + "# Ensure URIs have gs:// prefix\n", + "uris = [u if u.startswith(\"gs://\") else f\"gs://{u}\" for u in uris]\n", + "\n", + "# Read the URIs into a BigQuery DataFrame\n", + "# If the bucket is empty or doesn't exist, this will result in an empty DataFrame\n", + "if not uris:\n", + " # Fallback to a dummy list or just let it be empty\n", + " uris = [\"gs://bigframes_blob_test/audio/dummy.mp3\"]\n", + "\n", + "df = bpd.read_gbq(f\"SELECT uri FROM UNNEST({uris[:5]}) as uri\")\n", + "\n", + "# Create the object reference column\n", + "df['audio'] = bbq.obj.make_ref(df['uri'], authorizer=FULL_CONNECTION_ID)\n", + "df = df[['audio']]" ] }, { "cell_type": "code", "execution_count": null, + "id": "c9f9d484", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/dtypes.py:990: JSONDtypeWarning: JSON columns will be represented as pandas.ArrowDtype(pyarrow.json_())\n", - "instead of using `db_dtypes` in the future when available in pandas\n", - "(https://github.com/pandas-dev/pandas/issues/60958) and pyarrow.\n", - " warnings.warn(msg, bigframes.exceptions.JSONDtypeWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
0    Now, as all books, not primarily intended as p...
" - ], - "text/plain": [ - "0 Now, as all books, not primarily intended as p...\n", - "Name: transcribed_content, dtype: string" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# The audio_transcribe function is a convenience wrapper around bigframes.bigquery.ai.generate.\n", "# Here's how to perform the same operation directly:\n", @@ -1349,24 +972,10 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, + "id": "7209a62a", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
0    {'status': '', 'content': 'Now, as all books, ...
" - ], - "text/plain": [ - "0 {'status': '', 'content': 'Now, as all books, ...\n", - "Name: transcription_results, dtype: struct[pyarrow]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# To get verbose results (including status), we can extract both fields from the result struct.\n", "transcribed_content_series = transcribed_results.struct.field(\"result\")\n", @@ -1385,6 +994,7 @@ }, { "cell_type": "markdown", + "id": "c8351cc3", "metadata": {}, "source": [ "### 7. Extract EXIF metadata from images" @@ -1392,6 +1002,7 @@ }, { "cell_type": "markdown", + "id": "e59670b9", "metadata": {}, "source": [ "This section demonstrates how to extract EXIF metadata from images using a custom BigQuery Python UDF and the `Pillow` library." @@ -1399,18 +1010,10 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, + "id": "fda362f4", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/pandas/__init__.py:151: PreviewWarning: udf is in preview.\n", - " return global_session.with_default_session(\n" - ] - } - ], + "outputs": [], "source": [ "# Construct the canonical connection ID\n", "FULL_CONNECTION_ID = f\"{PROJECT}.{LOCATION}.bigframes-default-connection\"\n", @@ -1447,39 +1050,27 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, + "id": "40bb6bc9", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/google/home/shuowei/src/python-bigquery-dataframes/bigframes/core/utils.py:228: PreviewWarning: The JSON-related API `parse_json` is in preview. Its behavior may\n", - "change in future versions.\n", - " warnings.warn(bfe.format_message(msg), category=bfe.PreviewWarning)\n" - ] - }, - { - "data": { - "text/html": [ - "
0    {\"ExifOffset\":47,\"Make\":\"MyCamera\"}
" - ], - "text/plain": [ - "0 {\"ExifOffset\":47,\"Make\":\"MyCamera\"}\n", - "Name: blob_col, dtype: extension>[pyarrow]" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ + "import gcsfs\n", + "import bigframes.bigquery as bbq\n", + "\n", "# Create a Multimodal DataFrame from the sample image URIs\n", - "exif_image_df = bpd.from_glob_path(\n", - " \"gs://bigframes_blob_test/images_exif/*\",\n", - " name=\"blob_col\",\n", - ")\n", + "fs = gcsfs.GCSFileSystem()\n", + "uris = fs.glob(\"gs://bigframes_blob_test/images_exif/*\")\n", + "\n", + "# Ensure URIs have gs:// prefix\n", + "uris = [u if u.startswith(\"gs://\") else f\"gs://{u}\" for u in uris]\n", + "\n", + "if not uris:\n", + " uris = [\"gs://bigframes_blob_test/images_exif/dummy.jpg\"]\n", + "\n", + "exif_image_df = bpd.read_gbq(f\"SELECT uri FROM UNNEST({uris[:5]}) as uri\")\n", + "exif_image_df['blob_col'] = bbq.obj.make_ref(exif_image_df['uri'], authorizer=FULL_CONNECTION_ID)\n", + "exif_image_df = exif_image_df[['blob_col']]\n", "\n", "# Generate a JSON string containing the runtime information (including signed read URLs)\n", "# This allows the UDF to download the images from Google Cloud Storage\n",