apache
diff --git a/‎.github/workflows/rust.yml‎
Lines changed: 16 additions & 1 deletion b/‎.github/workflows/rust.yml‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎ci/scripts/check_examples_docs.sh‎
Lines changed: 47 additions & 38 deletions b/‎ci/scripts/check_examples_docs.sh‎
Lines changed: 47 additions & 38 deletions
diff --git a/‎datafusion-examples/Cargo.toml‎
Lines changed: 1 addition & 1 deletion b/‎datafusion-examples/Cargo.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎datafusion-examples/README.md‎
Lines changed: 15 additions & 13 deletions b/‎datafusion-examples/README.md‎
Lines changed: 15 additions & 13 deletions
diff --git a/‎datafusion-examples/examples/builtin_functions/main.rs‎
Lines changed: 9 additions & 3 deletions b/‎datafusion-examples/examples/builtin_functions/main.rs‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎datafusion-examples/examples/custom_data_source/main.rs‎
Lines changed: 24 additions & 8 deletions b/‎datafusion-examples/examples/custom_data_source/main.rs‎
Lines changed: 24 additions & 8 deletions
diff --git a/‎datafusion-examples/examples/data_io/main.rs‎
Lines changed: 30 additions & 10 deletions b/‎datafusion-examples/examples/data_io/main.rs‎
Lines changed: 30 additions & 10 deletions
diff --git a/‎datafusion-examples/examples/dataframe/main.rs‎
Lines changed: 9 additions & 2 deletions b/‎datafusion-examples/examples/dataframe/main.rs‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎datafusion-examples/examples/execution_monitoring/main.rs‎
Lines changed: 9 additions & 3 deletions b/‎datafusion-examples/examples/execution_monitoring/main.rs‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎datafusion-examples/examples/external_dependency/main.rs‎
Lines changed: 6 additions & 2 deletions b/‎datafusion-examples/examples/external_dependency/main.rs‎
Lines changed: 6 additions & 2 deletions
@@ -709,6 +709,11 @@ jobs:
           ./dev/update_function_docs.sh
           git diff --exit-code
 
+# This job ensures `datafusion-examples/README.md` stays in sync with the source code:
+# 1. Generates README automatically using the Rust examples docs generator
+#    (parsing documentation from `examples/<group>/main.rs`)
+# 2. Formats the generated Markdown using DataFusion's standard Prettier setup
+# 3. Compares the result against the committed README.md and fails if out-of-date
   examples-docs-check:
     name: check example README is up-to-date
     needs: linux-build-lib
@@ -721,6 +726,16 @@ jobs:
         with:
           submodules: true
           fetch-depth: 1
+          
+      - name: Mark repository as safe for git
+        # Required for git commands inside container (avoids "dubious ownership" error)
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Set up Node.js (required for prettier)
+        # doc_prettier_check.sh uses npx to run prettier for Markdown formatting
+        uses: actions/setup-node@v4
+        with:
+          node-version: '18'
 
       - name: Run examples docs check script
         run: |
@@ -778,4 +793,4 @@ jobs:
         run: cargo msrv --output-format json --log-target stdout verify
       - name: Check datafusion-proto
         working-directory: datafusion/proto
-        run: cargo msrv --output-format json --log-target stdout verify
+        run: cargo msrv --output-format json --log-target stdout verify
@@ -17,48 +17,57 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set -euo pipefail
-
-EXAMPLES_DIR="datafusion-examples/examples"
-README="datafusion-examples/README.md"
-
-# ffi examples are skipped because they were not part of the recent example
-# consolidation work and do not follow the new grouping and execution pattern.
-# They are not documented in the README using the new structure, so including
-# them here would cause false CI failures.
-SKIP_LIST=("ffi")
-
-missing=0
+# Generates documentation for DataFusion examples using the Rust-based
+# documentation generator and verifies that the committed README.md
+# is up to date.
+#
+# The README is generated from documentation comments in:
+#   datafusion-examples/examples/<group>/main.rs
+#
+# This script is intended to be run in CI to ensure that example
+# documentation stays in sync with the code.
+#
+# To update the README locally, run this script and replace README.md
+# with the generated output.
 
-skip() {
-    local value="$1"
-    for item in "${SKIP_LIST[@]}"; do
-        if [[ "$item" == "$value" ]]; then
-            return 0
-        fi
-    done
-    return 1
-}
+set -euo pipefail
 
-# collect folder names
-folders=$(find "$EXAMPLES_DIR" -mindepth 1 -maxdepth 1 -type d -exec basename {} \;)
+ROOT_DIR="$(git rev-parse --show-toplevel)"
+EXAMPLES_DIR="$ROOT_DIR/datafusion-examples"
+README="$EXAMPLES_DIR/README.md"
+README_NEW="$EXAMPLES_DIR/README-NEW.md"
 
-# collect group names from README headers
-groups=$(grep "^### Group:" "$README" | sed -E 's/^### Group: `([^`]+)`.*/\1/')
+echo "▶ Generating examples README (Rust generator)…"
+cargo run --quiet \
+  --manifest-path "$EXAMPLES_DIR/Cargo.toml" \
+  --bin examples-docs \
+  > "$README_NEW"
 
-for folder in $folders; do
-    if skip "$folder"; then
-        echo "Skipped group: $folder"
-        continue
-    fi
+echo "▶ Formatting generated README with Prettier…"
+npx prettier@2.7.1 \
+  --parser markdown \
+  --write "$README_NEW"
 
-    if ! echo "$groups" | grep -qx "$folder"; then
-        echo "Missing README entry for example group: $folder"
-        missing=1
-    fi
-done
+echo "▶ Comparing generated README with committed version…"
 
-if [[ $missing -eq 1 ]]; then
-    echo "README is out of sync with examples"
-    exit 1
+if ! diff -u "$README" "$README_NEW" > /tmp/examples-readme.diff; then
+  echo ""
+  echo "❌ Examples README is out of date."
+  echo ""
+  echo "The examples documentation is generated automatically from:"
+  echo "  - datafusion-examples/examples/<group>/main.rs"
+  echo ""
+  echo "To update the README locally, run:"
+  echo ""
+  echo "  cargo run --bin examples-docs \\"
+  echo "    | npx prettier@2.7.1 --parser markdown --write \\"
+  echo "    > datafusion-examples/README.md"
+  echo ""
+  echo "Diff:"
+  echo "------------------------------------------------------------"
+  cat /tmp/examples-readme.diff
+  echo "------------------------------------------------------------"
+  exit 1
 fi
+
+echo "✅ Examples README is up-to-date."
@@ -41,7 +41,7 @@ arrow-schema = { workspace = true }
 datafusion = { workspace = true, default-features = true, features = ["parquet_encryption"] }
 datafusion-common = { workspace = true }
 tempfile = { workspace = true }
-tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
+tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
 
 [dev-dependencies]
 arrow-flight = { workspace = true }
 
@@ -71,15 +71,16 @@ cargo run --example dataframe -- dataframe
 
 #### Category: Single Process
 
-| Subcommand            | File Path                                                                                             | Description                                   |
-| --------------------- | ----------------------------------------------------------------------------------------------------- | --------------------------------------------- |
-| csv_sql_streaming     | [`custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs)         | Run a streaming SQL query against CSV data    |
-| csv_json_opener       | [`custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs)             | Use low-level FileOpener APIs for CSV/JSON    |
-| custom_datasource     | [`custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs)         | Query a custom TableProvider                  |
-| custom_file_casts     | [`custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs)         | Implement custom casting rules                |
-| custom_file_format    | [`custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs)       | Write to a custom file format                 |
-| default_column_values | [`custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs) | Custom default values using metadata          |
-| file_stream_provider  | [`custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs)   | Read/write via FileStreamProvider for streams |
+| Subcommand            | File Path                                                                                             | Description                                                                                                         |
+| --------------------- | ----------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| adapter_serialization | [`custom_data_source/adapter_serialization.rs`](examples/custom_data_source/adapter_serialization.rs) | Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception |
+| csv_json_opener       | [`custom_data_source/csv_json_opener.rs`](examples/custom_data_source/csv_json_opener.rs)             | Use low-level FileOpener APIs for CSV/JSON                                                                          |
+| csv_sql_streaming     | [`custom_data_source/csv_sql_streaming.rs`](examples/custom_data_source/csv_sql_streaming.rs)         | Run a streaming SQL query against CSV data                                                                          |
+| custom_datasource     | [`custom_data_source/custom_datasource.rs`](examples/custom_data_source/custom_datasource.rs)         | Query a custom TableProvider                                                                                        |
+| custom_file_casts     | [`custom_data_source/custom_file_casts.rs`](examples/custom_data_source/custom_file_casts.rs)         | Implement custom casting rules                                                                                      |
+| custom_file_format    | [`custom_data_source/custom_file_format.rs`](examples/custom_data_source/custom_file_format.rs)       | Write to a custom file format                                                                                       |
+| default_column_values | [`custom_data_source/default_column_values.rs`](examples/custom_data_source/default_column_values.rs) | Custom default values using metadata                                                                                |
+| file_stream_provider  | [`custom_data_source/file_stream_provider.rs`](examples/custom_data_source/file_stream_provider.rs)   | Read/write via FileStreamProvider for streams                                                                       |
 
 ## Data IO Examples
 
@@ -143,8 +144,8 @@ cargo run --example dataframe -- dataframe
 
 | Subcommand | File Path                                               | Description                                            |
 | ---------- | ------------------------------------------------------- | ------------------------------------------------------ |
-| server     | [`flight/server.rs`](examples/flight/server.rs)         | Run DataFusion server accepting FlightSQL/JDBC queries |
 | client     | [`flight/client.rs`](examples/flight/client.rs)         | Execute SQL queries via Arrow Flight protocol          |
+| server     | [`flight/server.rs`](examples/flight/server.rs)         | Run DataFusion server accepting FlightSQL/JDBC queries |
 | sql_server | [`flight/sql_server.rs`](examples/flight/sql_server.rs) | Standalone SQL server for JDBC clients                 |
 
 ## Proto Examples
@@ -153,9 +154,10 @@ cargo run --example dataframe -- dataframe
 
 #### Category: Single Process
 
-| Subcommand               | File Path                                                                         | Description                                                     |
-| ------------------------ | --------------------------------------------------------------------------------- | --------------------------------------------------------------- |
-| composed_extension_codec | [`proto/composed_extension_codec.rs`](examples/proto/composed_extension_codec.rs) | Use multiple extension codecs for serialization/deserialization |
+| Subcommand               | File Path                                                                         | Description                                                                   |
+| ------------------------ | --------------------------------------------------------------------------------- | ----------------------------------------------------------------------------- |
+| composed_extension_codec | [`proto/composed_extension_codec.rs`](examples/proto/composed_extension_codec.rs) | Use multiple extension codecs for serialization/deserialization               |
+| expression_deduplication | [`proto/expression_deduplication.rs`](examples/proto/expression_deduplication.rs) | Example of expression caching/deduplication using the codec decorator pattern |
 
 ## Query Planning Examples
 
 
@@ -26,9 +26,15 @@
 //!
 //! Each subcommand runs a corresponding example:
 //! - `all` — run all examples included in this module
-//! - `date_time` — examples of date-time related functions and queries
-//! - `function_factory` — register `CREATE FUNCTION` handler to implement SQL macros
-//! - `regexp` — examples of using regular expression functions
+//!
+//! - `date_time`
+//!   (file: date_time.rs, desc: Examples of date-time related functions and queries)
+//!
+//! - `function_factory`  
+//!   (file: function_factory.rs, desc: Register `CREATE FUNCTION` handler to implement SQL macros)
+//!
+//! - `regexp`
+//!   (file: regexp.rs, desc: Examples of using regular expression functions)
 
 mod date_time;
 mod function_factory;
 
@@ -26,14 +26,30 @@
 //!
 //! Each subcommand runs a corresponding example:
 //! - `all` — run all examples included in this module
-//! - `adapter_serialization` — preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception
-//! - `csv_json_opener` — use low level FileOpener APIs to read CSV/JSON into Arrow RecordBatches
-//! - `csv_sql_streaming` — build and run a streaming query plan from a SQL statement against a local CSV file
-//! - `custom_datasource` — run queries against a custom datasource (TableProvider)
-//! - `custom_file_casts` — implement custom casting rules to adapt file schemas
-//! - `custom_file_format` — write data to a custom file format
-//! - `default_column_values` — implement custom default value handling for missing columns using field metadata and PhysicalExprAdapter
-//! - `file_stream_provider` — run a query on FileStreamProvider which implements StreamProvider for reading and writing to arbitrary stream sources/sinks
+//!
+//! - `adapter_serialization`  
+//!   (file: adapter_serialization.rs, desc: Preserve custom PhysicalExprAdapter information during plan serialization using PhysicalExtensionCodec interception)
+//!
+//! - `csv_json_opener`  
+//!   (file: csv_json_opener.rs, desc: Use low-level FileOpener APIs for CSV/JSON)
+//!
+//! - `csv_sql_streaming`
+//!   (file: csv_sql_streaming.rs, desc: Run a streaming SQL query against CSV data)
+//!
+//! - `custom_datasource`  
+//!   (file: custom_datasource.rs, desc: Query a custom TableProvider)
+//!
+//! - `custom_file_casts`
+//!   (file: custom_file_casts.rs, desc: Implement custom casting rules)
+//!
+//! - `custom_file_format`
+//!   (file: custom_file_format.rs, desc: Write to a custom file format)
+//!
+//! - `default_column_values`
+//!   (file: default_column_values.rs, desc: Custom default values using metadata)
+//!
+//! - `file_stream_provider`
+//!   (file: file_stream_provider.rs, desc: Read/write via FileStreamProvider for streams)
 
 mod adapter_serialization;
 mod csv_json_opener;
 
@@ -26,16 +26,36 @@
 //!
 //! Each subcommand runs a corresponding example:
 //! - `all` — run all examples included in this module
-//! - `catalog` — register the table into a custom catalog
-//! - `json_shredding` — shows how to implement custom filter rewriting for JSON shredding
-//! - `parquet_adv_idx` — create a detailed secondary index that covers the contents of several parquet files
-//! - `parquet_emb_idx` — store a custom index inside a Parquet file and use it to speed up queries
-//! - `parquet_enc_with_kms` — read and write encrypted Parquet files using an encryption factory
-//! - `parquet_enc` — read and write encrypted Parquet files using DataFusion
-//! - `parquet_exec_visitor` — extract statistics by visiting an ExecutionPlan after execution
-//! - `parquet_idx` — create an secondary index over several parquet files and use it to speed up queries
-//! - `query_http_csv` — configure `object_store` and run a query against files via HTTP
-//! - `remote_catalog` — interfacing with a remote catalog (e.g. over a network)
+//!
+//! - `catalog`
+//!   (file: catalog.rs, desc: Register tables into a custom catalog)
+//!
+//! - `json_shredding`
+//!   (file: json_shredding.rs, desc: Implement filter rewriting for JSON shredding)
+//!
+//! - `parquet_adv_idx`
+//!   (file: parquet_advanced_index.rs, desc: Create a secondary index across multiple parquet files)
+//!
+//! - `parquet_emb_idx`
+//!   (file: parquet_embedded_index.rs, desc: Store a custom index inside Parquet files)
+//!
+//! - `parquet_enc`  
+//!   (file: parquet_encrypted.rs, desc: Read & write encrypted Parquet files)
+//!
+//! - `parquet_enc_with_kms`
+//!   (file: parquet_encrypted_with_kms.rs, desc: Encrypted Parquet I/O using a KMS-backed factory)
+//!
+//! - `parquet_exec_visitor`
+//!   (file: parquet_exec_visitor.rs, desc: Extract statistics by visiting an ExecutionPlan)
+//!
+//! - `parquet_idx`
+//!   (file: parquet_index.rs, desc: Create a secondary index)
+//!
+//! - `query_http_csv`
+//!   (file: query_http_csv.rs, desc: Query CSV files via HTTP)
+//!
+//! - `remote_catalog`
+//!   (file: remote_catalog.rs, desc: Interact with a remote catalog)
 
 mod catalog;
 mod json_shredding;
 
@@ -26,8 +26,15 @@
 //!
 //! Each subcommand runs a corresponding example:
 //! - `all` — run all examples included in this module
-//! - `dataframe` — run a query using a DataFrame API against parquet files, csv files, and in-memory data, including multiple subqueries
-//! - `deserialize_to_struct` — convert query results (Arrow ArrayRefs) into Rust structs
+//!
+//! - `cache_factory`  
+//!   (file: cache_factory.rs, desc: Custom lazy caching for DataFrames using `CacheFactory`)
+//
+//! - `dataframe`
+//!   (file: dataframe.rs, desc: Query DataFrames from various sources and write output)
+//!
+//! - `deserialize_to_struct`
+//!   (file: deserialize_to_struct.rs, desc: Convert Arrow arrays into Rust structs)
 
 mod cache_factory;
 mod dataframe;
 
@@ -26,9 +26,15 @@
 //!
 //! Each subcommand runs a corresponding example:
 //! - `all` — run all examples included in this module
-//! - `mem_pool_exec_plan` — shows how to implement memory-aware ExecutionPlan with memory reservation and spilling
-//! - `mem_pool_tracking` — demonstrates TrackConsumersPool for memory tracking and debugging with enhanced error messages
-//! - `tracing` — demonstrates the tracing injection feature for the DataFusion runtime
+//!
+//! - `mem_pool_exec_plan`
+//!   (file: memory_pool_execution_plan.rs, desc: Memory-aware ExecutionPlan with spilling)
+//!
+//! - `mem_pool_tracking`
+//!   (file: memory_pool_tracking.rs, desc: Demonstrates memory tracking)
+//!
+//! - `tracing`
+//!   (file: tracing.rs, desc: Demonstrates tracing integration)
 
 mod memory_pool_execution_plan;
 mod memory_pool_tracking;
 
@@ -26,8 +26,12 @@
 //!
 //! Each subcommand runs a corresponding example:
 //! - `all` — run all examples included in this module
-//! - `dataframe_to_s3` — run a query using a DataFrame against a parquet file from AWS S3 and writing back to AWS S3
-//! - `query_aws_s3` — configure `object_store` and run a query against files stored in AWS S3
+//!
+//! - `dataframe_to_s3`
+//!   (file: dataframe_to_s3.rs, desc: Query DataFrames and write results to S3)
+//!
+//! - `query_aws_s3`
+//!   (file: query_aws_s3.rs, desc: Query S3-backed data using object_store)
 
 mod dataframe_to_s3;
 mod query_aws_s3;