Skip to content

Commit 1fb80a3

Browse files
committed
dont use array_remove
1 parent 1397c5d commit 1fb80a3

1 file changed

Lines changed: 6 additions & 7 deletions

File tree

examples/tpch/q21_suppliers_kept_orders_waiting.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -77,20 +77,19 @@
7777
# arrays with size one. That combination will give us orders that had multiple suppliers where only
7878
# one failed. Use distinct=True in the blow aggregation so we don't get multiple line items from the
7979
# same supplier reported in either array.
80+
#
81+
# Note: we use array_agg with filter to exclude nulls from the failed_suppliers array rather than
82+
# aggregating nulls and calling array_remove(..., NULL), which returns NULL per SQL standard.
8083
df = df.aggregate(
8184
[col("o_orderkey")],
8285
[
8386
F.array_agg(col("l_suppkey"), distinct=True).alias("all_suppliers"),
84-
F.array_agg(col("failed_supp"), distinct=True).alias("failed_suppliers"),
87+
F.array_agg(
88+
col("failed_supp"), distinct=True, filter=col("failed_supp").is_not_null()
89+
).alias("failed_suppliers"),
8590
],
8691
)
8792

88-
# Remove the null entries that will get returned by array_agg so we can test to see where we only
89-
# have a single failed supplier in a multiple supplier order
90-
df = df.with_column(
91-
"failed_suppliers", F.array_remove(col("failed_suppliers"), lit(None))
92-
)
93-
9493
# This is the check described above which will identify single failed supplier in a multiple
9594
# supplier order.
9695
df = df.filter(F.array_length(col("failed_suppliers")) == lit(1)).filter(

0 commit comments

Comments
 (0)