File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 7777# arrays with size one. That combination will give us orders that had multiple suppliers where only
7878# one failed. Use distinct=True in the blow aggregation so we don't get multiple line items from the
7979# same supplier reported in either array.
80+ #
81+ # Note: we use array_agg with filter to exclude nulls from the failed_suppliers array rather than
82+ # aggregating nulls and calling array_remove(..., NULL), which returns NULL per SQL standard.
8083df = df .aggregate (
8184 [col ("o_orderkey" )],
8285 [
8386 F .array_agg (col ("l_suppkey" ), distinct = True ).alias ("all_suppliers" ),
84- F .array_agg (col ("failed_supp" ), distinct = True ).alias ("failed_suppliers" ),
87+ F .array_agg (
88+ col ("failed_supp" ), distinct = True , filter = col ("failed_supp" ).is_not_null ()
89+ ).alias ("failed_suppliers" ),
8590 ],
8691)
8792
88- # Remove the null entries that will get returned by array_agg so we can test to see where we only
89- # have a single failed supplier in a multiple supplier order
90- df = df .with_column (
91- "failed_suppliers" , F .array_remove (col ("failed_suppliers" ), lit (None ))
92- )
93-
9493# This is the check described above which will identify single failed supplier in a multiple
9594# supplier order.
9695df = df .filter (F .array_length (col ("failed_suppliers" )) == lit (1 )).filter (
You can’t perform that action at this time.
0 commit comments