ruff fix

Elisa Tsai · Elisa Tsai · commit 58038ec87d45 · 2026-03-29T23:00:28.000Z
diff --git a/src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py b/src/maxdiffusion/kernels/splash_attention/ring_attention_kernel.py
@@ -544,7 +544,7 @@ class RingSplashAttentionKernel:
   """Implements Ring Attention using SplashAttention for sequence parallelism.
 
   This kernel computes global attention by keeping Keys and Values distributed
-  across the `ring_axis`. Instead of gathering full sequences, it rotates K/V 
+  across the `ring_axis`. Instead of gathering full sequences, it rotates K/V
   shards between devices and accumulates results incrementally. This allows
   processing sequence lengths that exceed single-device memory limits.
 
@@ -590,7 +590,8 @@ def manual_sharding_spec(self):
     """
 
     spec = jax.sharding.PartitionSpec(self.ring_axis)
-    _resolve_spec = lambda x: spec if x is not None else None
+    def _resolve_spec(x):
+      return spec if x is not None else None
 
     mask_info_specs = MaskInfo(  # pytype: disable=wrong-arg-types
         mask_next=_resolve_spec(self.fwd_mask_info.mask_next),
diff --git a/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py b/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel.py
@@ -196,7 +196,8 @@ def get_default(cls):
     )
 
 
-to_i32 = lambda x: x.astype(jnp.int32)
+def to_i32(x):
+  return x.astype(jnp.int32)
 
 
 def _apply_mask_and_soft_cap(
@@ -1471,7 +1472,8 @@ def mask_index_map(h, grid_idx, rows_ref, cols_ref, mask_next_ref=None, *_):
       return next_m, 0, 0
 
   else:
-    unravel = lambda f: lambda j, h, i, *_: f(h, i, j)
+    def unravel(f):
+      return lambda j, h, i, *_: f(h, i, j)
     grid = (kv_steps, num_q_heads, q_steps)
 
     def mask_index_map(j, h, i, rows_ref, cols_ref, mask_next_ref=None, *_):
@@ -1656,15 +1658,15 @@ def create_dkv_index_map(h, i, j, *_):
   )
   metadata = {
       "xprof_metadata": json.dumps(
-          dict(
-              block_q_dkv=bq,
-              block_kv_dkv=bkv,
-              block_kv_dkv_compute=bkv_compute,
-              q_layout=config.q_layout,
-              k_layout=config.k_layout,
-              v_layout=config.v_layout,
-              use_experimental_scheduler=config.use_experimental_scheduler,
-          ),
+          {
+              "block_q_dkv": bq,
+              "block_kv_dkv": bkv,
+              "block_kv_dkv_compute": bkv_compute,
+              "q_layout": config.q_layout,
+              "k_layout": config.k_layout,
+              "v_layout": config.v_layout,
+              "use_experimental_scheduler": config.use_experimental_scheduler,
+          },
       )
   }
   args = [
@@ -1970,7 +1972,8 @@ def manual_sharding_spec(self, sharding: jax.sharding.NamedSharding):
     if len(sharding.spec) != 1:
       raise ValueError("Only q sequence sharding is supported.")
 
-    _resolve_spec = lambda x: sharding.spec if x is not None else None
+    def _resolve_spec(x):
+      return sharding.spec if x is not None else None
     mask_info_specs = MaskInfo(  # pytype: disable=wrong-arg-types
         mask_next=_resolve_spec(self.fwd_mask_info.mask_next),
         active_rows=_resolve_spec(self.fwd_mask_info.active_rows),
@@ -2115,15 +2118,15 @@ def process_mask_shard(mask):
 
     return fwd_mask_info, dkv_mask_info
 
-  kwargs = dict(
-      config=config,
-      is_mqa=is_mqa,
-      save_residuals=save_residuals,
-      mask_value=mask_value,
-      mask_function=None,
-      fwd_mask_sparsity=1.0,
-      dkv_mask_sparsity=1.0,
-  )
+  kwargs = {
+      "config": config,
+      "is_mqa": is_mqa,
+      "save_residuals": save_residuals,
+      "mask_value": mask_value,
+      "mask_function": None,
+      "fwd_mask_sparsity": 1.0,
+      "dkv_mask_sparsity": 1.0,
+  }
 
   # If the input mask is replicated we don't need to call shard_map.
   if mask_spec is None:
diff --git a/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py b/src/maxdiffusion/kernels/splash_attention/splash_attention_kernel_test.py
@@ -279,7 +279,7 @@ def block_sizes_strategy(
   q_layout = draw(hps.sampled_from(splash.QKVLayout))
   k_layout = draw(hps.sampled_from(splash.QKVLayout))
   v_layout = draw(hps.sampled_from(splash.QKVLayout))
-  layouts = dict(q_layout=q_layout, k_layout=k_layout, v_layout=v_layout)
+  layouts = {"q_layout": q_layout, "k_layout": k_layout, "v_layout": v_layout}
   q_valid_block_shapes = [bs for bs in all_block_shapes if bs <= q_seq_len]
   kv_valid_block_shapes = [bs for bs in all_block_shapes if bs <= kv_seq_len]
   bq, bkv = (
@@ -494,16 +494,16 @@ def test_splash_attention_fwd(self, is_mqa, is_segmented, is_dynamic_mask,
         sinks,
     )
 
-    lse_tol = dict(atol=1e-3, rtol=3e-3)
-    max_logits_tol = dict(atol=1e-3, rtol=4e-3)
+    lse_tol = {"atol": 1e-3, "rtol": 3e-3}
+    max_logits_tol = {"atol": 1e-3, "rtol": 4e-3}
     if use_sinks:
-      o_tol = dict(atol=8e-2, rtol=1e-1)
+      o_tol = {"atol": 8e-2, "rtol": 1e-1}
       lse_tol['rtol'] = 6e-2
     elif (use_base2_exp or use_max_logit_estimate is not None
           or not fuse_reciprocal):
-      o_tol = dict(atol=8e-3, rtol=3e-3)
+      o_tol = {"atol": 8e-3, "rtol": 3e-3}
     else:
-      o_tol = dict(atol=4e-3, rtol=3e-3)
+      o_tol = {"atol": 4e-3, "rtol": 3e-3}
 
     self._assert_allclose(o, o_ref, **o_tol)
     self._assert_allclose(stats["logsumexp"],
@@ -598,12 +598,12 @@ def test_splash_attention_bwd(
         attn_logits_soft_cap=attn_logits_soft_cap,
     )
     if use_sinks:
-      o_tol = dict(atol=1e-2, rtol=1e-1)
+      o_tol = {"atol": 1e-2, "rtol": 1e-1}
     elif (use_base2_exp or use_max_logit_estimate is not None
           or not fuse_reciprocal):
-      o_tol = dict(atol=8e-3, rtol=1e-2)
+      o_tol = {"atol": 8e-3, "rtol": 1e-2}
     else:
-      o_tol = dict(atol=4e-3, rtol=3e-3)
+      o_tol = {"atol": 4e-3, "rtol": 3e-3}
     self._assert_allclose(o, o_ref, **o_tol)
 
     dq, dk, dv, _, dsinks = attn_vjp(do)
diff --git a/src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py b/src/maxdiffusion/kernels/splash_attention/splash_attention_mask_info.py
@@ -521,9 +521,8 @@ def _process_mask(
   if return_dynamic_grid:
     # Pad each slice to the largest number of active blocks in any shard.
     max_size = max(num_active_blocks)
-    pad_slice = lambda arr: np.pad(
-        arr, (0, max_size - arr.shape[0]), mode='constant', constant_values=-1
-    )
+    def pad_slice(arr):
+      return np.pad(arr, (0, max_size - arr.shape[0]), mode='constant', constant_values=-1)
     active_rows_slices = list(map(pad_slice, active_rows_slices))
     active_cols_slices = list(map(pad_slice, active_cols_slices))
     mask_next_slices = list(map(pad_slice, mask_next_slices))
diff --git a/src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py b/src/maxdiffusion/kernels/splash_attention/splash_attention_mask_test.py
@@ -1362,16 +1362,16 @@ def test_two_qseq_shards_causal_local_stacked(self):
     self._assert_mask_info_match(mask_info_dkv, expected_mask_info_dkv)
 
   @parameterized.named_parameters(
-      dict(
-          testcase_name="q_seq_shards_2",
-          q_seq_shards=2,
-          kv_seq_shards=1,
-      ),
-      dict(
-          testcase_name="kv_seq_shards_2",
-          q_seq_shards=1,
-          kv_seq_shards=2,
-      ),
+      {
+          "testcase_name": "q_seq_shards_2",
+          "q_seq_shards": 2,
+          "kv_seq_shards": 1,
+      },
+      {
+          "testcase_name": "kv_seq_shards_2",
+          "q_seq_shards": 1,
+          "kv_seq_shards": 2,
+      },
   )
   def test_two_shards_local_wide_local_narrow_stacked(
       self, q_seq_shards, kv_seq_shards