feat: support _sdpa_cudnn backend for cp (#504)

DefTruth · web-flow · commit 8bd89ba91138 · 2025-12-01T14:38:01.000+08:00
* feat: fast rope for z-image

* chore: update notes

* chore: update z-image cp example

* chore: update z-image cp example

* chore: update z-image cp example

* chore: update z-image cp example

* chore: update z-image cp example

* feat: allow cudnn attn w/ attn mask for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp

* feat: support _sdpa_cudnn backend for cp
diff --git a/examples/parallelism/run_zimage_cp.py b/examples/parallelism/run_zimage_cp.py
@@ -18,9 +18,9 @@
 
 import cache_dit
 
-# NOTE: Only support context parallelism with 'native' attention backend
-# for ZImage due to the attention mask in ZImage is not None. Please use:
-# --parallel ulysses --attn native
+# NOTE: Only support context parallelism with 'native/_sdpa_cudnn' attn backend
+# for Z-Image due to the attention mask in Z-Image is not None. Please use:
+# `--parallel ulysses --attn native` or `--attn _sdpa_cudnn`.
 
 args = get_args()
 print(args)
@@ -59,12 +59,34 @@
         # Only warmup 4 steps (total 9 steps) for distilled models
         args.max_warmup_steps = min(4, args.max_warmup_steps)
 
-    cachify(args, pipe)
+    cachify(
+        args,
+        pipe,
+        # Total 9 steps for distilled Z-Image-Turbo
+        # e.g, 111110101, 1: compute, 0: dynamic cache
+        steps_computation_mask=(
+            cache_dit.steps_mask(
+                compute_bins=[5, 1, 1],  # 7 steps compute
+                cache_bins=[1, 1],  # max 2 steps cache
+            )
+            if args.steps_mask
+            else None
+        ),
+    )
 
 pipe.to(device)
 
 assert isinstance(pipe.transformer, ZImageTransformer2DModel)
 
+# Allow customize attention backend for Single GPU inference
+if args.parallel_type is None:
+    # native, flash, _native_cudnn, sage, etc.
+    # _native_cudnn is faster than native(sdpa) on NVIDIA L20 with CUDA 12.9+.
+    # '_sdpa_cudnn' is only in cache-dit to support context parallelism
+    # with attn masks, e.g., Z-Image. It is not in diffusers yet.
+    if args.attn is not None:
+        pipe.transformer.set_attention_backend(args.attn)
+
 pipe.set_progress_bar_config(disable=rank != 0)
 
 # Set default prompt
@@ -94,7 +116,14 @@ def run_pipe(warmup: bool = False):
 
 if args.compile:
     cache_dit.set_compile_configs()
-    pipe.transformer = torch.compile(pipe.transformer)
+    if args.compile_repeated_blocks:
+        pipe.transformer.compile_repeated_blocks(
+            mode="max-autotune-no-cudagraphs" if args.max_autotune else "default"
+        )
+    else:
+        pipe.transformer = torch.compile(
+            pipe.transformer, mode="max-autotune-no-cudagraphs" if args.max_autotune else "default"
+        )
 
 # warmup
 _ = run_pipe(warmup=True)
diff --git a/examples/utils.py b/examples/utils.py
@@ -61,6 +61,8 @@ def get_args(
     parser = argparse.ArgumentParser()
     parser.add_argument("--cache", action="store_true", default=False)
     parser.add_argument("--compile", action="store_true", default=False)
+    parser.add_argument("--compile-repeated-blocks", action="store_true", default=False)
+    parser.add_argument("--max-autotune", action="store_true", default=False)
     parser.add_argument("--fuse-lora", action="store_true", default=False)
     parser.add_argument("--steps", type=int, default=None)
     parser.add_argument("--Fn", type=int, default=8)
@@ -72,6 +74,7 @@ def get_args(
     parser.add_argument("--max-continuous-cached-steps", "--mcc", type=int, default=-1)
     parser.add_argument("--taylorseer", action="store_true", default=False)
     parser.add_argument("--taylorseer-order", "-order", type=int, default=1)
+    parser.add_argument("--steps-mask", "--scm", action="store_true", default=False)
     parser.add_argument("--height", type=int, default=None)
     parser.add_argument("--width", type=int, default=None)
     parser.add_argument("--quantize", "-q", action="store_true", default=False)
@@ -113,6 +116,9 @@ def get_args(
             # Based on this fix: https://github.com/huggingface/diffusers/pull/12563
             "native",  # native pytorch attention: sdpa
             "_native_cudnn",
+            # '_sdpa_cudnn' is only in cache-dit to support context parallelism
+            # with attn masks, e.g., ZImage. It is not in diffusers yet.
+            "_sdpa_cudnn",
             "sage",  # Need install sageattention: https://github.com/thu-ml/SageAttention
         ],
     )
@@ -220,6 +226,7 @@ def cachify(
                     max_continuous_cached_steps=args.max_continuous_cached_steps,
                     residual_diff_threshold=args.rdt,
                     enable_separate_cfg=kwargs.get("enable_separate_cfg", None),
+                    steps_computation_mask=kwargs.get("steps_computation_mask", None),
                 )
                 if cache_config is None and args.cache
                 else cache_config
@@ -262,6 +269,8 @@ def strify(args, pipe_or_stats):
         base_str += "_ulysses_anything"
     if args.ulysses_async_qkv_proj:
         base_str += "_ulysses_async_qkv_proj"
+    if args.attn is not None:
+        base_str += f"_{args.attn.strip('_')}"
     return base_str
 
 
diff --git a/src/cache_dit/parallelism/backends/native_diffusers/context_parallelism/attention/__init__.py b/src/cache_dit/parallelism/backends/native_diffusers/context_parallelism/attention/__init__.py
@@ -1,7 +1,7 @@
 def maybe_resigter_native_attention_backend():
     """Maybe re-register native attention backend to enable context parallelism."""
     # Import custom attention backend ensuring registration
-    from ._attention_dispatch import _native_attention
+    from ._attention_dispatch import _native_attention, _sdpa_cudnn_attention
 
 
 from ._templated_ulysses_anything import (
diff --git a/src/cache_dit/parallelism/backends/native_diffusers/context_parallelism/attention/_attention_dispatch.py b/src/cache_dit/parallelism/backends/native_diffusers/context_parallelism/attention/_attention_dispatch.py
@@ -28,6 +28,7 @@
 
 __all__ = [
     "_native_attention",
+    "_sdpa_cudnn_attention",
 ]
 
 # Enable custom native attention backend with context parallelism
@@ -52,25 +53,33 @@ def _is_native_attn_supported_context_parallel() -> bool:
         )
 
 
-if _CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH:
-    logger.warning(
-        "Re-registering NATIVE attention backend to enable context parallelism. "
-        "This is a temporary workaround and should be removed after the native "
-        "attention backend supports context parallelism natively. Please check: "
-        "https://github.com/huggingface/diffusers/pull/12563 for more details. "
-        "Or, you can disable this behavior by setting the environment variable "
-        "`CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH=0`."
-    )
-    _AttentionBackendRegistry._backends.pop(AttentionBackendName.NATIVE)
-    _AttentionBackendRegistry._constraints.pop(AttentionBackendName.NATIVE)
-    _AttentionBackendRegistry._supported_arg_names.pop(AttentionBackendName.NATIVE)
+def _registry_pop_attn_backend(attn_backend: AttentionBackendName):
+    _AttentionBackendRegistry._backends.pop(attn_backend)
+    _AttentionBackendRegistry._constraints.pop(attn_backend)
+    _AttentionBackendRegistry._supported_arg_names.pop(attn_backend)
     if _is_native_attn_supported_context_parallel():
         if isinstance(_AttentionBackendRegistry._supports_context_parallel, dict):
-            _AttentionBackendRegistry._supports_context_parallel.pop(AttentionBackendName.NATIVE)
+            _AttentionBackendRegistry._supports_context_parallel.pop(attn_backend)
         else:
-            _AttentionBackendRegistry._supports_context_parallel.remove(
-                AttentionBackendName.NATIVE.value
-            )
+            _AttentionBackendRegistry._supports_context_parallel.remove(attn_backend.value)
+
+
+def _set_new_attn_backend(member: str, value: str):
+    # e.g., _set_new_attn_backend("_SDPA_CUDNN", "_sdpa_cudnn")
+    new_member = str.__new__(AttentionBackendName, value)
+    new_member._name_ = member
+    new_member._value_ = value
+    setattr(AttentionBackendName, member, new_member)
+    AttentionBackendName._member_map_[member] = new_member
+    AttentionBackendName._member_names_.append(member)
+    AttentionBackendName._value2member_map_[value] = new_member
+
+
+if _CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH:
+    _ATTENTION_OPS_ALLOW_ATTN_MASK = [
+        "_native_attention_forward_op",
+        "_sdpa_cudnn_attention_forward_op",
+    ]
 
     # Re-define templated context parallel attention to support attn mask
     def _templated_context_parallel_attention_v2(
@@ -91,7 +100,7 @@ def _templated_context_parallel_attention_v2(
         if attn_mask is not None:
             # NOTE(DefTruth): Check if forward_op is native attention forward op
             forward_op_name = forward_op.__name__
-            if not forward_op_name == "_native_attention_forward_op":
+            if forward_op_name not in _ATTENTION_OPS_ALLOW_ATTN_MASK:
                 raise ValueError(
                     "Templated context parallel attention with attn_mask "
                     "is only supported for native attention backend, "
@@ -239,6 +248,9 @@ def _native_attention_backward_op(
 
         return grad_query, grad_key, grad_value
 
+    # Re-register NATIVE attention backend to allow attn mask while using context parallelism
+    _registry_pop_attn_backend(AttentionBackendName.NATIVE)
+
     @_AttentionBackendRegistry.register(
         AttentionBackendName.NATIVE,
         constraints=[_check_device, _check_shape],
@@ -288,9 +300,130 @@ def _native_attention(
             )
         return out
 
+    logger.warning(
+        "Re-registered NATIVE attention backend to enable context parallelism "
+        "with attn mask. You can disable this behavior by export env: "
+        "export CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH=0."
+    )
+
+    def _sdpa_cudnn_attention_forward_op(
+        ctx: torch.autograd.function.FunctionCtx,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None,
+        enable_gqa: bool = False,
+        return_lse: bool = False,
+        _save_ctx: bool = True,
+        _parallel_config: Optional["ParallelConfig"] = None,
+    ):
+        # Native attention does not return_lse
+        if return_lse:
+            raise ValueError("cudnn attention with sdpa does not support return_lse=True")
+
+        # used for backward pass
+        if _save_ctx:
+            ctx.save_for_backward(query, key, value)
+            ctx.attn_mask = attn_mask
+            ctx.dropout_p = dropout_p
+            ctx.is_causal = is_causal
+            ctx.scale = scale
+            ctx.enable_gqa = enable_gqa
+
+        query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
+        with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION):
+            out = torch.nn.functional.scaled_dot_product_attention(
+                query=query,
+                key=key,
+                value=value,
+                attn_mask=attn_mask,
+                dropout_p=dropout_p,
+                is_causal=is_causal,
+                scale=scale,
+                enable_gqa=enable_gqa,
+            )
+        out = out.permute(0, 2, 1, 3)
+
+        return out
+
+    def _sdpa_cudnn_attention_backward_op(
+        ctx: torch.autograd.function.FunctionCtx,
+        grad_out: torch.Tensor,
+        *args,
+        **kwargs,
+    ):
+        raise NotImplementedError("Backward for cudnn attention with sdpa is not implemented yet.")
+
+    # Register _sdpa_cudnn_attention backend to allow attn mask while using context parallelism
+    _set_new_attn_backend("_SDPA_CUDNN", "_sdpa_cudnn")
+    assert hasattr(AttentionBackendName, "_SDPA_CUDNN")
+
+    @_AttentionBackendRegistry.register(
+        AttentionBackendName._SDPA_CUDNN,  # type: AttentionBackendName
+        constraints=[_check_device, _check_shape],
+        supports_context_parallel=True,
+    )
+    def _sdpa_cudnn_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0.0,
+        is_causal: bool = False,
+        scale: Optional[float] = None,
+        enable_gqa: bool = False,
+        return_lse: bool = False,
+        _parallel_config: Optional["ParallelConfig"] = None,
+    ) -> torch.Tensor:
+        lse = None
+        if _parallel_config is None and not return_lse:
+            query, key, value = (x.permute(0, 2, 1, 3).contiguous() for x in (query, key, value))
+            with torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.CUDNN_ATTENTION):
+                out = torch.nn.functional.scaled_dot_product_attention(
+                    query=query,
+                    key=key,
+                    value=value,
+                    attn_mask=attn_mask,
+                    dropout_p=dropout_p,
+                    is_causal=is_causal,
+                    scale=scale,
+                    enable_gqa=enable_gqa,
+                )
+            out = out.permute(0, 2, 1, 3)
+        else:
+            out = _templated_context_parallel_attention_v2(
+                query,
+                key,
+                value,
+                attn_mask,
+                dropout_p,
+                is_causal,
+                scale,
+                enable_gqa,
+                return_lse,
+                forward_op=_sdpa_cudnn_attention_forward_op,
+                backward_op=_sdpa_cudnn_attention_backward_op,
+                _parallel_config=_parallel_config,
+            )
+            if return_lse:
+                out, lse = out
+
+        return (out, lse) if return_lse else out
+
+    logger.info(
+        "Registered new attention backend: _SDPA_CUDNN, to enable "
+        "context parallelism with attn mask. You can disable it by: "
+        "export CACHE_DIT_ENABLE_CUSTOM_CP_NATIVE_ATTN_DISPATCH=0."
+    )
+
 else:
     from diffusers.models.attention_dispatch import (
         _native_attention,
     )  # noqa: F401
 
+    _sdpa_cudnn_attention = None  # type: ignore[assignment]
+
     logger.info("Native attention backend already supports context parallelism.")
diff --git a/src/cache_dit/parallelism/backends/native_diffusers/context_parallelism/cp_plan_zimage.py b/src/cache_dit/parallelism/backends/native_diffusers/context_parallelism/cp_plan_zimage.py
@@ -3,6 +3,7 @@
 from diffusers.models.modeling_utils import ModelMixin
 from diffusers import ZImageTransformer2DModel
 
+
 try:
     from diffusers.models._modeling_parallel import (
         ContextParallelInput,
@@ -51,9 +52,7 @@ def apply(
         # hooks in each block/layer in the initialization of DBCache.
         # Issue: https://github.com/vipshop/cache-dit/issues/498
         maybe_patch_cp_find_submodule_by_name()
-        # Otherwise, use the custom CP plan defined here, this maybe
-        # a little different from the native diffusers implementation
-        # for some models.
+        # TODO: Patch rotary embedding function to avoid complex number ops
         n_noise_refiner_layers = len(transformer.noise_refiner)  # 2
         n_context_refiner_layers = len(transformer.context_refiner)  # 2
         # num_layers = len(transformer.layers)  # 30
@@ -93,3 +92,12 @@ def apply(
             # f"layers.{num_layers - 1}": ContextParallelOutput(gather_dim=1, expected_dims=3),
         }
         return _cp_plan
+
+
+# TODO: Original implementation using complex numbers, which is not be supported in torch.compile yet.
+# May be Reference:
+# - https://github.com/triple-Mu/Z-Image-TensorRT/blob/4efc5749e9a0d22344e6c4b8a09d2223dd0a7e17/step_by_step/2-remove-complex-op.py#L26C1-L36C25
+# - https://github.com/huggingface/diffusers/pull/12725
+
+
+# TODO: Support Async Ulysses QKV projection for Z-Image