Support diffusers models

kaix-nv · kaix-nv · commit a8e41b7f326a · 2026-04-02T21:12:21.000-07:00
Signed-off-by: Kai Xu &lt;kaix@nvidia.com&gt;
diff --git a/modelopt/torch/kernels/__init__.py b/modelopt/torch/kernels/__init__.py
@@ -39,11 +39,8 @@
 
         register_triton_attention = _register_triton_attention
 
-from .hf_vsa_attention import register_vsa_attention  # noqa: E402
-
 __all__ = [
     "IS_AVAILABLE",
     "attention",
     "register_triton_attention",
-    "register_vsa_attention",
 ]
diff --git a/modelopt/torch/kernels/hf_vsa_attention.py b/modelopt/torch/kernels/hf_vsa_attention.py
diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -35,14 +35,14 @@
 def _set_attn_implementation(model: nn.Module, config: SparseAttentionConfig) -> None:
     """Set the correct attn_implementation based on the sparse attention method/backend.
 
-    - ``method="vsa"``: registers the VSA kernel with HF and sets
-      ``attn_implementation="modelopt_vsa"``.  HF calls VSA directly via the
-      registered attention function — no monkey-patching needed.
     - ``backend="triton"``: registers the Triton kernel with HF and sets
       ``attn_implementation="modelopt_triton"``.
     - ``backend="pytorch"`` (default): sets ``attn_implementation="eager"`` so that
       softmax-patching methods (e.g. skip-softmax) work correctly.  FlashAttention
       and SDPA bypass ``F.softmax``, so eager is required.
+    - ``method="vsa"``: no-op. VSA patches ``F.scaled_dot_product_attention``
+      directly in ``SparseAttentionModule.forward()``, so no ``attn_implementation``
+      change is needed.
 
     This is called automatically during ``mtsa.sparsify()`` so users never need
     to manually set ``attn_implementation``.
@@ -55,31 +55,23 @@ def _set_attn_implementation(model: nn.Module, config: SparseAttentionConfig) ->
     methods = {v.get("method") for v in layer_cfgs}
     backends = {v.get("backend", "pytorch") for v in layer_cfgs}
 
-    # VSA uses attn_implementation="modelopt_vsa", which is incompatible
-    # with softmax-patching methods that need "eager" or triton methods that need
-    # "modelopt_triton". Reject mixed configs.
+    # VSA patches F.scaled_dot_product_attention directly — it does not change
+    # attn_implementation.  Skip the rest for VSA-only configs.
+    if methods == {"vsa"}:
+        return
+
+    # Reject mixed VSA + non-VSA configs (VSA patches SDPA globally per-module,
+    # while softmax-patching methods need attn_implementation="eager").
     non_vsa_methods = methods - {"vsa"}
     if "vsa" in methods and non_vsa_methods:
         raise ValueError(
             f"Cannot mix VSA with other sparse attention methods ({non_vsa_methods}). "
-            f"VSA sets attn_implementation='modelopt_vsa' model-wide, which is incompatible "
+            f"VSA patches F.scaled_dot_product_attention, which is incompatible "
             f"with softmax-patching or triton methods."
         )
 
     model_config = getattr(model, "config", None)
 
-    if "vsa" in methods:
-        from .kernels import register_vsa_attention
-
-        if not register_vsa_attention():
-            raise RuntimeError(
-                "Failed to register VSA attention with HuggingFace. "
-                "Check that your transformers version supports ALL_ATTENTION_FUNCTIONS."
-            )
-        if model_config is not None:
-            model_config._attn_implementation = "modelopt_vsa"
-        return
-
     if "triton" in backends and "pytorch" in backends:
         raise ValueError(
             "Mixed backends ('triton' and 'pytorch') in the same model are not "
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
@@ -15,16 +15,10 @@
 
 """Re-exports from modelopt.torch.kernels for backward compatibility."""
 
-from modelopt.torch.kernels import (
-    IS_AVAILABLE,
-    attention,
-    register_triton_attention,
-    register_vsa_attention,
-)
+from modelopt.torch.kernels import IS_AVAILABLE, attention, register_triton_attention
 
 __all__ = [
     "IS_AVAILABLE",
     "attention",
     "register_triton_attention",
-    "register_vsa_attention",
 ]
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/registry.py b/modelopt/torch/sparsity/attention_sparsity/methods/registry.py
@@ -38,6 +38,29 @@ def __init__(self):
         # Target sparsity ratio per phase: {"prefill": 0.5, "decode": 0.5}
         self.target_sparse_ratio: dict[str, float] | None = None
 
+    def forward_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        **kwargs,
+    ) -> tuple[torch.Tensor, dict]:
+        """Compute full attention replacement (e.g. VSA).
+
+        Default: raises NotImplementedError. Override for methods that replace
+        the entire attention computation rather than patching softmax.
+
+        Args:
+            query: Query tensor [batch, heads, seq_len, dim].
+            key: Key tensor [batch, heads, seq_len, dim].
+            value: Value tensor [batch, heads, seq_len, dim].
+            **kwargs: Method-specific arguments.
+
+        Returns:
+            Tuple of (attention_output, stats_dict).
+        """
+        raise NotImplementedError(f"{type(self).__name__} does not implement forward_attention.")
+
     def calculate_sparsity(
         self,
         attention_scores: torch.Tensor,
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/vsa.py b/modelopt/torch/sparsity/attention_sparsity/methods/vsa.py
@@ -22,18 +22,10 @@
 Uses the optimized Triton kernel from fastvideo_kernel.
 
 Integration:
-    For HuggingFace models, VSA registers as ``attn_implementation="modelopt_vsa"``
-    via ``ALL_ATTENTION_FUNCTIONS`` (same pattern as the Triton FA backend).  HF
-    dispatches Q, K, V directly to the VSA kernel — no monkey-patching needed.
-    This is set up automatically by ``mtsa.sparsify()``.
-
-    For non-HF models, call ``forward_attention(q, k, v, ...)`` directly::
-
-        for module in model.modules():
-            if isinstance(module, SparseAttentionModule):
-                vsa = module._sparse_method_instance
-                vsa.set_video_shape((T, H, W))
-                output, stats = vsa.forward_attention(q, k, v)
+    After ``mtsa.sparsify(model, VSA_DEFAULT)``, each attention layer's
+    ``F.scaled_dot_product_attention`` call is intercepted and replaced by the VSA
+    kernel.  Cross-attention (Q/K have different seq_len) is automatically skipped.
+    This works with HF transformers and diffusers.
 """
 
 import math
@@ -302,11 +294,11 @@ def forward_attention(
         # Kernel operates on tiled tensors in [batch, heads, padded_seq, dim] format
         try:
             from fastvideo_kernel import video_sparse_attn as triton_vsa_kernel
-        except ModuleNotFoundError:
-            raise ModuleNotFoundError(
+        except ImportError as e:
+            raise ImportError(
                 "VSA requires the 'fastvideo_kernel' package for its Triton sparse attention "
-                "kernel. Install it with: pip install fastvideo_kernel"
-            ) from None
+                f"kernel. Install it with: pip install fastvideo_kernel (error: {e})"
+            ) from e
         output_tiled = triton_vsa_kernel(
             query_tiled,
             key_tiled,
diff --git a/modelopt/torch/sparsity/attention_sparsity/sparse_attention.py b/modelopt/torch/sparsity/attention_sparsity/sparse_attention.py
@@ -176,21 +176,22 @@ def _setup(self):
     def forward(self, *args, **kwargs):
         """Forward with selected sparse attention method.
 
-        - VSA: dispatched by HF via ``ALL_ATTENTION_FUNCTIONS["modelopt_vsa"]``
-          inside the original forward — just pass through.
+        - VSA: patches ``F.scaled_dot_product_attention`` to intercept the SDPA
+          call inside the original forward. Cross-attention is skipped.
         - Softmax-patching methods (e.g. ``flash_skip_softmax``): use the
           context manager path below.
         """
         # Pass through if sparse attention is disabled
         if not self.is_enabled:
             return super().forward(*args, **kwargs)
 
-        # VSA is dispatched by HF via ALL_ATTENTION_FUNCTIONS["modelopt_vsa"]
-        # inside the original forward — pass through and let HF call our
-        # registered vsa_attention_forward().
+        # VSA: patch F.scaled_dot_product_attention so the VSA kernel intercepts
+        # the SDPA call inside the original forward. This works for diffusers models
+        # since SDPA is the common attention primitive.
+        # Only self-attention is replaced. Cross-attention (Q/K have different seq_len) is skipped.
         if self._method == "vsa":
-            result = super().forward(*args, **kwargs)
-            # Collect stats set by vsa_attention_forward
+            result = self._forward_with_vsa_sdpa_patch(args, kwargs)
+
             if self._stats_manager is not None and self._last_stats is not None:
                 self._stats_manager.collect(self._last_stats)
                 self._last_stats = None
@@ -210,6 +211,52 @@ def forward(self, *args, **kwargs):
 
         return result
 
+    def _forward_with_vsa_sdpa_patch(self, args, kwargs):
+        """Run forward with F.scaled_dot_product_attention patched for VSA.
+
+        Replaces SDPA with the VSA kernel for self-attention calls (Q and K/V
+        have the same seq_len).  Cross-attention calls fall through to the
+        original SDPA.  Warns if SDPA was never called.
+        """
+        import torch.nn.functional as F
+
+        from modelopt.torch.quantization.utils import replace_function
+
+        vsa = self._sparse_method_instance
+        original_sdpa = F.scaled_dot_product_attention
+        self._vsa_sdpa_called = False
+
+        def _patched_sdpa(query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, **kw):
+            self._vsa_sdpa_called = True
+            # Skip VSA for cross-attention (Q and K/V have different seq_len)
+            if query.shape[2] != key.shape[2]:
+                return original_sdpa(
+                    query,
+                    key,
+                    value,
+                    attn_mask=attn_mask,
+                    dropout_p=dropout_p,
+                    is_causal=is_causal,
+                    **kw,
+                )
+            output, stats = vsa.forward_attention(query, key, value)
+            self._last_stats = stats
+            return output
+
+        with replace_function(F, "scaled_dot_product_attention", _patched_sdpa):
+            result = super().forward(*args, **kwargs)
+
+        if not self._vsa_sdpa_called:
+            import warnings
+
+            warnings.warn(
+                f"VSA: F.scaled_dot_product_attention was not called during "
+                f"{type(self).__name__}.forward(). The attention layer may use a "
+                f"custom kernel that bypasses SDPA. VSA had no effect on this layer.",
+            )
+
+        return result
+
     def _get_sparse_context(self):
         """Get the context manager for applying sparse attention.
 
diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_vsa.py b/tests/unit/torch/sparsity/attention_sparsity/test_vsa.py