NVIDIA
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/kernels/__init__.py‎
Lines changed: 2 additions & 3 deletions b/‎modelopt/torch/kernels/__init__.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/config.py‎
Lines changed: 134 additions & 1 deletion b/‎modelopt/torch/sparsity/attention_sparsity/config.py‎
Lines changed: 134 additions & 1 deletion
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/conversion.py‎
Lines changed: 26 additions & 12 deletions b/‎modelopt/torch/sparsity/attention_sparsity/conversion.py‎
Lines changed: 26 additions & 12 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/methods/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎modelopt/torch/sparsity/attention_sparsity/methods/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/methods/registry.py‎
Lines changed: 25 additions & 0 deletions b/‎modelopt/torch/sparsity/attention_sparsity/methods/registry.py‎
Lines changed: 25 additions & 0 deletions
@@ -101,6 +101,7 @@ repos:
               examples/speculative_decoding/server_generate.py|
               experimental/dms/models/qwen3/configuration_qwen3_dms.py|
               experimental/dms/models/qwen3/modeling_qwen3_dms.py|
+              modelopt/torch/sparsity/attention_sparsity/methods/vsa_utils.py|
           )$
 
       # Default hook for Apache 2.0 in c/c++/cuda files
 
@@ -9,6 +9,7 @@ NVIDIA Model Optimizer Changelog
 - Added iterator interface using CalibrationDataReader in ONNX quantization workflow.
 - Add N:M sparse softmax support to the Triton flash attention kernel (``modelopt.torch.kernels.triton_fa``). See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
 - Add skip-softmax skipping to the Triton flash attention kernel (``modelopt.torch.kernels.triton_fa``). See `examples/llm_sparsity/attention_sparsity/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_sparsity/attention_sparsity>`_ for usage.
+- Add Video Sparse Attention (VSA) method for video diffusion models (``modelopt.torch.sparsity.attention_sparsity``). VSA uses 3D block tiling with a two-branch architecture for attention speedup.
 - Enable PTQ workflow for the Step3.5-Flash MoE model with NVFP4 W4A4 + FP8 KV cache quantization. See `modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml <https://github.com/NVIDIA/Model-Optimizer/blob/main/modelopt_recipes/models/Step3.5-Flash/nvfp4-mlp-only.yaml>`_ for more details.
 - Add support for vLLM fakequant reload using ModelOpt state for HF models. See `examples/vllm_serve/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/vllm_serve#load-qatptq-model-and-serve-in-vllm-wip>`_ for more details.
 - [Early Testing] Add Claude Code PTQ skill (``.claude/skills/ptq/``) for agent-assisted post-training quantization. The skill guides the agent through environment detection, model support checking, format selection, and execution via the launcher or manual SLURM/Docker/bare GPU paths. Includes handling for unlisted models with custom module patching. This feature is in early testing — use with caution.
 
@@ -35,10 +35,9 @@
 
         attention = _attention
         IS_AVAILABLE = True
-        with import_plugin("transformers"):
-            from .hf_triton_attention import register_triton_attention as _register_triton_attention
+        from .hf_triton_attention import register_triton_attention as _register_triton_attention
 
-            register_triton_attention = _register_triton_attention
+        register_triton_attention = _register_triton_attention
 
 __all__ = [
     "IS_AVAILABLE",
 
@@ -498,7 +498,7 @@ class FlashSkipSoftmaxConfig(SparseAttentionConfig):
 
 
 # Configuration with RULER calibration
-# Note: threshold field is omitted - calibration determines dynamic threshold λ = a / length
+# Note: threshold field is omitted - calibration determines dynamic threshold lambda = a / length
 # The calibrated threshold adapts to sequence length for optimal sparsity
 SKIP_SOFTMAX_CALIB = {
     "sparse_cfg": {
@@ -521,6 +521,136 @@ class FlashSkipSoftmaxConfig(SparseAttentionConfig):
 }
 
 
+class VSAAttributeConfig(ModeloptBaseConfig):
+    """Video Sparse Attention (VSA) attribute configuration.
+
+    VSA uses a two-branch architecture optimized for video diffusion models:
+    1. Compression branch: Block-averaged coarse attention
+    2. Sparse branch: Top-K block selection for fine-grained attention
+    """
+
+    method: str = ModeloptField(
+        default="vsa",
+        title="Sparse attention method.",
+        description="Must be 'vsa' for Video Sparse Attention.",
+    )
+
+    enable: bool = ModeloptField(
+        default=True,
+        title="Enable VSA.",
+        description="If True, enables Video Sparse Attention. If False, bypasses sparsity.",
+    )
+
+    block_size_3d: tuple[int, int, int] | list[int] = ModeloptField(
+        default=(4, 4, 4),
+        title="3D block size.",
+        description=(
+            "Video block dimensions (T, H, W) for spatial-temporal tiling. "
+            "Default (4, 4, 4) creates 64-token blocks."
+        ),
+    )
+
+    top_k_ratio: float = ModeloptField(
+        default=0.5,
+        title="Top-K selection ratio.",
+        description=(
+            "Ratio of blocks to keep in sparse branch (0.0 to 1.0). "
+            "Lower values mean more sparsity. Default 0.5 keeps 50% of blocks."
+        ),
+    )
+
+    video_shape: tuple[int, int, int] | list[int] | None = ModeloptField(
+        default=None,
+        title="Video shape.",
+        description=(
+            "Video dimensions (T, H, W) after patchification. "
+            "Required for VSA — set via config or call set_video_shape() at runtime."
+        ),
+    )
+
+    collect_stats: bool = ModeloptField(
+        default=False,
+        title="Collect statistics.",
+        description="Whether to collect sparsity statistics during forward pass.",
+    )
+
+    @field_validator("method")
+    @classmethod
+    def validate_vsa_method(cls, v):
+        """Validate method is 'vsa'."""
+        if v != "vsa":
+            raise ValueError(f"VSAAttributeConfig method must be 'vsa', got '{v}'")
+        return v
+
+    @field_validator("block_size_3d")
+    @classmethod
+    def validate_block_size_3d(cls, v):
+        """Validate 3D block size."""
+        if isinstance(v, list):
+            v = tuple(v)
+        if len(v) != 3:
+            raise ValueError(f"block_size_3d must have 3 elements (T, H, W), got {len(v)}")
+        if any(x <= 0 for x in v):
+            raise ValueError(f"All block_size_3d values must be positive, got {v}")
+        return v
+
+    @field_validator("top_k_ratio")
+    @classmethod
+    def validate_top_k_ratio(cls, v):
+        """Validate top-K ratio is in valid range."""
+        if not 0.0 < v <= 1.0:
+            raise ValueError(f"top_k_ratio must be in range (0, 1], got {v}")
+        return v
+
+    @field_validator("video_shape")
+    @classmethod
+    def validate_video_shape(cls, v):
+        """Validate video shape if provided."""
+        if v is None:
+            return v
+        if isinstance(v, list):
+            v = tuple(v)
+        if len(v) != 3:
+            raise ValueError(f"video_shape must have 3 elements (T, H, W), got {len(v)}")
+        if any(x <= 0 for x in v):
+            raise ValueError(f"All video_shape values must be positive, got {v}")
+        return v
+
+
+class VSAConfig(SparseAttentionConfig):
+    """Configuration for Video Sparse Attention optimization."""
+
+    sparse_cfg: SparseAttentionCfgType = ModeloptField(
+        default={
+            "*attn*": {
+                "method": "vsa",
+                "block_size_3d": (4, 4, 4),
+                "top_k_ratio": 0.5,
+                "enable": True,
+            },
+            "default": {"enable": False},
+        },
+        title="VSA configuration",
+        description="Pattern-based configuration for Video Sparse Attention.",
+        validate_default=True,
+    )
+
+
+# Pre-defined VSA Configuration for video diffusion models.
+# Pattern "*attn*" matches attention module names by convention.
+VSA_DEFAULT = {
+    "sparse_cfg": {
+        "*attn*": {
+            "method": "vsa",
+            "block_size_3d": (4, 4, 4),
+            "top_k_ratio": 0.5,
+            "enable": True,
+        },
+        "default": {"enable": False},
+    },
+}
+
+
 # Default N:M sparse softmax configuration
 SPARSE_SOFTMAX_DEFAULT = {
     "sparse_cfg": {
@@ -557,10 +687,13 @@ class FlashSkipSoftmaxConfig(SparseAttentionConfig):
     "SKIP_SOFTMAX_DEFAULT",
     "SKIP_SOFTMAX_TRITON_DEFAULT",
     "SPARSE_SOFTMAX_DEFAULT",
+    "VSA_DEFAULT",
     "CalibrationConfig",
     "FlashSkipSoftmaxConfig",
     "SparseAttentionAttributeConfig",
     "SparseAttentionCfgType",
     "SparseAttentionConfig",
     "SparseAttributeConfig",
+    "VSAAttributeConfig",
+    "VSAConfig",
 ]
@@ -33,42 +33,57 @@
 
 
 def _set_attn_implementation(model: nn.Module, config: SparseAttentionConfig) -> None:
-    """Set the correct attn_implementation based on the sparse attention backend.
+    """Set the correct attn_implementation based on the sparse attention method/backend.
 
     - ``backend="triton"``: registers the Triton kernel with HF and sets
       ``attn_implementation="modelopt_triton"``.
     - ``backend="pytorch"`` (default): sets ``attn_implementation="eager"`` so that
       softmax-patching methods (e.g. skip-softmax) work correctly.  FlashAttention
       and SDPA bypass ``F.softmax``, so eager is required.
+    - ``method="vsa"``: no-op. VSA patches ``F.scaled_dot_product_attention``
+      directly in ``SparseAttentionModule.forward()``, so no ``attn_implementation``
+      change is needed.
 
     This is called automatically during ``mtsa.sparsify()`` so users never need
     to manually set ``attn_implementation``.
     """
     sparse_cfg = config.sparse_cfg if hasattr(config, "sparse_cfg") else {}
 
-    # Collect backends only from layer configs (identified by having a "method" key).
+    # Collect methods and backends only from layer configs (identified by having a "method" key).
     # Other dict entries (e.g. "calibration") are not layer configs.
-    backends = {
-        v.get("backend", "pytorch")
-        for v in sparse_cfg.values()
-        if isinstance(v, dict) and "method" in v
-    }
+    layer_cfgs = [v for v in sparse_cfg.values() if isinstance(v, dict) and "method" in v]
+    methods = {v.get("method") for v in layer_cfgs}
+    backends = {v.get("backend", "pytorch") for v in layer_cfgs}
+
+    # VSA patches F.scaled_dot_product_attention directly — it does not change
+    # attn_implementation.  Skip the rest for VSA-only configs.
+    if methods == {"vsa"}:
+        return
+
+    # Reject mixed VSA + non-VSA configs (VSA patches SDPA globally per-module,
+    # while softmax-patching methods need attn_implementation="eager").
+    non_vsa_methods = methods - {"vsa"}
+    if "vsa" in methods and non_vsa_methods:
+        raise ValueError(
+            f"Cannot mix VSA with other sparse attention methods ({non_vsa_methods}). "
+            f"VSA patches F.scaled_dot_product_attention, which is incompatible "
+            f"with softmax-patching or triton methods."
+        )
+
+    model_config = getattr(model, "config", None)
 
     if "triton" in backends and "pytorch" in backends:
         raise ValueError(
             "Mixed backends ('triton' and 'pytorch') in the same model are not "
             "supported. All sparse attention layers must use the same backend."
         )
 
-    model_config = getattr(model, "config", None)
-
     if "triton" in backends:
         from .kernels import register_triton_attention
 
         if register_triton_attention is None:
             raise ImportError(
-                "Triton backend requires 'triton' and 'transformers' packages. "
-                "Install with: pip install triton transformers"
+                "Triton backend requires 'triton' package. Install with: pip install triton"
             )
         if not register_triton_attention():
             raise RuntimeError(
@@ -83,7 +98,6 @@ def _set_attn_implementation(model: nn.Module, config: SparseAttentionConfig) ->
             model_config._attn_implementation = "modelopt_triton"
     elif model_config is not None:
         # For pytorch backend, force eager for softmax patching.
-        # TODO: Add the triton backend support for skip-softmax.
         model_config._attn_implementation = "eager"
 
 
 
@@ -24,4 +24,5 @@
 ]
 
 # Import method implementations to trigger registration
-from . import flash_skip_softmax, triton_skip_softmax, triton_sparse_softmax
+# Note: vsa imports no external deps at module level; fastvideo_kernel is imported lazily at runtime.
+from . import flash_skip_softmax, triton_skip_softmax, triton_sparse_softmax, vsa
@@ -37,6 +37,31 @@ def __init__(self):
         self.calibration_params: dict[str, dict[str, float]] | None = None
         # Target sparsity ratio per phase: {"prefill": 0.5, "decode": 0.5}
         self.target_sparse_ratio: dict[str, float] | None = None
+        # Video shape for VSA (T, H, W). None for non-VSA methods.
+        self.video_shape: tuple[int, int, int] | None = None
+
+    def forward_attention(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        **kwargs,
+    ) -> tuple[torch.Tensor, dict]:
+        """Compute full attention replacement (e.g. VSA).
+
+        Default: raises NotImplementedError. Override for methods that replace
+        the entire attention computation rather than patching softmax.
+
+        Args:
+            query: Query tensor [batch, heads, seq_len, dim].
+            key: Key tensor [batch, heads, seq_len, dim].
+            value: Value tensor [batch, heads, seq_len, dim].
+            **kwargs: Method-specific arguments.
+
+        Returns:
+            Tuple of (attention_output, stats_dict).
+        """
+        raise NotImplementedError(f"{type(self).__name__} does not implement forward_attention.")
 
     def calculate_sparsity(
         self,
Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,5 @@`
`24`	`24`	`]`
`25`	`25`
`26`	`26`	`# Import method implementations to trigger registration`
`27`		`-from . import flash_skip_softmax, triton_skip_softmax, triton_sparse_softmax`
	`27`	`+# Note: vsa imports no external deps at module level; fastvideo_kernel is imported lazily at runtime.`
	`28`	`+from . import flash_skip_softmax, triton_skip_softmax, triton_sparse_softmax, vsa`