vipshop
diff --git a/‎.github/workflows/issue.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/issue.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎MANIFEST.in‎
Lines changed: 1 addition & 2 deletions b/‎MANIFEST.in‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎README.md‎
Lines changed: 56 additions & 17 deletions b/‎README.md‎
Lines changed: 56 additions & 17 deletions
diff --git a/‎assets/speedup.png‎
-2.88 MB b/‎assets/speedup.png‎
-2.88 MB
diff --git a/‎assets/speedup_v2.png‎
-3.53 MB b/‎assets/speedup_v2.png‎
-3.53 MB
diff --git a/‎assets/speedup_v3.png‎
-8.6 MB b/‎assets/speedup_v3.png‎
-8.6 MB
diff --git a/‎bench/bench.py‎
Lines changed: 8 additions & 24 deletions b/‎bench/bench.py‎
Lines changed: 8 additions & 24 deletions
@@ -15,8 +15,8 @@ jobs:
         days-before-issue-stale: 360
         days-before-issue-close: 360
         stale-issue-label: "stale"
-        stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
-        close-issue-message: "This issue was closed because it has been inactive for 7 days since being marked as stale."
+        stale-issue-message: "This issue is stale because it has been open for 360 days with no activity."
+        close-issue-message: "This issue was closed because it has been inactive for 360 days since being marked as stale."
         days-before-pr-stale: -1
         days-before-pr-close: -1
         repo-token: ${{ secrets.GITHUB_TOKEN }}
@@ -181,5 +181,4 @@ init_env.sh
 *.hdf5
 
 uv.lock
-
 CLAUDE.local.md
@@ -28,4 +28,4 @@ repos:
     hooks:
       - id: black-jupyter
         args:
-          - --line-length=80
+          - --line-length=100
@@ -194,7 +194,7 @@ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-       http://www.apache.org/licenses/LICENSE-2.0
+      http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
 
@@ -1,5 +1,4 @@
 include MANIFEST.in
 include LICENSE
-include requirements.txt
 prune */__pycache__
-global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp *.mp4 *.png *.jpg assets docs examples tests .github tmp debug
+global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp *.mp4 *.png *.jpg *.jpeg assets docs examples tests .github tmp debug
@@ -4,30 +4,21 @@
         <img src=https://github.com/vipshop/cache-dit/raw/main/assets/cache-dit-logo.png height="90" align="left">
         A Unified and Flexible Inference Engine with 🤗🎉<br>Hybrid Cache Acceleration and Parallelism for DiTs<br>
         <a href="https://pepy.tech/projects/cache-dit"><img src=https://static.pepy.tech/personalized-badge/cache-dit?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=BLUE&left_text=downloads></a>
-        <img src=https://img.shields.io/badge/Release-v1.0.*-blue.svg >
+        <img src=https://img.shields.io/badge/Release-v1.1.*-blue.svg >
         <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a> 
         <a href="https://hellogithub.com/repository/vipshop/cache-dit" target="_blank"><img src="https://api.hellogithub.com/v1/widgets/recommend.svg?rid=b8b03b3b32a449ea84cfc2b96cd384f3&claim_uid=ofSCbzTmdeQk3FD&theme=small" alt="Featured｜HelloGitHub" /></a> 
         <img src=https://img.shields.io/badge/Models-30+-orange.svg >
     </h2>
   </p>
-<img src=./assets/speedup_v4.png>
+<img src=https://github.com/vipshop/cache-dit/raw/main/assets/speedup_v4.png>
 </div>
 
-<!--
-<img src=https://img.shields.io/github/release/vipshop/cache-dit.svg >
-<img src=https://img.shields.io/github/license/vipshop/cache-dit.svg?color=blue >
-<a href="https://pepy.tech/projects/cache-dit"><img src=https://static.pepy.tech/personalized-badge/cache-dit?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GREEN&left_text=downloads></a>
-<a href="https://pypi.org/project/cache-dit/"><img src=https://img.shields.io/pypi/dm/cache-dit.svg ></a> 
-<img src=https://img.shields.io/github/stars/vipshop/cache-dit.svg?style=dark >
--->
-
 ## 🔥Hightlight
 
-We are excited to announce that the **first API-stable version (v1.0.0)** of cache-dit has finally been released!
-**[cache-dit](https://github.com/vipshop/cache-dit)** is a **Unified** and **Flexible** Inference Engine for 🤗Diffusers, enabling acceleration with just ♥️**one line**♥️ of code. Key features: **Unified Cache APIs**, **Forward Pattern Matching**, **Automatic Block Adapter**, **DBCache**, **DBPrune**, **Hybrid TaylorSeer Calibrator**, **Hybrid Cache CFG**, **Context Parallelism**, **Tensor Parallelism**, **Torch Compile Compatible** and **🎉SOTA** performance.
+We are excited to announce that the 🎉[**v1.1.0**](https://github.com/vipshop/cache-dit/releases/tag/v1.1.0) version of cache-dit has finally been released! It brings **[🔥Context Parallelism](./docs/User_Guide.md/#️hybrid-context-parallelism)** and **[🔥Tensor Parallelism](./docs/User_Guide.md#️hybrid-tensor-parallelism)** to cache-dit, **thus making** it a Unified and Flexible Inference Engine for 🤗DiTs. Key features: **Unified Cache APIs**, **Forward Pattern Matching**, **Block Adapter**, **DBCache**, **DBPrune**, **Cache CFG**, **TaylorSeer**, **Context Parallelism**, **Tensor Parallelism** and **🎉SOTA** performance.
 
 ```bash
-pip3 install -U cache-dit # pip3 install git+https://github.com/vipshop/cache-dit.git
+pip3 install -U cache-dit # Also, pip3 install git+https://github.com/huggingface/diffusers.git (latest)
 ```
 You can install the stable release of cache-dit from PyPI, or the latest development version from GitHub. Then try ♥️ Cache Acceleration with just **one line** of code ~ ♥️
 ```python
@@ -51,7 +42,54 @@ You can install the stable release of cache-dit from PyPI, or the latest develop
 - **[🎉Hybrid Cache Acceleration](./docs/User_Guide.md#taylorseer-calibrator)**: Now supports hybrid **Block-wise Cache + Calibrator** schemes (e.g., DBCache or DBPrune + TaylorSeerCalibrator). DBCache or DBPrune acts as the **Indicator** to decide *when* to cache, while the Calibrator decides *how* to cache. More mainstream cache acceleration algorithms (e.g., FoCa) will be supported in the future, along with additional benchmarks—stay tuned for updates!  
 - **[🤗Diffusers Ecosystem Integration](https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit)**: 🔥**cache-dit** has joined the Diffusers community ecosystem as the **first** DiT-specific cache acceleration framework! Check out the documentation here: <a href="https://huggingface.co/docs/diffusers/main/en/optimization/cache_dit"><img src=https://img.shields.io/badge/🤗Diffusers-ecosystem-yellow.svg ></a>
 
-![](./assets/clip-score-bench-v2.png)
+![](https://github.com/vipshop/cache-dit/raw/main/assets/clip-score-bench-v2.png)
+
+The comparison between **cache-dit** and other algorithms shows that within a speedup ratio (TFLOPs) less than 🎉**4x**, cache-dit achieved the **SOTA** performance. Please refer to [📚Benchmarks](https://github.com/vipshop/cache-dit/tree/main/bench/) for more details.
+
+<div align="center">
+
+| Method | TFLOPs(↓) | SpeedUp(↑) | ImageReward(↑) | Clip Score(↑) |
+| --- | --- | --- | --- | --- |
+| [**FLUX.1**-dev]: 50 steps | 3726.87 | 1.00× | 0.9898 | 32.404 |
+| [**FLUX.1**-dev]: 60% steps | 2231.70 | 1.67× | 0.9663 | 32.312 |
+| Δ-DiT(N=2) | 2480.01 | 1.50× | 0.9444 | 32.273 |
+| Δ-DiT(N=3) | 1686.76 | 2.21× | 0.8721 | 32.102 |
+| [**FLUX.1**-dev]: 34% steps | 1264.63 | 3.13× | 0.9453 | 32.114 |
+| Chipmunk | 1505.87 | 2.47× | 0.9936 | 32.776 |
+| FORA(N=3) | 1320.07 | 2.82× | 0.9776 | 32.266 |
+| **[DBCache(S)](https://github.com/vipshop/cache-dit)** | 1400.08 | **2.66×** | **1.0065** | 32.838 |
+| DuCa(N=5) | 978.76 | 3.80× | 0.9955 | 32.241 |
+| TaylorSeer(N=4,O=2) | 1042.27 | 3.57× | 0.9857 | 32.413 |
+| **[DBCache(S)+TS](https://github.com/vipshop/cache-dit)** | 1153.05 | **3.23×** | **1.0221** | 32.819 |
+| **[DBCache(M)](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | 0.9997 | 32.849 |
+| **[DBCache(M)+TS](https://github.com/vipshop/cache-dit)** | 944.75 | **3.94×** | **1.0107** | 32.865 |
+| **[FoCa(N=5): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 893.54 | **4.16×** | 1.0029 | **32.948** |
+| [**FLUX.1**-dev]: 22% steps | 818.29 | 4.55× | 0.8183 | 31.772 |
+| FORA(N=7) | 670.14 | 5.55× | 0.7418 | 31.519 |
+| ToCa(N=12) | 644.70 | 5.77× | 0.7155 | 31.808 |
+| DuCa(N=10) | 606.91 | 6.13× | 0.8382 | 31.759 |
+| TeaCache(l=1.2) | 669.27 | 5.56× | 0.7394 | 31.704 |
+| TaylorSeer(N=7,O=2) | 670.44 | 5.54× | 0.9128 | 32.128 |
+| **[DBCache(F)](https://github.com/vipshop/cache-dit)** | 651.90 | **5.72x** | 0.9271 | 32.552 |
+| **[FoCa(N=8): arxiv.2508.16211](https://arxiv.org/pdf/2508.16211)** | 596.07 | 6.24× | 0.9502 | 32.706 |
+| **[DBCache(F)+TS](https://github.com/vipshop/cache-dit)** | 651.90 | **5.72x** | **0.9526** | 32.568 |
+| **[DBCache(U)+TS](https://github.com/vipshop/cache-dit)** | 505.47 | **7.37x** | 0.8645 | **32.719** |
+
+</div>
+
+🎉Surprisingly, **cache-dit** still works in the **extremely few-step** distill model, such as **Qwen-Image-Lightning**, with the F16B16 config, the PSNR is 34.8 and the ImageReward is 1.26. It maintained a relatively high precision.
+<div align="center">
+
+| Config                     |  PSNR(↑)      | Clip Score(↑) | ImageReward(↑) | TFLOPs(↓)   | SpeedUp(↑) |
+|----------------------------|-----------|------------|--------------|----------|------------|
+| [**Lightning**]: 4 steps   | INF       | 35.5797    | 1.2630       | 274.33   | 1.00x       |
+| F24B24_W2MC1_R0.8          | 36.3242   | 35.6224    | 1.2630       | 264.74   | 1.04x       |
+| F16B16_W2MC1_R0.8          | 34.8163   | 35.6109    | 1.2614       | 244.25   | 1.12x       |
+| F12B12_W2MC1_R0.8          | 33.8953   | 35.6535    | 1.2549       | 234.63   | 1.17x       |
+| F8B8_W2MC1_R0.8            | 33.1374   | 35.7284    | 1.2517       | 224.29   | 1.22x       |
+| F1B0_W2MC1_R0.8            | 31.8317   | 35.6651    | 1.2397       | 206.90   | 1.33x       |
+
+</div>
 
 ## 🔥Supported DiTs
 
@@ -217,14 +255,15 @@ For more advanced features such as **Unified Cache APIs**, **Forward Pattern Mat
 - [⚡️Hybrid Context Parallelism](./docs/User_Guide.md#context-parallelism)
 - [⚡️Hybrid Tensor Parallelism](./docs/User_Guide.md#tensor-parallelism)
 - [🤖Low-bits Quantization](./docs/User_Guide.md#quantization)
+- [🤖How to use FP8 Attention](./docs/User_Guide.md#fp8-attention)
 - [🛠Metrics Command Line](./docs/User_Guide.md#metrics-cli)
 - [⚙️Torch Compile](./docs/User_Guide.md#️torch-compile)
 - [📚API Documents](./docs/User_Guide.md#api-documentation)
 
 ## 👋Contribute 
 <div id="contribute"></div>
 
-How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/CONTRIBUTE.md).
+How to contribute? Star ⭐️ this repo to support us or check [CONTRIBUTE.md](https://github.com/vipshop/cache-dit/raw/main/docs/CONTRIBUTE.md).
 
 <div align='center'>
 <a href="https://star-history.com/#vipshop/cache-dit&Date">
@@ -243,15 +282,15 @@ Here is a curated list of open-source projects integrating **CacheDiT**, includi
 
 ## ©️Acknowledgements
 
-Special thanks to vipshop's Computer Vision AI Team for supporting document, testing and production-level deployment of this project.
+Special thanks to vipshop's Computer Vision AI Team for supporting document, testing and production-level deployment of this project. We learned the design and reused code from the following projects: [🤗diffusers](https://huggingface.co/docs/diffusers), [ParaAttention](https://github.com/chengzeyi/ParaAttention), [xDiT](https://github.com/xdit-project/xDiT) and [TaylorSeer](https://github.com/Shenyi-Z/TaylorSeer).
 
 ## ©️Citations
 
 <div id="citations"></div>
 
 ```BibTeX
 @misc{cache-dit@2025,
-  title={cache-dit: A Unified and Flexible Inference Engine with Hybrid Cache Acceleration and Parallelism for Diffusers.},
+  title={cache-dit: A Unified and Flexible Inference Engine with Hybrid Cache Acceleration and Parallelism for DiTs.},
   url={https://github.com/vipshop/cache-dit.git},
   note={Open-source software available at https://github.com/vipshop/cache-dit.git},
   author={DefTruth, vipshop.com},
 
@@ -113,9 +113,7 @@ def init_flux_pipe(args: argparse.Namespace) -> FluxPipeline:
             cache_dit.set_compile_configs()
         else:
             torch._dynamo.config.recompile_limit = 96  # default is 8
-            torch._dynamo.config.accumulated_recompile_limit = (
-                2048  # default is 256
-            )
+            torch._dynamo.config.accumulated_recompile_limit = 2048  # default is 256
         if not args.compile_all:
             logger.warning(
                 "Only compile transformer blocks not the whole model "
@@ -134,9 +132,7 @@ def init_flux_pipe(args: argparse.Namespace) -> FluxPipeline:
     return pipe
 
 
-def gen_flux_image(
-    args: argparse.Namespace, pipe: FluxPipeline, prompt: str = None
-) -> Image.Image:
+def gen_flux_image(args: argparse.Namespace, pipe: FluxPipeline, prompt: str = None) -> Image.Image:
     assert prompt is not None
     image = pipe(
         prompt,
@@ -163,30 +159,20 @@ def get_args() -> argparse.ArgumentParser:
     parser.add_argument("--max-warmup-steps", "--w", type=int, default=8)
     parser.add_argument("--warmup-interval", type=int, default=1)
     parser.add_argument("--max-cached-steps", "--mc", type=int, default=-1)
-    parser.add_argument(
-        "--max-continuous-cached-steps", "--mcc", type=int, default=-1
-    )
-    parser.add_argument(
-        "--disable-block-adapter", action="store_true", default=False
-    )
+    parser.add_argument("--max-continuous-cached-steps", "--mcc", type=int, default=-1)
+    parser.add_argument("--disable-block-adapter", action="store_true", default=False)
     # Compile & FP8
     parser.add_argument("--compile", action="store_true", default=False)
     parser.add_argument("--inductor-flags", action="store_true", default=False)
     parser.add_argument("--compile-all", action="store_true", default=False)
     parser.add_argument("--quantize", "--q", action="store_true", default=False)
     # Test data
-    parser.add_argument(
-        "--save-dir", type=str, default="./tmp/DrawBench200_Default"
-    )
-    parser.add_argument(
-        "--prompt-file", type=str, default="./prompts/DrawBench200.txt"
-    )
+    parser.add_argument("--save-dir", type=str, default="./tmp/DrawBench200_Default")
+    parser.add_argument("--prompt-file", type=str, default="./prompts/DrawBench200.txt")
     parser.add_argument("--width", type=int, default=1024, help="Image width")
     parser.add_argument("--height", type=int, default=1024, help="Image height")
     parser.add_argument("--test-num", type=int, default=None)
-    parser.add_argument(
-        "--cal-flops", "--flops", action="store_true", default=False
-    )
+    parser.add_argument("--cal-flops", "--flops", action="store_true", default=False)
     return parser.parse_args()
 
 
@@ -208,9 +194,7 @@ def main():
     logger.info(f"Loaded {len(prompts)} prompts from: {args.prompt_file}")
 
     all_times = []
-    perf_tag = (
-        f"C{int(args.compile)}_Q{int(args.quantize)}_{cache_dit.strify(pipe)}"
-    )
+    perf_tag = f"C{int(args.compile)}_Q{int(args.quantize)}_{cache_dit.strify(pipe)}"
     save_dir = os.path.join(args.save_dir, perf_tag)
     os.makedirs(save_dir, exist_ok=True)
-Original file line number
+Diff line change
 *.hdf5
 uv.lock
+-
 CLAUDE.local.md