diff --git a/expose.h b/expose.h
index 0847392aef0..16558cf7dc4 100644
--- a/expose.h
+++ b/expose.h
@@ -6,7 +6,6 @@ const int images_max = 8;
 const int audio_max = 4;
 const int logprobs_max = 10;
 const int overridekv_max = 16;
-const int lora_filenames_max = 4;
 
 // match kobold's sampler list and order
 enum samplers
@@ -189,8 +188,9 @@ struct sd_load_model_inputs
     const char * clip1_filename = nullptr;
     const char * clip2_filename = nullptr;
     const char * vae_filename = nullptr;
-    const char * lora_filenames[lora_filenames_max] = {};
-    const float lora_multiplier = 1.0f;
+    const int lora_len = 0;
+    const char ** lora_filenames = nullptr;
+    const float * lora_multipliers = nullptr;
     const int lora_apply_mode = 0;
     const char * photomaker_filename = nullptr;
     const char * upscaler_filename = nullptr;
@@ -227,6 +227,8 @@ struct sd_generation_inputs
     const bool circular_x = false;
     const bool circular_y = false;
     const bool upscale = false;
+    const int lora_len = 0;
+    const float * lora_multipliers = nullptr;
 };
 struct sd_generation_outputs
 {
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 2c4071c4c62..3d131a8ce6c 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -4465,7 +4465,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                             {
                                 skipdecodelater = true;
                                 //decode until nearly done, then snapshot and decode the last 64
-                                std::vector<std::vector<gpt_vocab::id>> parts = split_big_vector_in_two(embd,64);
+                                std::vector<std::vector<gpt_vocab::id>> parts = split_big_vector(embd,64);
                                 int temp_past = n_past;
                                 evalres = true;
                                 for(int p=0;p<parts.size();++p)
@@ -4477,11 +4477,6 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
                                     std::vector<gpt_vocab::id> chunk = parts[p];
                                     kcpp_embd_batch smallbatch = kcpp_embd_batch(chunk, temp_past, use_mrope, false);
                                     decode_status = llama_decode(llama_ctx_v4, smallbatch.batch);
-                                    if(p==0 && decode_status==1)
-                                    {
-                                        skipdecodelater = false;
-                                        break; //big pp failed
-                                    }
                                     evalres = (evalres && (decode_status==0));
                                     temp_past += chunk.size();
                                 }
diff --git a/koboldcpp.py b/koboldcpp.py
index d2da96236ab..29fb771278c 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -89,6 +89,7 @@
 embeddingsmodelpath = "" #if empty, not initialized
 musicllmmodelpath = "" #if empty, not initialized
 musicdiffusionmodelpath = "" #if empty, not initialized
+imglorainfo = []
 maxctx = 8192
 maxhordectx = 0 #set to whatever maxctx is if 0
 maxhordelen = 1024
@@ -320,8 +321,9 @@ class sd_load_model_inputs(ctypes.Structure):
                 ("clip1_filename", ctypes.c_char_p),
                 ("clip2_filename", ctypes.c_char_p),
                 ("vae_filename", ctypes.c_char_p),
-                ("lora_filenames", ctypes.c_char_p * lora_filenames_max),
-                ("lora_multiplier", ctypes.c_float),
+                ("lora_len", ctypes.c_int),
+                ("lora_filenames", ctypes.POINTER(ctypes.c_char_p)),
+                ("lora_multipliers", ctypes.POINTER(ctypes.c_float)),
                 ("lora_apply_mode", ctypes.c_int),
                 ("photomaker_filename", ctypes.c_char_p),
                 ("upscaler_filename", ctypes.c_char_p),
@@ -356,7 +358,9 @@ class sd_generation_inputs(ctypes.Structure):
                 ("remove_limits", ctypes.c_bool),
                 ("circular_x", ctypes.c_bool),
                 ("circular_y", ctypes.c_bool),
-                ("upscale", ctypes.c_bool)]
+                ("upscale", ctypes.c_bool),
+                ("lora_len", ctypes.c_int),
+                ("lora_multipliers", ctypes.POINTER(ctypes.c_float))]
 
 class sd_generation_outputs(ctypes.Structure):
     _fields_ = [("status", ctypes.c_int),
@@ -1994,30 +1998,38 @@ def sd_load_model(model_filename,vae_filename,lora_filenames,t5xxl_filename,clip
     inputs.taesd = True if args.sdvaeauto else False
     inputs.tiled_vae_threshold = args.sdtiledvae
     inputs.vae_filename = vae_filename.encode("UTF-8")
-    for n in range(lora_filenames_max):
-        if n >= len(lora_filenames):
-            inputs.lora_filenames[n] = "".encode("UTF-8")
-        else:
-            inputs.lora_filenames[n] = lora_filenames[n].encode("UTF-8")
-
-    inputs.lora_multiplier = args.sdloramult
     inputs.t5xxl_filename = t5xxl_filename.encode("UTF-8")
     inputs.clip1_filename = clip1_filename.encode("UTF-8")
     inputs.clip2_filename = clip2_filename.encode("UTF-8")
     inputs.photomaker_filename = photomaker_filename.encode("UTF-8")
     inputs.upscaler_filename = upscaler_filename.encode("UTF-8")
+
+    lora_filenames = [l.encode("UTF-8") for l in lora_filenames[:lora_filenames_max] if l]
+    lora_len = len(lora_filenames)
+    lora_multipliers = args.sdloramult[:lora_len]
+    if len(lora_multipliers) < lora_len:
+        missing = lora_len - len(lora_multipliers)
+        if len(lora_multipliers) == 1:
+            # previous behavior: all get the same weight
+            lora_multipliers.extend(lora_multipliers * missing)
+        else:
+            lora_multipliers.extend([0.] * missing)
+    inputs.lora_len = lora_len
+    inputs.lora_filenames = (ctypes.c_char_p * lora_len)(*lora_filenames)
+    inputs.lora_multipliers = (ctypes.c_float * lora_len)(*lora_multipliers)
+    # auto if no zero-weight lora, dynamic otherwise
+    inputs.lora_apply_mode = 3 if 0. in inputs.lora_multipliers else 0
+
     inputs.img_hard_limit = args.sdclamped
     inputs.img_soft_limit = args.sdclampedsoft
-    inputs.lora_apply_mode = 0 #auto for now
     inputs = set_backend_props(inputs)
     ret = handle.sd_load_model(inputs)
     return ret
 
-def sd_oai_tranform_params(genparams):
-    size = genparams.get('size', "512x512")
-    if size and size!="":
-        pattern = r'^\D*(\d+)x(\d+)$'
-        match = re.fullmatch(pattern, size)
+def sd_oai_transform_params(genparams):
+    size = genparams.get('size') or ''
+    pattern = r'^\D*(\d+)x(\d+)$'
+    match = re.fullmatch(pattern, size)
     if match:
         width = int(match.group(1))
         height = int(match.group(2))
@@ -2111,6 +2123,84 @@ def sd_upscale(genparams):
         data_main = ret.data.decode("UTF-8","ignore")
     return data_main
 
+def sanitize_lora_multipliers(sdloramult):
+    if sdloramult is None:
+        sdloramult = [1.0] * lora_filenames_max
+    elif not isinstance(sdloramult, list):
+        sdloramult = [sdloramult]
+    sdloramult = [tryparsefloat(m, 0.) for m in sdloramult]
+    return sdloramult
+
+def prepare_lora_multipliers(request_list):
+
+    orig_multipliers = [lora[3] for lora in imglorainfo]
+    dynamic = 0. in orig_multipliers
+    if not dynamic:
+        return orig_multipliers
+
+    req_by_path = {}
+    for r in request_list:
+        if not isinstance(r, dict):
+            continue
+        multiplier = tryparsefloat(r.get('multiplier'), 0.)
+        path = r.get('path')
+        if path and isinstance(path, str):
+            req_by_path[path] = req_by_path.get(path, 0.) + multiplier
+
+    result = []
+    for i, (fullpath, name, path, origmul) in enumerate(imglorainfo):
+        multiplier = orig_multipliers[i]
+        if multiplier == 0. and path in req_by_path:
+            multiplier = req_by_path[path]
+        result.append(multiplier)
+
+    return result
+
+def extract_loras_from_prompt(prompt):
+
+    pattern = r'<lora:([^:>]+):([^>]+)>'
+    lora_data = []
+
+    matches = list(re.finditer(pattern, prompt))
+
+    for match in matches:
+        raw_path = match.group(1)
+        raw_mul = match.group(2)
+        try:
+            mul = float(raw_mul)
+        except ValueError:
+            continue
+
+        is_high_noise = False
+        prefix = "|high_noise|"
+        if raw_path.startswith(prefix):
+            raw_path = raw_path[len(prefix):]
+            is_high_noise = True
+
+        lora_data.append({
+            'name': raw_path,
+            'multiplier': mul,
+            'is_high_noise': is_high_noise,
+            })
+
+        prompt = prompt.replace(match.group(0), "", 1)
+
+    return prompt, lora_data
+
+def lora_map_name_to_path(request_list):
+    name2path = {}
+    for _, name, path, _ in imglorainfo:
+        name2path[name] = path
+    result = []
+    for req in request_list:
+        out = dict(req)
+        name = out.pop('name')
+        path = name2path.get(name)
+        if path:
+            out['path'] = path
+            result.append(out)
+    return result
+
 def sd_generate(genparams):
     global maxctx, args, currentusergenkey, totalgens, pendingabortkey, chatcompl_adapter
 
@@ -2209,6 +2299,11 @@ def sd_generate(genparams):
     inputs.circular_x = tryparseint(adapter_obj.get("circular_x", genparams.get("circular_x",0)),0)
     inputs.circular_y = tryparseint(adapter_obj.get("circular_y", genparams.get("circular_y",0)),0)
     inputs.upscale = (True if tryparseint(genparams.get("enable_hr", 0),0) else False)
+
+    lora_multipliers = prepare_lora_multipliers(genparams.get("lora", []))
+    inputs.lora_len = len(lora_multipliers)
+    inputs.lora_multipliers = (ctypes.c_float * inputs.lora_len)(*lora_multipliers)
+
     ret = handle.sd_generate(inputs)
     data_main = ""
     data_extra = ""
@@ -4098,6 +4193,9 @@ def do_GET(self):
         elif clean_path.endswith('/v1/models') or clean_path=='/models':
             response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":int(time.time()),"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode())
 
+        elif clean_path.endswith('/sdapi/v1/loras'):
+            response_body = (json.dumps([{'name': name, 'path': path} for _, name, path, multiplier in imglorainfo if multiplier == 0.])).encode()
+
         elif clean_path.endswith('/sdapi/v1/upscalers'):
             if args.sdupscaler:
                 response_body = (json.dumps([{"name":"ESRGAN_4x","model_name":"ESRGAN_4x","model_path":"upscaler_model.gguf","model_url":None,"scale":4}]).encode())
@@ -5106,7 +5204,13 @@ def do_POST(self):
                             lastgeneratedcomfyimg = b''
                             genparams = sd_comfyui_tranform_params(genparams)
                         elif is_oai_imggen:
-                            genparams = sd_oai_tranform_params(genparams)
+                            genparams = sd_oai_transform_params(genparams)
+                        if not genparams.get('lora'):
+                            # process <lora:name:multiplier> syntax
+                            prompt, loras = extract_loras_from_prompt(genparams['prompt'])
+                            if loras:
+                                genparams['prompt'] = prompt
+                                genparams['lora'] = lora_map_name_to_path(loras)
                         gen = sd_generate(genparams)
                         gendat = gen["data"]
                         genanim = gen["animated"]
@@ -6936,9 +7040,10 @@ def export_vars():
         args.sdquant = sd_quant_option(sd_quant_var.get())
         if sd_lora_var.get() != "":
             args.sdlora = [item.strip() for item in sd_lora_var.get().split("|") if item]
-            args.sdloramult = float(sd_loramult_var.get())
         else:
             args.sdlora = None
+        # XXX the user may have used '|' since it's used for the LoRAs
+        args.sdloramult = sanitize_lora_multipliers(re.split(r"[ |]+", sd_lora_var.get()))
 
         if gen_defaults_var.get() != "":
             args.gendefaults = gen_defaults_var.get()
@@ -7197,7 +7302,7 @@ def import_vars(dict):
                 sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "")
         else:
             sd_lora_var.set("")
-        sd_loramult_var.set(str(dict["sdloramult"]) if ("sdloramult" in dict and dict["sdloramult"]) else "1.0")
+        sd_loramult_var.set(" ".join(f"{n:.3f}".rstrip('0') for n in dict.get("sdloramult", [])))
         gen_defaults_var.set(dict["gendefaults"] if ("gendefaults" in dict and dict["gendefaults"]) else "")
         gen_defaults_overwrite_var.set(1 if "gendefaultsoverwrite" in dict and dict["gendefaultsoverwrite"] else 0)
 
@@ -7641,6 +7746,8 @@ def convert_invalid_args(args):
         dict["noflashattention"] = not dict["flashattention"]
     if "sdlora" in dict and isinstance(dict["sdlora"], str):
         dict["sdlora"] = ([dict["sdlora"]] if dict["sdlora"] else None)
+    if "sdloramult" in dict:
+        dict["sdloramult"] = sanitize_lora_multipliers(dict["sdloramult"])
     return args
 
 def setuptunnel(global_memory, has_sd):
@@ -8325,6 +8432,30 @@ def main(launch_args, default_args):
                 print("Press ENTER key to exit.", flush=True)
                 input()
 
+
+def mk_lora_info(imgloras, multipliers):
+    # (full path, name, name+extension, can change multiplier)
+    # XXX for each LoRA, sdapi needs a name and a path; we could use
+    # the full filename as a path, but we don't know if we can expose it
+    used_lora_names = set()
+    result = []
+    for i, lora_path in enumerate(imgloras):
+        multiplier = 0. if i >= len(multipliers) else multipliers[i]
+        lora_file = os.path.basename(lora_path)
+        lora_name, lora_ext = os.path.splitext(lora_file)
+        # ensure unique names
+        i = 1
+        mapped_name = lora_name
+        while True:
+            if mapped_name not in used_lora_names:
+                result.append((lora_path, mapped_name, mapped_name + lora_ext, multiplier))
+                used_lora_names.add(mapped_name)
+                break
+            i += 1
+            mapped_name = lora_name + '_' + str(i)
+    return result
+
+
 def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
     global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, embedded_kailite_gz, embedded_kcpp_docs_gz, embedded_kcpp_sdui_gz, embedded_lcpp_ui_gz, embedded_musicui, embedded_musicui_gz, start_time, exitcounter, global_memory, using_gui_launcher
     global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, musicdiffusionmodelpath, musicllmmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support, cached_chat_template
@@ -8770,6 +8901,9 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False):
                         imgloras.append(os.path.abspath(curr))
                     else:
                         print(f"Missing SD LORA model file {curr}...")
+            global imglorainfo
+            args.sdloramult = sanitize_lora_multipliers(args.sdloramult)
+            imglorainfo = mk_lora_info(imgloras, args.sdloramult)
             if args.sdvae:
                 if os.path.exists(args.sdvae):
                     imgvae = os.path.abspath(args.sdvae)
@@ -9365,7 +9499,7 @@ def range_checker(arg: str):
     sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group()
     sdparsergrouplora.add_argument("--sdquant",  metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0)
     sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify image generation LoRAs safetensors models to be applied. Multiple LoRAs are accepted.", nargs='+')
-    sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LoRA model to be applied.", type=float, default=1.0)
+    sdparsergroup.add_argument("--sdloramult", metavar=('[amounts]'), help="Multipliers for the image LoRA model to be applied.", type=float, nargs='+', default=[1.0])
     sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold)
     whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands')
     whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="")
diff --git a/otherarch/sdcpp/anima.hpp b/otherarch/sdcpp/anima.hpp
new file mode 100644
index 00000000000..191a096d40f
--- /dev/null
+++ b/otherarch/sdcpp/anima.hpp
@@ -0,0 +1,686 @@
+#ifndef __ANIMA_HPP__
+#define __ANIMA_HPP__
+
+#include <cmath>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "common_block.hpp"
+#include "flux.hpp"
+#include "rope.hpp"
+
+namespace Anima {
+    constexpr int ANIMA_GRAPH_SIZE = 65536;
+
+    __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx,
+                                                     struct ggml_tensor* x,
+                                                     struct ggml_tensor* gate) {
+        gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]);  // [N, 1, C]
+        return ggml_mul(ctx, x, gate);
+    }
+
+    struct XEmbedder : public GGMLBlock {
+    public:
+        XEmbedder(int64_t in_dim, int64_t out_dim) {
+            blocks["proj.1"] = std::make_shared<Linear>(in_dim, out_dim, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj.1"]);
+            return proj->forward(ctx, x);
+        }
+    };
+
+    struct TimestepEmbedder : public GGMLBlock {
+    public:
+        TimestepEmbedder(int64_t in_dim, int64_t out_dim) {
+            blocks["1.linear_1"] = std::make_shared<Linear>(in_dim, in_dim, false);
+            blocks["1.linear_2"] = std::make_shared<Linear>(in_dim, out_dim, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["1.linear_2"]);
+
+            x = linear_1->forward(ctx, x);
+            x = ggml_silu_inplace(ctx->ggml_ctx, x);
+            x = linear_2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct AdaLayerNormZero : public GGMLBlock {
+    protected:
+        int64_t in_features;
+
+    public:
+        AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256)
+            : in_features(in_features) {
+            blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
+            blocks["1"]    = std::make_shared<Linear>(in_features, hidden_features, false);
+            blocks["2"]    = std::make_shared<Linear>(hidden_features, 3 * in_features, false);
+        }
+
+        std::pair<struct ggml_tensor*, struct ggml_tensor*> forward(GGMLRunnerContext* ctx,
+                                                                    struct ggml_tensor* hidden_states,
+                                                                    struct ggml_tensor* embedded_timestep,
+                                                                    struct ggml_tensor* temb = nullptr) {
+            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
+
+            auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
+            emb      = linear_1->forward(ctx, emb);
+            emb      = linear_2->forward(ctx, emb);  // [N, 3*C]
+
+            if (temb != nullptr) {
+                emb = ggml_add(ctx->ggml_ctx, emb, temb);
+            }
+
+            auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0);
+            auto shift      = emb_chunks[0];
+            auto scale      = emb_chunks[1];
+            auto gate       = emb_chunks[2];
+
+            auto x = norm->forward(ctx, hidden_states);
+            x      = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+
+            return {x, gate};
+        }
+    };
+
+    struct AdaLayerNorm : public GGMLBlock {
+    protected:
+        int64_t embedding_dim;
+
+    public:
+        AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256)
+            : embedding_dim(in_features) {
+            blocks["norm"] = std::make_shared<LayerNorm>(in_features, 1e-6f, false, false);
+            blocks["1"]    = std::make_shared<Linear>(in_features, hidden_features, false);
+            blocks["2"]    = std::make_shared<Linear>(hidden_features, 2 * in_features, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* hidden_states,
+                                    struct ggml_tensor* embedded_timestep,
+                                    struct ggml_tensor* temb = nullptr) {
+            auto norm     = std::dynamic_pointer_cast<LayerNorm>(blocks["norm"]);
+            auto linear_1 = std::dynamic_pointer_cast<Linear>(blocks["1"]);
+            auto linear_2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
+
+            auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep);
+            emb      = linear_1->forward(ctx, emb);
+            emb      = linear_2->forward(ctx, emb);  // [N, 2*C]
+
+            if (temb != nullptr) {
+                auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0);
+                emb          = ggml_add(ctx->ggml_ctx, emb, temb_2c);
+            }
+
+            auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0);
+            auto shift      = emb_chunks[0];
+            auto scale      = emb_chunks[1];
+
+            auto x = norm->forward(ctx, hidden_states);
+            x      = Flux::modulate(ctx->ggml_ctx, x, shift, scale);
+            return x;
+        }
+    };
+
+    struct AnimaAttention : public GGMLBlock {
+    protected:
+        int64_t num_heads;
+        int64_t head_dim;
+        std::string out_proj_name;
+
+    public:
+        AnimaAttention(int64_t query_dim,
+                       int64_t context_dim,
+                       int64_t num_heads,
+                       int64_t head_dim,
+                       const std::string& out_proj_name = "output_proj")
+            : num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) {
+            int64_t inner_dim = num_heads * head_dim;
+
+            blocks["q_proj"]            = std::make_shared<Linear>(query_dim, inner_dim, false);
+            blocks["k_proj"]            = std::make_shared<Linear>(context_dim, inner_dim, false);
+            blocks["v_proj"]            = std::make_shared<Linear>(context_dim, inner_dim, false);
+            blocks["q_norm"]            = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks["k_norm"]            = std::make_shared<RMSNorm>(head_dim, 1e-6f);
+            blocks[this->out_proj_name] = std::make_shared<Linear>(inner_dim, query_dim, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* hidden_states,
+                                    struct ggml_tensor* encoder_hidden_states = nullptr,
+                                    struct ggml_tensor* pe_q                  = nullptr,
+                                    struct ggml_tensor* pe_k                  = nullptr) {
+            if (encoder_hidden_states == nullptr) {
+                encoder_hidden_states = hidden_states;
+            }
+
+            auto q_proj   = std::dynamic_pointer_cast<Linear>(blocks["q_proj"]);
+            auto k_proj   = std::dynamic_pointer_cast<Linear>(blocks["k_proj"]);
+            auto v_proj   = std::dynamic_pointer_cast<Linear>(blocks["v_proj"]);
+            auto q_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["q_norm"]);
+            auto k_norm   = std::dynamic_pointer_cast<RMSNorm>(blocks["k_norm"]);
+            auto out_proj = std::dynamic_pointer_cast<Linear>(blocks[out_proj_name]);
+
+            auto q = q_proj->forward(ctx, hidden_states);
+            auto k = k_proj->forward(ctx, encoder_hidden_states);
+            auto v = v_proj->forward(ctx, encoder_hidden_states);
+
+            int64_t N   = q->ne[2];
+            int64_t L_q = q->ne[1];
+            int64_t L_k = k->ne[1];
+
+            auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N);  // [N, L_q, H, D]
+            auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N);  // [N, L_k, H, D]
+            auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N);  // [N, L_k, H, D]
+
+            q4 = q_norm->forward(ctx, q4);
+            k4 = k_norm->forward(ctx, k4);
+
+            struct ggml_tensor* attn_out = nullptr;
+            if (pe_q != nullptr || pe_k != nullptr) {
+                if (pe_q == nullptr) {
+                    pe_q = pe_k;
+                }
+                if (pe_k == nullptr) {
+                    pe_k = pe_q;
+                }
+                auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false);
+                auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false);
+                attn_out    = ggml_ext_attention_ext(ctx->ggml_ctx,
+                                                     ctx->backend,
+                                                     q_rope,
+                                                     k_rope,
+                                                     v4,
+                                                     num_heads,
+                                                     nullptr,
+                                                     true,
+                                                     ctx->flash_attn_enabled);
+            } else {
+                auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N);
+                auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N);
+                attn_out    = ggml_ext_attention_ext(ctx->ggml_ctx,
+                                                     ctx->backend,
+                                                     q_flat,
+                                                     k_flat,
+                                                     v,
+                                                     num_heads,
+                                                     nullptr,
+                                                     false,
+                                                     ctx->flash_attn_enabled);
+            }
+
+            return out_proj->forward(ctx, attn_out);
+        }
+    };
+
+    struct AnimaMLP : public GGMLBlock {
+    public:
+        AnimaMLP(int64_t dim, int64_t hidden_dim) {
+            blocks["layer1"] = std::make_shared<Linear>(dim, hidden_dim, false);
+            blocks["layer2"] = std::make_shared<Linear>(hidden_dim, dim, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto layer1 = std::dynamic_pointer_cast<Linear>(blocks["layer1"]);
+            auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["layer2"]);
+
+            x = layer1->forward(ctx, x);
+            x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+            x = layer2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct AdapterMLP : public GGMLBlock {
+    public:
+        AdapterMLP(int64_t dim, int64_t hidden_dim) {
+            blocks["0"] = std::make_shared<Linear>(dim, hidden_dim, true);
+            blocks["2"] = std::make_shared<Linear>(hidden_dim, dim, true);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+            auto layer0 = std::dynamic_pointer_cast<Linear>(blocks["0"]);
+            auto layer2 = std::dynamic_pointer_cast<Linear>(blocks["2"]);
+
+            x = layer0->forward(ctx, x);
+            x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+            x = layer2->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct LLMAdapterBlock : public GGMLBlock {
+    public:
+        LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) {
+            blocks["norm_self_attn"]  = std::make_shared<RMSNorm>(model_dim, 1e-6f);
+            blocks["self_attn"]       = std::make_shared<AnimaAttention>(model_dim, model_dim, num_heads, head_dim, "o_proj");
+            blocks["norm_cross_attn"] = std::make_shared<RMSNorm>(model_dim, 1e-6f);
+            blocks["cross_attn"]      = std::make_shared<AnimaAttention>(model_dim, source_dim, num_heads, head_dim, "o_proj");
+            blocks["norm_mlp"]        = std::make_shared<RMSNorm>(model_dim, 1e-6f);
+            blocks["mlp"]             = std::make_shared<AdapterMLP>(model_dim, model_dim * 4);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* context,
+                                    struct ggml_tensor* target_pe,
+                                    struct ggml_tensor* context_pe) {
+            auto norm_self_attn  = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_self_attn"]);
+            auto self_attn       = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
+            auto norm_cross_attn = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_cross_attn"]);
+            auto cross_attn      = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
+            auto norm_mlp        = std::dynamic_pointer_cast<RMSNorm>(blocks["norm_mlp"]);
+            auto mlp             = std::dynamic_pointer_cast<AdapterMLP>(blocks["mlp"]);
+
+            auto h = norm_self_attn->forward(ctx, x);
+            h      = self_attn->forward(ctx, h, nullptr, target_pe, target_pe);
+            x      = ggml_add(ctx->ggml_ctx, x, h);
+
+            h = norm_cross_attn->forward(ctx, x);
+            h = cross_attn->forward(ctx, h, context, target_pe, context_pe);
+            x = ggml_add(ctx->ggml_ctx, x, h);
+
+            h = norm_mlp->forward(ctx, x);
+            h = mlp->forward(ctx, h);
+            x = ggml_add(ctx->ggml_ctx, x, h);
+
+            return x;
+        }
+    };
+
+    struct LLMAdapter : public GGMLBlock {
+    protected:
+        int num_layers;
+
+    public:
+        LLMAdapter(int64_t source_dim = 1024,
+                   int64_t target_dim = 1024,
+                   int64_t model_dim  = 1024,
+                   int num_layers     = 6,
+                   int num_heads      = 16)
+            : num_layers(num_layers) {
+            int64_t head_dim = model_dim / num_heads;
+
+            blocks["embed"] = std::make_shared<Embedding>(32128, target_dim);
+            for (int i = 0; i < num_layers; i++) {
+                blocks["blocks." + std::to_string(i)] =
+                    std::make_shared<LLMAdapterBlock>(model_dim, source_dim, num_heads, head_dim);
+            }
+            blocks["out_proj"] = std::make_shared<Linear>(model_dim, target_dim, true);
+            blocks["norm"]     = std::make_shared<RMSNorm>(target_dim, 1e-6f);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* source_hidden_states,
+                                    struct ggml_tensor* target_input_ids,
+                                    struct ggml_tensor* target_pe,
+                                    struct ggml_tensor* source_pe) {
+            GGML_ASSERT(target_input_ids != nullptr);
+            if (ggml_n_dims(target_input_ids) == 1) {
+                target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1);
+            }
+
+            auto embed    = std::dynamic_pointer_cast<Embedding>(blocks["embed"]);
+            auto out_proj = std::dynamic_pointer_cast<Linear>(blocks["out_proj"]);
+            auto norm     = std::dynamic_pointer_cast<RMSNorm>(blocks["norm"]);
+
+            auto x = embed->forward(ctx, target_input_ids);  // [N, target_len, target_dim]
+
+            for (int i = 0; i < num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<LLMAdapterBlock>(blocks["blocks." + std::to_string(i)]);
+                x          = block->forward(ctx, x, source_hidden_states, target_pe, source_pe);
+            }
+
+            x = out_proj->forward(ctx, x);
+            x = norm->forward(ctx, x);
+            return x;
+        }
+    };
+
+    struct TransformerBlock : public GGMLBlock {
+    public:
+        TransformerBlock(int64_t hidden_size,
+                         int64_t text_embed_dim,
+                         int64_t num_heads,
+                         int64_t head_dim,
+                         int64_t mlp_ratio      = 4,
+                         int64_t adaln_lora_dim = 256) {
+            blocks["adaln_modulation_self_attn"]  = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
+            blocks["self_attn"]                   = std::make_shared<AnimaAttention>(hidden_size, hidden_size, num_heads, head_dim);
+            blocks["adaln_modulation_cross_attn"] = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
+            blocks["cross_attn"]                  = std::make_shared<AnimaAttention>(hidden_size, text_embed_dim, num_heads, head_dim);
+            blocks["adaln_modulation_mlp"]        = std::make_shared<AdaLayerNormZero>(hidden_size, adaln_lora_dim);
+            blocks["mlp"]                         = std::make_shared<AnimaMLP>(hidden_size, hidden_size * mlp_ratio);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* hidden_states,
+                                    struct ggml_tensor* encoder_hidden_states,
+                                    struct ggml_tensor* embedded_timestep,
+                                    struct ggml_tensor* temb,
+                                    struct ggml_tensor* image_pe) {
+            auto norm1 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_self_attn"]);
+            auto attn1 = std::dynamic_pointer_cast<AnimaAttention>(blocks["self_attn"]);
+            auto norm2 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_cross_attn"]);
+            auto attn2 = std::dynamic_pointer_cast<AnimaAttention>(blocks["cross_attn"]);
+            auto norm3 = std::dynamic_pointer_cast<AdaLayerNormZero>(blocks["adaln_modulation_mlp"]);
+            auto mlp   = std::dynamic_pointer_cast<AnimaMLP>(blocks["mlp"]);
+
+            auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb);
+            auto h                = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe);
+            hidden_states         = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1));
+
+            auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb);
+            h                     = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr);
+            hidden_states         = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2));
+
+            auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb);
+            h                     = mlp->forward(ctx, normed3);
+            hidden_states         = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3));
+
+            return hidden_states;
+        }
+    };
+
+    struct FinalLayer : public GGMLBlock {
+    protected:
+        int64_t hidden_size;
+        int64_t patch_size;
+        int64_t out_channels;
+
+    public:
+        FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels)
+            : hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) {
+            blocks["adaln_modulation"] = std::make_shared<AdaLayerNorm>(hidden_size, 256);
+            blocks["linear"]           = std::make_shared<Linear>(hidden_size, patch_size * patch_size * out_channels, false);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* hidden_states,
+                                    struct ggml_tensor* embedded_timestep,
+                                    struct ggml_tensor* temb) {
+            auto adaln  = std::dynamic_pointer_cast<AdaLayerNorm>(blocks["adaln_modulation"]);
+            auto linear = std::dynamic_pointer_cast<Linear>(blocks["linear"]);
+
+            hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb);
+            hidden_states = linear->forward(ctx, hidden_states);
+            return hidden_states;
+        }
+    };
+
+    struct AnimaNet : public GGMLBlock {
+    public:
+        int64_t in_channels       = 16;
+        int64_t out_channels      = 16;
+        int64_t hidden_size       = 2048;
+        int64_t text_embed_dim    = 1024;
+        int64_t num_heads         = 16;
+        int64_t head_dim          = 128;
+        int patch_size            = 2;
+        int64_t num_layers        = 28;
+        std::vector<int> axes_dim = {44, 42, 42};
+        int theta                 = 10000;
+
+    public:
+        AnimaNet() = default;
+        explicit AnimaNet(int64_t num_layers)
+            : num_layers(num_layers) {
+            blocks["x_embedder"]       = std::make_shared<XEmbedder>((in_channels + 1) * patch_size * patch_size, hidden_size);
+            blocks["t_embedder"]       = std::make_shared<TimestepEmbedder>(hidden_size, hidden_size * 3);
+            blocks["t_embedding_norm"] = std::make_shared<RMSNorm>(hidden_size, 1e-6f);
+            for (int i = 0; i < num_layers; i++) {
+                blocks["blocks." + std::to_string(i)] = std::make_shared<TransformerBlock>(hidden_size,
+                                                                                           text_embed_dim,
+                                                                                           num_heads,
+                                                                                           head_dim);
+            }
+            blocks["final_layer"] = std::make_shared<FinalLayer>(hidden_size, patch_size, out_channels);
+            blocks["llm_adapter"] = std::make_shared<LLMAdapter>(1024, 1024, 1024, 6, 16);
+        }
+
+        struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                    struct ggml_tensor* x,
+                                    struct ggml_tensor* timestep,
+                                    struct ggml_tensor* encoder_hidden_states,
+                                    struct ggml_tensor* image_pe,
+                                    struct ggml_tensor* t5_ids       = nullptr,
+                                    struct ggml_tensor* t5_weights   = nullptr,
+                                    struct ggml_tensor* adapter_q_pe = nullptr,
+                                    struct ggml_tensor* adapter_k_pe = nullptr) {
+            GGML_ASSERT(x->ne[3] == 1);
+
+            auto x_embedder       = std::dynamic_pointer_cast<XEmbedder>(blocks["x_embedder"]);
+            auto t_embedder       = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
+            auto t_embedding_norm = std::dynamic_pointer_cast<RMSNorm>(blocks["t_embedding_norm"]);
+            auto final_layer      = std::dynamic_pointer_cast<FinalLayer>(blocks["final_layer"]);
+            auto llm_adapter      = std::dynamic_pointer_cast<LLMAdapter>(blocks["llm_adapter"]);
+
+            int64_t W = x->ne[0];
+            int64_t H = x->ne[1];
+
+            auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]);
+            x                 = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2);  // [N, C + 1, H, W]
+
+            x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);  // [N, h*w, (C+1)*ph*pw]
+
+            x = x_embedder->forward(ctx, x);
+
+            auto timestep_proj     = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast<int>(hidden_size));
+            auto temb              = t_embedder->forward(ctx, timestep_proj);
+            auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj);
+
+            if (t5_ids != nullptr) {
+                auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe);
+                if (t5_weights != nullptr) {
+                    auto w = t5_weights;
+                    if (ggml_n_dims(w) == 1) {
+                        w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1);
+                    }
+                    w               = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1);
+                    adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w);
+                }
+                if (adapted_context->ne[1] < 512) {
+                    auto pad_ctx    = ggml_ext_zeros(ctx->ggml_ctx,
+                                                     adapted_context->ne[0],
+                                                     512 - adapted_context->ne[1],
+                                                     adapted_context->ne[2],
+                                                     1);
+                    adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1);
+                } else if (adapted_context->ne[1] > 512) {
+                    adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512);
+                }
+                encoder_hidden_states = adapted_context;
+            }
+
+            for (int i = 0; i < num_layers; i++) {
+                auto block = std::dynamic_pointer_cast<TransformerBlock>(blocks["blocks." + std::to_string(i)]);
+                x          = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe);
+            }
+
+            x = final_layer->forward(ctx, x, embedded_timestep, temb);  // [N, h*w, ph*pw*C]
+
+            x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false);  // [N, C, H, W]
+
+            return x;
+        }
+    };
+
+    struct AnimaRunner : public GGMLRunner {
+    public:
+        std::vector<float> image_pe_vec;
+        std::vector<float> adapter_q_pe_vec;
+        std::vector<float> adapter_k_pe_vec;
+        AnimaNet net;
+
+        AnimaRunner(ggml_backend_t backend,
+                    bool offload_params_to_cpu,
+                    const String2TensorStorage& tensor_storage_map = {},
+                    const std::string prefix                       = "model.diffusion_model")
+            : GGMLRunner(backend, offload_params_to_cpu) {
+            int64_t num_layers    = 0;
+            std::string layer_tag = prefix + ".net.blocks.";
+            for (const auto& kv : tensor_storage_map) {
+                const std::string& tensor_name = kv.first;
+                size_t pos                     = tensor_name.find(layer_tag);
+                if (pos == std::string::npos) {
+                    continue;
+                }
+                size_t start = pos + layer_tag.size();
+                size_t end   = tensor_name.find('.', start);
+                if (end == std::string::npos) {
+                    continue;
+                }
+                int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str());
+                num_layers       = std::max(num_layers, layer_id + 1);
+            }
+            if (num_layers <= 0) {
+                num_layers = 28;
+            }
+            LOG_INFO("anima net layers: %" PRId64, num_layers);
+
+            net = AnimaNet(num_layers);
+            net.init(params_ctx, tensor_storage_map, prefix + ".net");
+        }
+
+        std::string get_desc() override {
+            return "anima";
+        }
+
+        void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
+            net.get_param_tensors(tensors, prefix + ".net");
+        }
+
+        static std::vector<float> gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) {
+            std::vector<float> pos(seq_len);
+            for (int64_t i = 0; i < seq_len; i++) {
+                pos[i] = static_cast<float>(i);
+            }
+            auto rope_emb = Rope::rope(pos, dim, theta);
+            return Rope::flatten(rope_emb);
+        }
+
+        static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) {
+            if (extrapolation_ratio == 1.0f || axis_dim <= 2) {
+                return 1.0f;
+            }
+            return std::pow(extrapolation_ratio, static_cast<float>(axis_dim) / static_cast<float>(axis_dim - 2));
+        }
+
+        static std::vector<float> gen_anima_image_pe_vec(int bs,
+                                                         int h,
+                                                         int w,
+                                                         int patch_size,
+                                                         int theta,
+                                                         const std::vector<int>& axes_dim,
+                                                         float h_extrapolation_ratio,
+                                                         float w_extrapolation_ratio,
+                                                         float t_extrapolation_ratio) {
+            static const std::vector<ggml_tensor*> empty_ref_latents;
+            auto ids = Rope::gen_flux_ids(h,
+                                          w,
+                                          patch_size,
+                                          bs,
+                                          static_cast<int>(axes_dim.size()),
+                                          0,
+                                          {},
+                                          empty_ref_latents,
+                                          false,
+                                          1.0f);
+
+            std::vector<float> axis_thetas = {
+                static_cast<float>(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]),
+                static_cast<float>(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]),
+                static_cast<float>(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]),
+            };
+            return Rope::embed_nd(ids, bs, axis_thetas, axes_dim);
+        }
+
+        struct ggml_cgraph* build_graph(struct ggml_tensor* x,
+                                        struct ggml_tensor* timesteps,
+                                        struct ggml_tensor* context,
+                                        struct ggml_tensor* t5_ids     = nullptr,
+                                        struct ggml_tensor* t5_weights = nullptr) {
+            GGML_ASSERT(x->ne[3] == 1);
+            struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE);
+
+            x          = to_backend(x);
+            timesteps  = to_backend(timesteps);
+            context    = to_backend(context);
+            t5_ids     = to_backend(t5_ids);
+            t5_weights = to_backend(t5_weights);
+
+            int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size;
+            int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size;
+            int64_t h_pad = x->ne[1] + pad_h;
+            int64_t w_pad = x->ne[0] + pad_w;
+
+            image_pe_vec          = gen_anima_image_pe_vec(1,
+                                                           static_cast<int>(h_pad),
+                                                           static_cast<int>(w_pad),
+                                                           static_cast<int>(net.patch_size),
+                                                           net.theta,
+                                                           net.axes_dim,
+                                                           4.0f,
+                                                           4.0f,
+                                                           1.0f);
+            int64_t image_pos_len = static_cast<int64_t>(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2));
+            auto image_pe         = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len);
+            set_backend_tensor_data(image_pe, image_pe_vec.data());
+
+            ggml_tensor* adapter_q_pe = nullptr;
+            ggml_tensor* adapter_k_pe = nullptr;
+            if (t5_ids != nullptr) {
+                int64_t target_len = t5_ids->ne[0];
+                int64_t source_len = context->ne[1];
+
+                adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f);
+                adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f);
+
+                int64_t target_pos_len = static_cast<int64_t>(adapter_q_pe_vec.size()) / (2 * 2 * 32);
+                int64_t source_pos_len = static_cast<int64_t>(adapter_k_pe_vec.size()) / (2 * 2 * 32);
+
+                adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len);
+                adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len);
+                set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data());
+                set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data());
+            }
+
+            auto runner_ctx = get_context();
+            auto out        = net.forward(&runner_ctx,
+                                          x,
+                                          timesteps,
+                                          context,
+                                          image_pe,
+                                          t5_ids,
+                                          t5_weights,
+                                          adapter_q_pe,
+                                          adapter_k_pe);
+
+            ggml_build_forward_expand(gf, out);
+            return gf;
+        }
+
+        bool compute(int n_threads,
+                     struct ggml_tensor* x,
+                     struct ggml_tensor* timesteps,
+                     struct ggml_tensor* context,
+                     struct ggml_tensor* t5_ids      = nullptr,
+                     struct ggml_tensor* t5_weights  = nullptr,
+                     struct ggml_tensor** output     = nullptr,
+                     struct ggml_context* output_ctx = nullptr) {
+            auto get_graph = [&]() -> struct ggml_cgraph* {
+                return build_graph(x, timesteps, context, t5_ids, t5_weights);
+            };
+            return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
+        }
+    };
+}  // namespace Anima
+
+#endif  // __ANIMA_HPP__
diff --git a/otherarch/sdcpp/common_block.hpp b/otherarch/sdcpp/common_block.hpp
new file mode 100644
index 00000000000..435afa4f415
--- /dev/null
+++ b/otherarch/sdcpp/common_block.hpp
@@ -0,0 +1,593 @@
+#ifndef __COMMON_BLOCK_HPP__
+#define __COMMON_BLOCK_HPP__
+
+#include "ggml_extend.hpp"
+
+class DownSampleBlock : public GGMLBlock {
+protected:
+    int channels;
+    int out_channels;
+    bool vae_downsample;
+
+public:
+    DownSampleBlock(int channels,
+                    int out_channels,
+                    bool vae_downsample = false)
+        : channels(channels),
+          out_channels(out_channels),
+          vae_downsample(vae_downsample) {
+        if (vae_downsample) {
+            blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0}));
+        } else {
+            blocks["op"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1}));
+        }
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        if (vae_downsample) {
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+
+            x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
+            x = conv->forward(ctx, x);
+        } else {
+            auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["op"]);
+
+            x = conv->forward(ctx, x);
+        }
+        return x;  // [N, out_channels, h/2, w/2]
+    }
+};
+
+class UpSampleBlock : public GGMLBlock {
+protected:
+    int channels;
+    int out_channels;
+
+public:
+    UpSampleBlock(int channels,
+                  int out_channels)
+        : channels(channels),
+          out_channels(out_channels) {
+        blocks["conv"] = std::shared_ptr<GGMLBlock>(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1}));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        // x: [N, channels, h, w]
+        auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
+
+        x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST);  // [N, channels, h*2, w*2]
+        x = conv->forward(ctx, x);                                       // [N, out_channels, h*2, w*2]
+        return x;
+    }
+};
+
+class ResBlock : public GGMLBlock {
+protected:
+    // network hparams
+    int64_t channels;      // model_channels * (1, 1, 1, 2, 2, 4, 4, 4)
+    int64_t emb_channels;  // time_embed_dim
+    int64_t out_channels;  // mult * model_channels
+    std::pair<int, int> kernel_size;
+    int dims;
+    bool skip_t_emb;
+    bool exchange_temb_dims;
+
+    std::shared_ptr<GGMLBlock> conv_nd(int dims,
+                                       int64_t in_channels,
+                                       int64_t out_channels,
+                                       std::pair<int, int> kernel_size,
+                                       std::pair<int, int> padding) {
+        GGML_ASSERT(dims == 2 || dims == 3);
+        if (dims == 3) {
+            return std::shared_ptr<GGMLBlock>(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0}));
+        } else {
+            return std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding));
+        }
+    }
+
+public:
+    ResBlock(int64_t channels,
+             int64_t emb_channels,
+             int64_t out_channels,
+             std::pair<int, int> kernel_size = {3, 3},
+             int dims                        = 2,
+             bool exchange_temb_dims         = false,
+             bool skip_t_emb                 = false)
+        : channels(channels),
+          emb_channels(emb_channels),
+          out_channels(out_channels),
+          kernel_size(kernel_size),
+          dims(dims),
+          skip_t_emb(skip_t_emb),
+          exchange_temb_dims(exchange_temb_dims) {
+        std::pair<int, int> padding = {kernel_size.first / 2, kernel_size.second / 2};
+        blocks["in_layers.0"]       = std::shared_ptr<GGMLBlock>(new GroupNorm32(channels));
+        // in_layer_1 is nn.SILU()
+        blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding);
+
+        if (!skip_t_emb) {
+            // emb_layer_0 is nn.SILU()
+            blocks["emb_layers.1"] = std::shared_ptr<GGMLBlock>(new Linear(emb_channels, out_channels));
+        }
+
+        blocks["out_layers.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(out_channels));
+        // out_layer_1 is nn.SILU()
+        // out_layer_2 is nn.Dropout(), skip for inference
+        blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding);
+
+        if (out_channels != channels) {
+            blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0});
+        }
+    }
+
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) {
+        // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml
+        // [N, c, t, h, w] => [N, c, t, h * w]
+        // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w]
+        // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels]
+        auto in_layers_0  = std::dynamic_pointer_cast<GroupNorm32>(blocks["in_layers.0"]);
+        auto in_layers_2  = std::dynamic_pointer_cast<UnaryBlock>(blocks["in_layers.2"]);
+        auto out_layers_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks["out_layers.0"]);
+        auto out_layers_3 = std::dynamic_pointer_cast<UnaryBlock>(blocks["out_layers.3"]);
+
+        if (emb == nullptr) {
+            GGML_ASSERT(skip_t_emb);
+        }
+
+        // in_layers
+        auto h = in_layers_0->forward(ctx, x);
+        h      = ggml_silu_inplace(ctx->ggml_ctx, h);
+        h      = in_layers_2->forward(ctx, h);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+
+        // emb_layers
+        if (!skip_t_emb) {
+            auto emb_layer_1 = std::dynamic_pointer_cast<Linear>(blocks["emb_layers.1"]);
+
+            auto emb_out = ggml_silu(ctx->ggml_ctx, emb);
+            emb_out      = emb_layer_1->forward(ctx, emb_out);  // [N, out_channels] if dims == 2 else [N, t, out_channels]
+
+            if (dims == 2) {
+                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]);  // [N, out_channels, 1, 1]
+            } else {
+                emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]);  // [N, t, out_channels, 1]
+                if (exchange_temb_dims) {
+                    // emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
+                    emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3));  // [N, out_channels, t, 1]
+                }
+            }
+
+            h = ggml_add(ctx->ggml_ctx, h, emb_out);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+        }
+
+        // out_layers
+        h = out_layers_0->forward(ctx, h);
+        h = ggml_silu_inplace(ctx->ggml_ctx, h);
+        // dropout, skip for inference
+        h = out_layers_3->forward(ctx, h);
+
+        // skip connection
+        if (out_channels != channels) {
+            auto skip_connection = std::dynamic_pointer_cast<UnaryBlock>(blocks["skip_connection"]);
+            x                    = skip_connection->forward(ctx, x);  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+        }
+
+        h = ggml_add(ctx->ggml_ctx, h, x);
+        return h;  // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w]
+    }
+};
+
+class GEGLU : public UnaryBlock {
+protected:
+    int64_t dim_in;
+    int64_t dim_out;
+
+public:
+    GEGLU(int64_t dim_in, int64_t dim_out)
+        : dim_in(dim_in), dim_out(dim_out) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out * 2));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x          = proj->forward(ctx, x);  // [ne3, ne2, ne1, dim_out*2]
+        auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false);
+        x          = x_vec[0];  // [ne3, ne2, ne1, dim_out]
+        auto gate  = x_vec[1];  // [ne3, ne2, ne1, dim_out]
+
+        gate = ggml_cont(ctx->ggml_ctx, gate);
+
+        gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true);
+
+        x = ggml_mul(ctx->ggml_ctx, x, gate);  // [ne3, ne2, ne1, dim_out]
+
+        return x;
+    }
+};
+
+class GELU : public UnaryBlock {
+public:
+    GELU(int64_t dim_in, int64_t dim_out, bool bias = true) {
+        blocks["proj"] = std::shared_ptr<GGMLBlock>(new Linear(dim_in, dim_out, bias));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override {
+        // x: [ne3, ne2, ne1, dim_in]
+        // return: [ne3, ne2, ne1, dim_out]
+        auto proj = std::dynamic_pointer_cast<Linear>(blocks["proj"]);
+
+        x = proj->forward(ctx, x);
+        x = ggml_ext_gelu(ctx->ggml_ctx, x, true);
+        return x;
+    }
+};
+
+class FeedForward : public GGMLBlock {
+public:
+    enum class Activation {
+        GEGLU,
+        GELU
+    };
+    FeedForward(int64_t dim,
+                int64_t dim_out,
+                int64_t mult          = 4,
+                Activation activation = Activation::GEGLU,
+                bool precision_fix    = false) {
+        int64_t inner_dim = dim * mult;
+        if (activation == Activation::GELU) {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GELU(dim, inner_dim));
+        } else {
+            blocks["net.0"] = std::shared_ptr<GGMLBlock>(new GEGLU(dim, inner_dim));
+        }
+
+        // net_1 is nn.Dropout(), skip for inference
+        bool force_prec_f32 = false;
+        float scale         = 1.f;
+        if (precision_fix) {
+            scale = 1.f / 128.f;
+#ifdef SD_USE_VULKAN
+            force_prec_f32 = true;
+#endif
+        }
+        // The purpose of the scale here is to prevent NaN issues in certain situations.
+        // For example, when using Vulkan without enabling force_prec_f32,
+        // or when using CUDA but the weights are k-quants.
+        blocks["net.2"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale));
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) {
+        // x: [ne3, ne2, ne1, dim]
+        // return: [ne3, ne2, ne1, dim_out]
+
+        auto net_0 = std::dynamic_pointer_cast<UnaryBlock>(blocks["net.0"]);
+        auto net_2 = std::dynamic_pointer_cast<Linear>(blocks["net.2"]);
+
+        x = net_0->forward(ctx, x);  // [ne3, ne2, ne1, inner_dim]
+        x = net_2->forward(ctx, x);  // [ne3, ne2, ne1, dim_out]
+        return x;
+    }
+};
+
+class CrossAttention : public GGMLBlock {
+protected:
+    int64_t query_dim;
+    int64_t context_dim;
+    int64_t n_head;
+    int64_t d_head;
+
+public:
+    CrossAttention(int64_t query_dim,
+                   int64_t context_dim,
+                   int64_t n_head,
+                   int64_t d_head)
+        : n_head(n_head),
+          d_head(d_head),
+          query_dim(query_dim),
+          context_dim(context_dim) {
+        int64_t inner_dim = d_head * n_head;
+
+        blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
+        blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
+        blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
+
+        blocks["to_out.0"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, query_dim));
+        // to_out_1 is nn.Dropout(), skip for inference
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context) {
+        // x: [N, n_token, query_dim]
+        // context: [N, n_context, context_dim]
+        // return: [N, n_token, query_dim]
+        auto to_q     = std::dynamic_pointer_cast<Linear>(blocks["to_q"]);
+        auto to_k     = std::dynamic_pointer_cast<Linear>(blocks["to_k"]);
+        auto to_v     = std::dynamic_pointer_cast<Linear>(blocks["to_v"]);
+        auto to_out_0 = std::dynamic_pointer_cast<Linear>(blocks["to_out.0"]);
+
+        int64_t n         = x->ne[2];
+        int64_t n_token   = x->ne[1];
+        int64_t n_context = context->ne[1];
+        int64_t inner_dim = d_head * n_head;
+
+        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
+        auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
+
+        x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
+
+        x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
+        return x;
+    }
+};
+
+class BasicTransformerBlock : public GGMLBlock {
+protected:
+    int64_t n_head;
+    int64_t d_head;
+    bool ff_in;
+
+public:
+    BasicTransformerBlock(int64_t dim,
+                          int64_t n_head,
+                          int64_t d_head,
+                          int64_t context_dim,
+                          bool ff_in = false)
+        : n_head(n_head), d_head(d_head), ff_in(ff_in) {
+        // disable_self_attn is always False
+        // disable_temporal_crossattention is always False
+        // switch_temporal_ca_to_sa is always False
+        // inner_dim is always None or equal to dim
+        // gated_ff is always True
+        blocks["attn1"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, dim, n_head, d_head));
+        blocks["attn2"] = std::shared_ptr<GGMLBlock>(new CrossAttention(dim, context_dim, n_head, d_head));
+        blocks["ff"]    = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
+        blocks["norm1"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm2"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+        blocks["norm3"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+
+        if (ff_in) {
+            blocks["norm_in"] = std::shared_ptr<GGMLBlock>(new LayerNorm(dim));
+            blocks["ff_in"]   = std::shared_ptr<GGMLBlock>(new FeedForward(dim, dim));
+        }
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* context) {
+        // x: [N, n_token, query_dim]
+        // context: [N, n_context, context_dim]
+        // return: [N, n_token, query_dim]
+
+        auto attn1 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn1"]);
+        auto attn2 = std::dynamic_pointer_cast<CrossAttention>(blocks["attn2"]);
+        auto ff    = std::dynamic_pointer_cast<FeedForward>(blocks["ff"]);
+        auto norm1 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm1"]);
+        auto norm2 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm2"]);
+        auto norm3 = std::dynamic_pointer_cast<LayerNorm>(blocks["norm3"]);
+
+        if (ff_in) {
+            auto norm_in = std::dynamic_pointer_cast<LayerNorm>(blocks["norm_in"]);
+            auto ff_in   = std::dynamic_pointer_cast<FeedForward>(blocks["ff_in"]);
+
+            auto x_skip = x;
+            x           = norm_in->forward(ctx, x);
+            x           = ff_in->forward(ctx, x);
+            // self.is_res is always True
+            x = ggml_add(ctx->ggml_ctx, x, x_skip);
+        }
+
+        auto r = x;
+        x      = norm1->forward(ctx, x);
+        x      = attn1->forward(ctx, x, x);  // self-attention
+        x      = ggml_add(ctx->ggml_ctx, x, r);
+        r      = x;
+        x      = norm2->forward(ctx, x);
+        x      = attn2->forward(ctx, x, context);  // cross-attention
+        x      = ggml_add(ctx->ggml_ctx, x, r);
+        r      = x;
+        x      = norm3->forward(ctx, x);
+        x      = ff->forward(ctx, x);
+        x      = ggml_add(ctx->ggml_ctx, x, r);
+
+        return x;
+    }
+};
+
+class SpatialTransformer : public GGMLBlock {
+protected:
+    int64_t in_channels;  // mult * model_channels
+    int64_t n_head;
+    int64_t d_head;
+    int64_t depth       = 1;    // 1
+    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
+    bool use_linear     = false;
+
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            int64_t inner_dim = n_head * d_head;
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
+                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
+            }
+        }
+    }
+
+public:
+    SpatialTransformer(int64_t in_channels,
+                       int64_t n_head,
+                       int64_t d_head,
+                       int64_t depth,
+                       int64_t context_dim,
+                       bool use_linear)
+        : in_channels(in_channels),
+          n_head(n_head),
+          d_head(d_head),
+          depth(depth),
+          context_dim(context_dim),
+          use_linear(use_linear) {
+        // disable_self_attn is always False
+        int64_t inner_dim = n_head * d_head;  // in_channels
+        blocks["norm"]    = std::shared_ptr<GGMLBlock>(new GroupNorm32(in_channels));
+        if (use_linear) {
+            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Linear(in_channels, inner_dim));
+        } else {
+            blocks["proj_in"] = std::shared_ptr<GGMLBlock>(new Conv2d(in_channels, inner_dim, {1, 1}));
+        }
+
+        for (int i = 0; i < depth; i++) {
+            std::string name = "transformer_blocks." + std::to_string(i);
+            blocks[name]     = std::shared_ptr<GGMLBlock>(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false));
+        }
+
+        if (use_linear) {
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, in_channels));
+        } else {
+            blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Conv2d(inner_dim, in_channels, {1, 1}));
+        }
+    }
+
+    virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                        struct ggml_tensor* x,
+                                        struct ggml_tensor* context) {
+        // x: [N, in_channels, h, w]
+        // context: [N, max_position(aka n_token), hidden_size(aka context_dim)]
+        auto norm     = std::dynamic_pointer_cast<GroupNorm32>(blocks["norm"]);
+        auto proj_in  = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_in"]);
+        auto proj_out = std::dynamic_pointer_cast<UnaryBlock>(blocks["proj_out"]);
+
+        auto x_in         = x;
+        int64_t n         = x->ne[3];
+        int64_t h         = x->ne[1];
+        int64_t w         = x->ne[0];
+        int64_t inner_dim = n_head * d_head;
+
+        x = norm->forward(ctx, x);
+        if (use_linear) {
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
+            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
+        } else {
+            x = proj_in->forward(ctx, x);                                              // [N, inner_dim, h, w]
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3));  // [N, h, w, inner_dim]
+            x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n);                // [N, h * w, inner_dim]
+        }
+
+        for (int i = 0; i < depth; i++) {
+            std::string name       = "transformer_blocks." + std::to_string(i);
+            auto transformer_block = std::dynamic_pointer_cast<BasicTransformerBlock>(blocks[name]);
+
+            x = transformer_block->forward(ctx, x, context);
+        }
+
+        if (use_linear) {
+            // proj_out
+            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
+        } else {
+            x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3));  // [N, inner_dim, h * w]
+            x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n);                 // [N, inner_dim, h, w]
+
+            // proj_out
+            x = proj_out->forward(ctx, x);  // [N, in_channels, h, w]
+        }
+
+        x = ggml_add(ctx->ggml_ctx, x, x_in);
+        return x;
+    }
+};
+
+class AlphaBlender : public GGMLBlock {
+protected:
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override {
+        // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix
+        enum ggml_type wtype = GGML_TYPE_F32;
+        params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1);
+    }
+
+    float get_alpha() {
+        // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,]
+        // so learned_with_images is same as learned
+        float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]);
+        return sigmoid(alpha);
+    }
+
+public:
+    AlphaBlender() {
+        // merge_strategy is always learned_with_images
+        // for inference, we don't need to set alpha
+        // since mix_factor.shape is [1,], we don't need rearrange using  rearrange_pattern
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x_spatial,
+                                struct ggml_tensor* x_temporal) {
+        // image_only_indicator is always tensor([0.])
+        float alpha = get_alpha();
+        auto x      = ggml_add(ctx->ggml_ctx,
+                               ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha),
+                               ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha));
+        return x;
+    }
+};
+
+class VideoResBlock : public ResBlock {
+public:
+    VideoResBlock(int64_t channels,
+                  int64_t emb_channels,
+                  int64_t out_channels,
+                  std::pair<int, int> kernel_size = {3, 3},
+                  int64_t video_kernel_size       = 3,
+                  int dims                        = 2)  // always 2
+        : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) {
+        blocks["time_stack"] = std::shared_ptr<GGMLBlock>(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true));
+        blocks["time_mixer"] = std::shared_ptr<GGMLBlock>(new AlphaBlender());
+    }
+
+    struct ggml_tensor* forward(GGMLRunnerContext* ctx,
+                                struct ggml_tensor* x,
+                                struct ggml_tensor* emb,
+                                int num_video_frames) {
+        // x: [N, channels, h, w] aka [b*t, channels, h, w]
+        // emb: [N, emb_channels] aka [b*t, emb_channels]
+        // image_only_indicator is always tensor([0.])
+        auto time_stack = std::dynamic_pointer_cast<ResBlock>(blocks["time_stack"]);
+        auto time_mixer = std::dynamic_pointer_cast<AlphaBlender>(blocks["time_mixer"]);
+
+        x = ResBlock::forward(ctx, x, emb);
+
+        int64_t T = num_video_frames;
+        int64_t B = x->ne[3] / T;
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+
+        x          = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B);                     // (b t) c h w -> b t c (h w)
+        x          = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b t c (h w) -> b c t (h w)
+        auto x_mix = x;
+
+        emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]);  // (b t) ... -> b t ...
+
+        x = time_stack->forward(ctx, x, emb);  // b t c (h w)
+
+        x = time_mixer->forward(ctx, x_mix, x);  // b t c (h w)
+
+        x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3));  // b c t (h w) -> b t c (h w)
+        x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B);                     // b t c (h w) -> (b t) c h w
+
+        return x;
+    }
+};
+
+#endif  // __COMMON_BLOCK_HPP__
diff --git a/otherarch/sdcpp/common_dit.hpp b/otherarch/sdcpp/common_dit.hpp
new file mode 100644
index 00000000000..0e6f0f0870a
--- /dev/null
+++ b/otherarch/sdcpp/common_dit.hpp
@@ -0,0 +1,108 @@
+#ifndef __COMMON_DIT_HPP__
+#define __COMMON_DIT_HPP__
+
+#include "ggml_extend.hpp"
+
+namespace DiT {
+    ggml_tensor* patchify(ggml_context* ctx,
+                          ggml_tensor* x,
+                          int pw,
+                          int ph,
+                          bool patch_last = true) {
+        // x: [N, C, H, W]
+        // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
+        int64_t N = x->ne[3];
+        int64_t C = x->ne[2];
+        int64_t H = x->ne[1];
+        int64_t W = x->ne[0];
+        int64_t h = H / ph;
+        int64_t w = W / pw;
+
+        GGML_ASSERT(h * ph == H && w * pw == W);
+
+        x = ggml_reshape_4d(ctx, x, pw, w, ph, h * C * N);     // [N*C*h, ph, w, pw]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, ph, pw]
+        x = ggml_reshape_4d(ctx, x, pw * ph, w * h, C, N);     // [N, C, h*w, ph*pw]
+        if (patch_last) {
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, ph*pw]
+            x = ggml_reshape_3d(ctx, x, pw * ph * C, w * h, N);    // [N, h*w, C*ph*pw]
+        } else {
+            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));  // [N, h*w, C, ph*pw]
+            x = ggml_reshape_3d(ctx, x, C * pw * ph, w * h, N);              // [N, h*w, ph*pw*C]
+        }
+        return x;
+    }
+
+    ggml_tensor* unpatchify(ggml_context* ctx,
+                            ggml_tensor* x,
+                            int64_t h,
+                            int64_t w,
+                            int ph,
+                            int pw,
+                            bool patch_last = true) {
+        // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C]
+        // return: [N, C, H, W]
+        int64_t N = x->ne[2];
+        int64_t C = x->ne[0] / ph / pw;
+        int64_t H = h * ph;
+        int64_t W = w * pw;
+
+        GGML_ASSERT(C * ph * pw == x->ne[0]);
+
+        if (patch_last) {
+            x = ggml_reshape_4d(ctx, x, pw * ph, C, w * h, N);     // [N, h*w, C, ph*pw]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, ph*pw]
+        } else {
+            x = ggml_reshape_4d(ctx, x, C, pw * ph, w * h, N);     // [N, h*w, ph*pw, C]
+            x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // [N, C, h*w, ph*pw]
+        }
+
+        x = ggml_reshape_4d(ctx, x, pw, ph, w, h * C * N);     // [N*C*h, w, ph, pw]
+        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, ph, w, pw]
+        x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*ph, w*pw]
+
+        return x;
+    }
+
+    ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
+                                   ggml_tensor* x,
+                                   int ph,
+                                   int pw) {
+        int64_t W = x->ne[0];
+        int64_t H = x->ne[1];
+
+        int pad_h = (ph - H % ph) % ph;
+        int pad_w = (pw - W % pw) % pw;
+        x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
+        return x;
+    }
+
+    ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx,
+                                  ggml_tensor* x,
+                                  int ph,
+                                  int pw,
+                                  bool patch_last = true) {
+        x = pad_to_patch_size(ctx, x, ph, pw);
+        x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last);
+        return x;
+    }
+
+    ggml_tensor* unpatchify_and_crop(ggml_context* ctx,
+                                     ggml_tensor* x,
+                                     int64_t H,
+                                     int64_t W,
+                                     int ph,
+                                     int pw,
+                                     bool patch_last = true) {
+        int pad_h = (ph - H % ph) % ph;
+        int pad_w = (pw - W % pw) % pw;
+        int64_t h = ((H + pad_h) / ph);
+        int64_t w = ((W + pad_w) / pw);
+        x         = unpatchify(ctx, x, h, w, ph, pw, patch_last);  // [N, C, H + pad_h, W + pad_w]
+        x         = ggml_ext_slice(ctx, x, 1, 0, H);               // [N, C, H, W + pad_w]
+        x         = ggml_ext_slice(ctx, x, 0, 0, W);               // [N, C, H, W]
+        return x;
+    }
+}  // namespace DiT
+
+#endif  // __COMMON_DIT_HPP__
\ No newline at end of file
diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp
index 4317ed18a97..d4a3146b8c4 100644
--- a/otherarch/sdcpp/conditioner.hpp
+++ b/otherarch/sdcpp/conditioner.hpp
@@ -1641,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner {
     }
 };
 
+struct AnimaConditioner : public Conditioner {
+    std::shared_ptr<LLM::BPETokenizer> qwen_tokenizer;
+    T5UniGramTokenizer t5_tokenizer;
+    std::shared_ptr<LLM::LLMRunner> llm;
+
+    AnimaConditioner(ggml_backend_t backend,
+                     bool offload_params_to_cpu,
+                     const String2TensorStorage& tensor_storage_map = {}) {
+        qwen_tokenizer = std::make_shared<LLM::Qwen2Tokenizer>();
+        llm            = std::make_shared<LLM::LLMRunner>(LLM::LLMArch::QWEN3,
+                                               backend,
+                                               offload_params_to_cpu,
+                                               tensor_storage_map,
+                                               "text_encoders.llm",
+                                               false);
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        llm->get_param_tensors(tensors, "text_encoders.llm");
+    }
+
+    void alloc_params_buffer() override {
+        llm->alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        llm->free_params_buffer();
+    }
+
+    size_t get_params_buffer_size() override {
+        return llm->get_params_buffer_size();
+    }
+
+    void set_flash_attention_enabled(bool enabled) override {
+        llm->set_flash_attention_enabled(enabled);
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        llm->set_weight_adapter(adapter);
+    }
+
+    std::tuple<std::vector<int>, std::vector<float>, std::vector<int>, std::vector<float>> tokenize(std::string text) {
+        auto parsed_attention = parse_prompt_attention(text);
+
+        {
+            std::stringstream ss;
+            ss << "[";
+            for (const auto& item : parsed_attention) {
+                ss << "['" << item.first << "', " << item.second << "], ";
+            }
+            ss << "]";
+            LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
+        }
+
+        std::vector<int> qwen_tokens;
+        std::vector<float> qwen_weights;
+        std::vector<int> t5_tokens;
+        std::vector<float> t5_weights;
+
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            std::vector<int> curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr);
+            qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            // Anima uses uniform Qwen token weights.
+            qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f);
+        }
+        if (qwen_tokens.empty()) {
+            qwen_tokens.push_back(151643);  // qwen3 pad token
+            qwen_weights.push_back(1.f);
+        }
+
+        for (const auto& item : parsed_attention) {
+            const std::string& curr_text = item.first;
+            float curr_weight            = item.second;
+            std::vector<int> curr_tokens = t5_tokenizer.Encode(curr_text, true);
+            t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end());
+            t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight);
+        }
+
+        return {qwen_tokens, qwen_weights, t5_tokens, t5_weights};
+    }
+
+    SDCondition get_learned_condition(ggml_context* work_ctx,
+                                      int n_threads,
+                                      const ConditionerParams& conditioner_params) override {
+        int64_t t0 = ggml_time_ms();
+
+        auto tokenized     = tokenize(conditioner_params.text);
+        auto& qwen_tokens  = std::get<0>(tokenized);
+        auto& qwen_weights = std::get<1>(tokenized);
+        auto& t5_tokens    = std::get<2>(tokenized);
+        auto& t5_weights   = std::get<3>(tokenized);
+
+        auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens);
+
+        struct ggml_tensor* hidden_states = nullptr;  // [N, n_token, 1024]
+        llm->compute(n_threads,
+                     input_ids,
+                     nullptr,
+                     {},
+                     {},
+                     &hidden_states,
+                     work_ctx);
+
+        {
+            auto tensor         = hidden_states;
+            float original_mean = ggml_ext_tensor_mean(tensor);
+            for (int i2 = 0; i2 < tensor->ne[2]; i2++) {
+                for (int i1 = 0; i1 < tensor->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < tensor->ne[0]; i0++) {
+                        float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2);
+                        value *= qwen_weights[i1];
+                        ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2);
+                    }
+                }
+            }
+            float new_mean = ggml_ext_tensor_mean(tensor);
+            if (new_mean != 0.f) {
+                ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean));
+            }
+        }
+
+        struct ggml_tensor* t5_ids_tensor    = nullptr;
+        struct ggml_tensor* t5_weight_tensor = nullptr;
+        if (!t5_tokens.empty()) {
+            t5_ids_tensor    = vector_to_ggml_tensor_i32(work_ctx, t5_tokens);
+            t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights);
+        }
+
+        int64_t t1 = ggml_time_ms();
+        LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0);
+
+        return {hidden_states, t5_weight_tensor, t5_ids_tensor};
+    }
+};
+
 struct LLMEmbedder : public Conditioner {
     SDVersion version;
     std::shared_ptr<LLM::BPETokenizer> tokenizer;
diff --git a/otherarch/sdcpp/control.hpp b/otherarch/sdcpp/control.hpp
index f7842021c7b..5bab0381a3b 100644
--- a/otherarch/sdcpp/control.hpp
+++ b/otherarch/sdcpp/control.hpp
@@ -1,8 +1,7 @@
 #ifndef __CONTROL_HPP__
 #define __CONTROL_HPP__
 
-#include "common.hpp"
-#include "ggml_extend.hpp"
+#include "common_block.hpp"
 #include "model.h"
 
 #define CONTROL_NET_GRAPH_SIZE 1536
diff --git a/otherarch/sdcpp/diffusion_model.hpp b/otherarch/sdcpp/diffusion_model.hpp
index 3293ba9b702..329bb9d9a96 100644
--- a/otherarch/sdcpp/diffusion_model.hpp
+++ b/otherarch/sdcpp/diffusion_model.hpp
@@ -1,6 +1,7 @@
 #ifndef __DIFFUSION_MODEL_H__
 #define __DIFFUSION_MODEL_H__
 
+#include "anima.hpp"
 #include "flux.hpp"
 #include "mmdit.hpp"
 #include "qwen_image.hpp"
@@ -242,6 +243,72 @@ struct FluxModel : public DiffusionModel {
     }
 };
 
+struct AnimaModel : public DiffusionModel {
+    std::string prefix;
+    Anima::AnimaRunner anima;
+
+    AnimaModel(ggml_backend_t backend,
+               bool offload_params_to_cpu,
+               const String2TensorStorage& tensor_storage_map = {},
+               const std::string prefix                       = "model.diffusion_model")
+        : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) {
+    }
+
+    std::string get_desc() override {
+        return anima.get_desc();
+    }
+
+    void alloc_params_buffer() override {
+        anima.alloc_params_buffer();
+    }
+
+    void free_params_buffer() override {
+        anima.free_params_buffer();
+    }
+
+    void free_compute_buffer() override {
+        anima.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) override {
+        anima.get_param_tensors(tensors, prefix);
+    }
+
+    size_t get_params_buffer_size() override {
+        return anima.get_params_buffer_size();
+    }
+
+    void set_weight_adapter(const std::shared_ptr<WeightAdapter>& adapter) override {
+        anima.set_weight_adapter(adapter);
+    }
+
+    int64_t get_adm_in_channels() override {
+        return 768;
+    }
+
+    void set_flash_attention_enabled(bool enabled) {
+        anima.set_flash_attention_enabled(enabled);
+    }
+
+    void set_circular_axes(bool circular_x, bool circular_y) override {
+        anima.set_circular_axes(circular_x, circular_y);
+    }
+
+    bool compute(int n_threads,
+                 DiffusionParams diffusion_params,
+                 struct ggml_tensor** output     = nullptr,
+                 struct ggml_context* output_ctx = nullptr) override {
+        return anima.compute(n_threads,
+                             diffusion_params.x,
+                             diffusion_params.timesteps,
+                             diffusion_params.context,
+                             diffusion_params.c_concat,
+                             diffusion_params.y,
+                             output,
+                             output_ctx);
+    }
+};
+
 struct WanModel : public DiffusionModel {
     std::string prefix;
     WAN::WanRunner wan;
diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp
index ff8c18997fb..1204ae1e5e9 100644
--- a/otherarch/sdcpp/flux.hpp
+++ b/otherarch/sdcpp/flux.hpp
@@ -4,7 +4,7 @@
 #include <memory>
 #include <vector>
 
-#include "ggml_extend.hpp"
+#include "common_dit.hpp"
 #include "model.h"
 #include "rope.hpp"
 
@@ -103,11 +103,13 @@ namespace Flux {
             auto norm     = std::dynamic_pointer_cast<QKNorm>(blocks["norm"]);
 
             auto qkv         = qkv_proj->forward(ctx, x);
-            auto qkv_vec     = ggml_ext_chunk(ctx->ggml_ctx, qkv, 3, 0, true);
-            int64_t head_dim = qkv_vec[0]->ne[0] / num_heads;
-            auto q           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]);
-            auto k           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]);
-            auto v           = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]);
+            int64_t head_dim = qkv->ne[0] / 3 / num_heads;
+            auto q           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0);
+            auto k           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * qkv->ne[0] / 3);
+            auto v           = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2],
+                                            qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * 2 * qkv->ne[0] / 3);
             q                = norm->query_norm(ctx, q);
             k                = norm->key_norm(ctx, k);
             return {q, k, v};
@@ -491,15 +493,14 @@ namespace Flux {
             auto x_mod   = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale);
             auto qkv_mlp = linear1->forward(ctx, x_mod);  // [N, n_token, hidden_size * 3 + mlp_hidden_dim*mlp_mult_factor]
 
-            auto q = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
-            auto k = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * qkv_mlp->nb[0]);
-            auto v = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * 2 * qkv_mlp->nb[0]);
-
             int64_t head_dim = hidden_size / num_heads;
 
-            q = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim, num_heads, q->ne[1], q->ne[2]);  // [N, n_token, n_head, d_head]
-            k = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim, num_heads, k->ne[1], k->ne[2]);  // [N, n_token, n_head, d_head]
-            v = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim, num_heads, v->ne[1], v->ne[2]);  // [N, n_token, n_head, d_head]
+            auto q = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], 0);
+            auto k = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * hidden_size);
+            auto v = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2],
+                                  qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * 2 * hidden_size);
 
             q         = norm->query_norm(ctx, q);
             k         = norm->key_norm(ctx, k);
@@ -846,70 +847,6 @@ namespace Flux {
             }
         }
 
-        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                              struct ggml_tensor* x) {
-            int64_t W = x->ne[0];
-            int64_t H = x->ne[1];
-
-            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
-            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
-            return x;
-        }
-
-        struct ggml_tensor* patchify(struct ggml_context* ctx,
-                                     struct ggml_tensor* x) {
-            // x: [N, C, H, W]
-            // return: [N, h*w, C * patch_size * patch_size]
-            int64_t N = x->ne[3];
-            int64_t C = x->ne[2];
-            int64_t H = x->ne[1];
-            int64_t W = x->ne[0];
-            int64_t p = params.patch_size;
-            int64_t h = H / params.patch_size;
-            int64_t w = W / params.patch_size;
-
-            GGML_ASSERT(h * p == H && w * p == W);
-
-            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);       // [N*C*h, p, w, p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, p, p]
-            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);       // [N, C, h*w, p*p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, p*p]
-            x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N);      // [N, h*w, C*p*p]
-            return x;
-        }
-
-        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
-                                        struct ggml_tensor* x) {
-            // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
-            x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx->ggml_ctx, x);
-            return x;
-        }
-
-        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                                       struct ggml_tensor* x,
-                                       int64_t h,
-                                       int64_t w) {
-            // x: [N, h*w, C*patch_size*patch_size]
-            // return: [N, C, H, W]
-            int64_t N = x->ne[2];
-            int64_t C = x->ne[0] / params.patch_size / params.patch_size;
-            int64_t H = h * params.patch_size;
-            int64_t W = w * params.patch_size;
-            int64_t p = params.patch_size;
-
-            GGML_ASSERT(C * p * p == x->ne[0]);
-
-            x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, p*p]
-            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);       // [N*C*h, w, p, p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, p, w, p]
-            x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*p, w*p]
-
-            return x;
-        }
-
         struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
                                          struct ggml_tensor* img,
                                          struct ggml_tensor* txt,
@@ -1060,7 +997,7 @@ namespace Flux {
             int pad_h      = (patch_size - H % patch_size) % patch_size;
             int pad_w      = (patch_size - W % patch_size) % patch_size;
 
-            auto img      = pad_to_patch_size(ctx, x);
+            auto img      = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size);
             auto orig_img = img;
 
             if (params.chroma_radiance_params.fake_patch_size_x2) {
@@ -1082,7 +1019,7 @@ namespace Flux {
             auto nerf_image_embedder   = std::dynamic_pointer_cast<NerfEmbedder>(blocks["nerf_image_embedder"]);
             auto nerf_final_layer_conv = std::dynamic_pointer_cast<NerfFinalLayerConv>(blocks["nerf_final_layer_conv"]);
 
-            auto nerf_pixels    = patchify(ctx->ggml_ctx, orig_img);  // [N, num_patches, C * patch_size * patch_size]
+            auto nerf_pixels    = DiT::patchify(ctx->ggml_ctx, orig_img, patch_size, patch_size);  // [N, num_patches, C * patch_size * patch_size]
             int64_t num_patches = nerf_pixels->ne[1];
             nerf_pixels         = ggml_reshape_3d(ctx->ggml_ctx,
                                                   nerf_pixels,
@@ -1102,7 +1039,7 @@ namespace Flux {
 
             img_dct = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img_dct, 1, 0, 2, 3));                                 // [N*num_patches, nerf_hidden_size, patch_size*patch_size]
             img_dct = ggml_reshape_3d(ctx->ggml_ctx, img_dct, img_dct->ne[0] * img_dct->ne[1], num_patches, img_dct->ne[2] / num_patches);  // [N, num_patches, nerf_hidden_size*patch_size*patch_size]
-            img_dct = unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size);                               // [N, nerf_hidden_size, H, W]
+            img_dct = DiT::unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size, patch_size);  // [N, nerf_hidden_size, H, W]
 
             out = nerf_final_layer_conv->forward(ctx, img_dct);  // [N, C, H, W]
 
@@ -1134,7 +1071,7 @@ namespace Flux {
             int pad_h      = (patch_size - H % patch_size) % patch_size;
             int pad_w      = (patch_size - W % patch_size) % patch_size;
 
-            auto img           = process_img(ctx, x);
+            auto img           = DiT::pad_and_patchify(ctx, x, patch_size, patch_size);
             int64_t img_tokens = img->ne[1];
 
             if (params.version == VERSION_FLUX_FILL) {
@@ -1142,8 +1079,8 @@ namespace Flux {
                 ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
                 ggml_tensor* mask   = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
 
-                masked = process_img(ctx, masked);
-                mask   = process_img(ctx, mask);
+                masked = DiT::pad_and_patchify(ctx, masked, patch_size, patch_size);
+                mask   = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
 
                 img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0);
             } else if (params.version == VERSION_FLEX_2) {
@@ -1152,21 +1089,21 @@ namespace Flux {
                 ggml_tensor* mask    = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
                 ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1));
 
-                masked  = process_img(ctx, masked);
-                mask    = process_img(ctx, mask);
-                control = process_img(ctx, control);
+                masked  = DiT::pad_and_patchify(ctx, masked, patch_size, patch_size);
+                mask    = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size);
+                control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size);
 
                 img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0);
             } else if (params.version == VERSION_FLUX_CONTROLS) {
                 GGML_ASSERT(c_concat != nullptr);
 
-                auto control = process_img(ctx, c_concat);
+                auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size);
                 img          = ggml_concat(ctx->ggml_ctx, img, control, 0);
             }
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx, ref);
+                    ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }
@@ -1178,8 +1115,7 @@ namespace Flux {
                 out = ggml_cont(ctx->ggml_ctx, out);
             }
 
-            // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
-            out = unpatchify(ctx->ggml_ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size);  // [N, C, H + pad_h, W + pad_w]
+            out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size);  // [N, C, H, W]
             return out;
         }
 
diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp
index cac79bb2165..5265aeed46b 100644
--- a/otherarch/sdcpp/ggml_extend.hpp
+++ b/otherarch/sdcpp/ggml_extend.hpp
@@ -1219,6 +1219,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx,
     return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3);
 }
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx,
+                                                          struct ggml_tensor* x) {
+    return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
+}
+
 __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
                                                     int64_t ne0,
                                                     int64_t ne1,
@@ -1227,6 +1232,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx,
     return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3);
 }
 
+__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx,
+                                                         struct ggml_tensor* x) {
+    return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]);
+}
+
 __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) {
 #ifdef SD_USE_VULKAN
     auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int");
diff --git a/otherarch/sdcpp/ltxv.hpp b/otherarch/sdcpp/ltxv.hpp
index 0a2877a8639..9dcdd4b2058 100644
--- a/otherarch/sdcpp/ltxv.hpp
+++ b/otherarch/sdcpp/ltxv.hpp
@@ -1,8 +1,7 @@
 #ifndef __LTXV_HPP__
 #define __LTXV_HPP__
 
-#include "common.hpp"
-#include "ggml_extend.hpp"
+#include "common_block.hpp"
 
 namespace LTXV {
 
diff --git a/otherarch/sdcpp/mmdit.hpp b/otherarch/sdcpp/mmdit.hpp
index 726f60c2f0b..ba1c35d66e4 100644
--- a/otherarch/sdcpp/mmdit.hpp
+++ b/otherarch/sdcpp/mmdit.hpp
@@ -745,28 +745,6 @@ struct MMDiT : public GGMLBlock {
         return spatial_pos_embed;
     }
 
-    struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                                   struct ggml_tensor* x,
-                                   int64_t h,
-                                   int64_t w) {
-        // x: [N, H*W, patch_size * patch_size * C]
-        // return: [N, C, H, W]
-        int64_t n = x->ne[2];
-        int64_t c = out_channels;
-        int64_t p = patch_size;
-        h         = (h + 1) / p;
-        w         = (w + 1) / p;
-
-        GGML_ASSERT(h * w == x->ne[1]);
-
-        x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n);       // [N, H*W, P*P, C]
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3));  // [N, C, H*W, P*P]
-        x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n);       // [N*C*H, W, P, P]
-        x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*H, P, W, P]
-        x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n);       // [N, C, H*P, W*P]
-        return x;
-    }
-
     struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx,
                                                  struct ggml_tensor* x,
                                                  struct ggml_tensor* c_mod,
@@ -811,11 +789,11 @@ struct MMDiT : public GGMLBlock {
         auto x_embedder = std::dynamic_pointer_cast<PatchEmbed>(blocks["x_embedder"]);
         auto t_embedder = std::dynamic_pointer_cast<TimestepEmbedder>(blocks["t_embedder"]);
 
-        int64_t w = x->ne[0];
-        int64_t h = x->ne[1];
+        int64_t W = x->ne[0];
+        int64_t H = x->ne[1];
 
         auto patch_embed = x_embedder->forward(ctx, x);                      // [N, H*W, hidden_size]
-        auto pos_embed   = cropped_pos_embed(ctx->ggml_ctx, h, w);           // [1, H*W, hidden_size]
+        auto pos_embed   = cropped_pos_embed(ctx->ggml_ctx, H, W);           // [1, H*W, hidden_size]
         x                = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed);  // [N, H*W, hidden_size]
 
         auto c = t_embedder->forward(ctx, t);  // [N, hidden_size]
@@ -834,7 +812,7 @@ struct MMDiT : public GGMLBlock {
 
         x = forward_core_with_concat(ctx, x, c, context, skip_layers);  // (N, H*W, patch_size ** 2 * out_channels)
 
-        x = unpatchify(ctx->ggml_ctx, x, h, w);  // [N, C, H, W]
+        x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false);  // [N, C, H, W]
 
         return x;
     }
diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp
index ef1752d1d5f..eee00618fc2 100644
--- a/otherarch/sdcpp/model.cpp
+++ b/otherarch/sdcpp/model.cpp
@@ -1083,6 +1083,9 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) {
                 return VERSION_QWEN_IMAGE;
             }
+            if (tensor_storage.name.find("llm_adapter.blocks.0.cross_attn.q_proj.weight") != std::string::npos) {
+                return VERSION_ANIMA;
+            }
             if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) {
                 is_flux2 = true;
             }
diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h
index 66b347ab8d4..afa20e8c626 100644
--- a/otherarch/sdcpp/model.h
+++ b/otherarch/sdcpp/model.h
@@ -45,6 +45,7 @@ enum SDVersion {
     VERSION_WAN2_2_I2V,
     VERSION_WAN2_2_TI2V,
     VERSION_QWEN_IMAGE,
+    VERSION_ANIMA,
     VERSION_FLUX2,
     VERSION_FLUX2_KLEIN,
     VERSION_Z_IMAGE,
@@ -122,6 +123,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_anima(SDVersion version) {
+    if (version == VERSION_ANIMA) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_z_image(SDVersion version) {
     if (version == VERSION_Z_IMAGE) {
         return true;
@@ -146,6 +154,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
         sd_version_is_sd3(version) ||
         sd_version_is_wan(version) ||
         sd_version_is_qwen_image(version) ||
+        sd_version_is_anima(version) ||
         sd_version_is_z_image(version)) {
         return true;
     }
diff --git a/otherarch/sdcpp/name_conversion.cpp b/otherarch/sdcpp/name_conversion.cpp
index d3e863b8a86..3b3abfb63e7 100644
--- a/otherarch/sdcpp/name_conversion.cpp
+++ b/otherarch/sdcpp/name_conversion.cpp
@@ -653,6 +653,14 @@ std::string convert_diffusers_dit_to_original_lumina2(std::string name) {
     return name;
 }
 
+std::string convert_other_dit_to_original_anima(std::string name) {
+    static const std::string anima_net_prefix = "net.";
+    if (!starts_with(name, anima_net_prefix)) {
+        name = anima_net_prefix + name;
+    }
+    return name;
+}
+
 std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) {
     if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) {
         name = convert_diffusers_unet_to_original_sd1(name);
@@ -664,6 +672,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S
         name = convert_diffusers_dit_to_original_flux(name);
     } else if (sd_version_is_z_image(version)) {
         name = convert_diffusers_dit_to_original_lumina2(name);
+    } else if (sd_version_is_anima(version)) {
+        name = convert_other_dit_to_original_anima(name);
     }
     return name;
 }
diff --git a/otherarch/sdcpp/qwen_image.hpp b/otherarch/sdcpp/qwen_image.hpp
index 3044eb45680..2c70344cc4c 100644
--- a/otherarch/sdcpp/qwen_image.hpp
+++ b/otherarch/sdcpp/qwen_image.hpp
@@ -3,9 +3,8 @@
 
 #include <memory>
 
-#include "common.hpp"
+#include "common_block.hpp"
 #include "flux.hpp"
-#include "ggml_extend.hpp"
 
 namespace Qwen {
     constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480;
@@ -390,69 +389,6 @@ namespace Qwen {
             blocks["proj_out"] = std::shared_ptr<GGMLBlock>(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels));
         }
 
-        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                              struct ggml_tensor* x) {
-            int64_t W = x->ne[0];
-            int64_t H = x->ne[1];
-
-            int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size;
-            int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size;
-            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
-            return x;
-        }
-
-        struct ggml_tensor* patchify(struct ggml_context* ctx,
-                                     struct ggml_tensor* x) {
-            // x: [N, C, H, W]
-            // return: [N, h*w, C * patch_size * patch_size]
-            int64_t N = x->ne[3];
-            int64_t C = x->ne[2];
-            int64_t H = x->ne[1];
-            int64_t W = x->ne[0];
-            int64_t p = params.patch_size;
-            int64_t h = H / params.patch_size;
-            int64_t w = W / params.patch_size;
-
-            GGML_ASSERT(h * p == H && w * p == W);
-
-            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);       // [N*C*h, p, w, p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, w, p, p]
-            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);       // [N, C, h*w, p*p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, h*w, C, p*p]
-            x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N);      // [N, h*w, C*p*p]
-            return x;
-        }
-
-        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
-                                        struct ggml_tensor* x) {
-            x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx->ggml_ctx, x);
-            return x;
-        }
-
-        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                                       struct ggml_tensor* x,
-                                       int64_t h,
-                                       int64_t w) {
-            // x: [N, h*w, C*patch_size*patch_size]
-            // return: [N, C, H, W]
-            int64_t N = x->ne[2];
-            int64_t C = x->ne[0] / params.patch_size / params.patch_size;
-            int64_t H = h * params.patch_size;
-            int64_t W = w * params.patch_size;
-            int64_t p = params.patch_size;
-
-            GGML_ASSERT(C * p * p == x->ne[0]);
-
-            x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N);       // [N, h*w, C, p*p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N, C, h*w, p*p]
-            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);       // [N*C*h, w, p, p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));  // [N*C*h, p, w, p]
-            x = ggml_reshape_4d(ctx, x, W, H, C, N);               // [N, C, h*p, w*p]
-
-            return x;
-        }
-
         struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx,
                                          struct ggml_tensor* x,
                                          struct ggml_tensor* timestep,
@@ -468,7 +404,7 @@ namespace Qwen {
 
             auto t_emb = time_text_embed->forward(ctx, timestep);
             if (params.zero_cond_t) {
-                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros(ctx->ggml_ctx, timestep->ne[0], timestep->ne[1], timestep->ne[2], timestep->ne[3]));
+                auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep));
                 t_emb        = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1);
             }
             auto img = img_in->forward(ctx, x);
@@ -512,19 +448,16 @@ namespace Qwen {
             int64_t C = x->ne[2];
             int64_t N = x->ne[3];
 
-            auto img           = process_img(ctx, x);
+            auto img           = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size);
             int64_t img_tokens = img->ne[1];
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx, ref);
+                    ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }
 
-            int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size);
-            int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size);
-
             auto out = forward_orig(ctx, img, timestep, context, pe, modulate_index);  // [N, h_len*w_len, ph*pw*C]
 
             if (out->ne[1] > img_tokens) {
@@ -533,11 +466,7 @@ namespace Qwen {
                 out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3));  // [N, h*w, C * patch_size * patch_size]
             }
 
-            out = unpatchify(ctx->ggml_ctx, out, h_len, w_len);  // [N, C, H + pad_h, W + pad_w]
-
-            // slice
-            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
-            out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W);  // [N, C, H, W]
+            out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size);  // [N, C, H, W]
 
             return out;
         }
diff --git a/otherarch/sdcpp/rope.hpp b/otherarch/sdcpp/rope.hpp
index 45e88c831c8..b26e4fccd30 100644
--- a/otherarch/sdcpp/rope.hpp
+++ b/otherarch/sdcpp/rope.hpp
@@ -43,7 +43,7 @@ namespace Rope {
 
     __STATIC_INLINE__ std::vector<std::vector<float>> rope(const std::vector<float>& pos,
                                                            int dim,
-                                                           int theta,
+                                                           float theta,
                                                            const std::vector<int>& axis_wrap_dims = {}) {
         assert(dim % 2 == 0);
         int half_dim = dim / 2;
@@ -167,7 +167,7 @@ namespace Rope {
 
     __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
                                                   int bs,
-                                                  int theta,
+                                                  const std::vector<float>& axis_thetas,
                                                   const std::vector<int>& axes_dim,
                                                   const std::vector<std::vector<int>>& wrap_dims = {}) {
         std::vector<std::vector<float>> trans_ids = transpose(ids);
@@ -188,8 +188,12 @@ namespace Rope {
             if (!wrap_dims.empty() && i < (int)wrap_dims.size()) {
                 axis_wrap_dims = wrap_dims[i];
             }
+            float axis_theta = 10000.0f;
+            if (!axis_thetas.empty()) {
+                axis_theta = axis_thetas[std::min(i, axis_thetas.size() - 1)];
+            }
             std::vector<std::vector<float>> rope_emb =
-                rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
+                rope(trans_ids[i], axes_dim[i], axis_theta, axis_wrap_dims);  // [bs*pos_len, axes_dim[i]/2 * 2 * 2]
             for (int b = 0; b < bs; ++b) {
                 for (int j = 0; j < pos_len; ++j) {
                     for (int k = 0; k < rope_emb[0].size(); ++k) {
@@ -203,6 +207,15 @@ namespace Rope {
         return flatten(emb);
     }
 
+    __STATIC_INLINE__ std::vector<float> embed_nd(const std::vector<std::vector<float>>& ids,
+                                                  int bs,
+                                                  float theta,
+                                                  const std::vector<int>& axes_dim,
+                                                  const std::vector<std::vector<int>>& wrap_dims = {}) {
+        std::vector<float> axis_thetas(axes_dim.size(), theta);
+        return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims);
+    }
+
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_refs_ids(int patch_size,
                                                                    int bs,
                                                                    int axes_dim_num,
@@ -332,7 +345,7 @@ namespace Rope {
                 }
             }
         }
-        return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
+        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
     }
 
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen_image_ids(int h,
@@ -421,7 +434,7 @@ namespace Rope {
                 }
             }
         }
-        return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
+        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
     }
 
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_vid_ids(int t,
@@ -475,7 +488,7 @@ namespace Rope {
                                                     int theta,
                                                     const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs);
-        return embed_nd(ids, bs, theta, axes_dim);
+        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim);
     }
 
     __STATIC_INLINE__ std::vector<std::vector<float>> gen_qwen2vl_ids(int grid_h,
@@ -511,7 +524,7 @@ namespace Rope {
                                                         int theta,
                                                         const std::vector<int>& axes_dim) {
         std::vector<std::vector<float>> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index);
-        return embed_nd(ids, 1, theta, axes_dim);
+        return embed_nd(ids, 1, static_cast<float>(theta), axes_dim);
     }
 
     __STATIC_INLINE__ int bound_mod(int a, int m) {
@@ -584,7 +597,7 @@ namespace Rope {
             }
         }
 
-        return embed_nd(ids, bs, theta, axes_dim, wrap_dims);
+        return embed_nd(ids, bs, static_cast<float>(theta), axes_dim, wrap_dims);
     }
 
     __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx,
diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp
index 9fdb2747ce9..9491084d912 100644
--- a/otherarch/sdcpp/sdtype_adapter.cpp
+++ b/otherarch/sdcpp/sdtype_adapter.cpp
@@ -80,8 +80,8 @@ struct SDParams {
     bool chroma_use_dit_mask     = true;
 
     std::vector<std::string> lora_paths;
-    std::vector<sd_lora_t> lora_specs;
-    uint32_t lora_count;
+    std::vector<float> lora_multipliers;
+    bool lora_dynamic = false;
 };
 
 //shared
@@ -208,14 +208,12 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     set_sd_quiet(sd_is_quiet);
     executable_path = inputs.executable_path;
     std::string taesdpath = "";
-    std::vector<std::string> lorafilenames;
-    for(int i=0;i<lora_filenames_max;++i)
+    std::vector<std::string> lora_paths;
+    std::vector<float> lora_multipliers;
+    for(int i=0;i<inputs.lora_len;++i)
     {
-        std::string temp = inputs.lora_filenames[i];
-        if(temp!="")
-        {
-            lorafilenames.push_back(temp);
-        }
+        lora_paths.push_back(inputs.lora_filenames[i]);
+        lora_multipliers.push_back(inputs.lora_multipliers[i]);
     }
     std::string vaefilename = inputs.vae_filename;
     std::string t5xxl_filename = inputs.t5xxl_filename;
@@ -230,19 +228,28 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     cfg_square_limit = inputs.img_soft_limit;
     printf("\nImageGen Init - Load Model: %s\n",inputs.model_filename);
 
-    int lora_apply_mode = std::max(0, std::min(2, inputs.lora_apply_mode));
+    int lora_apply_mode = LORA_APPLY_AT_RUNTIME;
+    bool lora_dynamic = false;
+    if(inputs.lora_apply_mode >= 0 && inputs.lora_apply_mode <= 2) {
+        lora_apply_mode = inputs.lora_apply_mode;
+    }
+    else if(inputs.lora_apply_mode == 3) {
+        lora_dynamic = true;
+    }
 
-    if(lorafilenames.size()>0)
+    if(lora_paths.size() > 0)
     {
-        for(int i=0;i<lorafilenames.size();++i)
+        const char* lora_apply_mode_name = lora_apply_mode == 1 ? "immediately"
+                                         : lora_apply_mode == 2 ? "at runtime"
+                                         : "auto";
+        const char * lora_dynamic_name = lora_dynamic ? " (dynamic)" : "";
+        printf("With LoRAs in apply mode %s%s:\n", lora_apply_mode_name, lora_dynamic_name);
+        for(int i=0;i<lora_paths.size();++i)
         {
-            const char* lora_apply_mode_name = lora_apply_mode == 1 ? "immediately"
-                                            : lora_apply_mode == 2 ? "at runtime"
-                                            : "auto";
-            printf("With LoRA: %s at %f power, apply mode: %s\n",
-                lorafilenames[i].c_str(),inputs.lora_multiplier,lora_apply_mode_name);
+            printf("  %s at %f power\n", lora_paths[i].c_str(),lora_multipliers[i]);
         }
     }
+
     if(inputs.taesd)
     {
         taesdpath = executable_path + "embd_res/taesd.embd";
@@ -327,7 +334,9 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     sd_params->clip_l_path = clip1_filename;
     sd_params->clip_g_path = clip2_filename;
     sd_params->stacked_id_embeddings_path = photomaker_filename;
-    sd_params->lora_paths = lorafilenames;
+    sd_params->lora_paths = lora_paths;
+    sd_params->lora_multipliers = lora_multipliers;
+    sd_params->lora_dynamic = lora_dynamic;
     //if t5 is set, and model is a gguf, load it as a diffusion model path
     bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5);
     if((sd_params->t5xxl_path!="" || sd_params->clip_l_path!="" || sd_params->clip_g_path!="") && endswithgguf)
@@ -416,21 +425,22 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) {
     std::filesystem::path mpath(inputs.model_filename);
     sdmodelfilename = mpath.filename().string();
 
-    sd_params->lora_specs.clear();
-    sd_params->lora_specs.reserve(lora_filenames_max*2);
+    // preload the LoRAs with the initial multipliers
+    std::vector<sd_lora_t> lora_specs;
     for(int i=0;i<sd_params->lora_paths.size();++i)
     {
+        if (!lora_dynamic && sd_params->lora_multipliers[i] == 0.)
+            continue;
         sd_lora_t spec = {};
         spec.path = sd_params->lora_paths[i].c_str();
-        spec.multiplier = inputs.lora_multiplier;
-        sd_params->lora_specs.push_back(spec);
+        spec.multiplier = sd_params->lora_multipliers[i];
+        lora_specs.push_back(spec);
     }
 
-    if(sd_params->lora_specs.size()>0 && inputs.lora_multiplier>0)
+    if(lora_specs.size()>0)
     {
-        printf("\nApply %d LoRAs...\n",sd_params->lora_specs.size());
-        sd_params->lora_count = sd_params->lora_specs.size();
-        sd_ctx->sd->apply_loras(sd_params->lora_specs.data(), sd_params->lora_count);
+        printf("  applying %d LoRAs...\n", lora_specs.size());
+        sd_ctx->sd->apply_loras(lora_specs.data(), lora_specs.size());
     }
 
     input_extraimage_buffers.reserve(max_extra_images);
@@ -1034,10 +1044,34 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs)
     params.vae_tiling_params.enabled = dotile;
     params.batch_count = 1;
 
-    // needs to be "reapplied" because sdcpp tracks previously applied LoRAs
-    // and weights, and apply/unapply the differences at each gen
-    params.loras = sd_params->lora_specs.data();
-    params.lora_count = sd_params->lora_count;
+    std::vector<sd_lora_t> lora_specs;
+    for(size_t i=0;i<sd_params->lora_paths.size();++i)
+    {
+        float multiplier = sd_params->lora_multipliers[i];
+        if (sd_params->lora_dynamic) {
+            multiplier = i < inputs.lora_len ? inputs.lora_multipliers[i] : 0.;
+        }
+        if (multiplier != 0.f) {
+            sd_lora_t spec = {};
+            spec.path = sd_params->lora_paths[i].c_str();
+            spec.multiplier = multiplier;
+            lora_specs.push_back(spec);
+        }
+    }
+    if(!sd_is_quiet && sddebugmode==1) {
+        if (lora_specs.size() > 0) {
+            printf("Applying LoRAs:\n");
+            for(size_t i=0;i<lora_specs.size();++i)
+            {
+                printf("  %s @ %.3f\n", lora_specs[i].path, lora_specs[i].multiplier);
+            }
+        }
+    }
+
+    // note sdcpp tracks previously applied LoRAs and weights,
+    // and apply/unapply the differences at each gen
+    params.loras = lora_specs.data();
+    params.lora_count = lora_specs.size();
 
     params.ref_images = reference_imgs.data();
     params.ref_images_count = reference_imgs.size();
diff --git a/otherarch/sdcpp/stable-diffusion.cpp b/otherarch/sdcpp/stable-diffusion.cpp
index 773ff731db5..e1983a23d80 100644
--- a/otherarch/sdcpp/stable-diffusion.cpp
+++ b/otherarch/sdcpp/stable-diffusion.cpp
@@ -50,6 +50,7 @@ const char* model_version_to_str[] = {
     "Wan 2.2 I2V",
     "Wan 2.2 TI2V",
     "Qwen Image",
+    "Anima",
     "Flux.2",
     "Flux.2 klein",
     "Z-Image",
@@ -133,6 +134,7 @@ class StableDiffusionGGML {
     std::vector<std::shared_ptr<LoraModel>> diffusion_lora_models;
     std::vector<std::shared_ptr<LoraModel>> first_stage_lora_models;
     bool apply_lora_immediately = false;
+    std::map<std::string, std::shared_ptr<LoraModel>> kcpp_lora_cache;
 
     std::string taesd_path;
     bool use_tiny_autoencoder            = false;
@@ -310,15 +312,30 @@ class StableDiffusionGGML {
             }
         }
 
+        if (tempver == VERSION_ANIMA &&
+            strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 &&
+            strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 &&
+            !model_loader.has_diffusion_model_tensors()
+            )
+        {
+            LOG_INFO("Anima: SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n");
+            if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) {
+                LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path);
+            }
+            tempver = model_loader.get_sd_version();
+        }
+
         bool iswan = (tempver==VERSION_WAN2 || tempver==VERSION_WAN2_2_I2V || tempver==VERSION_WAN2_2_TI2V);
         bool isqwenimg = (tempver==VERSION_QWEN_IMAGE);
         bool iszimg = (tempver==VERSION_Z_IMAGE);
         bool isflux2 = (tempver==VERSION_FLUX2);
         bool isflux2k = (tempver==VERSION_FLUX2_KLEIN);
         bool is_ovis =  (tempver==VERSION_OVIS_IMAGE);
+        bool is_anima = (tempver==VERSION_ANIMA);
+        bool conditioner_is_llm = (isqwenimg||iszimg||isflux2||isflux2k||is_ovis||is_anima);
 
         //kcpp qol fallback: if qwen image, and they loaded the qwen2vl llm as t5 by mistake
-        if((isqwenimg||iszimg||isflux2||isflux2k||is_ovis) && t5_path_fixed!="")
+        if(conditioner_is_llm && t5_path_fixed!="")
         {
             if(clipl_path_fixed=="" && clipg_path_fixed=="")
             {
@@ -350,7 +367,7 @@ class StableDiffusionGGML {
                 prefix = "cond_stage_model.transformer.";
                 LOG_INFO("swap clip_vision from '%s'", clipl_path_fixed.c_str());
             }
-            if(isqwenimg||iszimg||isflux2||isflux2k||is_ovis)
+            if(conditioner_is_llm)
             {
                 prefix = "text_encoders.llm.";
                 LOG_INFO("swap llm from '%s'", clipl_path_fixed.c_str());
@@ -452,7 +469,7 @@ class StableDiffusionGGML {
             {
                 to_replace = "taesd_f2.embd";
             }
-            else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version))
+            else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version)||sd_version_is_anima(version))
             {
                 to_replace = "taesd_w21.embd";
             }
@@ -545,6 +562,7 @@ class StableDiffusionGGML {
             shift_factor = 0.1159f;
         } else if (sd_version_is_wan(version) ||
                    sd_version_is_qwen_image(version) ||
+                   sd_version_is_anima(version) ||
                    sd_version_is_flux2(version)) {
             scale_factor = 1.0f;
             shift_factor = 0.f;
@@ -675,6 +693,14 @@ class StableDiffusionGGML {
                                                                    "model.diffusion_model",
                                                                    version,
                                                                    sd_ctx_params->qwen_image_zero_cond_t);
+            } else if (sd_version_is_anima(version)) {
+                cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
+                                                                      offload_params_to_cpu,
+                                                                      tensor_storage_map);
+                diffusion_model  = std::make_shared<AnimaModel>(backend,
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               "model.diffusion_model");
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
@@ -737,7 +763,7 @@ class StableDiffusionGGML {
             }
 
             if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) {
-                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
+                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                     first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
                                                                             offload_params_to_cpu,
                                                                             tensor_storage_map,
@@ -775,7 +801,7 @@ class StableDiffusionGGML {
                 }
             }
             if (use_tiny_autoencoder || version == VERSION_SDXS) {
-                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
+                if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                     tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
                                                                              offload_params_to_cpu,
                                                                              tensor_storage_map,
@@ -1051,6 +1077,7 @@ class StableDiffusionGGML {
                 } else if (sd_version_is_sd3(version) ||
                            sd_version_is_wan(version) ||
                            sd_version_is_qwen_image(version) ||
+                           sd_version_is_anima(version) ||
                            sd_version_is_z_image(version)) {
                     pred_type = FLOW_PRED;
                     if (sd_version_is_wan(version)) {
@@ -1167,7 +1194,23 @@ class StableDiffusionGGML {
     std::shared_ptr<LoraModel> load_lora_model_from_file(const std::string& lora_id,
                                                          float multiplier,
                                                          ggml_backend_t backend,
+                                                         std::string stage = "",
                                                          LoraModel::filter_t lora_tensor_filter = nullptr) {
+        // kcpp
+        // first check the cache
+        bool kcpp_at_runtime = (stage != "");
+        std::string lora_key = "|" + stage + "|" + lora_id;
+        if (kcpp_at_runtime) {
+            auto it = kcpp_lora_cache.find(lora_key);
+            if (it != kcpp_lora_cache.end()) {
+                if (it->second) {
+                    it->second->multiplier = multiplier;
+                }
+                return it->second;
+            }
+        }
+        // by construction, kcpp will always find the preloaded LoRAs on the cache
+
         std::string lora_path             = lora_id;
         static std::string high_noise_tag = "|high_noise|";
         bool is_high_noise                = false;
@@ -1179,10 +1222,16 @@ class StableDiffusionGGML {
         auto lora = std::make_shared<LoraModel>(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version);
         if (!lora->load_from_file(n_threads, lora_tensor_filter)) {
             LOG_WARN("load lora tensors from %s failed", lora_path.c_str());
-            return nullptr;
+            // also cache negatives to avoid I/O at runtime
+            lora = nullptr;
+            if (kcpp_at_runtime)
+                kcpp_lora_cache[lora_key] = lora;
+            return lora;
         }
 
         lora->multiplier = multiplier;
+        if (kcpp_at_runtime)
+            kcpp_lora_cache[lora_key] = lora;
         return lora;
     }
 
@@ -1234,6 +1283,18 @@ class StableDiffusionGGML {
         cond_stage_lora_models.clear();
         diffusion_lora_models.clear();
         first_stage_lora_models.clear();
+        if (cond_stage_model) {
+            cond_stage_model->set_weight_adapter(nullptr);
+        }
+        if (diffusion_model) {
+            diffusion_model->set_weight_adapter(nullptr);
+        }
+        if (high_noise_diffusion_model) {
+            high_noise_diffusion_model->set_weight_adapter(nullptr);
+        }
+        if (first_stage_model) {
+            first_stage_model->set_weight_adapter(nullptr);
+        }
         if (lora_state.empty()) {
             return;
         }
@@ -1261,7 +1322,7 @@ class StableDiffusionGGML {
                 const std::string& lora_id = kv.first;
                 float multiplier           = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, "cond_stage", lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     cond_stage_lora_models.push_back(lora);
@@ -1293,7 +1354,7 @@ class StableDiffusionGGML {
                 const std::string& lora_name = kv.first;
                 float multiplier             = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_name, multiplier, backend, "diffusion", lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     diffusion_lora_models.push_back(lora);
@@ -1329,7 +1390,7 @@ class StableDiffusionGGML {
                 const std::string& lora_name = kv.first;
                 float multiplier             = kv.second;
 
-                auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter);
+                auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, "first_stage", lora_tensor_filter);
                 if (lora && !lora->lora_tensors.empty()) {
                     lora->preprocess_lora_tensors(tensors);
                     first_stage_lora_models.push_back(lora);
@@ -1650,7 +1711,7 @@ class StableDiffusionGGML {
                 } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) {
                     latent_rgb_proj = flux_latent_rgb_proj;
                     latent_rgb_bias = flux_latent_rgb_bias;
-                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
+                } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                     latent_rgb_proj = wan_21_latent_rgb_proj;
                     latent_rgb_bias = wan_21_latent_rgb_bias;
                 } else {
@@ -2131,6 +2192,9 @@ class StableDiffusionGGML {
                 shifted_t             = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
                 LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
                 timesteps_vec.assign(1, (float)shifted_t);
+            } else if (sd_version_is_anima(version)) {
+                // Anima uses normalized flow timesteps.
+                timesteps_vec.assign(1, t / static_cast<float>(TIMESTEPS));
             } else if (sd_version_is_z_image(version)) {
                 timesteps_vec.assign(1, 1000.f - t);
             } else {
@@ -2542,7 +2606,7 @@ class StableDiffusionGGML {
     }
 
     void process_latent_in(ggml_tensor* latent) {
-        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) {
+        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
             int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
             std::vector<float> latents_mean_vec;
             std::vector<float> latents_std_vec;
@@ -2581,7 +2645,7 @@ class StableDiffusionGGML {
     }
 
     void process_latent_out(ggml_tensor* latent) {
-        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) {
+        if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) {
             int channel_dim = sd_version_is_flux2(version) ? 2 : 3;
             std::vector<float> latents_mean_vec;
             std::vector<float> latents_std_vec;
@@ -2659,7 +2723,7 @@ class StableDiffusionGGML {
             // TODO wan2.2 vae support?
             int64_t ne2;
             int64_t ne3;
-            if (sd_version_is_qwen_image(version)) {
+            if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                 ne2 = 1;
                 ne3 = C * x->ne[3];
             } else {
@@ -2677,7 +2741,7 @@ class StableDiffusionGGML {
             result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3);
         }
 
-        if (sd_version_is_qwen_image(version)) {
+        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
             x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
         }
 
@@ -2750,6 +2814,7 @@ class StableDiffusionGGML {
         ggml_tensor* latent;
         if (use_tiny_autoencoder ||
             sd_version_is_qwen_image(version) ||
+            sd_version_is_anima(version) ||
             sd_version_is_wan(version) ||
             sd_version_is_flux2(version) ||
             version == VERSION_CHROMA_RADIANCE) {
@@ -2769,7 +2834,7 @@ class StableDiffusionGGML {
         if (!use_tiny_autoencoder) {
             process_latent_in(latent);
         }
-        if (sd_version_is_qwen_image(version)) {
+        if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
             latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1);
         }
         return latent;
@@ -2807,7 +2872,7 @@ class StableDiffusionGGML {
         }
         int64_t t0 = ggml_time_ms();
         if (!use_tiny_autoencoder) {
-            if (sd_version_is_qwen_image(version)) {
+            if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) {
                 x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]);
             }
             process_latent_out(x);
diff --git a/otherarch/sdcpp/unet.hpp b/otherarch/sdcpp/unet.hpp
index 2dd79e0e197..e0fd4c52761 100644
--- a/otherarch/sdcpp/unet.hpp
+++ b/otherarch/sdcpp/unet.hpp
@@ -1,8 +1,7 @@
 #ifndef __UNET_HPP__
 #define __UNET_HPP__
 
-#include "common.hpp"
-#include "ggml_extend.hpp"
+#include "common_block.hpp"
 #include "model.h"
 
 /*==================================================== UnetModel =====================================================*/
diff --git a/otherarch/sdcpp/vae.hpp b/otherarch/sdcpp/vae.hpp
index c627616c210..7ccba6eed33 100644
--- a/otherarch/sdcpp/vae.hpp
+++ b/otherarch/sdcpp/vae.hpp
@@ -1,8 +1,7 @@
 #ifndef __VAE_HPP__
 #define __VAE_HPP__
 
-#include "common.hpp"
-#include "ggml_extend.hpp"
+#include "common_block.hpp"
 
 /*================================================== AutoEncoderKL ===================================================*/
 
diff --git a/otherarch/sdcpp/wan.hpp b/otherarch/sdcpp/wan.hpp
index 90de3bdd161..d94fbd482a5 100644
--- a/otherarch/sdcpp/wan.hpp
+++ b/otherarch/sdcpp/wan.hpp
@@ -5,9 +5,8 @@
 #include <memory>
 #include <utility>
 
-#include "common.hpp"
+#include "common_block.hpp"
 #include "flux.hpp"
-#include "ggml_extend.hpp"
 #include "rope.hpp"
 #include "vae.hpp"
 
diff --git a/otherarch/sdcpp/z_image.hpp b/otherarch/sdcpp/z_image.hpp
index cee23833aa7..8f405a590b7 100644
--- a/otherarch/sdcpp/z_image.hpp
+++ b/otherarch/sdcpp/z_image.hpp
@@ -346,69 +346,6 @@ namespace ZImage {
             blocks["final_layer"] = std::make_shared<FinalLayer>(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels);
         }
 
-        struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx,
-                                              struct ggml_tensor* x) {
-            int64_t W = x->ne[0];
-            int64_t H = x->ne[1];
-
-            int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size;
-            int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size;
-            x         = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled);
-            return x;
-        }
-
-        struct ggml_tensor* patchify(struct ggml_context* ctx,
-                                     struct ggml_tensor* x) {
-            // x: [N, C, H, W]
-            // return: [N, h*w, patch_size*patch_size*C]
-            int64_t N = x->ne[3];
-            int64_t C = x->ne[2];
-            int64_t H = x->ne[1];
-            int64_t W = x->ne[0];
-            int64_t p = z_image_params.patch_size;
-            int64_t h = H / z_image_params.patch_size;
-            int64_t w = W / z_image_params.patch_size;
-
-            GGML_ASSERT(h * p == H && w * p == W);
-
-            x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N);                 // [N*C*h, p, w, p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));            // [N*C*h, w, p, p]
-            x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N);                 // [N, C, h*w, p*p]
-            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3));  // [N, h*w, C, p*p]
-            x = ggml_reshape_3d(ctx, x, C * p * p, w * h, N);                // [N, h*w, p*p*C]
-            return x;
-        }
-
-        struct ggml_tensor* process_img(GGMLRunnerContext* ctx,
-                                        struct ggml_tensor* x) {
-            x = pad_to_patch_size(ctx, x);
-            x = patchify(ctx->ggml_ctx, x);
-            return x;
-        }
-
-        struct ggml_tensor* unpatchify(struct ggml_context* ctx,
-                                       struct ggml_tensor* x,
-                                       int64_t h,
-                                       int64_t w) {
-            // x: [N, h*w, patch_size*patch_size*C]
-            // return: [N, C, H, W]
-            int64_t N = x->ne[2];
-            int64_t C = x->ne[0] / z_image_params.patch_size / z_image_params.patch_size;
-            int64_t H = h * z_image_params.patch_size;
-            int64_t W = w * z_image_params.patch_size;
-            int64_t p = z_image_params.patch_size;
-
-            GGML_ASSERT(C * p * p == x->ne[0]);
-
-            x = ggml_reshape_4d(ctx, x, C, p * p, w * h, N);                 // [N, h*w, p*p, C]
-            x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3));  // [N, C, h*w, p*p]
-            x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N);                 // [N*C*h, w, p, p]
-            x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3));            // [N*C*h, p, w, p]
-            x = ggml_reshape_4d(ctx, x, W, H, C, N);                         // [N, C, h*p, w*p]
-
-            return x;
-        }
-
         struct ggml_tensor* forward_core(GGMLRunnerContext* ctx,
                                          struct ggml_tensor* x,
                                          struct ggml_tensor* timestep,
@@ -495,27 +432,22 @@ namespace ZImage {
             int64_t C = x->ne[2];
             int64_t N = x->ne[3];
 
-            auto img             = process_img(ctx, x);
+            int patch_size = z_image_params.patch_size;
+
+            auto img             = DiT::pad_and_patchify(ctx, x, patch_size, patch_size, false);
             uint64_t n_img_token = img->ne[1];
 
             if (ref_latents.size() > 0) {
                 for (ggml_tensor* ref : ref_latents) {
-                    ref = process_img(ctx, ref);
+                    ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size, false);
                     img = ggml_concat(ctx->ggml_ctx, img, ref, 1);
                 }
             }
 
-            int64_t h_len = ((H + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
-            int64_t w_len = ((W + (z_image_params.patch_size / 2)) / z_image_params.patch_size);
-
             auto out = forward_core(ctx, img, timestep, context, pe);
 
-            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token);  // [N, n_img_token, ph*pw*C]
-            out = unpatchify(ctx->ggml_ctx, out, h_len, w_len);           // [N, C, H + pad_h, W + pad_w]
-
-            // slice
-            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H);  // [N, C, H, W + pad_w]
-            out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W);  // [N, C, H, W]
+            out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token);                              // [N, n_img_token, ph*pw*C]
+            out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size, false);  // [N, C, H, W]
 
             out = ggml_ext_scale(ctx->ggml_ctx, out, -1.f);
 
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 372d1c26eb6..1834a631d69 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -366,23 +366,6 @@ std::vector<std::vector<int>> split_big_vector(const std::vector<int>& big_arr,
     return small_arrs;
 }
 
-std::vector<std::vector<int>> split_big_vector_in_two(const std::vector<int>& big_arr, size_t chunk_size)
-{
-    std::vector<std::vector<int>> result;
-    if (chunk_size == 0 || big_arr.empty())
-        return result;
-
-    if (big_arr.size() <= chunk_size) {
-        // Only one chunk (all elements)
-        result.emplace_back(big_arr);
-        return result;
-    }
-    size_t split_point = big_arr.size() - chunk_size;
-    result.emplace_back(big_arr.begin(), big_arr.begin() + split_point);  // First big chunk
-    result.emplace_back(big_arr.begin() + split_point, big_arr.end()); // Last chunk (size <= chunk_size)
-    return result;
-}
-
 std::vector<float> resample_wav(const std::vector<float> & input, uint32_t input_rate, uint32_t output_rate) {
     if (input.empty() || input_rate == 0 || output_rate == 0)
         return {};
diff --git a/otherarch/utils.h b/otherarch/utils.h
index 4482cc90387..b5137c08e20 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -62,8 +62,6 @@ std::string kcpp_base64_encode(const std::string &data);
 
 std::string get_timestamp_str();
 std::vector<std::vector<int>> split_big_vector(const std::vector<int>& big_arr, size_t chunk_size);
-std::vector<std::vector<int>> split_big_vector_in_two(const std::vector<int>& big_arr, size_t chunk_size);
-
 std::vector<float> resample_wav(const std::vector<float>& input, uint32_t input_rate, uint32_t output_rate);
 std::vector<float> mix_planar_stereo_to_mono(const float* audio, int T_audio);