diff --git a/expose.h b/expose.h index 0847392aef0..16558cf7dc4 100644 --- a/expose.h +++ b/expose.h @@ -6,7 +6,6 @@ const int images_max = 8; const int audio_max = 4; const int logprobs_max = 10; const int overridekv_max = 16; -const int lora_filenames_max = 4; // match kobold's sampler list and order enum samplers @@ -189,8 +188,9 @@ struct sd_load_model_inputs const char * clip1_filename = nullptr; const char * clip2_filename = nullptr; const char * vae_filename = nullptr; - const char * lora_filenames[lora_filenames_max] = {}; - const float lora_multiplier = 1.0f; + const int lora_len = 0; + const char ** lora_filenames = nullptr; + const float * lora_multipliers = nullptr; const int lora_apply_mode = 0; const char * photomaker_filename = nullptr; const char * upscaler_filename = nullptr; @@ -227,6 +227,8 @@ struct sd_generation_inputs const bool circular_x = false; const bool circular_y = false; const bool upscale = false; + const int lora_len = 0; + const float * lora_multipliers = nullptr; }; struct sd_generation_outputs { diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 2c4071c4c62..3d131a8ce6c 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -4465,7 +4465,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs) { skipdecodelater = true; //decode until nearly done, then snapshot and decode the last 64 - std::vector> parts = split_big_vector_in_two(embd,64); + std::vector> parts = split_big_vector(embd,64); int temp_past = n_past; evalres = true; for(int p=0;p chunk = parts[p]; kcpp_embd_batch smallbatch = kcpp_embd_batch(chunk, temp_past, use_mrope, false); decode_status = llama_decode(llama_ctx_v4, smallbatch.batch); - if(p==0 && decode_status==1) - { - skipdecodelater = false; - break; //big pp failed - } evalres = (evalres && (decode_status==0)); temp_past += chunk.size(); } diff --git a/koboldcpp.py b/koboldcpp.py index d2da96236ab..29fb771278c 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -89,6 +89,7 @@ embeddingsmodelpath = "" #if empty, not initialized musicllmmodelpath = "" #if empty, not initialized musicdiffusionmodelpath = "" #if empty, not initialized +imglorainfo = [] maxctx = 8192 maxhordectx = 0 #set to whatever maxctx is if 0 maxhordelen = 1024 @@ -320,8 +321,9 @@ class sd_load_model_inputs(ctypes.Structure): ("clip1_filename", ctypes.c_char_p), ("clip2_filename", ctypes.c_char_p), ("vae_filename", ctypes.c_char_p), - ("lora_filenames", ctypes.c_char_p * lora_filenames_max), - ("lora_multiplier", ctypes.c_float), + ("lora_len", ctypes.c_int), + ("lora_filenames", ctypes.POINTER(ctypes.c_char_p)), + ("lora_multipliers", ctypes.POINTER(ctypes.c_float)), ("lora_apply_mode", ctypes.c_int), ("photomaker_filename", ctypes.c_char_p), ("upscaler_filename", ctypes.c_char_p), @@ -356,7 +358,9 @@ class sd_generation_inputs(ctypes.Structure): ("remove_limits", ctypes.c_bool), ("circular_x", ctypes.c_bool), ("circular_y", ctypes.c_bool), - ("upscale", ctypes.c_bool)] + ("upscale", ctypes.c_bool), + ("lora_len", ctypes.c_int), + ("lora_multipliers", ctypes.POINTER(ctypes.c_float))] class sd_generation_outputs(ctypes.Structure): _fields_ = [("status", ctypes.c_int), @@ -1994,30 +1998,38 @@ def sd_load_model(model_filename,vae_filename,lora_filenames,t5xxl_filename,clip inputs.taesd = True if args.sdvaeauto else False inputs.tiled_vae_threshold = args.sdtiledvae inputs.vae_filename = vae_filename.encode("UTF-8") - for n in range(lora_filenames_max): - if n >= len(lora_filenames): - inputs.lora_filenames[n] = "".encode("UTF-8") - else: - inputs.lora_filenames[n] = lora_filenames[n].encode("UTF-8") - - inputs.lora_multiplier = args.sdloramult inputs.t5xxl_filename = t5xxl_filename.encode("UTF-8") inputs.clip1_filename = clip1_filename.encode("UTF-8") inputs.clip2_filename = clip2_filename.encode("UTF-8") inputs.photomaker_filename = photomaker_filename.encode("UTF-8") inputs.upscaler_filename = upscaler_filename.encode("UTF-8") + + lora_filenames = [l.encode("UTF-8") for l in lora_filenames[:lora_filenames_max] if l] + lora_len = len(lora_filenames) + lora_multipliers = args.sdloramult[:lora_len] + if len(lora_multipliers) < lora_len: + missing = lora_len - len(lora_multipliers) + if len(lora_multipliers) == 1: + # previous behavior: all get the same weight + lora_multipliers.extend(lora_multipliers * missing) + else: + lora_multipliers.extend([0.] * missing) + inputs.lora_len = lora_len + inputs.lora_filenames = (ctypes.c_char_p * lora_len)(*lora_filenames) + inputs.lora_multipliers = (ctypes.c_float * lora_len)(*lora_multipliers) + # auto if no zero-weight lora, dynamic otherwise + inputs.lora_apply_mode = 3 if 0. in inputs.lora_multipliers else 0 + inputs.img_hard_limit = args.sdclamped inputs.img_soft_limit = args.sdclampedsoft - inputs.lora_apply_mode = 0 #auto for now inputs = set_backend_props(inputs) ret = handle.sd_load_model(inputs) return ret -def sd_oai_tranform_params(genparams): - size = genparams.get('size', "512x512") - if size and size!="": - pattern = r'^\D*(\d+)x(\d+)$' - match = re.fullmatch(pattern, size) +def sd_oai_transform_params(genparams): + size = genparams.get('size') or '' + pattern = r'^\D*(\d+)x(\d+)$' + match = re.fullmatch(pattern, size) if match: width = int(match.group(1)) height = int(match.group(2)) @@ -2111,6 +2123,84 @@ def sd_upscale(genparams): data_main = ret.data.decode("UTF-8","ignore") return data_main +def sanitize_lora_multipliers(sdloramult): + if sdloramult is None: + sdloramult = [1.0] * lora_filenames_max + elif not isinstance(sdloramult, list): + sdloramult = [sdloramult] + sdloramult = [tryparsefloat(m, 0.) for m in sdloramult] + return sdloramult + +def prepare_lora_multipliers(request_list): + + orig_multipliers = [lora[3] for lora in imglorainfo] + dynamic = 0. in orig_multipliers + if not dynamic: + return orig_multipliers + + req_by_path = {} + for r in request_list: + if not isinstance(r, dict): + continue + multiplier = tryparsefloat(r.get('multiplier'), 0.) + path = r.get('path') + if path and isinstance(path, str): + req_by_path[path] = req_by_path.get(path, 0.) + multiplier + + result = [] + for i, (fullpath, name, path, origmul) in enumerate(imglorainfo): + multiplier = orig_multipliers[i] + if multiplier == 0. and path in req_by_path: + multiplier = req_by_path[path] + result.append(multiplier) + + return result + +def extract_loras_from_prompt(prompt): + + pattern = r']+):([^>]+)>' + lora_data = [] + + matches = list(re.finditer(pattern, prompt)) + + for match in matches: + raw_path = match.group(1) + raw_mul = match.group(2) + try: + mul = float(raw_mul) + except ValueError: + continue + + is_high_noise = False + prefix = "|high_noise|" + if raw_path.startswith(prefix): + raw_path = raw_path[len(prefix):] + is_high_noise = True + + lora_data.append({ + 'name': raw_path, + 'multiplier': mul, + 'is_high_noise': is_high_noise, + }) + + prompt = prompt.replace(match.group(0), "", 1) + + return prompt, lora_data + +def lora_map_name_to_path(request_list): + name2path = {} + for _, name, path, _ in imglorainfo: + name2path[name] = path + result = [] + for req in request_list: + out = dict(req) + name = out.pop('name') + path = name2path.get(name) + if path: + out['path'] = path + result.append(out) + return result + def sd_generate(genparams): global maxctx, args, currentusergenkey, totalgens, pendingabortkey, chatcompl_adapter @@ -2209,6 +2299,11 @@ def sd_generate(genparams): inputs.circular_x = tryparseint(adapter_obj.get("circular_x", genparams.get("circular_x",0)),0) inputs.circular_y = tryparseint(adapter_obj.get("circular_y", genparams.get("circular_y",0)),0) inputs.upscale = (True if tryparseint(genparams.get("enable_hr", 0),0) else False) + + lora_multipliers = prepare_lora_multipliers(genparams.get("lora", [])) + inputs.lora_len = len(lora_multipliers) + inputs.lora_multipliers = (ctypes.c_float * inputs.lora_len)(*lora_multipliers) + ret = handle.sd_generate(inputs) data_main = "" data_extra = "" @@ -4098,6 +4193,9 @@ def do_GET(self): elif clean_path.endswith('/v1/models') or clean_path=='/models': response_body = (json.dumps({"object":"list","data":[{"id":friendlymodelname,"object":"model","created":int(time.time()),"owned_by":"koboldcpp","permission":[],"root":"koboldcpp"}]}).encode()) + elif clean_path.endswith('/sdapi/v1/loras'): + response_body = (json.dumps([{'name': name, 'path': path} for _, name, path, multiplier in imglorainfo if multiplier == 0.])).encode() + elif clean_path.endswith('/sdapi/v1/upscalers'): if args.sdupscaler: response_body = (json.dumps([{"name":"ESRGAN_4x","model_name":"ESRGAN_4x","model_path":"upscaler_model.gguf","model_url":None,"scale":4}]).encode()) @@ -5106,7 +5204,13 @@ def do_POST(self): lastgeneratedcomfyimg = b'' genparams = sd_comfyui_tranform_params(genparams) elif is_oai_imggen: - genparams = sd_oai_tranform_params(genparams) + genparams = sd_oai_transform_params(genparams) + if not genparams.get('lora'): + # process syntax + prompt, loras = extract_loras_from_prompt(genparams['prompt']) + if loras: + genparams['prompt'] = prompt + genparams['lora'] = lora_map_name_to_path(loras) gen = sd_generate(genparams) gendat = gen["data"] genanim = gen["animated"] @@ -6936,9 +7040,10 @@ def export_vars(): args.sdquant = sd_quant_option(sd_quant_var.get()) if sd_lora_var.get() != "": args.sdlora = [item.strip() for item in sd_lora_var.get().split("|") if item] - args.sdloramult = float(sd_loramult_var.get()) else: args.sdlora = None + # XXX the user may have used '|' since it's used for the LoRAs + args.sdloramult = sanitize_lora_multipliers(re.split(r"[ |]+", sd_lora_var.get())) if gen_defaults_var.get() != "": args.gendefaults = gen_defaults_var.get() @@ -7197,7 +7302,7 @@ def import_vars(dict): sd_lora_var.set(dict["sdlora"] if ("sdlora" in dict and dict["sdlora"]) else "") else: sd_lora_var.set("") - sd_loramult_var.set(str(dict["sdloramult"]) if ("sdloramult" in dict and dict["sdloramult"]) else "1.0") + sd_loramult_var.set(" ".join(f"{n:.3f}".rstrip('0') for n in dict.get("sdloramult", []))) gen_defaults_var.set(dict["gendefaults"] if ("gendefaults" in dict and dict["gendefaults"]) else "") gen_defaults_overwrite_var.set(1 if "gendefaultsoverwrite" in dict and dict["gendefaultsoverwrite"] else 0) @@ -7641,6 +7746,8 @@ def convert_invalid_args(args): dict["noflashattention"] = not dict["flashattention"] if "sdlora" in dict and isinstance(dict["sdlora"], str): dict["sdlora"] = ([dict["sdlora"]] if dict["sdlora"] else None) + if "sdloramult" in dict: + dict["sdloramult"] = sanitize_lora_multipliers(dict["sdloramult"]) return args def setuptunnel(global_memory, has_sd): @@ -8325,6 +8432,30 @@ def main(launch_args, default_args): print("Press ENTER key to exit.", flush=True) input() + +def mk_lora_info(imgloras, multipliers): + # (full path, name, name+extension, can change multiplier) + # XXX for each LoRA, sdapi needs a name and a path; we could use + # the full filename as a path, but we don't know if we can expose it + used_lora_names = set() + result = [] + for i, lora_path in enumerate(imgloras): + multiplier = 0. if i >= len(multipliers) else multipliers[i] + lora_file = os.path.basename(lora_path) + lora_name, lora_ext = os.path.splitext(lora_file) + # ensure unique names + i = 1 + mapped_name = lora_name + while True: + if mapped_name not in used_lora_names: + result.append((lora_path, mapped_name, mapped_name + lora_ext, multiplier)) + used_lora_names.add(mapped_name) + break + i += 1 + mapped_name = lora_name + '_' + str(i) + return result + + def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): global embedded_kailite, embedded_kcpp_docs, embedded_kcpp_sdui, embedded_kailite_gz, embedded_kcpp_docs_gz, embedded_kcpp_sdui_gz, embedded_lcpp_ui_gz, embedded_musicui, embedded_musicui_gz, start_time, exitcounter, global_memory, using_gui_launcher global libname, args, friendlymodelname, friendlysdmodelname, fullsdmodelpath, password, fullwhispermodelpath, ttsmodelpath, embeddingsmodelpath, musicdiffusionmodelpath, musicllmmodelpath, friendlyembeddingsmodelname, has_audio_support, has_vision_support, cached_chat_template @@ -8770,6 +8901,9 @@ def kcpp_main_process(launch_args, g_memory=None, gui_launcher=False): imgloras.append(os.path.abspath(curr)) else: print(f"Missing SD LORA model file {curr}...") + global imglorainfo + args.sdloramult = sanitize_lora_multipliers(args.sdloramult) + imglorainfo = mk_lora_info(imgloras, args.sdloramult) if args.sdvae: if os.path.exists(args.sdvae): imgvae = os.path.abspath(args.sdvae) @@ -9365,7 +9499,7 @@ def range_checker(arg: str): sdparsergrouplora = sdparsergroup.add_mutually_exclusive_group() sdparsergrouplora.add_argument("--sdquant", metavar=('[quantization level 0/1/2]'), help="If specified, loads the model quantized to save memory. 0=off, 1=q8, 2=q4", type=int, choices=[0,1,2], nargs="?", const=2, default=0) sdparsergrouplora.add_argument("--sdlora", metavar=('[filename]'), help="Specify image generation LoRAs safetensors models to be applied. Multiple LoRAs are accepted.", nargs='+') - sdparsergroup.add_argument("--sdloramult", metavar=('[amount]'), help="Multiplier for the image LoRA model to be applied.", type=float, default=1.0) + sdparsergroup.add_argument("--sdloramult", metavar=('[amounts]'), help="Multipliers for the image LoRA model to be applied.", type=float, nargs='+', default=[1.0]) sdparsergroup.add_argument("--sdtiledvae", metavar=('[maxres]'), help="Adjust the automatic VAE tiling trigger for images above this size. 0 disables vae tiling.", type=int, default=default_vae_tile_threshold) whisperparsergroup = parser.add_argument_group('Whisper Transcription Commands') whisperparsergroup.add_argument("--whispermodel", metavar=('[filename]'), help="Specify a Whisper .bin model to enable Speech-To-Text transcription.", default="") diff --git a/otherarch/sdcpp/anima.hpp b/otherarch/sdcpp/anima.hpp new file mode 100644 index 00000000000..191a096d40f --- /dev/null +++ b/otherarch/sdcpp/anima.hpp @@ -0,0 +1,686 @@ +#ifndef __ANIMA_HPP__ +#define __ANIMA_HPP__ + +#include +#include +#include +#include + +#include "common_block.hpp" +#include "flux.hpp" +#include "rope.hpp" + +namespace Anima { + constexpr int ANIMA_GRAPH_SIZE = 65536; + + __STATIC_INLINE__ struct ggml_tensor* apply_gate(struct ggml_context* ctx, + struct ggml_tensor* x, + struct ggml_tensor* gate) { + gate = ggml_reshape_3d(ctx, gate, gate->ne[0], 1, gate->ne[1]); // [N, 1, C] + return ggml_mul(ctx, x, gate); + } + + struct XEmbedder : public GGMLBlock { + public: + XEmbedder(int64_t in_dim, int64_t out_dim) { + blocks["proj.1"] = std::make_shared(in_dim, out_dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto proj = std::dynamic_pointer_cast(blocks["proj.1"]); + return proj->forward(ctx, x); + } + }; + + struct TimestepEmbedder : public GGMLBlock { + public: + TimestepEmbedder(int64_t in_dim, int64_t out_dim) { + blocks["1.linear_1"] = std::make_shared(in_dim, in_dim, false); + blocks["1.linear_2"] = std::make_shared(in_dim, out_dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto linear_1 = std::dynamic_pointer_cast(blocks["1.linear_1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["1.linear_2"]); + + x = linear_1->forward(ctx, x); + x = ggml_silu_inplace(ctx->ggml_ctx, x); + x = linear_2->forward(ctx, x); + return x; + } + }; + + struct AdaLayerNormZero : public GGMLBlock { + protected: + int64_t in_features; + + public: + AdaLayerNormZero(int64_t in_features, int64_t hidden_features = 256) + : in_features(in_features) { + blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false); + blocks["1"] = std::make_shared(in_features, hidden_features, false); + blocks["2"] = std::make_shared(hidden_features, 3 * in_features, false); + } + + std::pair forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb = nullptr) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); + + auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep); + emb = linear_1->forward(ctx, emb); + emb = linear_2->forward(ctx, emb); // [N, 3*C] + + if (temb != nullptr) { + emb = ggml_add(ctx->ggml_ctx, emb, temb); + } + + auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 3, 0); + auto shift = emb_chunks[0]; + auto scale = emb_chunks[1]; + auto gate = emb_chunks[2]; + + auto x = norm->forward(ctx, hidden_states); + x = Flux::modulate(ctx->ggml_ctx, x, shift, scale); + + return {x, gate}; + } + }; + + struct AdaLayerNorm : public GGMLBlock { + protected: + int64_t embedding_dim; + + public: + AdaLayerNorm(int64_t in_features, int64_t hidden_features = 256) + : embedding_dim(in_features) { + blocks["norm"] = std::make_shared(in_features, 1e-6f, false, false); + blocks["1"] = std::make_shared(in_features, hidden_features, false); + blocks["2"] = std::make_shared(hidden_features, 2 * in_features, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb = nullptr) { + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto linear_1 = std::dynamic_pointer_cast(blocks["1"]); + auto linear_2 = std::dynamic_pointer_cast(blocks["2"]); + + auto emb = ggml_silu(ctx->ggml_ctx, embedded_timestep); + emb = linear_1->forward(ctx, emb); + emb = linear_2->forward(ctx, emb); // [N, 2*C] + + if (temb != nullptr) { + auto temb_2c = ggml_view_2d(ctx->ggml_ctx, temb, 2 * embedding_dim, temb->ne[1], temb->nb[1], 0); + emb = ggml_add(ctx->ggml_ctx, emb, temb_2c); + } + + auto emb_chunks = ggml_ext_chunk(ctx->ggml_ctx, emb, 2, 0); + auto shift = emb_chunks[0]; + auto scale = emb_chunks[1]; + + auto x = norm->forward(ctx, hidden_states); + x = Flux::modulate(ctx->ggml_ctx, x, shift, scale); + return x; + } + }; + + struct AnimaAttention : public GGMLBlock { + protected: + int64_t num_heads; + int64_t head_dim; + std::string out_proj_name; + + public: + AnimaAttention(int64_t query_dim, + int64_t context_dim, + int64_t num_heads, + int64_t head_dim, + const std::string& out_proj_name = "output_proj") + : num_heads(num_heads), head_dim(head_dim), out_proj_name(out_proj_name) { + int64_t inner_dim = num_heads * head_dim; + + blocks["q_proj"] = std::make_shared(query_dim, inner_dim, false); + blocks["k_proj"] = std::make_shared(context_dim, inner_dim, false); + blocks["v_proj"] = std::make_shared(context_dim, inner_dim, false); + blocks["q_norm"] = std::make_shared(head_dim, 1e-6f); + blocks["k_norm"] = std::make_shared(head_dim, 1e-6f); + blocks[this->out_proj_name] = std::make_shared(inner_dim, query_dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* encoder_hidden_states = nullptr, + struct ggml_tensor* pe_q = nullptr, + struct ggml_tensor* pe_k = nullptr) { + if (encoder_hidden_states == nullptr) { + encoder_hidden_states = hidden_states; + } + + auto q_proj = std::dynamic_pointer_cast(blocks["q_proj"]); + auto k_proj = std::dynamic_pointer_cast(blocks["k_proj"]); + auto v_proj = std::dynamic_pointer_cast(blocks["v_proj"]); + auto q_norm = std::dynamic_pointer_cast(blocks["q_norm"]); + auto k_norm = std::dynamic_pointer_cast(blocks["k_norm"]); + auto out_proj = std::dynamic_pointer_cast(blocks[out_proj_name]); + + auto q = q_proj->forward(ctx, hidden_states); + auto k = k_proj->forward(ctx, encoder_hidden_states); + auto v = v_proj->forward(ctx, encoder_hidden_states); + + int64_t N = q->ne[2]; + int64_t L_q = q->ne[1]; + int64_t L_k = k->ne[1]; + + auto q4 = ggml_reshape_4d(ctx->ggml_ctx, q, head_dim, num_heads, L_q, N); // [N, L_q, H, D] + auto k4 = ggml_reshape_4d(ctx->ggml_ctx, k, head_dim, num_heads, L_k, N); // [N, L_k, H, D] + auto v4 = ggml_reshape_4d(ctx->ggml_ctx, v, head_dim, num_heads, L_k, N); // [N, L_k, H, D] + + q4 = q_norm->forward(ctx, q4); + k4 = k_norm->forward(ctx, k4); + + struct ggml_tensor* attn_out = nullptr; + if (pe_q != nullptr || pe_k != nullptr) { + if (pe_q == nullptr) { + pe_q = pe_k; + } + if (pe_k == nullptr) { + pe_k = pe_q; + } + auto q_rope = Rope::apply_rope(ctx->ggml_ctx, q4, pe_q, false); + auto k_rope = Rope::apply_rope(ctx->ggml_ctx, k4, pe_k, false); + attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, + ctx->backend, + q_rope, + k_rope, + v4, + num_heads, + nullptr, + true, + ctx->flash_attn_enabled); + } else { + auto q_flat = ggml_reshape_3d(ctx->ggml_ctx, q4, head_dim * num_heads, L_q, N); + auto k_flat = ggml_reshape_3d(ctx->ggml_ctx, k4, head_dim * num_heads, L_k, N); + attn_out = ggml_ext_attention_ext(ctx->ggml_ctx, + ctx->backend, + q_flat, + k_flat, + v, + num_heads, + nullptr, + false, + ctx->flash_attn_enabled); + } + + return out_proj->forward(ctx, attn_out); + } + }; + + struct AnimaMLP : public GGMLBlock { + public: + AnimaMLP(int64_t dim, int64_t hidden_dim) { + blocks["layer1"] = std::make_shared(dim, hidden_dim, false); + blocks["layer2"] = std::make_shared(hidden_dim, dim, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto layer1 = std::dynamic_pointer_cast(blocks["layer1"]); + auto layer2 = std::dynamic_pointer_cast(blocks["layer2"]); + + x = layer1->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, true); + x = layer2->forward(ctx, x); + return x; + } + }; + + struct AdapterMLP : public GGMLBlock { + public: + AdapterMLP(int64_t dim, int64_t hidden_dim) { + blocks["0"] = std::make_shared(dim, hidden_dim, true); + blocks["2"] = std::make_shared(hidden_dim, dim, true); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + auto layer0 = std::dynamic_pointer_cast(blocks["0"]); + auto layer2 = std::dynamic_pointer_cast(blocks["2"]); + + x = layer0->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, true); + x = layer2->forward(ctx, x); + return x; + } + }; + + struct LLMAdapterBlock : public GGMLBlock { + public: + LLMAdapterBlock(int64_t model_dim = 1024, int64_t source_dim = 1024, int64_t num_heads = 16, int64_t head_dim = 64) { + blocks["norm_self_attn"] = std::make_shared(model_dim, 1e-6f); + blocks["self_attn"] = std::make_shared(model_dim, model_dim, num_heads, head_dim, "o_proj"); + blocks["norm_cross_attn"] = std::make_shared(model_dim, 1e-6f); + blocks["cross_attn"] = std::make_shared(model_dim, source_dim, num_heads, head_dim, "o_proj"); + blocks["norm_mlp"] = std::make_shared(model_dim, 1e-6f); + blocks["mlp"] = std::make_shared(model_dim, model_dim * 4); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* context, + struct ggml_tensor* target_pe, + struct ggml_tensor* context_pe) { + auto norm_self_attn = std::dynamic_pointer_cast(blocks["norm_self_attn"]); + auto self_attn = std::dynamic_pointer_cast(blocks["self_attn"]); + auto norm_cross_attn = std::dynamic_pointer_cast(blocks["norm_cross_attn"]); + auto cross_attn = std::dynamic_pointer_cast(blocks["cross_attn"]); + auto norm_mlp = std::dynamic_pointer_cast(blocks["norm_mlp"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto h = norm_self_attn->forward(ctx, x); + h = self_attn->forward(ctx, h, nullptr, target_pe, target_pe); + x = ggml_add(ctx->ggml_ctx, x, h); + + h = norm_cross_attn->forward(ctx, x); + h = cross_attn->forward(ctx, h, context, target_pe, context_pe); + x = ggml_add(ctx->ggml_ctx, x, h); + + h = norm_mlp->forward(ctx, x); + h = mlp->forward(ctx, h); + x = ggml_add(ctx->ggml_ctx, x, h); + + return x; + } + }; + + struct LLMAdapter : public GGMLBlock { + protected: + int num_layers; + + public: + LLMAdapter(int64_t source_dim = 1024, + int64_t target_dim = 1024, + int64_t model_dim = 1024, + int num_layers = 6, + int num_heads = 16) + : num_layers(num_layers) { + int64_t head_dim = model_dim / num_heads; + + blocks["embed"] = std::make_shared(32128, target_dim); + for (int i = 0; i < num_layers; i++) { + blocks["blocks." + std::to_string(i)] = + std::make_shared(model_dim, source_dim, num_heads, head_dim); + } + blocks["out_proj"] = std::make_shared(model_dim, target_dim, true); + blocks["norm"] = std::make_shared(target_dim, 1e-6f); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* source_hidden_states, + struct ggml_tensor* target_input_ids, + struct ggml_tensor* target_pe, + struct ggml_tensor* source_pe) { + GGML_ASSERT(target_input_ids != nullptr); + if (ggml_n_dims(target_input_ids) == 1) { + target_input_ids = ggml_reshape_2d(ctx->ggml_ctx, target_input_ids, target_input_ids->ne[0], 1); + } + + auto embed = std::dynamic_pointer_cast(blocks["embed"]); + auto out_proj = std::dynamic_pointer_cast(blocks["out_proj"]); + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + + auto x = embed->forward(ctx, target_input_ids); // [N, target_len, target_dim] + + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); + x = block->forward(ctx, x, source_hidden_states, target_pe, source_pe); + } + + x = out_proj->forward(ctx, x); + x = norm->forward(ctx, x); + return x; + } + }; + + struct TransformerBlock : public GGMLBlock { + public: + TransformerBlock(int64_t hidden_size, + int64_t text_embed_dim, + int64_t num_heads, + int64_t head_dim, + int64_t mlp_ratio = 4, + int64_t adaln_lora_dim = 256) { + blocks["adaln_modulation_self_attn"] = std::make_shared(hidden_size, adaln_lora_dim); + blocks["self_attn"] = std::make_shared(hidden_size, hidden_size, num_heads, head_dim); + blocks["adaln_modulation_cross_attn"] = std::make_shared(hidden_size, adaln_lora_dim); + blocks["cross_attn"] = std::make_shared(hidden_size, text_embed_dim, num_heads, head_dim); + blocks["adaln_modulation_mlp"] = std::make_shared(hidden_size, adaln_lora_dim); + blocks["mlp"] = std::make_shared(hidden_size, hidden_size * mlp_ratio); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* encoder_hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb, + struct ggml_tensor* image_pe) { + auto norm1 = std::dynamic_pointer_cast(blocks["adaln_modulation_self_attn"]); + auto attn1 = std::dynamic_pointer_cast(blocks["self_attn"]); + auto norm2 = std::dynamic_pointer_cast(blocks["adaln_modulation_cross_attn"]); + auto attn2 = std::dynamic_pointer_cast(blocks["cross_attn"]); + auto norm3 = std::dynamic_pointer_cast(blocks["adaln_modulation_mlp"]); + auto mlp = std::dynamic_pointer_cast(blocks["mlp"]); + + auto [normed1, gate1] = norm1->forward(ctx, hidden_states, embedded_timestep, temb); + auto h = attn1->forward(ctx, normed1, nullptr, image_pe, image_pe); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate1)); + + auto [normed2, gate2] = norm2->forward(ctx, hidden_states, embedded_timestep, temb); + h = attn2->forward(ctx, normed2, encoder_hidden_states, nullptr, nullptr); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate2)); + + auto [normed3, gate3] = norm3->forward(ctx, hidden_states, embedded_timestep, temb); + h = mlp->forward(ctx, normed3); + hidden_states = ggml_add(ctx->ggml_ctx, hidden_states, apply_gate(ctx->ggml_ctx, h, gate3)); + + return hidden_states; + } + }; + + struct FinalLayer : public GGMLBlock { + protected: + int64_t hidden_size; + int64_t patch_size; + int64_t out_channels; + + public: + FinalLayer(int64_t hidden_size, int64_t patch_size, int64_t out_channels) + : hidden_size(hidden_size), patch_size(patch_size), out_channels(out_channels) { + blocks["adaln_modulation"] = std::make_shared(hidden_size, 256); + blocks["linear"] = std::make_shared(hidden_size, patch_size * patch_size * out_channels, false); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* hidden_states, + struct ggml_tensor* embedded_timestep, + struct ggml_tensor* temb) { + auto adaln = std::dynamic_pointer_cast(blocks["adaln_modulation"]); + auto linear = std::dynamic_pointer_cast(blocks["linear"]); + + hidden_states = adaln->forward(ctx, hidden_states, embedded_timestep, temb); + hidden_states = linear->forward(ctx, hidden_states); + return hidden_states; + } + }; + + struct AnimaNet : public GGMLBlock { + public: + int64_t in_channels = 16; + int64_t out_channels = 16; + int64_t hidden_size = 2048; + int64_t text_embed_dim = 1024; + int64_t num_heads = 16; + int64_t head_dim = 128; + int patch_size = 2; + int64_t num_layers = 28; + std::vector axes_dim = {44, 42, 42}; + int theta = 10000; + + public: + AnimaNet() = default; + explicit AnimaNet(int64_t num_layers) + : num_layers(num_layers) { + blocks["x_embedder"] = std::make_shared((in_channels + 1) * patch_size * patch_size, hidden_size); + blocks["t_embedder"] = std::make_shared(hidden_size, hidden_size * 3); + blocks["t_embedding_norm"] = std::make_shared(hidden_size, 1e-6f); + for (int i = 0; i < num_layers; i++) { + blocks["blocks." + std::to_string(i)] = std::make_shared(hidden_size, + text_embed_dim, + num_heads, + head_dim); + } + blocks["final_layer"] = std::make_shared(hidden_size, patch_size, out_channels); + blocks["llm_adapter"] = std::make_shared(1024, 1024, 1024, 6, 16); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* timestep, + struct ggml_tensor* encoder_hidden_states, + struct ggml_tensor* image_pe, + struct ggml_tensor* t5_ids = nullptr, + struct ggml_tensor* t5_weights = nullptr, + struct ggml_tensor* adapter_q_pe = nullptr, + struct ggml_tensor* adapter_k_pe = nullptr) { + GGML_ASSERT(x->ne[3] == 1); + + auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]); + auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]); + auto t_embedding_norm = std::dynamic_pointer_cast(blocks["t_embedding_norm"]); + auto final_layer = std::dynamic_pointer_cast(blocks["final_layer"]); + auto llm_adapter = std::dynamic_pointer_cast(blocks["llm_adapter"]); + + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + + auto padding_mask = ggml_ext_zeros(ctx->ggml_ctx, x->ne[0], x->ne[1], 1, x->ne[3]); + x = ggml_concat(ctx->ggml_ctx, x, padding_mask, 2); // [N, C + 1, H, W] + + x = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); // [N, h*w, (C+1)*ph*pw] + + x = x_embedder->forward(ctx, x); + + auto timestep_proj = ggml_ext_timestep_embedding(ctx->ggml_ctx, timestep, static_cast(hidden_size)); + auto temb = t_embedder->forward(ctx, timestep_proj); + auto embedded_timestep = t_embedding_norm->forward(ctx, timestep_proj); + + if (t5_ids != nullptr) { + auto adapted_context = llm_adapter->forward(ctx, encoder_hidden_states, t5_ids, adapter_q_pe, adapter_k_pe); + if (t5_weights != nullptr) { + auto w = t5_weights; + if (ggml_n_dims(w) == 1) { + w = ggml_reshape_3d(ctx->ggml_ctx, w, 1, w->ne[0], 1); + } + w = ggml_repeat_4d(ctx->ggml_ctx, w, adapted_context->ne[0], adapted_context->ne[1], adapted_context->ne[2], 1); + adapted_context = ggml_mul(ctx->ggml_ctx, adapted_context, w); + } + if (adapted_context->ne[1] < 512) { + auto pad_ctx = ggml_ext_zeros(ctx->ggml_ctx, + adapted_context->ne[0], + 512 - adapted_context->ne[1], + adapted_context->ne[2], + 1); + adapted_context = ggml_concat(ctx->ggml_ctx, adapted_context, pad_ctx, 1); + } else if (adapted_context->ne[1] > 512) { + adapted_context = ggml_ext_slice(ctx->ggml_ctx, adapted_context, 1, 0, 512); + } + encoder_hidden_states = adapted_context; + } + + for (int i = 0; i < num_layers; i++) { + auto block = std::dynamic_pointer_cast(blocks["blocks." + std::to_string(i)]); + x = block->forward(ctx, x, encoder_hidden_states, embedded_timestep, temb, image_pe); + } + + x = final_layer->forward(ctx, x, embedded_timestep, temb); // [N, h*w, ph*pw*C] + + x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, false); // [N, C, H, W] + + return x; + } + }; + + struct AnimaRunner : public GGMLRunner { + public: + std::vector image_pe_vec; + std::vector adapter_q_pe_vec; + std::vector adapter_k_pe_vec; + AnimaNet net; + + AnimaRunner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : GGMLRunner(backend, offload_params_to_cpu) { + int64_t num_layers = 0; + std::string layer_tag = prefix + ".net.blocks."; + for (const auto& kv : tensor_storage_map) { + const std::string& tensor_name = kv.first; + size_t pos = tensor_name.find(layer_tag); + if (pos == std::string::npos) { + continue; + } + size_t start = pos + layer_tag.size(); + size_t end = tensor_name.find('.', start); + if (end == std::string::npos) { + continue; + } + int64_t layer_id = atoll(tensor_name.substr(start, end - start).c_str()); + num_layers = std::max(num_layers, layer_id + 1); + } + if (num_layers <= 0) { + num_layers = 28; + } + LOG_INFO("anima net layers: %" PRId64, num_layers); + + net = AnimaNet(num_layers); + net.init(params_ctx, tensor_storage_map, prefix + ".net"); + } + + std::string get_desc() override { + return "anima"; + } + + void get_param_tensors(std::map& tensors, const std::string prefix) { + net.get_param_tensors(tensors, prefix + ".net"); + } + + static std::vector gen_1d_rope_pe_vec(int64_t seq_len, int dim, float theta = 10000.f) { + std::vector pos(seq_len); + for (int64_t i = 0; i < seq_len; i++) { + pos[i] = static_cast(i); + } + auto rope_emb = Rope::rope(pos, dim, theta); + return Rope::flatten(rope_emb); + } + + static float calc_ntk_factor(float extrapolation_ratio, int axis_dim) { + if (extrapolation_ratio == 1.0f || axis_dim <= 2) { + return 1.0f; + } + return std::pow(extrapolation_ratio, static_cast(axis_dim) / static_cast(axis_dim - 2)); + } + + static std::vector gen_anima_image_pe_vec(int bs, + int h, + int w, + int patch_size, + int theta, + const std::vector& axes_dim, + float h_extrapolation_ratio, + float w_extrapolation_ratio, + float t_extrapolation_ratio) { + static const std::vector empty_ref_latents; + auto ids = Rope::gen_flux_ids(h, + w, + patch_size, + bs, + static_cast(axes_dim.size()), + 0, + {}, + empty_ref_latents, + false, + 1.0f); + + std::vector axis_thetas = { + static_cast(theta) * calc_ntk_factor(t_extrapolation_ratio, axes_dim[0]), + static_cast(theta) * calc_ntk_factor(h_extrapolation_ratio, axes_dim[1]), + static_cast(theta) * calc_ntk_factor(w_extrapolation_ratio, axes_dim[2]), + }; + return Rope::embed_nd(ids, bs, axis_thetas, axes_dim); + } + + struct ggml_cgraph* build_graph(struct ggml_tensor* x, + struct ggml_tensor* timesteps, + struct ggml_tensor* context, + struct ggml_tensor* t5_ids = nullptr, + struct ggml_tensor* t5_weights = nullptr) { + GGML_ASSERT(x->ne[3] == 1); + struct ggml_cgraph* gf = new_graph_custom(ANIMA_GRAPH_SIZE); + + x = to_backend(x); + timesteps = to_backend(timesteps); + context = to_backend(context); + t5_ids = to_backend(t5_ids); + t5_weights = to_backend(t5_weights); + + int64_t pad_h = (net.patch_size - x->ne[1] % net.patch_size) % net.patch_size; + int64_t pad_w = (net.patch_size - x->ne[0] % net.patch_size) % net.patch_size; + int64_t h_pad = x->ne[1] + pad_h; + int64_t w_pad = x->ne[0] + pad_w; + + image_pe_vec = gen_anima_image_pe_vec(1, + static_cast(h_pad), + static_cast(w_pad), + static_cast(net.patch_size), + net.theta, + net.axes_dim, + 4.0f, + 4.0f, + 1.0f); + int64_t image_pos_len = static_cast(image_pe_vec.size()) / (2 * 2 * (net.head_dim / 2)); + auto image_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, net.head_dim / 2, image_pos_len); + set_backend_tensor_data(image_pe, image_pe_vec.data()); + + ggml_tensor* adapter_q_pe = nullptr; + ggml_tensor* adapter_k_pe = nullptr; + if (t5_ids != nullptr) { + int64_t target_len = t5_ids->ne[0]; + int64_t source_len = context->ne[1]; + + adapter_q_pe_vec = gen_1d_rope_pe_vec(target_len, 64, 10000.f); + adapter_k_pe_vec = gen_1d_rope_pe_vec(source_len, 64, 10000.f); + + int64_t target_pos_len = static_cast(adapter_q_pe_vec.size()) / (2 * 2 * 32); + int64_t source_pos_len = static_cast(adapter_k_pe_vec.size()) / (2 * 2 * 32); + + adapter_q_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, target_pos_len); + adapter_k_pe = ggml_new_tensor_4d(compute_ctx, GGML_TYPE_F32, 2, 2, 32, source_pos_len); + set_backend_tensor_data(adapter_q_pe, adapter_q_pe_vec.data()); + set_backend_tensor_data(adapter_k_pe, adapter_k_pe_vec.data()); + } + + auto runner_ctx = get_context(); + auto out = net.forward(&runner_ctx, + x, + timesteps, + context, + image_pe, + t5_ids, + t5_weights, + adapter_q_pe, + adapter_k_pe); + + ggml_build_forward_expand(gf, out); + return gf; + } + + bool compute(int n_threads, + struct ggml_tensor* x, + struct ggml_tensor* timesteps, + struct ggml_tensor* context, + struct ggml_tensor* t5_ids = nullptr, + struct ggml_tensor* t5_weights = nullptr, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) { + auto get_graph = [&]() -> struct ggml_cgraph* { + return build_graph(x, timesteps, context, t5_ids, t5_weights); + }; + return GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); + } + }; +} // namespace Anima + +#endif // __ANIMA_HPP__ diff --git a/otherarch/sdcpp/common_block.hpp b/otherarch/sdcpp/common_block.hpp new file mode 100644 index 00000000000..435afa4f415 --- /dev/null +++ b/otherarch/sdcpp/common_block.hpp @@ -0,0 +1,593 @@ +#ifndef __COMMON_BLOCK_HPP__ +#define __COMMON_BLOCK_HPP__ + +#include "ggml_extend.hpp" + +class DownSampleBlock : public GGMLBlock { +protected: + int channels; + int out_channels; + bool vae_downsample; + +public: + DownSampleBlock(int channels, + int out_channels, + bool vae_downsample = false) + : channels(channels), + out_channels(out_channels), + vae_downsample(vae_downsample) { + if (vae_downsample) { + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {0, 0})); + } else { + blocks["op"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {2, 2}, {1, 1})); + } + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + // x: [N, channels, h, w] + if (vae_downsample) { + auto conv = std::dynamic_pointer_cast(blocks["conv"]); + + x = ggml_ext_pad(ctx->ggml_ctx, x, 1, 1, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); + x = conv->forward(ctx, x); + } else { + auto conv = std::dynamic_pointer_cast(blocks["op"]); + + x = conv->forward(ctx, x); + } + return x; // [N, out_channels, h/2, w/2] + } +}; + +class UpSampleBlock : public GGMLBlock { +protected: + int channels; + int out_channels; + +public: + UpSampleBlock(int channels, + int out_channels) + : channels(channels), + out_channels(out_channels) { + blocks["conv"] = std::shared_ptr(new Conv2d(channels, out_channels, {3, 3}, {1, 1}, {1, 1})); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + // x: [N, channels, h, w] + auto conv = std::dynamic_pointer_cast(blocks["conv"]); + + x = ggml_upscale(ctx->ggml_ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] + x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] + return x; + } +}; + +class ResBlock : public GGMLBlock { +protected: + // network hparams + int64_t channels; // model_channels * (1, 1, 1, 2, 2, 4, 4, 4) + int64_t emb_channels; // time_embed_dim + int64_t out_channels; // mult * model_channels + std::pair kernel_size; + int dims; + bool skip_t_emb; + bool exchange_temb_dims; + + std::shared_ptr conv_nd(int dims, + int64_t in_channels, + int64_t out_channels, + std::pair kernel_size, + std::pair padding) { + GGML_ASSERT(dims == 2 || dims == 3); + if (dims == 3) { + return std::shared_ptr(new Conv3d(in_channels, out_channels, {kernel_size.first, 1, 1}, {1, 1, 1}, {padding.first, 0, 0})); + } else { + return std::shared_ptr(new Conv2d(in_channels, out_channels, kernel_size, {1, 1}, padding)); + } + } + +public: + ResBlock(int64_t channels, + int64_t emb_channels, + int64_t out_channels, + std::pair kernel_size = {3, 3}, + int dims = 2, + bool exchange_temb_dims = false, + bool skip_t_emb = false) + : channels(channels), + emb_channels(emb_channels), + out_channels(out_channels), + kernel_size(kernel_size), + dims(dims), + skip_t_emb(skip_t_emb), + exchange_temb_dims(exchange_temb_dims) { + std::pair padding = {kernel_size.first / 2, kernel_size.second / 2}; + blocks["in_layers.0"] = std::shared_ptr(new GroupNorm32(channels)); + // in_layer_1 is nn.SILU() + blocks["in_layers.2"] = conv_nd(dims, channels, out_channels, kernel_size, padding); + + if (!skip_t_emb) { + // emb_layer_0 is nn.SILU() + blocks["emb_layers.1"] = std::shared_ptr(new Linear(emb_channels, out_channels)); + } + + blocks["out_layers.0"] = std::shared_ptr(new GroupNorm32(out_channels)); + // out_layer_1 is nn.SILU() + // out_layer_2 is nn.Dropout(), skip for inference + blocks["out_layers.3"] = conv_nd(dims, out_channels, out_channels, kernel_size, padding); + + if (out_channels != channels) { + blocks["skip_connection"] = conv_nd(dims, channels, out_channels, {1, 1}, {0, 0}); + } + } + + virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* emb = nullptr) { + // For dims==3, we reduce dimension from 5d to 4d by merging h and w, in order not to change ggml + // [N, c, t, h, w] => [N, c, t, h * w] + // x: [N, channels, h, w] if dims == 2 else [N, channels, t, h, w] + // emb: [N, emb_channels] if dims == 2 else [N, t, emb_channels] + auto in_layers_0 = std::dynamic_pointer_cast(blocks["in_layers.0"]); + auto in_layers_2 = std::dynamic_pointer_cast(blocks["in_layers.2"]); + auto out_layers_0 = std::dynamic_pointer_cast(blocks["out_layers.0"]); + auto out_layers_3 = std::dynamic_pointer_cast(blocks["out_layers.3"]); + + if (emb == nullptr) { + GGML_ASSERT(skip_t_emb); + } + + // in_layers + auto h = in_layers_0->forward(ctx, x); + h = ggml_silu_inplace(ctx->ggml_ctx, h); + h = in_layers_2->forward(ctx, h); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] + + // emb_layers + if (!skip_t_emb) { + auto emb_layer_1 = std::dynamic_pointer_cast(blocks["emb_layers.1"]); + + auto emb_out = ggml_silu(ctx->ggml_ctx, emb); + emb_out = emb_layer_1->forward(ctx, emb_out); // [N, out_channels] if dims == 2 else [N, t, out_channels] + + if (dims == 2) { + emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, 1, emb_out->ne[0], emb_out->ne[1]); // [N, out_channels, 1, 1] + } else { + emb_out = ggml_reshape_4d(ctx->ggml_ctx, emb_out, 1, emb_out->ne[0], emb_out->ne[1], emb_out->ne[2]); // [N, t, out_channels, 1] + if (exchange_temb_dims) { + // emb_out = rearrange(emb_out, "b t c ... -> b c t ...") + emb_out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, emb_out, 0, 2, 1, 3)); // [N, out_channels, t, 1] + } + } + + h = ggml_add(ctx->ggml_ctx, h, emb_out); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] + } + + // out_layers + h = out_layers_0->forward(ctx, h); + h = ggml_silu_inplace(ctx->ggml_ctx, h); + // dropout, skip for inference + h = out_layers_3->forward(ctx, h); + + // skip connection + if (out_channels != channels) { + auto skip_connection = std::dynamic_pointer_cast(blocks["skip_connection"]); + x = skip_connection->forward(ctx, x); // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] + } + + h = ggml_add(ctx->ggml_ctx, h, x); + return h; // [N, out_channels, h, w] if dims == 2 else [N, out_channels, t, h, w] + } +}; + +class GEGLU : public UnaryBlock { +protected: + int64_t dim_in; + int64_t dim_out; + +public: + GEGLU(int64_t dim_in, int64_t dim_out) + : dim_in(dim_in), dim_out(dim_out) { + blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out * 2)); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + // x: [ne3, ne2, ne1, dim_in] + // return: [ne3, ne2, ne1, dim_out] + auto proj = std::dynamic_pointer_cast(blocks["proj"]); + + x = proj->forward(ctx, x); // [ne3, ne2, ne1, dim_out*2] + auto x_vec = ggml_ext_chunk(ctx->ggml_ctx, x, 2, 0, false); + x = x_vec[0]; // [ne3, ne2, ne1, dim_out] + auto gate = x_vec[1]; // [ne3, ne2, ne1, dim_out] + + gate = ggml_cont(ctx->ggml_ctx, gate); + + gate = ggml_ext_gelu(ctx->ggml_ctx, gate, true); + + x = ggml_mul(ctx->ggml_ctx, x, gate); // [ne3, ne2, ne1, dim_out] + + return x; + } +}; + +class GELU : public UnaryBlock { +public: + GELU(int64_t dim_in, int64_t dim_out, bool bias = true) { + blocks["proj"] = std::shared_ptr(new Linear(dim_in, dim_out, bias)); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) override { + // x: [ne3, ne2, ne1, dim_in] + // return: [ne3, ne2, ne1, dim_out] + auto proj = std::dynamic_pointer_cast(blocks["proj"]); + + x = proj->forward(ctx, x); + x = ggml_ext_gelu(ctx->ggml_ctx, x, true); + return x; + } +}; + +class FeedForward : public GGMLBlock { +public: + enum class Activation { + GEGLU, + GELU + }; + FeedForward(int64_t dim, + int64_t dim_out, + int64_t mult = 4, + Activation activation = Activation::GEGLU, + bool precision_fix = false) { + int64_t inner_dim = dim * mult; + if (activation == Activation::GELU) { + blocks["net.0"] = std::shared_ptr(new GELU(dim, inner_dim)); + } else { + blocks["net.0"] = std::shared_ptr(new GEGLU(dim, inner_dim)); + } + + // net_1 is nn.Dropout(), skip for inference + bool force_prec_f32 = false; + float scale = 1.f; + if (precision_fix) { + scale = 1.f / 128.f; +#ifdef SD_USE_VULKAN + force_prec_f32 = true; +#endif + } + // The purpose of the scale here is to prevent NaN issues in certain situations. + // For example, when using Vulkan without enabling force_prec_f32, + // or when using CUDA but the weights are k-quants. + blocks["net.2"] = std::shared_ptr(new Linear(inner_dim, dim_out, true, false, force_prec_f32, scale)); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, struct ggml_tensor* x) { + // x: [ne3, ne2, ne1, dim] + // return: [ne3, ne2, ne1, dim_out] + + auto net_0 = std::dynamic_pointer_cast(blocks["net.0"]); + auto net_2 = std::dynamic_pointer_cast(blocks["net.2"]); + + x = net_0->forward(ctx, x); // [ne3, ne2, ne1, inner_dim] + x = net_2->forward(ctx, x); // [ne3, ne2, ne1, dim_out] + return x; + } +}; + +class CrossAttention : public GGMLBlock { +protected: + int64_t query_dim; + int64_t context_dim; + int64_t n_head; + int64_t d_head; + +public: + CrossAttention(int64_t query_dim, + int64_t context_dim, + int64_t n_head, + int64_t d_head) + : n_head(n_head), + d_head(d_head), + query_dim(query_dim), + context_dim(context_dim) { + int64_t inner_dim = d_head * n_head; + + blocks["to_q"] = std::shared_ptr(new Linear(query_dim, inner_dim, false)); + blocks["to_k"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); + blocks["to_v"] = std::shared_ptr(new Linear(context_dim, inner_dim, false)); + + blocks["to_out.0"] = std::shared_ptr(new Linear(inner_dim, query_dim)); + // to_out_1 is nn.Dropout(), skip for inference + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* context) { + // x: [N, n_token, query_dim] + // context: [N, n_context, context_dim] + // return: [N, n_token, query_dim] + auto to_q = std::dynamic_pointer_cast(blocks["to_q"]); + auto to_k = std::dynamic_pointer_cast(blocks["to_k"]); + auto to_v = std::dynamic_pointer_cast(blocks["to_v"]); + auto to_out_0 = std::dynamic_pointer_cast(blocks["to_out.0"]); + + int64_t n = x->ne[2]; + int64_t n_token = x->ne[1]; + int64_t n_context = context->ne[1]; + int64_t inner_dim = d_head * n_head; + + auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim] + auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim] + auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim] + + x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim] + + x = to_out_0->forward(ctx, x); // [N, n_token, query_dim] + return x; + } +}; + +class BasicTransformerBlock : public GGMLBlock { +protected: + int64_t n_head; + int64_t d_head; + bool ff_in; + +public: + BasicTransformerBlock(int64_t dim, + int64_t n_head, + int64_t d_head, + int64_t context_dim, + bool ff_in = false) + : n_head(n_head), d_head(d_head), ff_in(ff_in) { + // disable_self_attn is always False + // disable_temporal_crossattention is always False + // switch_temporal_ca_to_sa is always False + // inner_dim is always None or equal to dim + // gated_ff is always True + blocks["attn1"] = std::shared_ptr(new CrossAttention(dim, dim, n_head, d_head)); + blocks["attn2"] = std::shared_ptr(new CrossAttention(dim, context_dim, n_head, d_head)); + blocks["ff"] = std::shared_ptr(new FeedForward(dim, dim)); + blocks["norm1"] = std::shared_ptr(new LayerNorm(dim)); + blocks["norm2"] = std::shared_ptr(new LayerNorm(dim)); + blocks["norm3"] = std::shared_ptr(new LayerNorm(dim)); + + if (ff_in) { + blocks["norm_in"] = std::shared_ptr(new LayerNorm(dim)); + blocks["ff_in"] = std::shared_ptr(new FeedForward(dim, dim)); + } + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* context) { + // x: [N, n_token, query_dim] + // context: [N, n_context, context_dim] + // return: [N, n_token, query_dim] + + auto attn1 = std::dynamic_pointer_cast(blocks["attn1"]); + auto attn2 = std::dynamic_pointer_cast(blocks["attn2"]); + auto ff = std::dynamic_pointer_cast(blocks["ff"]); + auto norm1 = std::dynamic_pointer_cast(blocks["norm1"]); + auto norm2 = std::dynamic_pointer_cast(blocks["norm2"]); + auto norm3 = std::dynamic_pointer_cast(blocks["norm3"]); + + if (ff_in) { + auto norm_in = std::dynamic_pointer_cast(blocks["norm_in"]); + auto ff_in = std::dynamic_pointer_cast(blocks["ff_in"]); + + auto x_skip = x; + x = norm_in->forward(ctx, x); + x = ff_in->forward(ctx, x); + // self.is_res is always True + x = ggml_add(ctx->ggml_ctx, x, x_skip); + } + + auto r = x; + x = norm1->forward(ctx, x); + x = attn1->forward(ctx, x, x); // self-attention + x = ggml_add(ctx->ggml_ctx, x, r); + r = x; + x = norm2->forward(ctx, x); + x = attn2->forward(ctx, x, context); // cross-attention + x = ggml_add(ctx->ggml_ctx, x, r); + r = x; + x = norm3->forward(ctx, x); + x = ff->forward(ctx, x); + x = ggml_add(ctx->ggml_ctx, x, r); + + return x; + } +}; + +class SpatialTransformer : public GGMLBlock { +protected: + int64_t in_channels; // mult * model_channels + int64_t n_head; + int64_t d_head; + int64_t depth = 1; // 1 + int64_t context_dim = 768; // hidden_size, 1024 for VERSION_SD2 + bool use_linear = false; + + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") { + auto iter = tensor_storage_map.find(prefix + "proj_out.weight"); + if (iter != tensor_storage_map.end()) { + int64_t inner_dim = n_head * d_head; + if (iter->second.n_dims == 4 && use_linear) { + use_linear = false; + blocks["proj_in"] = std::make_shared(in_channels, inner_dim, std::pair{1, 1}); + blocks["proj_out"] = std::make_shared(inner_dim, in_channels, std::pair{1, 1}); + } else if (iter->second.n_dims == 2 && !use_linear) { + use_linear = true; + blocks["proj_in"] = std::make_shared(in_channels, inner_dim); + blocks["proj_out"] = std::make_shared(inner_dim, in_channels); + } + } + } + +public: + SpatialTransformer(int64_t in_channels, + int64_t n_head, + int64_t d_head, + int64_t depth, + int64_t context_dim, + bool use_linear) + : in_channels(in_channels), + n_head(n_head), + d_head(d_head), + depth(depth), + context_dim(context_dim), + use_linear(use_linear) { + // disable_self_attn is always False + int64_t inner_dim = n_head * d_head; // in_channels + blocks["norm"] = std::shared_ptr(new GroupNorm32(in_channels)); + if (use_linear) { + blocks["proj_in"] = std::shared_ptr(new Linear(in_channels, inner_dim)); + } else { + blocks["proj_in"] = std::shared_ptr(new Conv2d(in_channels, inner_dim, {1, 1})); + } + + for (int i = 0; i < depth; i++) { + std::string name = "transformer_blocks." + std::to_string(i); + blocks[name] = std::shared_ptr(new BasicTransformerBlock(inner_dim, n_head, d_head, context_dim, false)); + } + + if (use_linear) { + blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, in_channels)); + } else { + blocks["proj_out"] = std::shared_ptr(new Conv2d(inner_dim, in_channels, {1, 1})); + } + } + + virtual struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* context) { + // x: [N, in_channels, h, w] + // context: [N, max_position(aka n_token), hidden_size(aka context_dim)] + auto norm = std::dynamic_pointer_cast(blocks["norm"]); + auto proj_in = std::dynamic_pointer_cast(blocks["proj_in"]); + auto proj_out = std::dynamic_pointer_cast(blocks["proj_out"]); + + auto x_in = x; + int64_t n = x->ne[3]; + int64_t h = x->ne[1]; + int64_t w = x->ne[0]; + int64_t inner_dim = n_head * d_head; + + x = norm->forward(ctx, x); + if (use_linear) { + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim] + x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim] + x = proj_in->forward(ctx, x); // [N, inner_dim, h, w] + } else { + x = proj_in->forward(ctx, x); // [N, inner_dim, h, w] + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 2, 0, 3)); // [N, h, w, inner_dim] + x = ggml_reshape_3d(ctx->ggml_ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim] + } + + for (int i = 0; i < depth; i++) { + std::string name = "transformer_blocks." + std::to_string(i); + auto transformer_block = std::dynamic_pointer_cast(blocks[name]); + + x = transformer_block->forward(ctx, x, context); + } + + if (use_linear) { + // proj_out + x = proj_out->forward(ctx, x); // [N, in_channels, h, w] + + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w] + x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w] + } else { + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 1, 0, 2, 3)); // [N, inner_dim, h * w] + x = ggml_reshape_4d(ctx->ggml_ctx, x, w, h, inner_dim, n); // [N, inner_dim, h, w] + + // proj_out + x = proj_out->forward(ctx, x); // [N, in_channels, h, w] + } + + x = ggml_add(ctx->ggml_ctx, x, x_in); + return x; + } +}; + +class AlphaBlender : public GGMLBlock { +protected: + void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, std::string prefix = "") override { + // Get the type of the "mix_factor" tensor from the input tensors map with the specified prefix + enum ggml_type wtype = GGML_TYPE_F32; + params["mix_factor"] = ggml_new_tensor_1d(ctx, wtype, 1); + } + + float get_alpha() { + // image_only_indicator is always tensor([0.]) and since mix_factor.shape is [1,] + // so learned_with_images is same as learned + float alpha = ggml_ext_backend_tensor_get_f32(params["mix_factor"]); + return sigmoid(alpha); + } + +public: + AlphaBlender() { + // merge_strategy is always learned_with_images + // for inference, we don't need to set alpha + // since mix_factor.shape is [1,], we don't need rearrange using rearrange_pattern + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x_spatial, + struct ggml_tensor* x_temporal) { + // image_only_indicator is always tensor([0.]) + float alpha = get_alpha(); + auto x = ggml_add(ctx->ggml_ctx, + ggml_ext_scale(ctx->ggml_ctx, x_spatial, alpha), + ggml_ext_scale(ctx->ggml_ctx, x_temporal, 1.0f - alpha)); + return x; + } +}; + +class VideoResBlock : public ResBlock { +public: + VideoResBlock(int64_t channels, + int64_t emb_channels, + int64_t out_channels, + std::pair kernel_size = {3, 3}, + int64_t video_kernel_size = 3, + int dims = 2) // always 2 + : ResBlock(channels, emb_channels, out_channels, kernel_size, dims) { + blocks["time_stack"] = std::shared_ptr(new ResBlock(out_channels, emb_channels, out_channels, kernel_size, 3, true)); + blocks["time_mixer"] = std::shared_ptr(new AlphaBlender()); + } + + struct ggml_tensor* forward(GGMLRunnerContext* ctx, + struct ggml_tensor* x, + struct ggml_tensor* emb, + int num_video_frames) { + // x: [N, channels, h, w] aka [b*t, channels, h, w] + // emb: [N, emb_channels] aka [b*t, emb_channels] + // image_only_indicator is always tensor([0.]) + auto time_stack = std::dynamic_pointer_cast(blocks["time_stack"]); + auto time_mixer = std::dynamic_pointer_cast(blocks["time_mixer"]); + + x = ResBlock::forward(ctx, x, emb); + + int64_t T = num_video_frames; + int64_t B = x->ne[3] / T; + int64_t C = x->ne[2]; + int64_t H = x->ne[1]; + int64_t W = x->ne[0]; + + x = ggml_reshape_4d(ctx->ggml_ctx, x, W * H, C, T, B); // (b t) c h w -> b t c (h w) + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b t c (h w) -> b c t (h w) + auto x_mix = x; + + emb = ggml_reshape_4d(ctx->ggml_ctx, emb, emb->ne[0], T, B, emb->ne[3]); // (b t) ... -> b t ... + + x = time_stack->forward(ctx, x, emb); // b t c (h w) + + x = time_mixer->forward(ctx, x_mix, x); // b t c (h w) + + x = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, x, 0, 2, 1, 3)); // b c t (h w) -> b t c (h w) + x = ggml_reshape_4d(ctx->ggml_ctx, x, W, H, C, T * B); // b t c (h w) -> (b t) c h w + + return x; + } +}; + +#endif // __COMMON_BLOCK_HPP__ diff --git a/otherarch/sdcpp/common_dit.hpp b/otherarch/sdcpp/common_dit.hpp new file mode 100644 index 00000000000..0e6f0f0870a --- /dev/null +++ b/otherarch/sdcpp/common_dit.hpp @@ -0,0 +1,108 @@ +#ifndef __COMMON_DIT_HPP__ +#define __COMMON_DIT_HPP__ + +#include "ggml_extend.hpp" + +namespace DiT { + ggml_tensor* patchify(ggml_context* ctx, + ggml_tensor* x, + int pw, + int ph, + bool patch_last = true) { + // x: [N, C, H, W] + // return: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C] + int64_t N = x->ne[3]; + int64_t C = x->ne[2]; + int64_t H = x->ne[1]; + int64_t W = x->ne[0]; + int64_t h = H / ph; + int64_t w = W / pw; + + GGML_ASSERT(h * ph == H && w * pw == W); + + x = ggml_reshape_4d(ctx, x, pw, w, ph, h * C * N); // [N*C*h, ph, w, pw] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, ph, pw] + x = ggml_reshape_4d(ctx, x, pw * ph, w * h, C, N); // [N, C, h*w, ph*pw] + if (patch_last) { + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, ph*pw] + x = ggml_reshape_3d(ctx, x, pw * ph * C, w * h, N); // [N, h*w, C*ph*pw] + } else { + x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, ph*pw] + x = ggml_reshape_3d(ctx, x, C * pw * ph, w * h, N); // [N, h*w, ph*pw*C] + } + return x; + } + + ggml_tensor* unpatchify(ggml_context* ctx, + ggml_tensor* x, + int64_t h, + int64_t w, + int ph, + int pw, + bool patch_last = true) { + // x: [N, h*w, C*ph*pw] if patch_last else [N, h*w, ph*pw*C] + // return: [N, C, H, W] + int64_t N = x->ne[2]; + int64_t C = x->ne[0] / ph / pw; + int64_t H = h * ph; + int64_t W = w * pw; + + GGML_ASSERT(C * ph * pw == x->ne[0]); + + if (patch_last) { + x = ggml_reshape_4d(ctx, x, pw * ph, C, w * h, N); // [N, h*w, C, ph*pw] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, ph*pw] + } else { + x = ggml_reshape_4d(ctx, x, C, pw * ph, w * h, N); // [N, h*w, ph*pw, C] + x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, h*w, ph*pw] + } + + x = ggml_reshape_4d(ctx, x, pw, ph, w, h * C * N); // [N*C*h, w, ph, pw] + x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, ph, w, pw] + x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*ph, w*pw] + + return x; + } + + ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, + ggml_tensor* x, + int ph, + int pw) { + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; + + int pad_h = (ph - H % ph) % ph; + int pad_w = (pw - W % pw) % pw; + x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); + return x; + } + + ggml_tensor* pad_and_patchify(GGMLRunnerContext* ctx, + ggml_tensor* x, + int ph, + int pw, + bool patch_last = true) { + x = pad_to_patch_size(ctx, x, ph, pw); + x = patchify(ctx->ggml_ctx, x, ph, pw, patch_last); + return x; + } + + ggml_tensor* unpatchify_and_crop(ggml_context* ctx, + ggml_tensor* x, + int64_t H, + int64_t W, + int ph, + int pw, + bool patch_last = true) { + int pad_h = (ph - H % ph) % ph; + int pad_w = (pw - W % pw) % pw; + int64_t h = ((H + pad_h) / ph); + int64_t w = ((W + pad_w) / pw); + x = unpatchify(ctx, x, h, w, ph, pw, patch_last); // [N, C, H + pad_h, W + pad_w] + x = ggml_ext_slice(ctx, x, 1, 0, H); // [N, C, H, W + pad_w] + x = ggml_ext_slice(ctx, x, 0, 0, W); // [N, C, H, W] + return x; + } +} // namespace DiT + +#endif // __COMMON_DIT_HPP__ \ No newline at end of file diff --git a/otherarch/sdcpp/conditioner.hpp b/otherarch/sdcpp/conditioner.hpp index 4317ed18a97..d4a3146b8c4 100644 --- a/otherarch/sdcpp/conditioner.hpp +++ b/otherarch/sdcpp/conditioner.hpp @@ -1641,6 +1641,142 @@ struct T5CLIPEmbedder : public Conditioner { } }; +struct AnimaConditioner : public Conditioner { + std::shared_ptr qwen_tokenizer; + T5UniGramTokenizer t5_tokenizer; + std::shared_ptr llm; + + AnimaConditioner(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}) { + qwen_tokenizer = std::make_shared(); + llm = std::make_shared(LLM::LLMArch::QWEN3, + backend, + offload_params_to_cpu, + tensor_storage_map, + "text_encoders.llm", + false); + } + + void get_param_tensors(std::map& tensors) override { + llm->get_param_tensors(tensors, "text_encoders.llm"); + } + + void alloc_params_buffer() override { + llm->alloc_params_buffer(); + } + + void free_params_buffer() override { + llm->free_params_buffer(); + } + + size_t get_params_buffer_size() override { + return llm->get_params_buffer_size(); + } + + void set_flash_attention_enabled(bool enabled) override { + llm->set_flash_attention_enabled(enabled); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + llm->set_weight_adapter(adapter); + } + + std::tuple, std::vector, std::vector, std::vector> tokenize(std::string text) { + auto parsed_attention = parse_prompt_attention(text); + + { + std::stringstream ss; + ss << "["; + for (const auto& item : parsed_attention) { + ss << "['" << item.first << "', " << item.second << "], "; + } + ss << "]"; + LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str()); + } + + std::vector qwen_tokens; + std::vector qwen_weights; + std::vector t5_tokens; + std::vector t5_weights; + + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + std::vector curr_tokens = qwen_tokenizer->tokenize(curr_text, nullptr); + qwen_tokens.insert(qwen_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + // Anima uses uniform Qwen token weights. + qwen_weights.insert(qwen_weights.end(), curr_tokens.size(), 1.f); + } + if (qwen_tokens.empty()) { + qwen_tokens.push_back(151643); // qwen3 pad token + qwen_weights.push_back(1.f); + } + + for (const auto& item : parsed_attention) { + const std::string& curr_text = item.first; + float curr_weight = item.second; + std::vector curr_tokens = t5_tokenizer.Encode(curr_text, true); + t5_tokens.insert(t5_tokens.end(), curr_tokens.begin(), curr_tokens.end()); + t5_weights.insert(t5_weights.end(), curr_tokens.size(), curr_weight); + } + + return {qwen_tokens, qwen_weights, t5_tokens, t5_weights}; + } + + SDCondition get_learned_condition(ggml_context* work_ctx, + int n_threads, + const ConditionerParams& conditioner_params) override { + int64_t t0 = ggml_time_ms(); + + auto tokenized = tokenize(conditioner_params.text); + auto& qwen_tokens = std::get<0>(tokenized); + auto& qwen_weights = std::get<1>(tokenized); + auto& t5_tokens = std::get<2>(tokenized); + auto& t5_weights = std::get<3>(tokenized); + + auto input_ids = vector_to_ggml_tensor_i32(work_ctx, qwen_tokens); + + struct ggml_tensor* hidden_states = nullptr; // [N, n_token, 1024] + llm->compute(n_threads, + input_ids, + nullptr, + {}, + {}, + &hidden_states, + work_ctx); + + { + auto tensor = hidden_states; + float original_mean = ggml_ext_tensor_mean(tensor); + for (int i2 = 0; i2 < tensor->ne[2]; i2++) { + for (int i1 = 0; i1 < tensor->ne[1]; i1++) { + for (int i0 = 0; i0 < tensor->ne[0]; i0++) { + float value = ggml_ext_tensor_get_f32(tensor, i0, i1, i2); + value *= qwen_weights[i1]; + ggml_ext_tensor_set_f32(tensor, value, i0, i1, i2); + } + } + } + float new_mean = ggml_ext_tensor_mean(tensor); + if (new_mean != 0.f) { + ggml_ext_tensor_scale_inplace(tensor, (original_mean / new_mean)); + } + } + + struct ggml_tensor* t5_ids_tensor = nullptr; + struct ggml_tensor* t5_weight_tensor = nullptr; + if (!t5_tokens.empty()) { + t5_ids_tensor = vector_to_ggml_tensor_i32(work_ctx, t5_tokens); + t5_weight_tensor = vector_to_ggml_tensor(work_ctx, t5_weights); + } + + int64_t t1 = ggml_time_ms(); + LOG_DEBUG("computing condition graph completed, taking %" PRId64 " ms", t1 - t0); + + return {hidden_states, t5_weight_tensor, t5_ids_tensor}; + } +}; + struct LLMEmbedder : public Conditioner { SDVersion version; std::shared_ptr tokenizer; diff --git a/otherarch/sdcpp/control.hpp b/otherarch/sdcpp/control.hpp index f7842021c7b..5bab0381a3b 100644 --- a/otherarch/sdcpp/control.hpp +++ b/otherarch/sdcpp/control.hpp @@ -1,8 +1,7 @@ #ifndef __CONTROL_HPP__ #define __CONTROL_HPP__ -#include "common.hpp" -#include "ggml_extend.hpp" +#include "common_block.hpp" #include "model.h" #define CONTROL_NET_GRAPH_SIZE 1536 diff --git a/otherarch/sdcpp/diffusion_model.hpp b/otherarch/sdcpp/diffusion_model.hpp index 3293ba9b702..329bb9d9a96 100644 --- a/otherarch/sdcpp/diffusion_model.hpp +++ b/otherarch/sdcpp/diffusion_model.hpp @@ -1,6 +1,7 @@ #ifndef __DIFFUSION_MODEL_H__ #define __DIFFUSION_MODEL_H__ +#include "anima.hpp" #include "flux.hpp" #include "mmdit.hpp" #include "qwen_image.hpp" @@ -242,6 +243,72 @@ struct FluxModel : public DiffusionModel { } }; +struct AnimaModel : public DiffusionModel { + std::string prefix; + Anima::AnimaRunner anima; + + AnimaModel(ggml_backend_t backend, + bool offload_params_to_cpu, + const String2TensorStorage& tensor_storage_map = {}, + const std::string prefix = "model.diffusion_model") + : prefix(prefix), anima(backend, offload_params_to_cpu, tensor_storage_map, prefix) { + } + + std::string get_desc() override { + return anima.get_desc(); + } + + void alloc_params_buffer() override { + anima.alloc_params_buffer(); + } + + void free_params_buffer() override { + anima.free_params_buffer(); + } + + void free_compute_buffer() override { + anima.free_compute_buffer(); + } + + void get_param_tensors(std::map& tensors) override { + anima.get_param_tensors(tensors, prefix); + } + + size_t get_params_buffer_size() override { + return anima.get_params_buffer_size(); + } + + void set_weight_adapter(const std::shared_ptr& adapter) override { + anima.set_weight_adapter(adapter); + } + + int64_t get_adm_in_channels() override { + return 768; + } + + void set_flash_attention_enabled(bool enabled) { + anima.set_flash_attention_enabled(enabled); + } + + void set_circular_axes(bool circular_x, bool circular_y) override { + anima.set_circular_axes(circular_x, circular_y); + } + + bool compute(int n_threads, + DiffusionParams diffusion_params, + struct ggml_tensor** output = nullptr, + struct ggml_context* output_ctx = nullptr) override { + return anima.compute(n_threads, + diffusion_params.x, + diffusion_params.timesteps, + diffusion_params.context, + diffusion_params.c_concat, + diffusion_params.y, + output, + output_ctx); + } +}; + struct WanModel : public DiffusionModel { std::string prefix; WAN::WanRunner wan; diff --git a/otherarch/sdcpp/flux.hpp b/otherarch/sdcpp/flux.hpp index ff8c18997fb..1204ae1e5e9 100644 --- a/otherarch/sdcpp/flux.hpp +++ b/otherarch/sdcpp/flux.hpp @@ -4,7 +4,7 @@ #include #include -#include "ggml_extend.hpp" +#include "common_dit.hpp" #include "model.h" #include "rope.hpp" @@ -103,11 +103,13 @@ namespace Flux { auto norm = std::dynamic_pointer_cast(blocks["norm"]); auto qkv = qkv_proj->forward(ctx, x); - auto qkv_vec = ggml_ext_chunk(ctx->ggml_ctx, qkv, 3, 0, true); - int64_t head_dim = qkv_vec[0]->ne[0] / num_heads; - auto q = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[0], head_dim, num_heads, qkv_vec[0]->ne[1], qkv_vec[0]->ne[2]); - auto k = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[1], head_dim, num_heads, qkv_vec[1]->ne[1], qkv_vec[1]->ne[2]); - auto v = ggml_reshape_4d(ctx->ggml_ctx, qkv_vec[2], head_dim, num_heads, qkv_vec[2]->ne[1], qkv_vec[2]->ne[2]); + int64_t head_dim = qkv->ne[0] / 3 / num_heads; + auto q = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2], + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], 0); + auto k = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2], + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * qkv->ne[0] / 3); + auto v = ggml_view_4d(ctx->ggml_ctx, qkv, head_dim, num_heads, qkv->ne[1], qkv->ne[2], + qkv->nb[0] * head_dim, qkv->nb[1], qkv->nb[2], (qkv->nb[0]) * 2 * qkv->ne[0] / 3); q = norm->query_norm(ctx, q); k = norm->key_norm(ctx, k); return {q, k, v}; @@ -491,15 +493,14 @@ namespace Flux { auto x_mod = Flux::modulate(ctx->ggml_ctx, pre_norm->forward(ctx, x), mod.shift, mod.scale); auto qkv_mlp = linear1->forward(ctx, x_mod); // [N, n_token, hidden_size * 3 + mlp_hidden_dim*mlp_mult_factor] - auto q = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], 0); - auto k = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * qkv_mlp->nb[0]); - auto v = ggml_view_3d(ctx->ggml_ctx, qkv_mlp, hidden_size, qkv_mlp->ne[1], qkv_mlp->ne[2], qkv_mlp->nb[1], qkv_mlp->nb[2], hidden_size * 2 * qkv_mlp->nb[0]); - int64_t head_dim = hidden_size / num_heads; - q = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, q), head_dim, num_heads, q->ne[1], q->ne[2]); // [N, n_token, n_head, d_head] - k = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, k), head_dim, num_heads, k->ne[1], k->ne[2]); // [N, n_token, n_head, d_head] - v = ggml_reshape_4d(ctx->ggml_ctx, ggml_cont(ctx->ggml_ctx, v), head_dim, num_heads, v->ne[1], v->ne[2]); // [N, n_token, n_head, d_head] + auto q = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2], + qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], 0); + auto k = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2], + qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * hidden_size); + auto v = ggml_view_4d(ctx->ggml_ctx, qkv_mlp, head_dim, num_heads, qkv_mlp->ne[1], qkv_mlp->ne[2], + qkv_mlp->nb[0] * head_dim, qkv_mlp->nb[1], qkv_mlp->nb[2], (qkv_mlp->nb[0]) * 2 * hidden_size); q = norm->query_norm(ctx, q); k = norm->key_norm(ctx, k); @@ -846,70 +847,6 @@ namespace Flux { } } - struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - - int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; - int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); - return x; - } - - struct ggml_tensor* patchify(struct ggml_context* ctx, - struct ggml_tensor* x) { - // x: [N, C, H, W] - // return: [N, h*w, C * patch_size * patch_size] - int64_t N = x->ne[3]; - int64_t C = x->ne[2]; - int64_t H = x->ne[1]; - int64_t W = x->ne[0]; - int64_t p = params.patch_size; - int64_t h = H / params.patch_size; - int64_t w = W / params.patch_size; - - GGML_ASSERT(h * p == H && w * p == W); - - x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p] - x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p] - x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p] - return x; - } - - struct ggml_tensor* process_img(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { - // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) - x = pad_to_patch_size(ctx, x); - x = patchify(ctx->ggml_ctx, x); - return x; - } - - struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t h, - int64_t w) { - // x: [N, h*w, C*patch_size*patch_size] - // return: [N, C, H, W] - int64_t N = x->ne[2]; - int64_t C = x->ne[0] / params.patch_size / params.patch_size; - int64_t H = h * params.patch_size; - int64_t W = w * params.patch_size; - int64_t p = params.patch_size; - - GGML_ASSERT(C * p * p == x->ne[0]); - - x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p] - x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p] - x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p] - - return x; - } - struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx, struct ggml_tensor* img, struct ggml_tensor* txt, @@ -1060,7 +997,7 @@ namespace Flux { int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - auto img = pad_to_patch_size(ctx, x); + auto img = DiT::pad_to_patch_size(ctx, x, params.patch_size, params.patch_size); auto orig_img = img; if (params.chroma_radiance_params.fake_patch_size_x2) { @@ -1082,7 +1019,7 @@ namespace Flux { auto nerf_image_embedder = std::dynamic_pointer_cast(blocks["nerf_image_embedder"]); auto nerf_final_layer_conv = std::dynamic_pointer_cast(blocks["nerf_final_layer_conv"]); - auto nerf_pixels = patchify(ctx->ggml_ctx, orig_img); // [N, num_patches, C * patch_size * patch_size] + auto nerf_pixels = DiT::patchify(ctx->ggml_ctx, orig_img, patch_size, patch_size); // [N, num_patches, C * patch_size * patch_size] int64_t num_patches = nerf_pixels->ne[1]; nerf_pixels = ggml_reshape_3d(ctx->ggml_ctx, nerf_pixels, @@ -1102,7 +1039,7 @@ namespace Flux { img_dct = ggml_cont(ctx->ggml_ctx, ggml_ext_torch_permute(ctx->ggml_ctx, img_dct, 1, 0, 2, 3)); // [N*num_patches, nerf_hidden_size, patch_size*patch_size] img_dct = ggml_reshape_3d(ctx->ggml_ctx, img_dct, img_dct->ne[0] * img_dct->ne[1], num_patches, img_dct->ne[2] / num_patches); // [N, num_patches, nerf_hidden_size*patch_size*patch_size] - img_dct = unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size); // [N, nerf_hidden_size, H, W] + img_dct = DiT::unpatchify(ctx->ggml_ctx, img_dct, (H + pad_h) / patch_size, (W + pad_w) / patch_size, patch_size, patch_size); // [N, nerf_hidden_size, H, W] out = nerf_final_layer_conv->forward(ctx, img_dct); // [N, C, H, W] @@ -1134,7 +1071,7 @@ namespace Flux { int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; - auto img = process_img(ctx, x); + auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size); int64_t img_tokens = img->ne[1]; if (params.version == VERSION_FLUX_FILL) { @@ -1142,8 +1079,8 @@ namespace Flux { ggml_tensor* masked = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); - masked = process_img(ctx, masked); - mask = process_img(ctx, mask); + masked = DiT::pad_and_patchify(ctx, masked, patch_size, patch_size); + mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size); img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, masked, mask, 0), 0); } else if (params.version == VERSION_FLEX_2) { @@ -1152,21 +1089,21 @@ namespace Flux { ggml_tensor* mask = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 1, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); ggml_tensor* control = ggml_view_4d(ctx->ggml_ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * (C + 1)); - masked = process_img(ctx, masked); - mask = process_img(ctx, mask); - control = process_img(ctx, control); + masked = DiT::pad_and_patchify(ctx, masked, patch_size, patch_size); + mask = DiT::pad_and_patchify(ctx, mask, patch_size, patch_size); + control = DiT::pad_and_patchify(ctx, control, patch_size, patch_size); img = ggml_concat(ctx->ggml_ctx, img, ggml_concat(ctx->ggml_ctx, ggml_concat(ctx->ggml_ctx, masked, mask, 0), control, 0), 0); } else if (params.version == VERSION_FLUX_CONTROLS) { GGML_ASSERT(c_concat != nullptr); - auto control = process_img(ctx, c_concat); + auto control = DiT::pad_and_patchify(ctx, c_concat, patch_size, patch_size); img = ggml_concat(ctx->ggml_ctx, img, control, 0); } if (ref_latents.size() > 0) { for (ggml_tensor* ref : ref_latents) { - ref = process_img(ctx, ref); + ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size); img = ggml_concat(ctx->ggml_ctx, img, ref, 1); } } @@ -1178,8 +1115,7 @@ namespace Flux { out = ggml_cont(ctx->ggml_ctx, out); } - // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2) - out = unpatchify(ctx->ggml_ctx, out, (H + pad_h) / patch_size, (W + pad_w) / patch_size); // [N, C, H + pad_h, W + pad_w] + out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size); // [N, C, H, W] return out; } diff --git a/otherarch/sdcpp/ggml_extend.hpp b/otherarch/sdcpp/ggml_extend.hpp index cac79bb2165..5265aeed46b 100644 --- a/otherarch/sdcpp/ggml_extend.hpp +++ b/otherarch/sdcpp/ggml_extend.hpp @@ -1219,6 +1219,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros(struct ggml_context* ctx, return ggml_ext_full(ctx, 0.f, ne0, ne1, ne2, ne3); } +__STATIC_INLINE__ struct ggml_tensor* ggml_ext_zeros_like(struct ggml_context* ctx, + struct ggml_tensor* x) { + return ggml_ext_zeros(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); +} + __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx, int64_t ne0, int64_t ne1, @@ -1227,6 +1232,11 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones(struct ggml_context* ctx, return ggml_ext_full(ctx, 1.f, ne0, ne1, ne2, ne3); } +__STATIC_INLINE__ struct ggml_tensor* ggml_ext_ones_like(struct ggml_context* ctx, + struct ggml_tensor* x) { + return ggml_ext_ones(ctx, x->ne[0], x->ne[1], x->ne[2], x->ne[3]); +} + __STATIC_INLINE__ ggml_tensor* ggml_ext_cast_f32(ggml_context* ctx, ggml_tensor* a) { #ifdef SD_USE_VULKAN auto zero_index = ggml_get_tensor(ctx, "ggml_runner_build_in_tensor:zero_int"); diff --git a/otherarch/sdcpp/ltxv.hpp b/otherarch/sdcpp/ltxv.hpp index 0a2877a8639..9dcdd4b2058 100644 --- a/otherarch/sdcpp/ltxv.hpp +++ b/otherarch/sdcpp/ltxv.hpp @@ -1,8 +1,7 @@ #ifndef __LTXV_HPP__ #define __LTXV_HPP__ -#include "common.hpp" -#include "ggml_extend.hpp" +#include "common_block.hpp" namespace LTXV { diff --git a/otherarch/sdcpp/mmdit.hpp b/otherarch/sdcpp/mmdit.hpp index 726f60c2f0b..ba1c35d66e4 100644 --- a/otherarch/sdcpp/mmdit.hpp +++ b/otherarch/sdcpp/mmdit.hpp @@ -745,28 +745,6 @@ struct MMDiT : public GGMLBlock { return spatial_pos_embed; } - struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t h, - int64_t w) { - // x: [N, H*W, patch_size * patch_size * C] - // return: [N, C, H, W] - int64_t n = x->ne[2]; - int64_t c = out_channels; - int64_t p = patch_size; - h = (h + 1) / p; - w = (w + 1) / p; - - GGML_ASSERT(h * w == x->ne[1]); - - x = ggml_reshape_4d(ctx, x, c, p * p, w * h, n); // [N, H*W, P*P, C] - x = ggml_cont(ctx, ggml_permute(ctx, x, 2, 0, 1, 3)); // [N, C, H*W, P*P] - x = ggml_reshape_4d(ctx, x, p, p, w, h * c * n); // [N*C*H, W, P, P] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*H, P, W, P] - x = ggml_reshape_4d(ctx, x, p * w, p * h, c, n); // [N, C, H*P, W*P] - return x; - } - struct ggml_tensor* forward_core_with_concat(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* c_mod, @@ -811,11 +789,11 @@ struct MMDiT : public GGMLBlock { auto x_embedder = std::dynamic_pointer_cast(blocks["x_embedder"]); auto t_embedder = std::dynamic_pointer_cast(blocks["t_embedder"]); - int64_t w = x->ne[0]; - int64_t h = x->ne[1]; + int64_t W = x->ne[0]; + int64_t H = x->ne[1]; auto patch_embed = x_embedder->forward(ctx, x); // [N, H*W, hidden_size] - auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, h, w); // [1, H*W, hidden_size] + auto pos_embed = cropped_pos_embed(ctx->ggml_ctx, H, W); // [1, H*W, hidden_size] x = ggml_add(ctx->ggml_ctx, patch_embed, pos_embed); // [N, H*W, hidden_size] auto c = t_embedder->forward(ctx, t); // [N, hidden_size] @@ -834,7 +812,7 @@ struct MMDiT : public GGMLBlock { x = forward_core_with_concat(ctx, x, c, context, skip_layers); // (N, H*W, patch_size ** 2 * out_channels) - x = unpatchify(ctx->ggml_ctx, x, h, w); // [N, C, H, W] + x = DiT::unpatchify_and_crop(ctx->ggml_ctx, x, H, W, patch_size, patch_size, /*patch_last*/ false); // [N, C, H, W] return x; } diff --git a/otherarch/sdcpp/model.cpp b/otherarch/sdcpp/model.cpp index ef1752d1d5f..eee00618fc2 100644 --- a/otherarch/sdcpp/model.cpp +++ b/otherarch/sdcpp/model.cpp @@ -1083,6 +1083,9 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.transformer_blocks.0.img_mod.1.weight") != std::string::npos) { return VERSION_QWEN_IMAGE; } + if (tensor_storage.name.find("llm_adapter.blocks.0.cross_attn.q_proj.weight") != std::string::npos) { + return VERSION_ANIMA; + } if (tensor_storage.name.find("model.diffusion_model.double_stream_modulation_img.lin.weight") != std::string::npos) { is_flux2 = true; } diff --git a/otherarch/sdcpp/model.h b/otherarch/sdcpp/model.h index 66b347ab8d4..afa20e8c626 100644 --- a/otherarch/sdcpp/model.h +++ b/otherarch/sdcpp/model.h @@ -45,6 +45,7 @@ enum SDVersion { VERSION_WAN2_2_I2V, VERSION_WAN2_2_TI2V, VERSION_QWEN_IMAGE, + VERSION_ANIMA, VERSION_FLUX2, VERSION_FLUX2_KLEIN, VERSION_Z_IMAGE, @@ -122,6 +123,13 @@ static inline bool sd_version_is_qwen_image(SDVersion version) { return false; } +static inline bool sd_version_is_anima(SDVersion version) { + if (version == VERSION_ANIMA) { + return true; + } + return false; +} + static inline bool sd_version_is_z_image(SDVersion version) { if (version == VERSION_Z_IMAGE) { return true; @@ -146,6 +154,7 @@ static inline bool sd_version_is_dit(SDVersion version) { sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_z_image(version)) { return true; } diff --git a/otherarch/sdcpp/name_conversion.cpp b/otherarch/sdcpp/name_conversion.cpp index d3e863b8a86..3b3abfb63e7 100644 --- a/otherarch/sdcpp/name_conversion.cpp +++ b/otherarch/sdcpp/name_conversion.cpp @@ -653,6 +653,14 @@ std::string convert_diffusers_dit_to_original_lumina2(std::string name) { return name; } +std::string convert_other_dit_to_original_anima(std::string name) { + static const std::string anima_net_prefix = "net."; + if (!starts_with(name, anima_net_prefix)) { + name = anima_net_prefix + name; + } + return name; +} + std::string convert_diffusion_model_name(std::string name, std::string prefix, SDVersion version) { if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { name = convert_diffusers_unet_to_original_sd1(name); @@ -664,6 +672,8 @@ std::string convert_diffusion_model_name(std::string name, std::string prefix, S name = convert_diffusers_dit_to_original_flux(name); } else if (sd_version_is_z_image(version)) { name = convert_diffusers_dit_to_original_lumina2(name); + } else if (sd_version_is_anima(version)) { + name = convert_other_dit_to_original_anima(name); } return name; } diff --git a/otherarch/sdcpp/qwen_image.hpp b/otherarch/sdcpp/qwen_image.hpp index 3044eb45680..2c70344cc4c 100644 --- a/otherarch/sdcpp/qwen_image.hpp +++ b/otherarch/sdcpp/qwen_image.hpp @@ -3,9 +3,8 @@ #include -#include "common.hpp" +#include "common_block.hpp" #include "flux.hpp" -#include "ggml_extend.hpp" namespace Qwen { constexpr int QWEN_IMAGE_GRAPH_SIZE = 20480; @@ -390,69 +389,6 @@ namespace Qwen { blocks["proj_out"] = std::shared_ptr(new Linear(inner_dim, params.patch_size * params.patch_size * params.out_channels)); } - struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - - int pad_h = (params.patch_size - H % params.patch_size) % params.patch_size; - int pad_w = (params.patch_size - W % params.patch_size) % params.patch_size; - x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); - return x; - } - - struct ggml_tensor* patchify(struct ggml_context* ctx, - struct ggml_tensor* x) { - // x: [N, C, H, W] - // return: [N, h*w, C * patch_size * patch_size] - int64_t N = x->ne[3]; - int64_t C = x->ne[2]; - int64_t H = x->ne[1]; - int64_t W = x->ne[0]; - int64_t p = params.patch_size; - int64_t h = H / params.patch_size; - int64_t w = W / params.patch_size; - - GGML_ASSERT(h * p == H && w * p == W); - - x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p] - x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, h*w, C, p*p] - x = ggml_reshape_3d(ctx, x, p * p * C, w * h, N); // [N, h*w, C*p*p] - return x; - } - - struct ggml_tensor* process_img(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { - x = pad_to_patch_size(ctx, x); - x = patchify(ctx->ggml_ctx, x); - return x; - } - - struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t h, - int64_t w) { - // x: [N, h*w, C*patch_size*patch_size] - // return: [N, C, H, W] - int64_t N = x->ne[2]; - int64_t C = x->ne[0] / params.patch_size / params.patch_size; - int64_t H = h * params.patch_size; - int64_t W = w * params.patch_size; - int64_t p = params.patch_size; - - GGML_ASSERT(C * p * p == x->ne[0]); - - x = ggml_reshape_4d(ctx, x, p * p, C, w * h, N); // [N, h*w, C, p*p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N, C, h*w, p*p] - x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p] - x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p] - - return x; - } - struct ggml_tensor* forward_orig(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* timestep, @@ -468,7 +404,7 @@ namespace Qwen { auto t_emb = time_text_embed->forward(ctx, timestep); if (params.zero_cond_t) { - auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros(ctx->ggml_ctx, timestep->ne[0], timestep->ne[1], timestep->ne[2], timestep->ne[3])); + auto t_emb_0 = time_text_embed->forward(ctx, ggml_ext_zeros_like(ctx->ggml_ctx, timestep)); t_emb = ggml_concat(ctx->ggml_ctx, t_emb, t_emb_0, 1); } auto img = img_in->forward(ctx, x); @@ -512,19 +448,16 @@ namespace Qwen { int64_t C = x->ne[2]; int64_t N = x->ne[3]; - auto img = process_img(ctx, x); + auto img = DiT::pad_and_patchify(ctx, x, params.patch_size, params.patch_size); int64_t img_tokens = img->ne[1]; if (ref_latents.size() > 0) { for (ggml_tensor* ref : ref_latents) { - ref = process_img(ctx, ref); + ref = DiT::pad_and_patchify(ctx, ref, params.patch_size, params.patch_size); img = ggml_concat(ctx->ggml_ctx, img, ref, 1); } } - int64_t h_len = ((H + (params.patch_size / 2)) / params.patch_size); - int64_t w_len = ((W + (params.patch_size / 2)) / params.patch_size); - auto out = forward_orig(ctx, img, timestep, context, pe, modulate_index); // [N, h_len*w_len, ph*pw*C] if (out->ne[1] > img_tokens) { @@ -533,11 +466,7 @@ namespace Qwen { out = ggml_cont(ctx->ggml_ctx, ggml_permute(ctx->ggml_ctx, out, 0, 2, 1, 3)); // [N, h*w, C * patch_size * patch_size] } - out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w] - - // slice - out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w] - out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W] + out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, params.patch_size, params.patch_size); // [N, C, H, W] return out; } diff --git a/otherarch/sdcpp/rope.hpp b/otherarch/sdcpp/rope.hpp index 45e88c831c8..b26e4fccd30 100644 --- a/otherarch/sdcpp/rope.hpp +++ b/otherarch/sdcpp/rope.hpp @@ -43,7 +43,7 @@ namespace Rope { __STATIC_INLINE__ std::vector> rope(const std::vector& pos, int dim, - int theta, + float theta, const std::vector& axis_wrap_dims = {}) { assert(dim % 2 == 0); int half_dim = dim / 2; @@ -167,7 +167,7 @@ namespace Rope { __STATIC_INLINE__ std::vector embed_nd(const std::vector>& ids, int bs, - int theta, + const std::vector& axis_thetas, const std::vector& axes_dim, const std::vector>& wrap_dims = {}) { std::vector> trans_ids = transpose(ids); @@ -188,8 +188,12 @@ namespace Rope { if (!wrap_dims.empty() && i < (int)wrap_dims.size()) { axis_wrap_dims = wrap_dims[i]; } + float axis_theta = 10000.0f; + if (!axis_thetas.empty()) { + axis_theta = axis_thetas[std::min(i, axis_thetas.size() - 1)]; + } std::vector> rope_emb = - rope(trans_ids[i], axes_dim[i], theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] + rope(trans_ids[i], axes_dim[i], axis_theta, axis_wrap_dims); // [bs*pos_len, axes_dim[i]/2 * 2 * 2] for (int b = 0; b < bs; ++b) { for (int j = 0; j < pos_len; ++j) { for (int k = 0; k < rope_emb[0].size(); ++k) { @@ -203,6 +207,15 @@ namespace Rope { return flatten(emb); } + __STATIC_INLINE__ std::vector embed_nd(const std::vector>& ids, + int bs, + float theta, + const std::vector& axes_dim, + const std::vector>& wrap_dims = {}) { + std::vector axis_thetas(axes_dim.size(), theta); + return embed_nd(ids, bs, axis_thetas, axes_dim, wrap_dims); + } + __STATIC_INLINE__ std::vector> gen_refs_ids(int patch_size, int bs, int axes_dim_num, @@ -332,7 +345,7 @@ namespace Rope { } } } - return embed_nd(ids, bs, theta, axes_dim, wrap_dims); + return embed_nd(ids, bs, static_cast(theta), axes_dim, wrap_dims); } __STATIC_INLINE__ std::vector> gen_qwen_image_ids(int h, @@ -421,7 +434,7 @@ namespace Rope { } } } - return embed_nd(ids, bs, theta, axes_dim, wrap_dims); + return embed_nd(ids, bs, static_cast(theta), axes_dim, wrap_dims); } __STATIC_INLINE__ std::vector> gen_vid_ids(int t, @@ -475,7 +488,7 @@ namespace Rope { int theta, const std::vector& axes_dim) { std::vector> ids = gen_vid_ids(t, h, w, pt, ph, pw, bs); - return embed_nd(ids, bs, theta, axes_dim); + return embed_nd(ids, bs, static_cast(theta), axes_dim); } __STATIC_INLINE__ std::vector> gen_qwen2vl_ids(int grid_h, @@ -511,7 +524,7 @@ namespace Rope { int theta, const std::vector& axes_dim) { std::vector> ids = gen_qwen2vl_ids(grid_h, grid_w, merge_size, window_index); - return embed_nd(ids, 1, theta, axes_dim); + return embed_nd(ids, 1, static_cast(theta), axes_dim); } __STATIC_INLINE__ int bound_mod(int a, int m) { @@ -584,7 +597,7 @@ namespace Rope { } } - return embed_nd(ids, bs, theta, axes_dim, wrap_dims); + return embed_nd(ids, bs, static_cast(theta), axes_dim, wrap_dims); } __STATIC_INLINE__ struct ggml_tensor* apply_rope(struct ggml_context* ctx, diff --git a/otherarch/sdcpp/sdtype_adapter.cpp b/otherarch/sdcpp/sdtype_adapter.cpp index 9fdb2747ce9..9491084d912 100644 --- a/otherarch/sdcpp/sdtype_adapter.cpp +++ b/otherarch/sdcpp/sdtype_adapter.cpp @@ -80,8 +80,8 @@ struct SDParams { bool chroma_use_dit_mask = true; std::vector lora_paths; - std::vector lora_specs; - uint32_t lora_count; + std::vector lora_multipliers; + bool lora_dynamic = false; }; //shared @@ -208,14 +208,12 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { set_sd_quiet(sd_is_quiet); executable_path = inputs.executable_path; std::string taesdpath = ""; - std::vector lorafilenames; - for(int i=0;i lora_paths; + std::vector lora_multipliers; + for(int i=0;i= 0 && inputs.lora_apply_mode <= 2) { + lora_apply_mode = inputs.lora_apply_mode; + } + else if(inputs.lora_apply_mode == 3) { + lora_dynamic = true; + } - if(lorafilenames.size()>0) + if(lora_paths.size() > 0) { - for(int i=0;iclip_l_path = clip1_filename; sd_params->clip_g_path = clip2_filename; sd_params->stacked_id_embeddings_path = photomaker_filename; - sd_params->lora_paths = lorafilenames; + sd_params->lora_paths = lora_paths; + sd_params->lora_multipliers = lora_multipliers; + sd_params->lora_dynamic = lora_dynamic; //if t5 is set, and model is a gguf, load it as a diffusion model path bool endswithgguf = (sd_params->model_path.rfind(".gguf") == sd_params->model_path.size() - 5); if((sd_params->t5xxl_path!="" || sd_params->clip_l_path!="" || sd_params->clip_g_path!="") && endswithgguf) @@ -416,21 +425,22 @@ bool sdtype_load_model(const sd_load_model_inputs inputs) { std::filesystem::path mpath(inputs.model_filename); sdmodelfilename = mpath.filename().string(); - sd_params->lora_specs.clear(); - sd_params->lora_specs.reserve(lora_filenames_max*2); + // preload the LoRAs with the initial multipliers + std::vector lora_specs; for(int i=0;ilora_paths.size();++i) { + if (!lora_dynamic && sd_params->lora_multipliers[i] == 0.) + continue; sd_lora_t spec = {}; spec.path = sd_params->lora_paths[i].c_str(); - spec.multiplier = inputs.lora_multiplier; - sd_params->lora_specs.push_back(spec); + spec.multiplier = sd_params->lora_multipliers[i]; + lora_specs.push_back(spec); } - if(sd_params->lora_specs.size()>0 && inputs.lora_multiplier>0) + if(lora_specs.size()>0) { - printf("\nApply %d LoRAs...\n",sd_params->lora_specs.size()); - sd_params->lora_count = sd_params->lora_specs.size(); - sd_ctx->sd->apply_loras(sd_params->lora_specs.data(), sd_params->lora_count); + printf(" applying %d LoRAs...\n", lora_specs.size()); + sd_ctx->sd->apply_loras(lora_specs.data(), lora_specs.size()); } input_extraimage_buffers.reserve(max_extra_images); @@ -1034,10 +1044,34 @@ sd_generation_outputs sdtype_generate(const sd_generation_inputs inputs) params.vae_tiling_params.enabled = dotile; params.batch_count = 1; - // needs to be "reapplied" because sdcpp tracks previously applied LoRAs - // and weights, and apply/unapply the differences at each gen - params.loras = sd_params->lora_specs.data(); - params.lora_count = sd_params->lora_count; + std::vector lora_specs; + for(size_t i=0;ilora_paths.size();++i) + { + float multiplier = sd_params->lora_multipliers[i]; + if (sd_params->lora_dynamic) { + multiplier = i < inputs.lora_len ? inputs.lora_multipliers[i] : 0.; + } + if (multiplier != 0.f) { + sd_lora_t spec = {}; + spec.path = sd_params->lora_paths[i].c_str(); + spec.multiplier = multiplier; + lora_specs.push_back(spec); + } + } + if(!sd_is_quiet && sddebugmode==1) { + if (lora_specs.size() > 0) { + printf("Applying LoRAs:\n"); + for(size_t i=0;i> diffusion_lora_models; std::vector> first_stage_lora_models; bool apply_lora_immediately = false; + std::map> kcpp_lora_cache; std::string taesd_path; bool use_tiny_autoencoder = false; @@ -310,15 +312,30 @@ class StableDiffusionGGML { } } + if (tempver == VERSION_ANIMA && + strlen(SAFE_STR(sd_ctx_params->model_path)) > 0 && + strlen(SAFE_STR(sd_ctx_params->diffusion_model_path)) == 0 && + !model_loader.has_diffusion_model_tensors() + ) + { + LOG_INFO("Anima: SD Diffusion Model tensors missing! Fallback trying alternative tensor names...\n"); + if (!model_loader.init_from_file(sd_ctx_params->model_path, "model.diffusion_model.")) { + LOG_WARN("loading diffusion model from '%s' failed", sd_ctx_params->model_path); + } + tempver = model_loader.get_sd_version(); + } + bool iswan = (tempver==VERSION_WAN2 || tempver==VERSION_WAN2_2_I2V || tempver==VERSION_WAN2_2_TI2V); bool isqwenimg = (tempver==VERSION_QWEN_IMAGE); bool iszimg = (tempver==VERSION_Z_IMAGE); bool isflux2 = (tempver==VERSION_FLUX2); bool isflux2k = (tempver==VERSION_FLUX2_KLEIN); bool is_ovis = (tempver==VERSION_OVIS_IMAGE); + bool is_anima = (tempver==VERSION_ANIMA); + bool conditioner_is_llm = (isqwenimg||iszimg||isflux2||isflux2k||is_ovis||is_anima); //kcpp qol fallback: if qwen image, and they loaded the qwen2vl llm as t5 by mistake - if((isqwenimg||iszimg||isflux2||isflux2k||is_ovis) && t5_path_fixed!="") + if(conditioner_is_llm && t5_path_fixed!="") { if(clipl_path_fixed=="" && clipg_path_fixed=="") { @@ -350,7 +367,7 @@ class StableDiffusionGGML { prefix = "cond_stage_model.transformer."; LOG_INFO("swap clip_vision from '%s'", clipl_path_fixed.c_str()); } - if(isqwenimg||iszimg||isflux2||isflux2k||is_ovis) + if(conditioner_is_llm) { prefix = "text_encoders.llm."; LOG_INFO("swap llm from '%s'", clipl_path_fixed.c_str()); @@ -452,7 +469,7 @@ class StableDiffusionGGML { { to_replace = "taesd_f2.embd"; } - else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version)) + else if((sd_version_is_wan(version) && version != VERSION_WAN2_2_TI2V)||sd_version_is_qwen_image(version)||sd_version_is_anima(version)) { to_replace = "taesd_w21.embd"; } @@ -545,6 +562,7 @@ class StableDiffusionGGML { shift_factor = 0.1159f; } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_flux2(version)) { scale_factor = 1.0f; shift_factor = 0.f; @@ -675,6 +693,14 @@ class StableDiffusionGGML { "model.diffusion_model", version, sd_ctx_params->qwen_image_zero_cond_t); + } else if (sd_version_is_anima(version)) { + cond_stage_model = std::make_shared(clip_backend, + offload_params_to_cpu, + tensor_storage_map); + diffusion_model = std::make_shared(backend, + offload_params_to_cpu, + tensor_storage_map, + "model.diffusion_model"); } else if (sd_version_is_z_image(version)) { cond_stage_model = std::make_shared(clip_backend, offload_params_to_cpu, @@ -737,7 +763,7 @@ class StableDiffusionGGML { } if (!(use_tiny_autoencoder || version == VERSION_SDXS) || tae_preview_only) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { first_stage_model = std::make_shared(vae_backend, offload_params_to_cpu, tensor_storage_map, @@ -775,7 +801,7 @@ class StableDiffusionGGML { } } if (use_tiny_autoencoder || version == VERSION_SDXS) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { tae_first_stage = std::make_shared(vae_backend, offload_params_to_cpu, tensor_storage_map, @@ -1051,6 +1077,7 @@ class StableDiffusionGGML { } else if (sd_version_is_sd3(version) || sd_version_is_wan(version) || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_z_image(version)) { pred_type = FLOW_PRED; if (sd_version_is_wan(version)) { @@ -1167,7 +1194,23 @@ class StableDiffusionGGML { std::shared_ptr load_lora_model_from_file(const std::string& lora_id, float multiplier, ggml_backend_t backend, + std::string stage = "", LoraModel::filter_t lora_tensor_filter = nullptr) { + // kcpp + // first check the cache + bool kcpp_at_runtime = (stage != ""); + std::string lora_key = "|" + stage + "|" + lora_id; + if (kcpp_at_runtime) { + auto it = kcpp_lora_cache.find(lora_key); + if (it != kcpp_lora_cache.end()) { + if (it->second) { + it->second->multiplier = multiplier; + } + return it->second; + } + } + // by construction, kcpp will always find the preloaded LoRAs on the cache + std::string lora_path = lora_id; static std::string high_noise_tag = "|high_noise|"; bool is_high_noise = false; @@ -1179,10 +1222,16 @@ class StableDiffusionGGML { auto lora = std::make_shared(lora_id, backend, lora_path, is_high_noise ? "model.high_noise_" : "", version); if (!lora->load_from_file(n_threads, lora_tensor_filter)) { LOG_WARN("load lora tensors from %s failed", lora_path.c_str()); - return nullptr; + // also cache negatives to avoid I/O at runtime + lora = nullptr; + if (kcpp_at_runtime) + kcpp_lora_cache[lora_key] = lora; + return lora; } lora->multiplier = multiplier; + if (kcpp_at_runtime) + kcpp_lora_cache[lora_key] = lora; return lora; } @@ -1234,6 +1283,18 @@ class StableDiffusionGGML { cond_stage_lora_models.clear(); diffusion_lora_models.clear(); first_stage_lora_models.clear(); + if (cond_stage_model) { + cond_stage_model->set_weight_adapter(nullptr); + } + if (diffusion_model) { + diffusion_model->set_weight_adapter(nullptr); + } + if (high_noise_diffusion_model) { + high_noise_diffusion_model->set_weight_adapter(nullptr); + } + if (first_stage_model) { + first_stage_model->set_weight_adapter(nullptr); + } if (lora_state.empty()) { return; } @@ -1261,7 +1322,7 @@ class StableDiffusionGGML { const std::string& lora_id = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_id, multiplier, clip_backend, "cond_stage", lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); cond_stage_lora_models.push_back(lora); @@ -1293,7 +1354,7 @@ class StableDiffusionGGML { const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, backend, "diffusion", lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); diffusion_lora_models.push_back(lora); @@ -1329,7 +1390,7 @@ class StableDiffusionGGML { const std::string& lora_name = kv.first; float multiplier = kv.second; - auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, lora_tensor_filter); + auto lora = load_lora_model_from_file(lora_name, multiplier, vae_backend, "first_stage", lora_tensor_filter); if (lora && !lora->lora_tensors.empty()) { lora->preprocess_lora_tensors(tensors); first_stage_lora_models.push_back(lora); @@ -1650,7 +1711,7 @@ class StableDiffusionGGML { } else if (sd_version_is_flux(version) || sd_version_is_z_image(version)) { latent_rgb_proj = flux_latent_rgb_proj; latent_rgb_bias = flux_latent_rgb_bias; - } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) { + } else if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { latent_rgb_proj = wan_21_latent_rgb_proj; latent_rgb_bias = wan_21_latent_rgb_bias; } else { @@ -2131,6 +2192,9 @@ class StableDiffusionGGML { shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); timesteps_vec.assign(1, (float)shifted_t); + } else if (sd_version_is_anima(version)) { + // Anima uses normalized flow timesteps. + timesteps_vec.assign(1, t / static_cast(TIMESTEPS)); } else if (sd_version_is_z_image(version)) { timesteps_vec.assign(1, 1000.f - t); } else { @@ -2542,7 +2606,7 @@ class StableDiffusionGGML { } void process_latent_in(ggml_tensor* latent) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { int channel_dim = sd_version_is_flux2(version) ? 2 : 3; std::vector latents_mean_vec; std::vector latents_std_vec; @@ -2581,7 +2645,7 @@ class StableDiffusionGGML { } void process_latent_out(ggml_tensor* latent) { - if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_flux2(version)) { + if (sd_version_is_wan(version) || sd_version_is_qwen_image(version) || sd_version_is_anima(version) || sd_version_is_flux2(version)) { int channel_dim = sd_version_is_flux2(version) ? 2 : 3; std::vector latents_mean_vec; std::vector latents_std_vec; @@ -2659,7 +2723,7 @@ class StableDiffusionGGML { // TODO wan2.2 vae support? int64_t ne2; int64_t ne3; - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { ne2 = 1; ne3 = C * x->ne[3]; } else { @@ -2677,7 +2741,7 @@ class StableDiffusionGGML { result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); } - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); } @@ -2750,6 +2814,7 @@ class StableDiffusionGGML { ggml_tensor* latent; if (use_tiny_autoencoder || sd_version_is_qwen_image(version) || + sd_version_is_anima(version) || sd_version_is_wan(version) || sd_version_is_flux2(version) || version == VERSION_CHROMA_RADIANCE) { @@ -2769,7 +2834,7 @@ class StableDiffusionGGML { if (!use_tiny_autoencoder) { process_latent_in(latent); } - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { latent = ggml_reshape_4d(work_ctx, latent, latent->ne[0], latent->ne[1], latent->ne[3], 1); } return latent; @@ -2807,7 +2872,7 @@ class StableDiffusionGGML { } int64_t t0 = ggml_time_ms(); if (!use_tiny_autoencoder) { - if (sd_version_is_qwen_image(version)) { + if (sd_version_is_qwen_image(version) || sd_version_is_anima(version)) { x = ggml_reshape_4d(work_ctx, x, x->ne[0], x->ne[1], 1, x->ne[2] * x->ne[3]); } process_latent_out(x); diff --git a/otherarch/sdcpp/unet.hpp b/otherarch/sdcpp/unet.hpp index 2dd79e0e197..e0fd4c52761 100644 --- a/otherarch/sdcpp/unet.hpp +++ b/otherarch/sdcpp/unet.hpp @@ -1,8 +1,7 @@ #ifndef __UNET_HPP__ #define __UNET_HPP__ -#include "common.hpp" -#include "ggml_extend.hpp" +#include "common_block.hpp" #include "model.h" /*==================================================== UnetModel =====================================================*/ diff --git a/otherarch/sdcpp/vae.hpp b/otherarch/sdcpp/vae.hpp index c627616c210..7ccba6eed33 100644 --- a/otherarch/sdcpp/vae.hpp +++ b/otherarch/sdcpp/vae.hpp @@ -1,8 +1,7 @@ #ifndef __VAE_HPP__ #define __VAE_HPP__ -#include "common.hpp" -#include "ggml_extend.hpp" +#include "common_block.hpp" /*================================================== AutoEncoderKL ===================================================*/ diff --git a/otherarch/sdcpp/wan.hpp b/otherarch/sdcpp/wan.hpp index 90de3bdd161..d94fbd482a5 100644 --- a/otherarch/sdcpp/wan.hpp +++ b/otherarch/sdcpp/wan.hpp @@ -5,9 +5,8 @@ #include #include -#include "common.hpp" +#include "common_block.hpp" #include "flux.hpp" -#include "ggml_extend.hpp" #include "rope.hpp" #include "vae.hpp" diff --git a/otherarch/sdcpp/z_image.hpp b/otherarch/sdcpp/z_image.hpp index cee23833aa7..8f405a590b7 100644 --- a/otherarch/sdcpp/z_image.hpp +++ b/otherarch/sdcpp/z_image.hpp @@ -346,69 +346,6 @@ namespace ZImage { blocks["final_layer"] = std::make_shared(z_image_params.hidden_size, z_image_params.patch_size, z_image_params.out_channels); } - struct ggml_tensor* pad_to_patch_size(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { - int64_t W = x->ne[0]; - int64_t H = x->ne[1]; - - int pad_h = (z_image_params.patch_size - H % z_image_params.patch_size) % z_image_params.patch_size; - int pad_w = (z_image_params.patch_size - W % z_image_params.patch_size) % z_image_params.patch_size; - x = ggml_ext_pad(ctx->ggml_ctx, x, pad_w, pad_h, 0, 0, ctx->circular_x_enabled, ctx->circular_y_enabled); - return x; - } - - struct ggml_tensor* patchify(struct ggml_context* ctx, - struct ggml_tensor* x) { - // x: [N, C, H, W] - // return: [N, h*w, patch_size*patch_size*C] - int64_t N = x->ne[3]; - int64_t C = x->ne[2]; - int64_t H = x->ne[1]; - int64_t W = x->ne[0]; - int64_t p = z_image_params.patch_size; - int64_t h = H / z_image_params.patch_size; - int64_t w = W / z_image_params.patch_size; - - GGML_ASSERT(h * p == H && w * p == W); - - x = ggml_reshape_4d(ctx, x, p, w, p, h * C * N); // [N*C*h, p, w, p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, w, p, p] - x = ggml_reshape_4d(ctx, x, p * p, w * h, C, N); // [N, C, h*w, p*p] - x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 2, 0, 1, 3)); // [N, h*w, C, p*p] - x = ggml_reshape_3d(ctx, x, C * p * p, w * h, N); // [N, h*w, p*p*C] - return x; - } - - struct ggml_tensor* process_img(GGMLRunnerContext* ctx, - struct ggml_tensor* x) { - x = pad_to_patch_size(ctx, x); - x = patchify(ctx->ggml_ctx, x); - return x; - } - - struct ggml_tensor* unpatchify(struct ggml_context* ctx, - struct ggml_tensor* x, - int64_t h, - int64_t w) { - // x: [N, h*w, patch_size*patch_size*C] - // return: [N, C, H, W] - int64_t N = x->ne[2]; - int64_t C = x->ne[0] / z_image_params.patch_size / z_image_params.patch_size; - int64_t H = h * z_image_params.patch_size; - int64_t W = w * z_image_params.patch_size; - int64_t p = z_image_params.patch_size; - - GGML_ASSERT(C * p * p == x->ne[0]); - - x = ggml_reshape_4d(ctx, x, C, p * p, w * h, N); // [N, h*w, p*p, C] - x = ggml_cont(ctx, ggml_ext_torch_permute(ctx, x, 1, 2, 0, 3)); // [N, C, h*w, p*p] - x = ggml_reshape_4d(ctx, x, p, p, w, h * C * N); // [N*C*h, w, p, p] - x = ggml_cont(ctx, ggml_permute(ctx, x, 0, 2, 1, 3)); // [N*C*h, p, w, p] - x = ggml_reshape_4d(ctx, x, W, H, C, N); // [N, C, h*p, w*p] - - return x; - } - struct ggml_tensor* forward_core(GGMLRunnerContext* ctx, struct ggml_tensor* x, struct ggml_tensor* timestep, @@ -495,27 +432,22 @@ namespace ZImage { int64_t C = x->ne[2]; int64_t N = x->ne[3]; - auto img = process_img(ctx, x); + int patch_size = z_image_params.patch_size; + + auto img = DiT::pad_and_patchify(ctx, x, patch_size, patch_size, false); uint64_t n_img_token = img->ne[1]; if (ref_latents.size() > 0) { for (ggml_tensor* ref : ref_latents) { - ref = process_img(ctx, ref); + ref = DiT::pad_and_patchify(ctx, ref, patch_size, patch_size, false); img = ggml_concat(ctx->ggml_ctx, img, ref, 1); } } - int64_t h_len = ((H + (z_image_params.patch_size / 2)) / z_image_params.patch_size); - int64_t w_len = ((W + (z_image_params.patch_size / 2)) / z_image_params.patch_size); - auto out = forward_core(ctx, img, timestep, context, pe); - out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C] - out = unpatchify(ctx->ggml_ctx, out, h_len, w_len); // [N, C, H + pad_h, W + pad_w] - - // slice - out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, H); // [N, C, H, W + pad_w] - out = ggml_ext_slice(ctx->ggml_ctx, out, 0, 0, W); // [N, C, H, W] + out = ggml_ext_slice(ctx->ggml_ctx, out, 1, 0, n_img_token); // [N, n_img_token, ph*pw*C] + out = DiT::unpatchify_and_crop(ctx->ggml_ctx, out, H, W, patch_size, patch_size, false); // [N, C, H, W] out = ggml_ext_scale(ctx->ggml_ctx, out, -1.f); diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 372d1c26eb6..1834a631d69 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -366,23 +366,6 @@ std::vector> split_big_vector(const std::vector& big_arr, return small_arrs; } -std::vector> split_big_vector_in_two(const std::vector& big_arr, size_t chunk_size) -{ - std::vector> result; - if (chunk_size == 0 || big_arr.empty()) - return result; - - if (big_arr.size() <= chunk_size) { - // Only one chunk (all elements) - result.emplace_back(big_arr); - return result; - } - size_t split_point = big_arr.size() - chunk_size; - result.emplace_back(big_arr.begin(), big_arr.begin() + split_point); // First big chunk - result.emplace_back(big_arr.begin() + split_point, big_arr.end()); // Last chunk (size <= chunk_size) - return result; -} - std::vector resample_wav(const std::vector & input, uint32_t input_rate, uint32_t output_rate) { if (input.empty() || input_rate == 0 || output_rate == 0) return {}; diff --git a/otherarch/utils.h b/otherarch/utils.h index 4482cc90387..b5137c08e20 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -62,8 +62,6 @@ std::string kcpp_base64_encode(const std::string &data); std::string get_timestamp_str(); std::vector> split_big_vector(const std::vector& big_arr, size_t chunk_size); -std::vector> split_big_vector_in_two(const std::vector& big_arr, size_t chunk_size); - std::vector resample_wav(const std::vector& input, uint32_t input_rate, uint32_t output_rate); std::vector mix_planar_stereo_to_mono(const float* audio, int T_audio);