cont : alternative impl

ggerganov · ggerganov · commit 1ec9145fb5db · 2026-02-24T13:30:35.000+02:00
diff --git a/src/llama-memory-hybrid-iswa.cpp b/src/llama-memory-hybrid-iswa.cpp
@@ -3,7 +3,6 @@
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-context.h"
-#include <limits>
 
 //
 // llama_memory_hybrid_iswa
@@ -137,10 +136,10 @@ void llama_memory_hybrid_iswa::clear(bool data) {
 }
 
 bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    // Try removing from the recurrent cache first since it may fail. If it does
+    // fail, the cache will not have been mutated.
     if (!mem_recr->seq_rm(seq_id, p0, p1)) {
-        mem_recr->seq_rm(seq_id, 0, std::numeric_limits<llama_pos>::max()); 
-        mem_attn->seq_rm(seq_id, p0, p1); 
-        return false; //This should always fail, since we cannot truncate recurrent
+        return false;
     }
     return mem_attn->seq_rm(seq_id, p0, p1);
 }
diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp
@@ -163,7 +163,7 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
             const auto & cell = cells[tail_id];
             // partial intersection is invalid if it includes the final pos
             if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
-                //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
+                //printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false, p0 = %d, cell.pos = %d, p1 = %d\n", p0, cell.pos, p1);
                 return false;
             }
             // invalidate tails which will be cleared
@@ -599,21 +599,10 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
     // update the pos of the used seqs
     for (uint32_t s = 0; s < n_seqs; ++s) {
         const uint32_t i = s*n_seq_tokens;
+        const llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
         const int32_t cell_id = s + min;
         auto & cell = cells[cell_id];
 
-        // The temporal plane may have the same value for all image tokens, so we need the max across ALL planes to get the true sequence position.
-        llama_pos last_pos = ubatch.pos[i + n_seq_tokens - 1];
-
-        // For M-RoPE image/audio embeddings,positions are stored in multiple planes. The temporal plane may have the same value for all tokens, so scan all planes for the true max.
-        if (ubatch.n_pos > 1 && ubatch.embd != nullptr) {
-            for (uint32_t p = 0; p < ubatch.n_pos; ++p) {
-                for (uint32_t t = 0; t < n_seq_tokens; ++t) {
-                    last_pos = std::max(last_pos, ubatch.pos[p * ubatch.n_tokens + i + t]);
-                }
-            }
-        }
-
         if (cell.pos >= 0 && last_pos != cell.pos + (llama_pos) n_seq_tokens) {
             // What should happen when the pos backtracks or skips a value?
             // Clearing the state mid-batch would require special-casing which isn't done.
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -231,19 +231,47 @@ server_tokens::server_tokens(mtmd::input_chunks & mtmd_chunks, bool has_mtmd) :
 server_tokens::server_tokens(const llama_tokens & tokens, bool has_mtmd) : has_mtmd(has_mtmd), tokens(tokens) {
 }
 
-llama_pos server_tokens::pos_next() const {
+llama_pos server_tokens::pos_next(int64_t n_tokens) const {
     if (!has_mtmd) {
-        return tokens.size();
+        if (n_tokens < 0) {
+            return tokens.size();
+        }
+
+        return n_tokens;
     }
 
-    llama_pos res = tokens.size();
+    if (n_tokens < 0) {
+        llama_pos res = tokens.size();
 
-    for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
-        const auto & chunk = it->second;
-        res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
+        for (auto it = map_idx_to_media.begin(); it != map_idx_to_media.end(); ++it) {
+            const auto & chunk = it->second;
+            res += mtmd_input_chunk_get_n_pos(chunk.get()) - mtmd_input_chunk_get_n_tokens(chunk.get());
+        }
+
+        return res;
     }
 
-    return res;
+    int64_t idx = 0;
+    llama_pos pos = 0;
+
+    GGML_ASSERT(n_tokens <= (int64_t)tokens.size());
+
+    while (idx < n_tokens) {
+        auto media_it = map_idx_to_media.find(idx);
+        if (media_it != map_idx_to_media.end()) {
+            const auto & chunk = media_it->second;
+            const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
+            const size_t n_tok = mtmd_input_chunk_get_n_tokens(chunk.get());
+
+            pos += n_pos;
+            idx += n_tok;
+        } else {
+            pos++;
+            idx++;
+        }
+    }
+
+    return pos;
 }
 
 size_t server_tokens::tokens_up_to_pos(llama_pos max_pos) const {
@@ -252,27 +280,25 @@ size_t server_tokens::tokens_up_to_pos(llama_pos max_pos) const {
     }
 
     size_t idx = 0;
-    llama_pos current_pos = 0;
+    llama_pos pos = 0;
 
     while (idx < tokens.size()) {
         auto media_it = map_idx_to_media.find(idx);
         if (media_it != map_idx_to_media.end()) {
             const auto & chunk = media_it->second;
             const llama_pos n_pos = mtmd_input_chunk_get_n_pos(chunk.get());
-            const size_t   n_tok = mtmd_input_chunk_get_n_tokens(chunk.get());
+            const size_t n_tok = mtmd_input_chunk_get_n_tokens(chunk.get());
 
-            if (current_pos + n_pos > max_pos + 1) {
-                break;
-            }
-            current_pos += n_pos;
+            pos += n_pos;
             idx += n_tok;
         } else {
-            if (current_pos > max_pos) {
-                break;
-            }
-            current_pos++;
+            pos++;
             idx++;
         }
+
+        if (pos > max_pos) {
+            break;
+        }
     }
 
     return idx;
diff --git a/tools/server/server-common.h b/tools/server/server-common.h
@@ -167,11 +167,14 @@ struct server_tokens {
     // for debugging
     std::string str() const;
 
-    llama_pos pos_next() const;
-    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
+    // the next position after n_tokens. if n_tokens < 0, return the next position after all tokens.
+    llama_pos pos_next(int64_t n_tokens = -1) const;
 
+    // number of tokens with position <= max_pos
     size_t tokens_up_to_pos(llama_pos max_pos) const;
-    
+
+    const mtmd::input_chunk_ptr & find_chunk(size_t idx) const;
+
     void push_back(llama_token tok);
 
     // will create a copy of the chunk if it contains non-text data
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -77,6 +77,7 @@ struct server_slot {
     size_t last_nl_pos = 0;
 
     std::string  generated_text;
+    std::string  debug_generated_text;
     llama_tokens generated_tokens;
 
     // idx of draft tokens in the main batch
@@ -425,7 +426,7 @@ struct server_slot {
 
             if (!only_metrics) {
                 res["prompt"] = ptask->tokens.detokenize(ctx, true);
-                res["generated"] = generated_text;
+                res["generated"] = generated_text.empty() ? debug_generated_text : generated_text;
             }
         }
 
@@ -1441,7 +1442,13 @@ struct server_context_impl {
         res->id      = slot.task->id;
         res->id_slot = slot.id;
 
-        res->index           = slot.task->index;
+        res->index = slot.task->index;
+
+        // keep copy of last generated text for debugging purposes
+        if (slots_debug) {
+            slot.debug_generated_text = slot.generated_text;
+        }
+
         // in stream mode, content and tokens are already in last partial chunk
         if (slot.task->params.stream) {
             res->content     = "";
@@ -2275,14 +2282,14 @@ struct server_context_impl {
                                 n_past = 0;
                             }
 
+                            llama_pos pos_next = slot.prompt.tokens.pos_next(n_past);
+
                             // note: when n_swa == 0, the model does not use SWA, which is equivalent to a window of 1
                             const auto n_swa = std::max(1, llama_model_n_swa(model));
 
                             // the largest pos_min required for a checkpoint to be useful
-                            const auto pos_min_thold = std::max(0, n_past - n_swa);
+                            const auto pos_min_thold = std::max(0, pos_next - n_swa);
 
-                            // note: disallow with mtmd contexts for now
-                            //       https://github.com/ggml-org/llama.cpp/issues/17043
                             if (n_past > 0 && n_past < slot.prompt.n_tokens()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx), slot.id);
                                 if (pos_min == -1) {
@@ -2334,9 +2341,6 @@ struct server_context_impl {
                                 }
 
                                 if (pos_min > pos_min_thold) {
-                                    // Removed assert. This is a partial fix
-                                    
-
                                     SLT_WRN(slot, "n_past = %d, slot.prompt.tokens.size() = %d, seq_id = %d, pos_min = %d, n_swa = %d\n", n_past, (int) slot.prompt.tokens.size(), slot.id, pos_min, n_swa);
 
                                     // search for a context checkpoint
@@ -2361,14 +2365,16 @@ struct server_context_impl {
                                             do_reset = true;
                                             //printf("[DEBUG] `do_reset` was set to `true` after failing to restore a checkpoint");
                                         } else {
-                                            n_past = std::min(n_past, std::max(it->pos_min + 1, it->pos_max));
+                                            pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
+                                            n_past = slot.prompt.tokens.tokens_up_to_pos(pos_next);
                                             SLT_WRN(slot, "restored context checkpoint (pos_min = %d, pos_max = %d, size = %.3f MiB)\n", it->pos_min, it->pos_max, (float) checkpoint_size / 1024 / 1024);
                                         }
                                     }
 
                                     if (do_reset) {
                                         SLT_WRN(slot, "forcing full prompt re-processing due to lack of cache data (likely due to SWA or hybrid/recurrent memory, see %s)\n",
                                                 "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+                                        pos_next = 0;
                                         n_past = 0;
                                     }
                                 }
@@ -2395,17 +2401,10 @@ struct server_context_impl {
                             SLT_WRN(slot, "n_past was set to %d\n", n_past);
                         }
 
-                        
+                        slot.n_prompt_tokens_cache = n_past;
                         slot.n_prompt_tokens_processed = 0;
 
-                        if (slot.prompt.tokens.has_mtmd) {
-                            const int n_tokens_keep = (int)slot.prompt.tokens.tokens_up_to_pos(n_past);
-                            slot.n_prompt_tokens_cache     = n_tokens_keep;
-                            slot.prompt.tokens.keep_first(n_tokens_keep);
-                        } else {
-                            slot.n_prompt_tokens_cache     = n_past;
-                            slot.prompt.tokens.keep_first(n_past);
-                        }
+                        slot.prompt.tokens.keep_first(n_past);
 
                         // send initial 0% progress update if needed
                         // this is to signal the client that the request has started processing
@@ -2427,53 +2426,14 @@ struct server_context_impl {
                     SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0);
 
                     if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) {
-                        // hybrid model: recurrent partial removal failed.
-                        // find a checkpoint to restore recurrent state from,
-                        // then truncate attention KV to checkpoint position (preserving image KV).
-                        bool recovered = false;
-
-                        if (!slot.prompt.checkpoints.empty()) {
-                            for (auto it = slot.prompt.checkpoints.rbegin(); it != slot.prompt.checkpoints.rend(); ++it) {
-                                if (std::max(it->pos_min, it->pos_max) >= p0) {
-                                    continue; // checkpoint is past truncation point
-                                }
-
-                                // truncate attention KV to checkpoint position (and clear recurrent).
-                                // this call will "fail" (return false) because recurrent can't do
-                                // partial removal, but the hybrid seq_rm internally handles it:
-                                //   - clears recurrent fully
-                                //   - truncates attention from checkpoint pos_max onward
-                                const llama_pos checkpoint_pos = std::max(it->pos_min, it->pos_max);
-                                llama_memory_seq_rm(llama_get_memory(ctx), slot.id, checkpoint_pos, -1);
+                        SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0);
 
-                                const size_t checkpoint_size = it->data.size();
-                                const size_t n = llama_state_seq_set_data_ext(ctx, it->data.data(), checkpoint_size, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                        slot.prompt_clear(true);
 
-                                if (n == checkpoint_size) {
-                                    const int n_past_new = (int)slot.prompt.tokens.tokens_up_to_pos(checkpoint_pos);
-
-                                    SLT_WRN(slot, "recovered recurrent state from checkpoint (pos_min = %d, pos_max = %d, n_tokens = %d), n_past: %d -> %d\n",
-                                            it->pos_min, it->pos_max, it->n_tokens_cached, slot.prompt.n_tokens(), n_past_new);
-
-                                    slot.prompt.tokens.keep_first(n_past_new);
-                                    slot.n_prompt_tokens_cache = n_past_new;
-                                    recovered = true;
-                                    break;
-                                }
-                            }
-                        }
-
-                        if (!recovered) {
-                            SLT_WRN(slot, "failed to recover recurrent state - clearing the memory%s\n", "");
-
-                            llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1);
-
-                            auto saved_checkpoints = std::move(slot.prompt.checkpoints);
-                            slot.prompt_clear(true);
-                            slot.n_prompt_tokens_cache = 0;
-                            slot.prompt.checkpoints = std::move(saved_checkpoints);
-                        }
+                        // there is no common part left
+                        slot.n_prompt_tokens_cache = 0;
                     }
+
                     // check if we should process the image
                     if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) {
                         // process the image
@@ -2604,7 +2564,6 @@ struct server_context_impl {
                             auto & cur = slot.prompt.checkpoints.emplace_back(server_prompt_checkpoint{
                                 /*.pos_min = */ pos_min,
                                 /*.pos_max = */ pos_max,
-                                /*.n_tokens_cached  = */ slot.prompt.n_tokens(),
                                 /*.data    = */ std::vector<uint8_t>(checkpoint_size),
                             });
 
@@ -2951,6 +2910,9 @@ server_context_meta server_context::get_meta() const {
         /* fim_pre_token          */ llama_vocab_fim_pre(impl->vocab),
         /* fim_sub_token          */ llama_vocab_fim_suf(impl->vocab),
         /* fim_mid_token          */ llama_vocab_fim_mid(impl->vocab),
+        /* fim_pad_token          */ llama_vocab_fim_pad(impl->vocab),
+        /* fim_rep_token          */ llama_vocab_fim_rep(impl->vocab),
+        /* fim_sep_token          */ llama_vocab_fim_sep(impl->vocab),
 
         /* model_vocab_type       */ llama_vocab_type(impl->vocab),
         /* model_vocab_n_tokens   */ llama_vocab_n_tokens(impl->vocab),
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
@@ -556,7 +556,6 @@ struct server_task_result_apply_lora : server_task_result {
 struct server_prompt_checkpoint {
     llama_pos pos_min;
     llama_pos pos_max;
-    int n_tokens_cached;
 
     std::vector<uint8_t> data;
 

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,6 @@`
`3`	`3`	`#include "llama-impl.h"`
`4`	`4`	`#include "llama-model.h"`
`5`	`5`	`#include "llama-context.h"`
`6`		`-#include <limits>`
`7`	`6`
`8`	`7`	`//`
`9`	`8`	`// llama_memory_hybrid_iswa`
`@@ -137,10 +136,10 @@ void llama_memory_hybrid_iswa::clear(bool data) {`
`137`	`136`	`}`
`138`	`137`
`139`	`138`	`bool llama_memory_hybrid_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {`
	`139`	`+ // Try removing from the recurrent cache first since it may fail. If it does`
	`140`	`+ // fail, the cache will not have been mutated.`
`140`	`141`	`if (!mem_recr->seq_rm(seq_id, p0, p1)) {`
`141`		`- mem_recr->seq_rm(seq_id, 0, std::numeric_limits<llama_pos>::max());`
`142`		`- mem_attn->seq_rm(seq_id, p0, p1);`
`143`		`- return false; //This should always fail, since we cannot truncate recurrent`
	`142`	`+ return false;`
`144`	`143`	`}`
`145`	`144`	`return mem_attn->seq_rm(seq_id, p0, p1);`
`146`	`145`	`}`