Merge remote-tracking branch 'upstream/master' into bleedingedge

stvoler · stvoler · commit fc1d804b8c65 · 2025-07-24T14:01:22.000+04:00
diff --git a/clip.hpp b/clip.hpp
@@ -732,7 +732,7 @@ class CLIPTextModel : public GGMLBlock {
             auto text_projection = params["text_projection"];
             ggml_tensor* pooled  = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);
             if (text_projection != NULL) {
-                pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);
+                pooled           = ggml_nn_linear(ctx, pooled, text_projection, NULL);
             } else {
                 LOG_DEBUG("Missing text_projection matrix, assuming identity...");
             }
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -896,7 +896,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                 }
 
                 if (chunk_idx == 0) {
-                    auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+                    auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
                     clip_l->compute(n_threads,
                                     input_ids,
@@ -907,13 +907,6 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     &pooled_l,
                                     work_ctx);
                 }
-            } else {
-                chunk_hidden_states_l = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 768, chunk_len);
-                ggml_set_f32(chunk_hidden_states_l, 0.f);
-                if (chunk_idx == 0) {
-                    pooled_l = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 768);
-                    ggml_set_f32(pooled_l, 0.f);
-                }
             }
 
             // clip_g
@@ -952,7 +945,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                 }
 
                 if (chunk_idx == 0) {
-                    auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
+                    auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_g_tokenizer.EOS_TOKEN_ID);
                     max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
                     clip_g->compute(n_threads,
                                     input_ids,
@@ -963,13 +956,6 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     &pooled_g,
                                     work_ctx);
                 }
-            } else {
-                chunk_hidden_states_g = ggml_new_tensor_2d(work_ctx, GGML_TYPE_F32, 1280, chunk_len);
-                ggml_set_f32(chunk_hidden_states_g, 0.f);
-                if (chunk_idx == 0) {
-                    pooled_g = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1280);
-                    ggml_set_f32(pooled_g, 0.f);
-                }
             }
 
             // t5
@@ -1264,18 +1250,18 @@ struct FluxCLIPEmbedder : public Conditioner {
                     auto input_ids       = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
                     size_t max_token_idx = 0;
 
-                    auto it       = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
-                    max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+                auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), clip_l_tokenizer.EOS_TOKEN_ID);
+                max_token_idx = std::min<size_t>(std::distance(chunk_tokens.begin(), it), chunk_tokens.size() - 1);
+                
+                clip_l->compute(n_threads,
+                                input_ids,
+                                0,
+                                NULL,
+                                max_token_idx,
+                                true,
+                                &pooled,
+                                work_ctx);
 
-                    clip_l->compute(n_threads,
-                                    input_ids,
-                                    0,
-                                    NULL,
-                                    max_token_idx,
-                                    true,
-                                    &pooled,
-                                    work_ctx);
-                }
             }
 
             // t5

Original file line number	Diff line number	Diff line change
`@@ -732,7 +732,7 @@ class CLIPTextModel : public GGMLBlock {`
`732`	`732`	`auto text_projection = params["text_projection"];`
`733`	`733`	`ggml_tensor* pooled = ggml_view_1d(ctx, x, hidden_size, x->nb[1] * max_token_idx);`
`734`	`734`	`if (text_projection != NULL) {`
`735`		`- pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);`
	`735`	`+ pooled = ggml_nn_linear(ctx, pooled, text_projection, NULL);`
`736`	`736`	`} else {`
`737`	`737`	`LOG_DEBUG("Missing text_projection matrix, assuming identity...");`
`738`	`738`	`}`