@@ -61,30 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6161 const String2GGMLType& tensor_types,
6262 const std::string& embd_dir,
6363 SDVersion version = VERSION_SD1,
64- PMVersion pv = PM_VERSION_1,
65- int clip_skip = -1 )
64+ PMVersion pv = PM_VERSION_1)
6665 : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407 ), embd_dir(embd_dir) {
66+ bool force_clip_f32 = embd_dir.size () > 0 ;
6767 if (sd_version_is_sd1 (version)) {
68- text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14);
68+ text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true , force_clip_f32 );
6969 } else if (sd_version_is_sd2 (version)) {
70- text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14);
70+ text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.transformer.text_model" , OPEN_CLIP_VIT_H_14, true , force_clip_f32 );
7171 } else if (sd_version_is_sdxl (version)) {
72- text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false );
73- text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false );
74- }
75- set_clip_skip (clip_skip);
76- }
77-
78- void set_clip_skip (int clip_skip) {
79- if (clip_skip <= 0 ) {
80- clip_skip = 1 ;
81- if (sd_version_is_sd2 (version) || sd_version_is_sdxl (version)) {
82- clip_skip = 2 ;
83- }
84- }
85- text_model->set_clip_skip (clip_skip);
86- if (sd_version_is_sdxl (version)) {
87- text_model2->set_clip_skip (clip_skip);
72+ text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false , force_clip_f32);
73+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " cond_stage_model.1.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false , force_clip_f32);
8874 }
8975 }
9076
@@ -412,7 +398,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
412398 int height,
413399 int adm_in_channels = -1 ,
414400 bool zero_out_masked = false ) {
415- set_clip_skip (clip_skip);
416401 int64_t t0 = ggml_time_ms ();
417402 struct ggml_tensor * hidden_states = NULL ; // [N, n_token, hidden_size]
418403 struct ggml_tensor * chunk_hidden_states = NULL ; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@@ -421,6 +406,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
421406 struct ggml_tensor * pooled = NULL ;
422407 std::vector<float > hidden_states_vec;
423408
409+ if (clip_skip <= 0 ) {
410+ clip_skip = (sd_version_is_sd2 (version) || sd_version_is_sdxl (version)) ? 2 : 1 ;
411+ }
412+
424413 size_t chunk_len = 77 ;
425414 size_t chunk_count = tokens.size () / chunk_len;
426415 for (int chunk_idx = 0 ; chunk_idx < chunk_count; chunk_idx++) {
@@ -455,6 +444,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
455444 token_embed_custom.data (),
456445 max_token_idx,
457446 false ,
447+ clip_skip,
458448 &chunk_hidden_states1,
459449 work_ctx);
460450 if (sd_version_is_sdxl (version)) {
@@ -464,6 +454,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
464454 token_embed_custom.data (),
465455 max_token_idx,
466456 false ,
457+ clip_skip,
467458 &chunk_hidden_states2, work_ctx);
468459 // concat
469460 chunk_hidden_states = ggml_tensor_concat (work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0 );
@@ -475,6 +466,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
475466 token_embed_custom.data (),
476467 max_token_idx,
477468 true ,
469+ clip_skip,
478470 &pooled,
479471 work_ctx);
480472 }
@@ -669,21 +661,11 @@ struct SD3CLIPEmbedder : public Conditioner {
669661
670662 SD3CLIPEmbedder (ggml_backend_t backend,
671663 bool offload_params_to_cpu,
672- const String2GGMLType& tensor_types = {},
673- int clip_skip = -1 )
664+ const String2GGMLType& tensor_types = {})
674665 : clip_g_tokenizer(0 ) {
675666 clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, false );
676667 clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.clip_g.transformer.text_model" , OPEN_CLIP_VIT_BIGG_14, false );
677668 t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.t5xxl.transformer" );
678- set_clip_skip (clip_skip);
679- }
680-
681- void set_clip_skip (int clip_skip) {
682- if (clip_skip <= 0 ) {
683- clip_skip = 2 ;
684- }
685- clip_l->set_clip_skip (clip_skip);
686- clip_g->set_clip_skip (clip_skip);
687669 }
688670
689671 void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
@@ -780,14 +762,17 @@ struct SD3CLIPEmbedder : public Conditioner {
780762 std::vector<std::pair<std::vector<int >, std::vector<float >>> token_and_weights,
781763 int clip_skip,
782764 bool zero_out_masked = false ) {
783- set_clip_skip (clip_skip);
784765 auto & clip_l_tokens = token_and_weights[0 ].first ;
785766 auto & clip_l_weights = token_and_weights[0 ].second ;
786767 auto & clip_g_tokens = token_and_weights[1 ].first ;
787768 auto & clip_g_weights = token_and_weights[1 ].second ;
788769 auto & t5_tokens = token_and_weights[2 ].first ;
789770 auto & t5_weights = token_and_weights[2 ].second ;
790771
772+ if (clip_skip <= 0 ) {
773+ clip_skip = 2 ;
774+ }
775+
791776 int64_t t0 = ggml_time_ms ();
792777 struct ggml_tensor * hidden_states = NULL ; // [N, n_token*2, 4096]
793778 struct ggml_tensor * chunk_hidden_states = NULL ; // [n_token*2, 4096]
@@ -818,6 +803,7 @@ struct SD3CLIPEmbedder : public Conditioner {
818803 NULL ,
819804 max_token_idx,
820805 false ,
806+ clip_skip,
821807 &chunk_hidden_states_l,
822808 work_ctx);
823809 {
@@ -845,6 +831,7 @@ struct SD3CLIPEmbedder : public Conditioner {
845831 NULL ,
846832 max_token_idx,
847833 true ,
834+ clip_skip,
848835 &pooled_l,
849836 work_ctx);
850837 }
@@ -866,6 +853,7 @@ struct SD3CLIPEmbedder : public Conditioner {
866853 NULL ,
867854 max_token_idx,
868855 false ,
856+ clip_skip,
869857 &chunk_hidden_states_g,
870858 work_ctx);
871859
@@ -894,6 +882,7 @@ struct SD3CLIPEmbedder : public Conditioner {
894882 NULL ,
895883 max_token_idx,
896884 true ,
885+ clip_skip,
897886 &pooled_g,
898887 work_ctx);
899888 }
@@ -1017,18 +1006,9 @@ struct FluxCLIPEmbedder : public Conditioner {
10171006
10181007 FluxCLIPEmbedder (ggml_backend_t backend,
10191008 bool offload_params_to_cpu,
1020- const String2GGMLType& tensor_types = {},
1021- int clip_skip = -1 ) {
1009+ const String2GGMLType& tensor_types = {}) {
10221010 clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.clip_l.transformer.text_model" , OPENAI_CLIP_VIT_L_14, true );
10231011 t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.t5xxl.transformer" );
1024- set_clip_skip (clip_skip);
1025- }
1026-
1027- void set_clip_skip (int clip_skip) {
1028- if (clip_skip <= 0 ) {
1029- clip_skip = 2 ;
1030- }
1031- clip_l->set_clip_skip (clip_skip);
10321012 }
10331013
10341014 void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
@@ -1109,12 +1089,15 @@ struct FluxCLIPEmbedder : public Conditioner {
11091089 std::vector<std::pair<std::vector<int >, std::vector<float >>> token_and_weights,
11101090 int clip_skip,
11111091 bool zero_out_masked = false ) {
1112- set_clip_skip (clip_skip);
11131092 auto & clip_l_tokens = token_and_weights[0 ].first ;
11141093 auto & clip_l_weights = token_and_weights[0 ].second ;
11151094 auto & t5_tokens = token_and_weights[1 ].first ;
11161095 auto & t5_weights = token_and_weights[1 ].second ;
11171096
1097+ if (clip_skip <= 0 ) {
1098+ clip_skip = 2 ;
1099+ }
1100+
11181101 int64_t t0 = ggml_time_ms ();
11191102 struct ggml_tensor * hidden_states = NULL ; // [N, n_token, 4096]
11201103 struct ggml_tensor * chunk_hidden_states = NULL ; // [n_token, 4096]
@@ -1143,6 +1126,7 @@ struct FluxCLIPEmbedder : public Conditioner {
11431126 NULL ,
11441127 max_token_idx,
11451128 true ,
1129+ clip_skip,
11461130 &pooled,
11471131 work_ctx);
11481132 }
@@ -1241,17 +1225,13 @@ struct T5CLIPEmbedder : public Conditioner {
12411225 T5CLIPEmbedder (ggml_backend_t backend,
12421226 bool offload_params_to_cpu,
12431227 const String2GGMLType& tensor_types = {},
1244- int clip_skip = -1 ,
12451228 bool use_mask = false ,
12461229 int mask_pad = 1 ,
12471230 bool is_umt5 = false )
12481231 : use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
12491232 t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, " text_encoders.t5xxl.transformer" , is_umt5);
12501233 }
12511234
1252- void set_clip_skip (int clip_skip) {
1253- }
1254-
12551235 void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
12561236 t5->get_param_tensors (tensors, " text_encoders.t5xxl.transformer" );
12571237 }
0 commit comments