Skip to content

Commit 48956ff

Browse files
authored
feat: reduce CLIP memory usage with no embeddings (#768)
1 parent ddc4a18 commit 48956ff

File tree

3 files changed

+53
-76
lines changed

3 files changed

+53
-76
lines changed

clip.hpp

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -548,9 +548,15 @@ class CLIPEmbeddings : public GGMLBlock {
548548
int64_t embed_dim;
549549
int64_t vocab_size;
550550
int64_t num_positions;
551+
bool force_clip_f32;
551552

552553
void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
553-
enum ggml_type token_wtype = GGML_TYPE_F32;
554+
enum ggml_type token_wtype = GGML_TYPE_F32;
555+
if (!force_clip_f32) {
556+
auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
557+
if (tensor_type != tensor_types.end())
558+
token_wtype = tensor_type->second;
559+
}
554560
enum ggml_type position_wtype = GGML_TYPE_F32;
555561

556562
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@@ -560,10 +566,12 @@ class CLIPEmbeddings : public GGMLBlock {
560566
public:
561567
CLIPEmbeddings(int64_t embed_dim,
562568
int64_t vocab_size = 49408,
563-
int64_t num_positions = 77)
569+
int64_t num_positions = 77,
570+
bool force_clip_f32 = false)
564571
: embed_dim(embed_dim),
565572
vocab_size(vocab_size),
566-
num_positions(num_positions) {
573+
num_positions(num_positions),
574+
force_clip_f32(force_clip_f32) {
567575
}
568576

569577
struct ggml_tensor* get_token_embed_weight() {
@@ -678,12 +686,11 @@ class CLIPTextModel : public GGMLBlock {
678686
int32_t n_head = 12;
679687
int32_t n_layer = 12; // num_hidden_layers
680688
int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14
681-
int32_t clip_skip = -1;
682689
bool with_final_ln = true;
683690

684691
CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
685692
bool with_final_ln = true,
686-
int clip_skip_value = -1)
693+
bool force_clip_f32 = false)
687694
: version(version), with_final_ln(with_final_ln) {
688695
if (version == OPEN_CLIP_VIT_H_14) {
689696
hidden_size = 1024;
@@ -696,20 +703,12 @@ class CLIPTextModel : public GGMLBlock {
696703
n_head = 20;
697704
n_layer = 32;
698705
}
699-
set_clip_skip(clip_skip_value);
700706

701-
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
707+
blocks["embeddings"] = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
702708
blocks["encoder"] = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
703709
blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
704710
}
705711

706-
void set_clip_skip(int skip) {
707-
if (skip <= 0) {
708-
skip = -1;
709-
}
710-
clip_skip = skip;
711-
}
712-
713712
struct ggml_tensor* get_token_embed_weight() {
714713
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
715714
return embeddings->get_token_embed_weight();
@@ -720,7 +719,8 @@ class CLIPTextModel : public GGMLBlock {
720719
struct ggml_tensor* input_ids,
721720
struct ggml_tensor* tkn_embeddings,
722721
size_t max_token_idx = 0,
723-
bool return_pooled = false) {
722+
bool return_pooled = false,
723+
int clip_skip = -1) {
724724
// input_ids: [N, n_token]
725725
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
726726
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@@ -889,19 +889,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
889889
const std::string prefix,
890890
CLIPVersion version = OPENAI_CLIP_VIT_L_14,
891891
bool with_final_ln = true,
892-
int clip_skip_value = -1)
893-
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, clip_skip_value) {
892+
bool force_clip_f32 = false)
893+
: GGMLRunner(backend, offload_params_to_cpu), model(version, with_final_ln, force_clip_f32) {
894894
model.init(params_ctx, tensor_types, prefix);
895895
}
896896

897897
std::string get_desc() {
898898
return "clip";
899899
}
900900

901-
void set_clip_skip(int clip_skip) {
902-
model.set_clip_skip(clip_skip);
903-
}
904-
905901
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
906902
model.get_param_tensors(tensors, prefix);
907903
}
@@ -911,22 +907,24 @@ struct CLIPTextModelRunner : public GGMLRunner {
911907
struct ggml_tensor* input_ids,
912908
struct ggml_tensor* embeddings,
913909
size_t max_token_idx = 0,
914-
bool return_pooled = false) {
910+
bool return_pooled = false,
911+
int clip_skip = -1) {
915912
size_t N = input_ids->ne[1];
916913
size_t n_token = input_ids->ne[0];
917914
if (input_ids->ne[0] > model.n_token) {
918915
GGML_ASSERT(input_ids->ne[0] % model.n_token == 0);
919916
input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
920917
}
921918

922-
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled);
919+
return model.forward(ctx, backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
923920
}
924921

925922
struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
926923
int num_custom_embeddings = 0,
927924
void* custom_embeddings_data = NULL,
928925
size_t max_token_idx = 0,
929-
bool return_pooled = false) {
926+
bool return_pooled = false,
927+
int clip_skip = -1) {
930928
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
931929

932930
input_ids = to_backend(input_ids);
@@ -945,7 +943,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
945943
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
946944
}
947945

948-
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled);
946+
struct ggml_tensor* hidden_states = forward(compute_ctx, runtime_backend, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
949947

950948
ggml_build_forward_expand(gf, hidden_states);
951949

@@ -958,10 +956,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
958956
void* custom_embeddings_data,
959957
size_t max_token_idx,
960958
bool return_pooled,
959+
int clip_skip,
961960
ggml_tensor** output,
962961
ggml_context* output_ctx = NULL) {
963962
auto get_graph = [&]() -> struct ggml_cgraph* {
964-
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
963+
return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
965964
};
966965
GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
967966
}

conditioner.hpp

Lines changed: 28 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -61,30 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6161
const String2GGMLType& tensor_types,
6262
const std::string& embd_dir,
6363
SDVersion version = VERSION_SD1,
64-
PMVersion pv = PM_VERSION_1,
65-
int clip_skip = -1)
64+
PMVersion pv = PM_VERSION_1)
6665
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
66+
bool force_clip_f32 = embd_dir.size() > 0;
6767
if (sd_version_is_sd1(version)) {
68-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
68+
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
6969
} else if (sd_version_is_sd2(version)) {
70-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
70+
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
7171
} else if (sd_version_is_sdxl(version)) {
72-
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
73-
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
74-
}
75-
set_clip_skip(clip_skip);
76-
}
77-
78-
void set_clip_skip(int clip_skip) {
79-
if (clip_skip <= 0) {
80-
clip_skip = 1;
81-
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
82-
clip_skip = 2;
83-
}
84-
}
85-
text_model->set_clip_skip(clip_skip);
86-
if (sd_version_is_sdxl(version)) {
87-
text_model2->set_clip_skip(clip_skip);
72+
text_model = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
73+
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
8874
}
8975
}
9076

@@ -412,7 +398,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
412398
int height,
413399
int adm_in_channels = -1,
414400
bool zero_out_masked = false) {
415-
set_clip_skip(clip_skip);
416401
int64_t t0 = ggml_time_ms();
417402
struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size]
418403
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@@ -421,6 +406,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
421406
struct ggml_tensor* pooled = NULL;
422407
std::vector<float> hidden_states_vec;
423408

409+
if (clip_skip <= 0) {
410+
clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
411+
}
412+
424413
size_t chunk_len = 77;
425414
size_t chunk_count = tokens.size() / chunk_len;
426415
for (int chunk_idx = 0; chunk_idx < chunk_count; chunk_idx++) {
@@ -455,6 +444,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
455444
token_embed_custom.data(),
456445
max_token_idx,
457446
false,
447+
clip_skip,
458448
&chunk_hidden_states1,
459449
work_ctx);
460450
if (sd_version_is_sdxl(version)) {
@@ -464,6 +454,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
464454
token_embed_custom.data(),
465455
max_token_idx,
466456
false,
457+
clip_skip,
467458
&chunk_hidden_states2, work_ctx);
468459
// concat
469460
chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@@ -475,6 +466,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
475466
token_embed_custom.data(),
476467
max_token_idx,
477468
true,
469+
clip_skip,
478470
&pooled,
479471
work_ctx);
480472
}
@@ -669,21 +661,11 @@ struct SD3CLIPEmbedder : public Conditioner {
669661

670662
SD3CLIPEmbedder(ggml_backend_t backend,
671663
bool offload_params_to_cpu,
672-
const String2GGMLType& tensor_types = {},
673-
int clip_skip = -1)
664+
const String2GGMLType& tensor_types = {})
674665
: clip_g_tokenizer(0) {
675666
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
676667
clip_g = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
677668
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
678-
set_clip_skip(clip_skip);
679-
}
680-
681-
void set_clip_skip(int clip_skip) {
682-
if (clip_skip <= 0) {
683-
clip_skip = 2;
684-
}
685-
clip_l->set_clip_skip(clip_skip);
686-
clip_g->set_clip_skip(clip_skip);
687669
}
688670

689671
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -780,14 +762,17 @@ struct SD3CLIPEmbedder : public Conditioner {
780762
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
781763
int clip_skip,
782764
bool zero_out_masked = false) {
783-
set_clip_skip(clip_skip);
784765
auto& clip_l_tokens = token_and_weights[0].first;
785766
auto& clip_l_weights = token_and_weights[0].second;
786767
auto& clip_g_tokens = token_and_weights[1].first;
787768
auto& clip_g_weights = token_and_weights[1].second;
788769
auto& t5_tokens = token_and_weights[2].first;
789770
auto& t5_weights = token_and_weights[2].second;
790771

772+
if (clip_skip <= 0) {
773+
clip_skip = 2;
774+
}
775+
791776
int64_t t0 = ggml_time_ms();
792777
struct ggml_tensor* hidden_states = NULL; // [N, n_token*2, 4096]
793778
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token*2, 4096]
@@ -818,6 +803,7 @@ struct SD3CLIPEmbedder : public Conditioner {
818803
NULL,
819804
max_token_idx,
820805
false,
806+
clip_skip,
821807
&chunk_hidden_states_l,
822808
work_ctx);
823809
{
@@ -845,6 +831,7 @@ struct SD3CLIPEmbedder : public Conditioner {
845831
NULL,
846832
max_token_idx,
847833
true,
834+
clip_skip,
848835
&pooled_l,
849836
work_ctx);
850837
}
@@ -866,6 +853,7 @@ struct SD3CLIPEmbedder : public Conditioner {
866853
NULL,
867854
max_token_idx,
868855
false,
856+
clip_skip,
869857
&chunk_hidden_states_g,
870858
work_ctx);
871859

@@ -894,6 +882,7 @@ struct SD3CLIPEmbedder : public Conditioner {
894882
NULL,
895883
max_token_idx,
896884
true,
885+
clip_skip,
897886
&pooled_g,
898887
work_ctx);
899888
}
@@ -1017,18 +1006,9 @@ struct FluxCLIPEmbedder : public Conditioner {
10171006

10181007
FluxCLIPEmbedder(ggml_backend_t backend,
10191008
bool offload_params_to_cpu,
1020-
const String2GGMLType& tensor_types = {},
1021-
int clip_skip = -1) {
1009+
const String2GGMLType& tensor_types = {}) {
10221010
clip_l = std::make_shared<CLIPTextModelRunner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
10231011
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer");
1024-
set_clip_skip(clip_skip);
1025-
}
1026-
1027-
void set_clip_skip(int clip_skip) {
1028-
if (clip_skip <= 0) {
1029-
clip_skip = 2;
1030-
}
1031-
clip_l->set_clip_skip(clip_skip);
10321012
}
10331013

10341014
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -1109,12 +1089,15 @@ struct FluxCLIPEmbedder : public Conditioner {
11091089
std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
11101090
int clip_skip,
11111091
bool zero_out_masked = false) {
1112-
set_clip_skip(clip_skip);
11131092
auto& clip_l_tokens = token_and_weights[0].first;
11141093
auto& clip_l_weights = token_and_weights[0].second;
11151094
auto& t5_tokens = token_and_weights[1].first;
11161095
auto& t5_weights = token_and_weights[1].second;
11171096

1097+
if (clip_skip <= 0) {
1098+
clip_skip = 2;
1099+
}
1100+
11181101
int64_t t0 = ggml_time_ms();
11191102
struct ggml_tensor* hidden_states = NULL; // [N, n_token, 4096]
11201103
struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, 4096]
@@ -1143,6 +1126,7 @@ struct FluxCLIPEmbedder : public Conditioner {
11431126
NULL,
11441127
max_token_idx,
11451128
true,
1129+
clip_skip,
11461130
&pooled,
11471131
work_ctx);
11481132
}
@@ -1241,17 +1225,13 @@ struct T5CLIPEmbedder : public Conditioner {
12411225
T5CLIPEmbedder(ggml_backend_t backend,
12421226
bool offload_params_to_cpu,
12431227
const String2GGMLType& tensor_types = {},
1244-
int clip_skip = -1,
12451228
bool use_mask = false,
12461229
int mask_pad = 1,
12471230
bool is_umt5 = false)
12481231
: use_mask(use_mask), mask_pad(mask_pad), t5_tokenizer(is_umt5) {
12491232
t5 = std::make_shared<T5Runner>(backend, offload_params_to_cpu, tensor_types, "text_encoders.t5xxl.transformer", is_umt5);
12501233
}
12511234

1252-
void set_clip_skip(int clip_skip) {
1253-
}
1254-
12551235
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
12561236
t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
12571237
}

0 commit comments

Comments
 (0)