diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 6ab7f486ed73e..7d42586b83026 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -203,6 +203,8 @@ def from_model_architecture(model_architecture): return CodeShellModel if model_architecture == "OrionForCausalLM": return OrionModel + if model_architecture == "InternLM2ForCausalLM": + return InternLM2Model return Model def _is_model_safetensors(self) -> bool: @@ -254,6 +256,8 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.CODESHELL if arch == "OrionForCausalLM": return gguf.MODEL_ARCH.ORION + if arch == "InternLM2ForCausalLM": + return gguf.MODEL_ARCH.INTERNLM2 raise NotImplementedError(f'Architecture "{arch}" not supported!') @@ -1344,6 +1348,147 @@ def write_tensors(self): self.gguf_writer.add_tensor("output.weight", data) print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") +class InternLM2Model(Model): + def set_vocab(self): + # (TODO): Is there a better way? + # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character + # \x00 specially and convert it into an emoji character to prevent it from being mistakenly + # recognized as an empty string in C++. + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + tokens: list[bytes] = [] + scores: list[float] = [] + toktypes: list[int] = [] + + if not tokenizer_path.is_file(): + print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + sys.exit(1) + + tokenizer = SentencePieceProcessor(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + for token_id in range(vocab_size): + piece = tokenizer.id_to_piece(token_id) + text = piece.encode("utf-8") + score = tokenizer.get_score(token_id) + if text == b"\x00": + # (TODO): fixme + # Hack here and replace the \x00 characters. + print(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉" + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.is_unknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.is_control(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.is_unused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.is_byte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + tokens.append(key.encode("utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + self.gguf_writer.add_name("InternLM2") + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + + def post_write_tensors(self, tensor_map, name, data_torch): + old_dtype = data_torch.dtype + + # convert any unsupported data types to float32 + if data_torch.dtype not in (torch.float16, torch.float32): + data_torch = data_torch.to(torch.float32) + + data = data_torch.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print(f"Can not map tensor {name!r}") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + self.gguf_writer.add_tensor(new_name, data) + + def write_tensors(self): + from einops import rearrange + + num_heads = self.hparams.get("num_attention_heads") + num_kv_heads = self.hparams.get("num_key_value_heads") + hidden_size = self.hparams.get("hidden_size") + q_per_kv = num_heads // num_kv_heads + head_dim = hidden_size // num_heads + num_groups = num_heads // q_per_kv + + block_count = self.hparams["num_hidden_layers"] + model_kv = dict(self.get_tensors()) + tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" + for name, data_torch in model_kv.items(): + # we don't need these + if name.endswith(".rotary_emb.inv_freq"): + continue + plora_tensor = True if "Plora" in name else False + if re.match(qkv_pattern, name) and not plora_tensor: + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv+2, i=head_dim) + q, k, v = qkv[...,:q_per_kv,:], qkv[...,q_per_kv:q_per_kv+1,:], qkv[...,q_per_kv+1:q_per_kv+2,:] + q = rearrange(q, " o g n i -> o (g n i)").T + k = rearrange(k, " o g n i -> o (g n i)").T + v = rearrange(v, " o g n i -> o (g n i)").T + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) + self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) + else: + self.post_write_tensors(tensor_map, name, data_torch) + + ###### CONVERSION LOGIC ###### diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index f5a3c9b46f9e3..33d9270426ba5 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -41,6 +41,9 @@ def get_tensor_name(name: str) -> str: if "mm_projector" in name: return name.replace("model.mm_projector", "mm") + + if "vision_proj" in name: + return name.replace("vision_proj", "mm") return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") @@ -80,12 +83,13 @@ def bytes_to_unicode(): help="Save a vision-only model. It can't be used to encode texts") ap.add_argument("--clip_model_is_vision", action="store_true", required=False, help="The clip model is a pure vision model (ShareGPT4V vision extract for example)") +ap.add_argument("--clip_model_is_openclip", action="store_true", required=False, + help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") -ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values") -ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 +# Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 default_image_mean = [0.48145466, 0.4578275, 0.40821073] default_image_std = [0.26862954, 0.26130258, 0.27577711] ap.add_argument('--image_mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) @@ -105,7 +109,7 @@ def bytes_to_unicode(): # output in the same directory as the model if output_dir is None dir_model = args.model_dir -if args.clip_model_is_vision: +if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None else: @@ -133,7 +137,7 @@ def bytes_to_unicode(): if args.use_f32: ftype = 0 -if args.clip_model_is_vision: +if args.clip_model_is_vision or args.clip_model_is_openclip: model = CLIPVisionModel.from_pretrained(dir_model) processor = None else: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f5c933a4176e4..aa99544f57c59 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -102,6 +102,7 @@ class MODEL_ARCH(IntEnum): PLAMO = auto() CODESHELL = auto() ORION = auto() + INTERNLM2 = auto() class MODEL_TENSOR(IntEnum): @@ -132,6 +133,21 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_NORM = auto() ATTN_K_NORM = auto() + ATTN_QKV_LORA_A = auto() + ATTN_QKV_LORA_B = auto() + ATTN_OUT_LORA_A = auto() + ATTN_OUT_LORA_B = auto() + FFN_UP_LORA_A = auto() + FFN_UP_LORA_B = auto() + FFN_GATE_LORA_A = auto() + FFN_GATE_LORA_B = auto() + FFN_DOWN_LORA_A = auto() + FFN_DOWN_LORA_B = auto() + + + + + MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", @@ -153,6 +169,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -182,6 +199,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", + + MODEL_TENSOR.ATTN_QKV_LORA_A : "blk.{bid}.attn_qkv_lora_a", + MODEL_TENSOR.ATTN_QKV_LORA_B : "blk.{bid}.attn_qkv_lora_b", + MODEL_TENSOR.ATTN_OUT_LORA_A : "blk.{bid}.attn_out_lora_a", + MODEL_TENSOR.ATTN_OUT_LORA_B : "blk.{bid}.attn_out_lora_b", + MODEL_TENSOR.FFN_UP_LORA_A : "blk.{bid}.ffn_up_lora_a", + MODEL_TENSOR.FFN_UP_LORA_B : "blk.{bid}.ffn_up_lora_b", + MODEL_TENSOR.FFN_GATE_LORA_A : "blk.{bid}.ffn_gate_lora_a", + MODEL_TENSOR.FFN_GATE_LORA_B : "blk.{bid}.ffn_gate_lora_b", + MODEL_TENSOR.FFN_DOWN_LORA_A : "blk.{bid}.ffn_down_lora_a", + MODEL_TENSOR.FFN_DOWN_LORA_B : "blk.{bid}.ffn_down_lora_b", + } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -446,6 +475,32 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.INTERNLM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + + MODEL_TENSOR.ATTN_QKV_LORA_A, + MODEL_TENSOR.ATTN_QKV_LORA_B, + MODEL_TENSOR.ATTN_OUT_LORA_A, + MODEL_TENSOR.ATTN_OUT_LORA_B, + MODEL_TENSOR.FFN_UP_LORA_A, + MODEL_TENSOR.FFN_UP_LORA_B, + MODEL_TENSOR.FFN_GATE_LORA_A, + MODEL_TENSOR.FFN_GATE_LORA_B, + MODEL_TENSOR.FFN_DOWN_LORA_A, + MODEL_TENSOR.FFN_DOWN_LORA_B, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index de177af137714..074e7dd32dba4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -19,6 +19,7 @@ class TensorNameMap: "language_model.embedding.word_embeddings", # persimmon "wte", # gpt2 "transformer.embd.wte", # phi2 + "model.tok_embeddings", # internlm2 ), # Token type embeddings @@ -42,7 +43,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen - "output", # llama-pth bloom + "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 ), @@ -51,7 +52,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT_NORM: ( "gpt_neox.final_layer_norm", # gptneox "transformer.ln_f", # gpt2 gpt-j falcon - "model.norm", # llama-hf baichuan + "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth "embeddings.LayerNorm", # bert "transformer.norm_f", # mpt @@ -84,6 +85,7 @@ class TensorNameMap: "h.{bid}.ln_1", # gpt2 "transformer.h.{bid}.ln", # phi2 "model.layers.layers.{bid}.norm", # plamo + "model.layers.{bid}.attention_norm", # internlm2 ), # Attention norm 2 @@ -111,6 +113,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.query", # bert "transformer.h.{bid}.attn.q_proj", # gpt-j "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq" # internlm2 ), # Attention key @@ -120,6 +123,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.key", # bert "transformer.h.{bid}.attn.k_proj", # gpt-j "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk" # internlm2 ), # Attention value @@ -129,6 +133,13 @@ class TensorNameMap: "encoder.layer.{bid}.attention.self.value", # bert "transformer.h.{bid}.attn.v_proj", # gpt-j "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv" # internlm2 + ), + MODEL_TENSOR.ATTN_QKV_LORA_A: ( + "model.layers.{bid}.attention.wqkv.Plora_A", # internlm2 + ), + MODEL_TENSOR.ATTN_QKV_LORA_B: ( + "model.layers.{bid}.attention.wqkv.Plora_B", # internlm2 ), # Attention output @@ -147,6 +158,13 @@ class TensorNameMap: "h.{bid}.attn.c_proj", # gpt2 "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + ), + MODEL_TENSOR.ATTN_OUT_LORA_A: ( + "model.layers.{bid}.attention.wo.Plora_A", # internlm2 + ), + MODEL_TENSOR.ATTN_OUT_LORA_B: ( + "model.layers.{bid}.attention.wo.Plora_B", # internlm2 ), # Rotary embeddings @@ -169,6 +187,7 @@ class TensorNameMap: "language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon "model.layers.{bid}.ln2", # yi "h.{bid}.ln_2", # gpt2 + "model.layers.{bid}.ffn_norm", # internlm2 ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -194,6 +213,13 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2 "model.layers.layers.{bid}.mlp.up_proj", # plamo + "model.layers.{bid}.feed_forward.w3", # internlm2 + ), + MODEL_TENSOR.FFN_UP_LORA_A: ( + "model.layers.{bid}.feed_forward.w3.Plora_A", # internlm2 + ), + MODEL_TENSOR.FFN_UP_LORA_B: ( + "model.layers.{bid}.feed_forward.w3.Plora_B", # internlm2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -212,6 +238,13 @@ class TensorNameMap: "layers.{bid}.feed_forward.w1", # llama-pth "transformer.h.{bid}.mlp.w2", # qwen "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + ), + MODEL_TENSOR.FFN_GATE_LORA_A: ( + "model.layers.{bid}.feed_forward.w1.Plora_A", # internlm2 + ), + MODEL_TENSOR.FFN_GATE_LORA_B: ( + "model.layers.{bid}.feed_forward.w1.Plora_B", # internlm2 ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -236,6 +269,13 @@ class TensorNameMap: "transformer.h.{bid}.mlp.fc2", # phi2 "model.layers.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo + "model.layers.{bid}.feed_forward.w2", # internlm2 + ), + MODEL_TENSOR.FFN_DOWN_LORA_A: ( + "model.layers.{bid}.feed_forward.w2.Plora_A", # internlm2 + ), + MODEL_TENSOR.FFN_DOWN_LORA_B: ( + "model.layers.{bid}.feed_forward.w2.Plora_B", # internlm2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/llama.cpp b/llama.cpp index bb23689fac0e0..f711023c2e865 100644 --- a/llama.cpp +++ b/llama.cpp @@ -204,6 +204,7 @@ enum llm_arch { LLM_ARCH_PLAMO, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, + LLM_ARCH_INTERNLM2, LLM_ARCH_UNKNOWN, }; @@ -226,6 +227,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, }; enum llm_kv { @@ -372,6 +374,18 @@ enum llm_tensor { LLM_TENSOR_FFN_UP_EXP, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, + + // dynamic/partial LORA tensors + LLM_TENSOR_ATTN_QKV_LORA_A, + LLM_TENSOR_ATTN_QKV_LORA_B, + LLM_TENSOR_ATTN_OUT_LORA_A, + LLM_TENSOR_ATTN_OUT_LORA_B, + LLM_TENSOR_FFN_UP_LORA_A, + LLM_TENSOR_FFN_UP_LORA_B, + LLM_TENSOR_FFN_GATE_LORA_A, + LLM_TENSOR_FFN_GATE_LORA_B, + LLM_TENSOR_FFN_DOWN_LORA_A, + LLM_TENSOR_FFN_DOWN_LORA_B, }; static std::map> LLM_TENSOR_NAMES = { @@ -664,12 +678,37 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, + { + LLM_ARCH_INTERNLM2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + // dynamic/partial LORA tensors + { LLM_TENSOR_ATTN_QKV_LORA_A, "blk.%d.attn_qkv_lora_a" }, + { LLM_TENSOR_ATTN_QKV_LORA_B, "blk.%d.attn_qkv_lora_b" }, + { LLM_TENSOR_ATTN_OUT_LORA_A, "blk.%d.attn_out_lora_a" }, + { LLM_TENSOR_ATTN_OUT_LORA_B, "blk.%d.attn_out_lora_b" }, + { LLM_TENSOR_FFN_UP_LORA_A, "blk.%d.ffn_up_lora_a" }, + { LLM_TENSOR_FFN_UP_LORA_B, "blk.%d.ffn_up_lora_b" }, + { LLM_TENSOR_FFN_GATE_LORA_A, "blk.%d.ffn_gate_lora_a" }, + { LLM_TENSOR_FFN_GATE_LORA_B, "blk.%d.ffn_gate_lora_b" }, + { LLM_TENSOR_FFN_DOWN_LORA_A, "blk.%d.ffn_down_lora_a" }, + { LLM_TENSOR_FFN_DOWN_LORA_B, "blk.%d.ffn_down_lora_b" }, }, }, - { LLM_ARCH_UNKNOWN, { @@ -1377,6 +1416,7 @@ enum e_model { MODEL_13B, MODEL_14B, MODEL_15B, + MODEL_20B, MODEL_30B, MODEL_34B, MODEL_40B, @@ -1529,6 +1569,19 @@ struct llama_layer { struct ggml_tensor * ffn_down_b; // b2 struct ggml_tensor * ffn_up_b; // b3 struct ggml_tensor * ffn_act; + + // intern_ml ploras: + struct ggml_tensor * attn_qkv_lora_a; // qkv + struct ggml_tensor * attn_qkv_lora_b; + struct ggml_tensor * attn_out_lora_a; // wo + struct ggml_tensor * attn_out_lora_b; + struct ggml_tensor * ffn_up_lora_a; // w3 + struct ggml_tensor * ffn_up_lora_b; + struct ggml_tensor * ffn_gate_lora_a; // w1 + struct ggml_tensor * ffn_gate_lora_b; + struct ggml_tensor * ffn_down_lora_a; // w2 + struct ggml_tensor * ffn_down_lora_b; + }; struct llama_kv_cell { @@ -2367,7 +2420,6 @@ struct llama_model_loader { case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; - case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2715,7 +2767,6 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; - case LLAMA_FTYPE_MOSTLY_Q3_K_XS:return "Q3_K - Extra small"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; default: return "unknown, may not work"; @@ -2731,6 +2782,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_13B: return "13B"; case MODEL_14B: return "14B"; case MODEL_15B: return "15B"; + case MODEL_20B: return "20B"; case MODEL_30B: return "30B"; case MODEL_34B: return "34B"; case MODEL_40B: return "40B"; @@ -2743,6 +2795,14 @@ static const char * llama_model_type_name(e_model type) { default: return "?B"; } } +static const char * llama_model_vocab_type_name(enum llama_vocab_type type){ + switch (type) { + case LLAMA_VOCAB_TYPE_SPM: return "SPM"; + case LLAMA_VOCAB_TYPE_BPE: return "BPE"; + default: return "unknown"; + } +} + static void llm_load_arch(llama_model_loader & ml, llama_model & model) { model.arch = ml.get_arch(); @@ -3006,6 +3066,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_INTERNLM2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 48: model.type = e_model::MODEL_20B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -3013,7 +3082,7 @@ static void llm_load_hparams( } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false); +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false, bool add_space_prefix = true); static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( @@ -3269,7 +3338,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { // hparams LLAMA_LOG_INFO("%s: format = %s\n", __func__, llama_file_version_name(ml.fver)); LLAMA_LOG_INFO("%s: arch = %s\n", __func__, LLM_ARCH_NAMES.at(model.arch).c_str()); - LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, vocab.type == LLAMA_VOCAB_TYPE_SPM ? "SPM" : "BPE"); // TODO: fix + LLAMA_LOG_INFO("%s: vocab type = %s\n", __func__, llama_model_vocab_type_name(vocab.type)); LLAMA_LOG_INFO("%s: n_vocab = %u\n", __func__, hparams.n_vocab); LLAMA_LOG_INFO("%s: n_merges = %u\n", __func__, (int) vocab.bpe_ranks.size()); LLAMA_LOG_INFO("%s: n_ctx_train = %u\n", __func__, hparams.n_ctx_train); @@ -4018,8 +4087,54 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_INTERNLM2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + // layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // plora support - if available + if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_QKV_LORA_A, "weight", i).c_str()) >= 0) { + LLAMA_LOG_INFO("Found LORA tensors for layer %d\n", i); + // this is not correct {n_embd, hparams.n_head * hparams.n_head_kv} + layer.attn_qkv_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV_LORA_A, "weight", i), {n_embd, hparams.n_head * hparams.n_head_kv}); + layer.attn_qkv_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV_LORA_B, "weight", i), {hparams.n_head * hparams.n_head_kv, n_embd + 2*n_embd_gqa}); + + + layer.attn_out_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT_LORA_A, "weight", i), {n_embd, 256}); // wo + layer.attn_out_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT_LORA_B, "weight", i), {256, n_embd}); + layer.ffn_up_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_LORA_A, "weight", i), {n_embd, 256}); // w3 + layer.ffn_up_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_LORA_B, "weight", i), {256, n_ff}); + layer.ffn_gate_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_LORA_A, "weight", i), {n_embd, 256}); // w1 + layer.ffn_gate_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_LORA_B, "weight", i), {256, n_ff}); + layer.ffn_down_lora_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_LORA_A, "weight", i), {n_ff, 256}); // w2 + layer.ffn_down_lora_b = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_LORA_B, "weight", i), {256, n_embd}); + } + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -6589,6 +6704,126 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_internlm2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + // shift the entire K-cache if needed + if (do_rope_shift) { + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_ctx, n_tokens, kv_head, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + }; static struct ggml_cgraph * llama_build_graph( @@ -6746,6 +6981,10 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_ORION: { result = llm.build_orion(); + } break; + case LLM_ARCH_INTERNLM2: + { + result = llm.build_internlm2(); } break; default: GGML_ASSERT(false); @@ -6772,6 +7011,7 @@ static int llama_decode_internal( if (n_tokens == 0) { LLAMA_LOG_ERROR("%s: n_tokens == 0", __func__); + fprintf(stderr, "%s: n_tokens == 0\n", __func__); return -1; } @@ -6840,7 +7080,7 @@ static int llama_decode_internal( } if (!llama_kv_cache_find_slot(kv_self, batch)) { - return 1; + return 1; } // a heuristic, to avoid attending the full cache if it is not yet utilized @@ -6878,6 +7118,11 @@ static int llama_decode_internal( n_threads = std::min(4, n_threads); } + const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1; + if ((ggml_cpu_has_cublas() || ggml_cpu_has_vulkan()) && fully_offloaded) { + n_threads = 1; + } + #ifdef GGML_USE_MPI const int64_t n_layer = hparams.n_layer; ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); @@ -7653,7 +7898,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< } } -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) { +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special, bool add_space_prefix) { std::vector output; // OG tokenizer behavior: @@ -7689,7 +7934,9 @@ static std::vector llama_tokenize_internal(const llama_vocab & // auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); if (&fragment == &fragment_buffer.front()) { - raw_text = " " + raw_text; // prefix with space if the first token is not special + if (add_space_prefix){ + raw_text = " " + raw_text; // prefix with space if the first token is not special + } } #ifdef PRETOKENIZERDEBUG @@ -8703,7 +8950,6 @@ void llama_sample_classifier_free_guidance( llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) { GGML_ASSERT(ctx); - auto N = float(llama_n_vocab(llama_get_model(ctx))); int64_t t_start_sample_us; t_start_sample_us = ggml_time_us(); @@ -9234,13 +9480,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (new_type != GGML_TYPE_Q8_0) { new_type = GGML_TYPE_Q6_K; } - } else if (name == "token_embd.weight") { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { - new_type = GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_Q4_K; - } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; @@ -9251,6 +9490,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (qs.i_ffn_down < qs.n_ffn_down/8) new_type = GGML_TYPE_Q2_K; ++qs.i_ffn_down; } + else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K; } else if (name.find("attn_v.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; @@ -9258,9 +9498,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q4_K; - } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -9298,9 +9535,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } - //else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - // if (i_layer < n_layer/8) new_type = GGML_TYPE_Q5_K; - //} else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K @@ -9332,14 +9566,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } else if (name.find("attn_output.weight") != std::string::npos) { if (arch != LLM_ARCH_FALCON) { if (qs.model.hparams.n_expert == 8) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { new_type = GGML_TYPE_Q5_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } @@ -9382,8 +9615,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty bool convert_incompatible_tensor = false; if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || - new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || - new_type == GGML_TYPE_IQ3_XXS) { + new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -9397,7 +9629,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty switch (new_type) { case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break; case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break; case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break; @@ -9439,7 +9670,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break; - case LLAMA_FTYPE_MOSTLY_IQ3_XXS:quantized_type = GGML_TYPE_IQ3_XXS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -9545,7 +9775,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct ggml_tensor * tensor = ml.get_tensor_meta(i); const std::string name = ggml_get_name(tensor); - + if (!ml.use_mmap) { if (read_data.size() < ggml_nbytes(tensor)) { read_data.resize(ggml_nbytes(tensor)); @@ -11266,7 +11496,8 @@ int32_t llama_tokenize( int32_t n_max_tokens, bool add_bos, bool special) { - auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); + bool add_space_prefix = model->arch == LLM_ARCH_INTERNLM2; + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special, add_space_prefix); if (n_max_tokens < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -11281,7 +11512,7 @@ int32_t llama_tokenize( } static std::string llama_decode_text(const std::string & text) { - std::string decoded_text; + std::string decoded_text; auto unicode_sequences = codepoints_from_utf8(text); for (auto& unicode_sequence : unicode_sequences) { decoded_text += unicode_to_bytes_bpe(codepoint_to_utf8(unicode_sequence)); @@ -11292,7 +11523,7 @@ static std::string llama_decode_text(const std::string & text) { // does not write null-terminator to buf int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) { - if (0 <= token && token < llama_n_vocab(model)) { + if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { case LLAMA_VOCAB_TYPE_SPM: { // NOTE: we accept all unsupported token types,