Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
10c830c
Create llava-survery-v2.py
cmp-nct Feb 1, 2024
97dda1e
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 1, 2024
8ebdaec
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 1, 2024
1f9367c
Rename llava-survery-v2.py to llava-surgery-v2.py
cmp-nct Feb 1, 2024
a27b9a4
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 2, 2024
440b2ae
Update convert-image-encoder-to-gguf.py
cmp-nct Feb 2, 2024
35b7a7a
Update llava-surgery-v2.py
cmp-nct Feb 2, 2024
37a147e
Clip: Bugfix for normalization (it did not loat the 3 std and mean va…
cmp-nct Feb 8, 2024
7dcadb4
whitespace corrections
Feb 11, 2024
7107b90
ws
Feb 11, 2024
51e60c9
Tensors are now properly permuted.
Feb 12, 2024
60c5f46
ws
Feb 12, 2024
0dd6c9d
added verbose_prompt support into cli
Feb 12, 2024
3a72267
moved llava functions to llava.cpp, made clip.h C compatible API, rep…
Feb 12, 2024
07f5cd7
ws
Feb 12, 2024
6b8d69b
convert : skip unknown tensors (need for LLaVA)
ggerganov Feb 13, 2024
a284885
llava : update readme
ggerganov Feb 13, 2024
65ec518
llava : fix compile warnings
ggerganov Feb 13, 2024
a20c071
Merge remote-tracking branch 'origin/master' into HEAD
ggerganov Feb 13, 2024
997dd1f
llava : style
ggerganov Feb 13, 2024
9d166b0
convert : add --skip-unknown CLI arg
ggerganov Feb 13, 2024
c92431a
server : remove clip structs
ggerganov Feb 13, 2024
c9874dd
bugfix for non llava-1.6
cmp-nct Feb 14, 2024
7974ff7
clip : minor code rearrange
ggerganov Feb 14, 2024
6727cfd
llava : update readme a bit
ggerganov Feb 14, 2024
760a214
Update README.md
cmp-nct Feb 14, 2024
502d7c0
bugfix image number
Feb 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 21 additions & 16 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,7 +1173,7 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM
for (name, tensor) in model.items()}


def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel:
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))

Expand All @@ -1199,7 +1199,11 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
for name, lazy_tensor in model.items():
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None:
raise Exception(f"Unexpected tensor name: {name}")
if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping")
continue
else:
raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")

if tensor_type in should_skip:
print(f"skipping tensor {name_new}")
Expand Down Expand Up @@ -1377,19 +1381,20 @@ def main(args_in: list[str] | None = None) -> None:
output_choices.append("q8_0")
vocab_types = ["spm", "bpe", "hfft"]
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)")
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file")
parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default=DEFAULT_CONCURRENCY)
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")

args = parser.parse_args(args_in)
if args.awq_path:
Expand Down Expand Up @@ -1461,7 +1466,7 @@ def main(args_in: list[str] | None = None) -> None:
print(f"Special vocab info: {special_vocab}")

model = model_plus.model
model = convert_model_names(model, params)
model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
model = convert_to_output_type(model, ftype)
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
Expand Down
52 changes: 48 additions & 4 deletions examples/llava/README.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
# LLaVA

Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants.
Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.

The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
models are available.
For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)

After API is confirmed, more models will be supported / uploaded.

Expand All @@ -18,10 +20,11 @@ After building, run: `./llava-cli` to see the usage. For example:
```

**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual

## Model conversion
## LLaVA 1.5

- Clone `llava-v15-7b` and `clip-vit-large-patch14-336` locally:
- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:

```sh
git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
Expand Down Expand Up @@ -55,8 +58,49 @@ python ./convert.py ../llava-v1.5-7b

Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.

## LLaVA 1.6 gguf conversion

1) Backup your pth/safetensor model files as llava-surgery modifies them
2) Use `python llava-surgery-v2.py -C -m /path/to/hf-model` which also supports llava-1.5 variants pytorch as well as safetensor models:
- you will find a llava.projector and a llava.clip file in your model directory
3) copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory (https://huggingface.co/cmp-nct/llava-1.6-gguf/blob/main/config.json)
4) Create the visual gguf model: `python ./examples/llava/convert-image-encoder-to-gguf.py -m ../path/to/vit --llava-projector ../path/to/llava.projector --output-dir ../path/to/output --clip_model_is_vision`
- This is similar to llava-1.5, the difference is that we tellt he encoder that we are working with the pure vision model part of CLIP
5) Everything else as usual: convert.py the hf model, quantize as needed
**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)

## llava-cli templating and llava-1.6 prompting

llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:

**For Mistral and using llava-cli binary:**
Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role

**For the 34B this should work:**
Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`


## How to know if you are running in llava-1.5 or llava-1.6 mode

When running llava-cli you will see a visual information right before the prompt is being processed:

**Llava-1.5:**
`encode_image_with_clip: image embedding created: 576 tokens`

**Llava-1.6 (anything above 576):**
`encode_image_with_clip: image embedding created: 2880 tokens`


Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6




## TODO

- [ ] Support non-CPU backend for the image encoding part.
- [x] Support non-CPU backend for the image encoding part.
- [ ] Support different sampling methods.
- [ ] Support more model variants.
Loading