Skip to content

Commit 3ce7e8f

Browse files
XiaotaoChenChenxiaotao03
and
Chenxiaotao03
authored
llava : MobileVLM support (#4954)
* MobileVLM native implementation * delete depthwise_conv_2d and permute_cpy relative code, replace the two by the existed functions, and opt ldp definition, support LLAMA_PERF option for CMake * move android script to example/llava directory * Fix the editor config checks --------- Co-authored-by: Chenxiaotao03 <[email protected]>
1 parent b2d80e1 commit 3ce7e8f

File tree

8 files changed

+737
-24
lines changed

8 files changed

+737
-24
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,13 @@ option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STA
108108
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
109109
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
110110

111+
112+
# add perf arguments
113+
option(LLAMA_PERF "llama: enable perf" OFF)
114+
if (LLAMA_PERF)
115+
add_definitions(-DGGML_PERF)
116+
endif()
117+
111118
# Required for relocatable CMake package
112119
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
113120

examples/llava/MobileVLM-README.md

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
# MobileVLM
2+
3+
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
4+
5+
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
6+
7+
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
8+
9+
## Usage
10+
Build with cmake or run `make llava-cli` to build it.
11+
12+
After building, run: `./llava-cli` to see the usage. For example:
13+
14+
```sh
15+
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
16+
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
17+
--image path/to/an/image.jpg \
18+
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
19+
```
20+
21+
## Model conversion
22+
23+
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
24+
25+
```sh
26+
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
27+
28+
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
29+
```
30+
31+
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
32+
33+
```sh
34+
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
35+
```
36+
37+
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
38+
39+
```sh
40+
python ./examples/llava/convert-image-encoder-to-gguf \
41+
-m path/to/clip-vit-large-patch14-336 \
42+
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
43+
--output-dir path/to/MobileVLM-1.7B \
44+
--projector-type ldp
45+
```
46+
47+
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
48+
49+
```sh
50+
python ./convert.py path/to/MobileVLM-1.7B
51+
```
52+
53+
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
54+
```sh
55+
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
56+
```
57+
58+
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
59+
60+
## Android compile and run
61+
### compile
62+
refer to `examples/llava/android/build_64.sh`
63+
```sh
64+
mkdir examples/llava/android/build_64
65+
cd examples/llava/android/build_64
66+
../build_64.sh
67+
```
68+
### run on Android
69+
refer to `android/adb_run.sh`, modify resources' `name` and `path`
70+
71+
## some result on Android with `Snapdragon 888` chip
72+
### case 1
73+
**input**
74+
```sh
75+
/data/local/tmp/llava-cli \
76+
-m /data/local/tmp/ggml-model-q4_k.gguf \
77+
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
78+
-t 4 \
79+
--image /data/local/tmp/demo.jpg \
80+
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
81+
```
82+
**output**
83+
```sh
84+
encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch)
85+
Susan Wise Bauer
86+
llama_print_timings: load time = 23574.72 ms
87+
llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second)
88+
llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second)
89+
llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second)
90+
llama_print_timings: total time = 34731.93 ms
91+
```
92+
### case 2
93+
**input**
94+
```sh
95+
/data/local/tmp/llava-cli \
96+
-m /data/local/tmp/ggml-model-q4_k.gguf \
97+
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
98+
-t 4 \
99+
--image /data/local/tmp/cat.jpeg \
100+
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
101+
```
102+
103+
**output**
104+
```sh
105+
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
106+
The image depicts a cat sitting in the grass near some tall green plants.
107+
llama_print_timings: load time = 23257.32 ms
108+
llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second)
109+
llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second)
110+
llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second)
111+
llama_print_timings: total time = 34570.79 ms
112+
```
113+
114+
## Minor shortcomings
115+
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
116+
117+
## TODO
118+
119+
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
120+
- [ ] Optimize LDP projector performance
121+
122+
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
123+
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
124+
- [ ] run MobileVLM on `Jetson Orin`
125+
- [ ] Support more model variants, such as `MobileVLM-3B`.
126+
127+
128+
## contributor
129+
```sh
130+
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
131+
```

examples/llava/android/adb_run.sh

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/bin/bash
2+
3+
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
4+
projector_name="mmproj-model-f16.gguf"
5+
llama_name="ggml-model-q4_k.gguf"
6+
img_dir="/Users/cxt/model/llm"
7+
img_name="demo.jpg"
8+
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
9+
# img_name="cat.jpeg"
10+
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
11+
12+
program_dir="build_64/bin"
13+
binName="llava-cli"
14+
n_threads=4
15+
16+
17+
deviceDir="/data/local/tmp"
18+
saveDir="output"
19+
if [ ! -d ${saveDir} ]; then
20+
mkdir ${saveDir}
21+
fi
22+
23+
24+
function android_run() {
25+
# # copy resource into device
26+
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
27+
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
28+
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
29+
# copy program into device
30+
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
31+
adb shell "chmod 0777 ${deviceDir}/${binName}"
32+
33+
# run
34+
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
35+
-m ${deviceDir}/${llama_name} \
36+
--mmproj ${deviceDir}/${projector_name} \
37+
-t ${n_threads} \
38+
--image ${deviceDir}/${img_name} \
39+
-p \"${prompt}\" \
40+
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
41+
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
42+
-m ${deviceDir}/${llama_name} \
43+
--mmproj ${deviceDir}/${projector_name} \
44+
-t ${n_threads} \
45+
--image ${deviceDir}/${img_name} \
46+
-p \"${prompt}\" \
47+
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
48+
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
49+
}
50+
51+
android_run
52+
53+
echo "android_run is Done!"

examples/llava/android/build_64.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
cmake ../../../../ \
3+
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
4+
-DCMAKE_BUILD_TYPE=Release \
5+
-DANDROID_ABI="arm64-v8a" \
6+
-DANDROID_PLATFORM=android-23 $1
7+
8+
make -j4

0 commit comments

Comments
 (0)