We already provided example trained predict models in moe-predict-models
folder.
You can also train your own predict models by following the instructions below.
# collect logits using llama.cpp.
# you may need to let llama.cpp to offload some layers to CPU for memory constrained GPUs, using `--gpu-layers` option.
SPARSE_CACHE_ENABLE_TRACE=0 /code/moe/llama.cpp/build/bin/llama-parallel \
-m /code/huggingface-gguf/DeepSeek-V2-Lite-Chat/f16/main.gguf \
-ngl 80 -ub 2048 -nocb -ns 10000 -n 512 -c 2048 \
-f /code/sparse-llm-cache-scripts/dataset/chatgpt-prompts/prompt_list.txt \
--moe-cache 0 --expert_buft 0 -fa --delay-escape \
--trace-logits 1 --trace-num-entries 100000 \
--trace-dump-path /code/moe/moe-traces/logits-traces/DeepSeek-V2-Lite-Chat-chatgpt-prompts
# train predict model from above traces
python3 train_predict_model.py \
--logits_path /code/moe/moe-traces/logits-traces/DeepSeek-V2-Lite-Chat-chatgpt-prompts \
--predict_model_path /code/sparse-llm-cache-scripts/moe-predict-models/models--deepseek-ai--DeepSeek-V2-Lite-Chat \
--predict_output gate \
--print_loss \
--threshold 0.0005 \
--window 4 \
--lr 0.0005 \
--input_norm_method std