Skip to content

Commit 76a0b6e

Browse files
committed
llama : don't zero-init vectors in quantize -> 7.4% faster
1 parent a95aa21 commit 76a0b6e

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

llama.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4639,8 +4639,14 @@ void llama_beam_search(llama_context * ctx,
46394639
// quantization
46404640
//
46414641

4642+
template <typename T>
4643+
struct no_init {
4644+
T value;
4645+
no_init() { /* do nothing */ }
4646+
};
4647+
46424648
static void llama_convert_tensor_internal(
4643-
struct ggml_tensor * tensor, std::vector<float> & output, std::vector<std::thread> & workers,
4649+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
46444650
const size_t nelements, const int nthread
46454651
) {
46464652
if (output.size() < nelements) {
@@ -4895,9 +4901,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48954901

48964902
int idx = 0;
48974903

4898-
std::vector<uint8_t> read_data;
4899-
std::vector<uint8_t> work;
4900-
std::vector<float> f32_conv_buf;
4904+
std::vector<no_init<uint8_t>> read_data;
4905+
std::vector<no_init<uint8_t>> work;
4906+
std::vector<no_init<float>> f32_conv_buf;
49014907

49024908
// populate the original tensors so we get an initial meta data
49034909
for (int i = 0; i < ml->n_tensors; ++i) {

0 commit comments

Comments
 (0)