Skip to content

Commit e0680ac

Browse files
committed
llama : don't zero-init vectors in quantize -> 7.4% faster
1 parent a42435a commit e0680ac

File tree

1 file changed

+10
-4
lines changed

1 file changed

+10
-4
lines changed

llama.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4640,8 +4640,14 @@ void llama_beam_search(llama_context * ctx,
46404640
// quantization
46414641
//
46424642

4643+
template <typename T>
4644+
struct no_init {
4645+
T value;
4646+
no_init() { /* do nothing */ }
4647+
};
4648+
46434649
static void llama_convert_tensor_internal(
4644-
struct ggml_tensor * tensor, std::vector<float> & output, std::vector<std::thread> & workers,
4650+
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
46454651
const size_t nelements, const int nthread
46464652
) {
46474653
if (output.size() < nelements) {
@@ -4899,9 +4905,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
48994905

49004906
int idx = 0;
49014907

4902-
std::vector<uint8_t> read_data;
4903-
std::vector<uint8_t> work;
4904-
std::vector<float> f32_conv_buf;
4908+
std::vector<no_init<uint8_t>> read_data;
4909+
std::vector<no_init<uint8_t>> work;
4910+
std::vector<no_init<float>> f32_conv_buf;
49054911

49064912
// populate the original tensors so we get an initial meta data
49074913
for (int i = 0; i < ml->n_tensors; ++i) {

0 commit comments

Comments
 (0)