@@ -1480,37 +1480,38 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
1480
1480
//
1481
1481
1482
1482
void llama_sample_softmax (struct llama_context * ctx, llama_token_data_array * candidates) {
1483
+ printf (" llama_sample_softmax\n " ); fflush (stdout);
1483
1484
assert (candidates->size > 0 );
1484
- printf (" llama_sample_softmax\n " );
1485
1485
1486
1486
const int64_t t_start_sample_us = ggml_time_us ();
1487
1487
1488
+ printf (" llama_sample_softmax 1\n " ); fflush (stdout);
1488
1489
// Sort the logits in descending order
1489
1490
if (!candidates->sorted ) {
1490
1491
std::sort (candidates->data , candidates->data + candidates->size , [](const llama_token_data & a, const llama_token_data & b) {
1491
1492
return a.logit > b.logit ;
1492
1493
});
1493
1494
candidates->sorted = true ;
1494
1495
}
1495
- printf (" llama_sample_softmax 2\n " );
1496
+ printf (" llama_sample_softmax 2\n " ); fflush (stdout);
1496
1497
1497
1498
float max_l = candidates->data [0 ].logit ;
1498
- printf (" max_l = %f\n " , max_l);
1499
- fflush (stdout);
1499
+ printf (" max_l = %f\n " , max_l); fflush (stdout);
1500
1500
float cum_sum = 0 .0f ;
1501
1501
for (size_t i = 0 ; i < candidates->size ; ++i) {
1502
- printf (" i = %d, logit = %f\n " , i, candidates->data [i].logit );
1503
- fflush (stdout);
1502
+ printf (" i = %d, logit = %f\n " , i, candidates->data [i].logit ); fflush (stdout);
1504
1503
float p = expf (candidates->data [i].logit - max_l);
1504
+ printf (" p = %f\n " , p); fflush (stdout);
1505
1505
candidates->data [i].p = p;
1506
1506
cum_sum += p;
1507
+ printf (" cum_sum = %f\n " , cum_sum); fflush (stdout);
1507
1508
}
1508
1509
printf (" cum_sum = %f\n " , cum_sum);
1509
1510
fflush (stdout);
1510
1511
for (size_t i = 0 ; i < candidates->size ; ++i) {
1511
- printf (" i = %d, p = %f\n " , i, candidates->data [i].logit );
1512
- fflush (stdout);
1512
+ printf (" i = %d, p = %f\n " , i, candidates->data [i].p ); fflush (stdout);
1513
1513
candidates->data [i].p /= cum_sum;
1514
+ printf (" p = %f\n " , candidates->data [i].p ); fflush (stdout);
1514
1515
}
1515
1516
1516
1517
if (ctx) {
@@ -1521,26 +1522,35 @@ void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * c
1521
1522
void llama_sample_top_k (struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep) {
1522
1523
const int64_t t_start_sample_us = ggml_time_us ();
1523
1524
1525
+ printf (" llama_sample_top_k\n " ); fflush (stdout);
1524
1526
k = std::max (k, (int ) min_keep);
1525
1527
k = std::min (k, (int ) candidates->size );
1528
+ printf (" llama_sample_top_k 2\n " ); fflush (stdout);
1526
1529
1527
1530
// Sort scores in descending order
1528
1531
if (!candidates->sorted ) {
1532
+ printf (" llama_sample_top_k 3\n " ); fflush (stdout);
1529
1533
auto comp = [](const llama_token_data & a, const llama_token_data & b) {
1534
+ printf (" llama_sample_top_k 4\n " ); fflush (stdout);
1530
1535
return a.logit > b.logit ;
1531
1536
};
1532
1537
if (k == (int ) candidates->size ) {
1538
+ printf (" llama_sample_top_k 5\n " ); fflush (stdout);
1533
1539
std::sort (candidates->data , candidates->data + candidates->size , comp);
1534
1540
} else {
1541
+ printf (" llama_sample_top_k 6\n " ); fflush (stdout);
1535
1542
std::partial_sort (candidates->data , candidates->data + k, candidates->data + candidates->size , comp);
1536
1543
}
1544
+ printf (" llama_sample_top_k 7\n " ); fflush (stdout);
1537
1545
candidates->sorted = true ;
1538
1546
}
1539
1547
candidates->size = k;
1548
+ printf (" llama_sample_top_k 8\n " ); fflush (stdout);
1540
1549
1541
1550
if (ctx) {
1542
1551
ctx->t_sample_us += ggml_time_us () - t_start_sample_us;
1543
1552
}
1553
+ printf (" llama_sample_top_k 9\n " ); fflush (stdout);
1544
1554
}
1545
1555
1546
1556
void llama_sample_top_p (struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep) {
0 commit comments