@@ -16533,7 +16533,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
16533
16533
}
16534
16534
}
16535
16535
16536
- static void llama_graph_compute(
16536
+ static enum ggml_status llama_graph_compute(
16537
16537
llama_context & lctx,
16538
16538
ggml_cgraph * gf,
16539
16539
int n_threads,
@@ -16555,9 +16555,11 @@ static void llama_graph_compute(
16555
16555
}
16556
16556
#endif
16557
16557
16558
- ggml_backend_sched_graph_compute_async(lctx.sched, gf);
16558
+ auto status = ggml_backend_sched_graph_compute_async(lctx.sched, gf);
16559
16559
16560
16560
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
16561
+
16562
+ return status;
16561
16563
}
16562
16564
16563
16565
// decode a batch of tokens by evaluating the transformer
@@ -16739,7 +16741,18 @@ static int llama_decode_internal(
16739
16741
16740
16742
llama_set_inputs(lctx, ubatch);
16741
16743
16742
- llama_graph_compute(lctx, gf, n_threads, threadpool);
16744
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
16745
+ switch (compute_status) {
16746
+ case GGML_STATUS_SUCCESS:
16747
+ break;
16748
+ case GGML_STATUS_ABORTED:
16749
+ return 2;
16750
+ case GGML_STATUS_ALLOC_FAILED:
16751
+ return -2;
16752
+ case GGML_STATUS_FAILED:
16753
+ default:
16754
+ return -3;
16755
+ }
16743
16756
16744
16757
// update the kv ring buffer
16745
16758
{
@@ -16959,7 +16972,18 @@ static int llama_encode_internal(
16959
16972
16960
16973
llama_set_inputs(lctx, ubatch);
16961
16974
16962
- llama_graph_compute(lctx, gf, n_threads, threadpool);
16975
+ const auto compute_status = llama_graph_compute(lctx, gf, n_threads, threadpool);
16976
+ switch (compute_status) {
16977
+ case GGML_STATUS_SUCCESS:
16978
+ break;
16979
+ case GGML_STATUS_ABORTED:
16980
+ return 2;
16981
+ case GGML_STATUS_ALLOC_FAILED:
16982
+ return -2;
16983
+ case GGML_STATUS_FAILED:
16984
+ default:
16985
+ return -3;
16986
+ }
16963
16987
16964
16988
// extract embeddings
16965
16989
if (embd) {
0 commit comments