From 697c769b6422b7084f7c815c5a84bcff50f240f3 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 21 Jan 2024 14:59:48 +0100 Subject: [PATCH] fix(llama.cpp): enable cont batching when parallel is set (#1622) Signed-off-by: Ettore Di Giacinto --- backend/cpp/llama/grpc-server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/cpp/llama/grpc-server.cpp b/backend/cpp/llama/grpc-server.cpp index 3bbf7ce06ecb..76a82a33dc86 100644 --- a/backend/cpp/llama/grpc-server.cpp +++ b/backend/cpp/llama/grpc-server.cpp @@ -2465,10 +2465,10 @@ static void params_parse(const backend::ModelOptions* request, const char *env_parallel = std::getenv("LLAMACPP_PARALLEL"); if (env_parallel != NULL) { params.n_parallel = std::stoi(env_parallel); + params.cont_batching = true; } else { params.n_parallel = 1; } - params.cont_batching = true; // TODO: Add yarn if (!request->tensorsplit().empty()) {