Parallel processing of runners #4380
Unanswered
DenisStefanAndrei
asked this question in
Q&A
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Hello. I am running the following service
`
import bentoml
from openllm import LLM
from bentoml.io import JSON
from bentoml.io import Text
import asyncio
import logging
from time import time
ch = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
bentoml_logger = logging.getLogger("bentoml")
bentoml_logger.addHandler(ch)
bentoml_logger.setLevel(logging.DEBUG)
class LLMRunnable(bentoml.Runnable):
SUPPORTED_RESOURCES = ("nvidia.com/gpu", )
SUPPORTS_CPU_MULTI_THREADING = True
def init(self, model):
self.llm = LLM(model, backend="vllm", quantize="awq", max_model_len=3000, gpu_memory_utilization=0.5)
mistral_awq_instruct = bentoml.Runner(
LLMRunnable,
name="my_runner_1",
runnable_init_params={
"model": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
}
)
mistral_2 = bentoml.Runner(LLMRunnable, name="mistral_runner_2", runnable_init_params={"model": "TheBloke/Mistral-7B-Instruct-v0.2-AWQ"} )
svc = bentoml.Service(name, runners=[mistral_awq_instruct, mistral_2])
@svc.api(input=Text(), output=JSON())
async def generate(prompt: str) -> any:
t1 = time()
print("generate endpoint")
output = await mistral_awq_instruct.generate.async_run(prompt)
print("execution time is: ", t1 - time())
return output
`
I make 4 parallel requests to try and user the 2 runners available in the service but only one of them is being used.
How can I take advantage of the 2 runners for this service to have a better inference time?
Beta Was this translation helpful? Give feedback.
All reactions