Skip to content

Commit

Permalink
Add OpenVINO as additional backend (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
DanielHirschTNG committed Jan 27, 2025
1 parent 0baf9e3 commit 007fb89
Show file tree
Hide file tree
Showing 21 changed files with 701 additions and 8 deletions.
14 changes: 14 additions & 0 deletions OpenVINO/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
.vscode/
__pycache__/
models/llm/
temp/
test/
dist/
build/
cache/
test/
env/

!tools/*.exe
llm_cache/
TinyLlama-*
4 changes: 4 additions & 0 deletions OpenVINO/model_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
openVINOConfig = {
"openvino": "../service/models/llm/openvino",
}

171 changes: 171 additions & 0 deletions OpenVINO/openvino_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
import threading
from queue import Empty, Queue
import json
import traceback
from typing import Dict, List, Callable
#from model_downloader import NotEnoughDiskSpaceException, DownloadException
#from psutil._common import bytes2human
from openvino_interface import LLMInterface
from openvino_params import LLMParams


RAG_PROMPT_FORMAT = "Answer the questions based on the information below. \n{context}\n\nQuestion: {prompt}"

class LLM_SSE_Adapter:
msg_queue: Queue
finish: bool
singal: threading.Event
llm_interface: LLMInterface
should_stop: bool

def __init__(self, llm_interface: LLMInterface):
self.msg_queue = Queue(-1)
self.finish = False
self.singal = threading.Event()
self.llm_interface = llm_interface
self.should_stop = False

def put_msg(self, data):
self.msg_queue.put_nowait(data)
self.singal.set()

def load_model_callback(self, event: str):
data = {"type": "load_model", "event": event}
self.put_msg(data)

def text_in_callback(self, msg: str):
data = {"type": "text_in", "value": msg}
self.put_msg(data)

def text_out_callback(self, msg: str, type=1):
data = {"type": "text_out", "value": msg, "dtype": type}
self.put_msg(data)

def first_latency_callback(self, first_latency: str):
data = {"type": "first_token_latency", "value": first_latency}
self.put_msg(data)

def after_latency_callback(self, after_latency: str):
data = {"type": "after_token_latency", "value": after_latency}
self.put_msg(data)

def sr_latency_callback(self, sr_latency: str):
data = {"type": "sr_latency", "value": sr_latency}
self.put_msg(data)

def error_callback(self, ex: Exception):
if (
isinstance(ex, NotImplementedError)
and ex.__str__() == "Access to repositories lists is not implemented."
):
self.put_msg(
{
"type": "error",
"err_type": "repositories_not_found",
}
)
# elif isinstance(ex, NotEnoughDiskSpaceException):
# self.put_msg(
# {
# "type": "error",
# "err_type": "not_enough_disk_space",
# "need": bytes2human(ex.requires_space),
# "free": bytes2human(ex.free_space),
# }
# )
# elif isinstance(ex, DownloadException):
# self.put_msg({"type": "error", "err_type": "download_exception"})
# # elif isinstance(ex, llm_biz.StopGenerateException):
# # pass
elif isinstance(ex, RuntimeError):
self.put_msg({"type": "error", "err_type": "runtime_error"})
else:
self.put_msg({"type": "error", "err_type": "unknow_exception"})
print(f"exception:{str(ex)}")

def text_conversation(self, params: LLMParams):
thread = threading.Thread(
target=self.text_conversation_run,
args=[params],
)
thread.start()
return self.generator()


def stream_function(self, stream):
for output in stream:
if self.llm_interface.stop_generate:
self.llm_interface.stop_generate = False
break

self.text_out_callback(output)
self.put_msg({"type": "finish"})

def text_conversation_run(
self,
params: LLMParams,
):
try:
self.llm_interface.load_model(params, callback=self.load_model_callback)

prompt = params.prompt
full_prompt = convert_prompt(prompt)
self.llm_interface.create_chat_completion(full_prompt, self.text_out_callback)

except Exception as ex:
traceback.print_exc()
self.error_callback(ex)
finally:
self.finish = True
self.singal.set()

def generator(self):
while True:
while not self.msg_queue.empty():
try:
data = self.msg_queue.get_nowait()
msg = f"data:{json.dumps(data)}\0"
print(msg)
yield msg
except Empty(Exception):
break
if not self.finish:
self.singal.clear()
self.singal.wait()
else:
break


_default_prompt = {
"role": "system",
"content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. Please keep the output text language the same as the user input.",
}

def convert_prompt(prompt: List[Dict[str, str]]):
chat_history = [_default_prompt]
prompt_len = prompt.__len__()
i = 0
while i < prompt_len:
chat_history.append({"role": "user", "content": prompt[i].get("question")})
if i < prompt_len - 1:
chat_history.append(
{"role": "assistant", "content": prompt[i].get("answer")}
)
i = i + 1
return chat_history


def process_rag(
prompt: str,
device: str,
text_out_callback: Callable[[str, int], None] = None,
):
import rag
rag.to(device)
query_success, context, rag_source = rag.query(prompt)
if query_success:
print("rag query input\r\n{}output:\r\n{}".format(prompt, context))
prompt = RAG_PROMPT_FORMAT.format(prompt=prompt, context=context)
if text_out_callback is not None:
text_out_callback(rag_source, 2)
return prompt
57 changes: 57 additions & 0 deletions OpenVINO/openvino_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from typing import Dict, List, Callable
from os import path

Check failure on line 2 in OpenVINO/openvino_backend.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

OpenVINO/openvino_backend.py:2:16: F401 `os.path` imported but unused
from openvino_interface import LLMInterface
import sys

Check failure on line 4 in OpenVINO/openvino_backend.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

OpenVINO/openvino_backend.py:4:8: F401 `sys` imported but unused
import openvino_genai
from openvino_params import LLMParams
import model_config

Check failure on line 7 in OpenVINO/openvino_backend.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

OpenVINO/openvino_backend.py:7:8: F401 `model_config` imported but unused
import gc

class OpenVino(LLMInterface):
def __init__(self):
self._model = None
self.stop_generate = False
self._last_repo_id = None

def load_model(self, params: LLMParams, callback: Callable[[str], None] = None):
model_repo_id = params.model_repo_id
if self._model is None or self._last_repo_id != model_repo_id:
if callback is not None:
callback("start")
self.unload_model()

#model_base_path = model_config.openVINOConfig.get("openvino")
#namespace, repo, *model = model_repo_id.split("/")
#model_path = path.abspath(path.join(model_base_path,"---".join([namespace, repo]), "---".join(model)))
model_path = r".\TinyLlama-1.1B-Chat-v1.0"

print(params.model_repo_id)
enable_compile_cache = dict()
enable_compile_cache["CACHE_DIR"] = "llm_cache"
self._model = openvino_genai.LLMPipeline(model_path, "GPU", **enable_compile_cache)

self._tokenizer = self._model.get_tokenizer()

self._last_repo_id = model_repo_id
if callback is not None:
callback("finish")

self.config = openvino_genai.GenerationConfig()
self.config.max_new_tokens = 100
print("Model loaded")

def create_chat_completion(self, messages: List[Dict[str, str]], streamer: Callable[[str], None]):
tokenized_input = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True)
print(tokenized_input)
return self._model.generate(tokenized_input, self.config, streamer)


def unload_model(self):
if self._model is not None:
#self._model.close()
del self._model
gc.collect()
self._model = None

def get_backend_type(self):
return "openvino"
24 changes: 24 additions & 0 deletions OpenVINO/openvino_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from openvino_params import LLMParams

class LLMInterface(ABC):
stop_generate: bool
_model: Optional[object]

@abstractmethod
def load_model(self, params: LLMParams, **kwargs):
pass

@abstractmethod
def unload_model(self):
pass

@abstractmethod
def create_chat_completion(self, messages: List[Dict[str, str]]):
pass

@abstractmethod
def get_backend_type(self):
pass

15 changes: 15 additions & 0 deletions OpenVINO/openvino_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import Dict, List

class LLMParams:
prompt: List[Dict[str, str]]
device: int
enable_rag: bool
model_repo_id: str

def __init__(
self, prompt: list, device: int, enable_rag: bool, model_repo_id: str
) -> None:
self.prompt = prompt
self.device = device
self.enable_rag = enable_rag
self.model_repo_id = model_repo_id
Loading

0 comments on commit 007fb89

Please sign in to comment.