Add OpenVINO as additional backend (#50)

TNG · Jan 27, 2025 · 007fb89 · 007fb89
1 parent 0baf9e3
commit 007fb89
Show file tree

Hide file tree

Showing 21 changed files with 701 additions and 8 deletions.
diff --git a/OpenVINO/.gitignore b/OpenVINO/.gitignore
@@ -0,0 +1,14 @@
+.vscode/
+__pycache__/
+models/llm/
+temp/
+test/
+dist/
+build/
+cache/
+test/
+env/
+
+!tools/*.exe
+llm_cache/
+TinyLlama-*
diff --git a/OpenVINO/model_config.py b/OpenVINO/model_config.py
@@ -0,0 +1,4 @@
+openVINOConfig = {
+    "openvino": "../service/models/llm/openvino",
+}
+
diff --git a/OpenVINO/openvino_adapter.py b/OpenVINO/openvino_adapter.py
@@ -0,0 +1,171 @@
+import threading
+from queue import Empty, Queue
+import json
+import traceback
+from typing import Dict, List, Callable
+#from model_downloader import NotEnoughDiskSpaceException, DownloadException
+#from psutil._common import bytes2human
+from openvino_interface import LLMInterface
+from openvino_params import LLMParams
+
+
+RAG_PROMPT_FORMAT = "Answer the questions based on the information below. \n{context}\n\nQuestion: {prompt}"
+
+class LLM_SSE_Adapter:
+    msg_queue: Queue
+    finish: bool
+    singal: threading.Event
+    llm_interface: LLMInterface
+    should_stop: bool
+
+    def __init__(self, llm_interface: LLMInterface):
+        self.msg_queue = Queue(-1)
+        self.finish = False
+        self.singal = threading.Event()
+        self.llm_interface = llm_interface
+        self.should_stop = False
+
+    def put_msg(self, data):
+        self.msg_queue.put_nowait(data)
+        self.singal.set()
+
+    def load_model_callback(self, event: str):
+        data = {"type": "load_model", "event": event}
+        self.put_msg(data)
+
+    def text_in_callback(self, msg: str):
+        data = {"type": "text_in", "value": msg}
+        self.put_msg(data)
+
+    def text_out_callback(self, msg: str, type=1):
+        data = {"type": "text_out", "value": msg, "dtype": type}
+        self.put_msg(data)
+
+    def first_latency_callback(self, first_latency: str):
+        data = {"type": "first_token_latency", "value": first_latency}
+        self.put_msg(data)
+
+    def after_latency_callback(self, after_latency: str):
+        data = {"type": "after_token_latency", "value": after_latency}
+        self.put_msg(data)
+
+    def sr_latency_callback(self, sr_latency: str):
+        data = {"type": "sr_latency", "value": sr_latency}
+        self.put_msg(data)
+
+    def error_callback(self, ex: Exception):
+        if (
+            isinstance(ex, NotImplementedError)
+            and ex.__str__() == "Access to repositories lists is not implemented."
+        ):
+            self.put_msg(
+                {
+                    "type": "error",
+                    "err_type": "repositories_not_found",
+                }
+            )
+        # elif isinstance(ex, NotEnoughDiskSpaceException):
+        #     self.put_msg(
+        #         {
+        #             "type": "error",
+        #             "err_type": "not_enough_disk_space",
+        #             "need": bytes2human(ex.requires_space),
+        #             "free": bytes2human(ex.free_space),
+        #         }
+        #     )
+        # elif isinstance(ex, DownloadException):
+        #     self.put_msg({"type": "error", "err_type": "download_exception"})
+        # # elif isinstance(ex, llm_biz.StopGenerateException):
+        # #     pass
+        elif isinstance(ex, RuntimeError):
+            self.put_msg({"type": "error", "err_type": "runtime_error"})
+        else:
+            self.put_msg({"type": "error", "err_type": "unknow_exception"})
+        print(f"exception:{str(ex)}")
+
+    def text_conversation(self, params: LLMParams):
+        thread = threading.Thread(
+            target=self.text_conversation_run,
+            args=[params],
+        )
+        thread.start()
+        return self.generator()
+
+
+    def stream_function(self, stream):
+        for output in stream:
+            if self.llm_interface.stop_generate:
+                self.llm_interface.stop_generate = False
+                break
+
+            self.text_out_callback(output)
+        self.put_msg({"type": "finish"})
+
+    def text_conversation_run(
+        self,
+        params: LLMParams,
+    ):
+        try:
+            self.llm_interface.load_model(params, callback=self.load_model_callback)
+
+            prompt = params.prompt
+            full_prompt = convert_prompt(prompt)
+            self.llm_interface.create_chat_completion(full_prompt, self.text_out_callback)
+
+        except Exception as ex:
+            traceback.print_exc()
+            self.error_callback(ex)
+        finally:
+            self.finish = True
+            self.singal.set()
+
+    def generator(self):
+        while True:
+            while not self.msg_queue.empty():
+                try:
+                    data = self.msg_queue.get_nowait()
+                    msg = f"data:{json.dumps(data)}\0"
+                    print(msg)
+                    yield msg
+                except Empty(Exception):
+                    break
+            if not self.finish:
+                self.singal.clear()
+                self.singal.wait()
+            else:
+                break
+
+
+_default_prompt = {
+        "role": "system",
+        "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user. Please keep the output text language the same as the user input.",
+    }
+
+def convert_prompt(prompt: List[Dict[str, str]]):
+    chat_history = [_default_prompt]
+    prompt_len = prompt.__len__()
+    i = 0
+    while i < prompt_len:
+        chat_history.append({"role": "user", "content": prompt[i].get("question")})
+        if i < prompt_len - 1:
+            chat_history.append(
+                {"role": "assistant", "content": prompt[i].get("answer")}
+            )
+        i = i + 1
+    return chat_history
+
+
+def process_rag(
+        prompt: str,
+        device: str,
+        text_out_callback: Callable[[str, int], None] = None,
+    ):
+        import rag
+        rag.to(device)
+        query_success, context, rag_source = rag.query(prompt)
+        if query_success:
+            print("rag query input\r\n{}output:\r\n{}".format(prompt, context))
+            prompt = RAG_PROMPT_FORMAT.format(prompt=prompt, context=context)
+            if text_out_callback is not None:
+                text_out_callback(rag_source, 2)
+        return prompt
diff --git a/OpenVINO/openvino_backend.py b/OpenVINO/openvino_backend.py
@@ -0,0 +1,57 @@
+from typing import Dict, List, Callable
+from os import path
+from openvino_interface import LLMInterface
+import sys
+import openvino_genai
+from openvino_params import LLMParams
+import model_config
+import gc
+
+class OpenVino(LLMInterface):
+    def __init__(self):
+        self._model = None
+        self.stop_generate = False
+        self._last_repo_id = None
+
+    def load_model(self, params: LLMParams, callback: Callable[[str], None] = None):
+        model_repo_id = params.model_repo_id
+        if self._model is None or self._last_repo_id != model_repo_id:
+            if callback is not None:
+                callback("start")
+            self.unload_model()
+
+            #model_base_path = model_config.openVINOConfig.get("openvino")
+            #namespace, repo, *model = model_repo_id.split("/")
+            #model_path = path.abspath(path.join(model_base_path,"---".join([namespace, repo]), "---".join(model)))
+            model_path = r".\TinyLlama-1.1B-Chat-v1.0"
+
+            print(params.model_repo_id)
+            enable_compile_cache = dict()
+            enable_compile_cache["CACHE_DIR"] = "llm_cache"
+            self._model = openvino_genai.LLMPipeline(model_path, "GPU", **enable_compile_cache)
+
+            self._tokenizer = self._model.get_tokenizer()
+
+            self._last_repo_id = model_repo_id
+            if callback is not None:
+                callback("finish")
+
+            self.config = openvino_genai.GenerationConfig()
+            self.config.max_new_tokens = 100
+            print("Model loaded")
+
+    def create_chat_completion(self, messages: List[Dict[str, str]], streamer: Callable[[str], None]):
+        tokenized_input = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True)
+        print(tokenized_input)
+        return self._model.generate(tokenized_input, self.config, streamer)
+
+
+    def unload_model(self):
+        if self._model is not None:
+            #self._model.close()
+            del self._model
+        gc.collect()
+        self._model = None
+
+    def get_backend_type(self):
+        return "openvino"
diff --git a/OpenVINO/openvino_interface.py b/OpenVINO/openvino_interface.py
@@ -0,0 +1,24 @@
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional
+from openvino_params import LLMParams
+
+class LLMInterface(ABC):
+    stop_generate: bool
+    _model: Optional[object]
+
+    @abstractmethod
+    def load_model(self, params: LLMParams, **kwargs):
+        pass
+
+    @abstractmethod
+    def unload_model(self):
+        pass
+
+    @abstractmethod
+    def create_chat_completion(self, messages: List[Dict[str, str]]):
+        pass 
+
+    @abstractmethod
+    def get_backend_type(self):
+        pass
+
diff --git a/OpenVINO/openvino_params.py b/OpenVINO/openvino_params.py
@@ -0,0 +1,15 @@
+from typing import Dict, List
+
+class LLMParams:
+    prompt: List[Dict[str, str]]
+    device: int
+    enable_rag: bool 
+    model_repo_id: str
+
+    def __init__(
+        self, prompt: list, device: int, enable_rag: bool, model_repo_id: str
+    ) -> None:
+        self.prompt = prompt
+        self.device = device
+        self.enable_rag = enable_rag
+        self.model_repo_id = model_repo_id