huggingface · OlivierDehaene · Jun 17, 2024 · Jun 17, 2024 · Jun 21, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/backends/grpc-client/src/client.rs b/backends/grpc-client/src/client.rs
@@ -1,18 +1,19 @@
-/// Single shard Client
 use crate::pb::embedding::v1::embedding_service_client::EmbeddingServiceClient;
 use crate::pb::embedding::v1::*;
-use crate::Result;
+use crate::{ClientError, Result};
 use grpc_metadata::InjectTelemetryContext;
+/// Single shard Client
+use tokio::runtime::Runtime;
 use tonic::transport::{Channel, Uri};
 use tracing::instrument;
 
 /// Text Generation Inference gRPC client
 #[derive(Debug, Clone)]
-pub struct Client {
+pub struct AsyncClient {
     stub: EmbeddingServiceClient<Channel>,
 }
 
-impl Client {
+impl AsyncClient {
     /// Returns a client connected to the given url
     pub async fn connect(uri: Uri) -> Result<Self> {
         let channel = Channel::builder(uri).connect().await?;
@@ -23,7 +24,8 @@ impl Client {
     }
 
     /// Returns a client connected to the given unix socket
-    pub async fn connect_uds(path: String) -> Result<Self> {
+    pub async fn connect_uds(path: &str) -> Result<Self> {
+        let path = path.to_owned();
         let channel = Channel::from_shared("http://[::]:50051".to_string())
             .unwrap()
             .connect_with_connector(tower::service_fn(move |_: Uri| {
@@ -65,3 +67,69 @@ impl Client {
         Ok(response.embeddings)
     }
 }
+
+#[derive(Debug)]
+pub struct Client {
+    async_client: AsyncClient,
+    runtime: Runtime,
+}
+
+impl Client {
+    /// Returns a client connected to the given url
+    pub fn connect(uri: Uri) -> Result<Self> {
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .map_err(|err| {
+                ClientError::Connection(format!("Could not start Tokio runtime: {err}"))
+            })?;
+
+        let async_client = runtime.block_on(AsyncClient::connect(uri))?;
+
+        Ok(Self {
+            async_client,
+            runtime,
+        })
+    }
+
+    /// Returns a client connected to the given unix socket
+    pub fn connect_uds(path: &str) -> Result<Self> {
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .map_err(|err| {
+                ClientError::Connection(format!("Could not start Tokio runtime: {err}"))
+            })?;
+
+        let async_client = runtime.block_on(AsyncClient::connect_uds(path))?;
+
+        Ok(Self {
+            async_client,
+            runtime,
+        })
+    }
+
+    /// Get backend health
+    #[instrument(skip(self))]
+    pub fn health(&self) -> Result<HealthResponse> {
+        self.runtime.block_on(self.async_client.clone().health())
+    }
+
+    #[instrument(skip_all)]
+    pub fn embed(
+        &self,
+        input_ids: Vec<u32>,
+        token_type_ids: Vec<u32>,
+        position_ids: Vec<u32>,
+        cu_seq_lengths: Vec<u32>,
+        max_length: u32,
+    ) -> Result<Vec<Embedding>> {
+        self.runtime.block_on(self.async_client.clone().embed(
+            input_ids,
+            token_type_ids,
+            position_ids,
+            cu_seq_lengths,
+            max_length,
+        ))
+    }
+}
diff --git a/backends/grpc-client/src/lib.rs b/backends/grpc-client/src/lib.rs
@@ -29,9 +29,7 @@ impl From<Status> for ClientError {
 
 impl From<transport::Error> for ClientError {
     fn from(err: transport::Error) -> Self {
-        let err = Self::Connection(err.to_string());
-        tracing::error!("{err}");
-        err
+        Self::Connection(err.to_string())
     }
 }
 

diff --git a/backends/python/Cargo.toml b/backends/python/Cargo.toml
@@ -12,5 +12,4 @@ serde = { version = "^1.0", features = ["derive"]  }
 serde_json = "^1.0"
 text-embeddings-backend-core = { path = "../core" }
 thiserror = "^1.0"
-tokio = { version = "^1.25", features = ["sync"] }
 tracing = "^0.1"
diff --git a/backends/python/server/text_embeddings_server/models/default_model.py b/backends/python/server/text_embeddings_server/models/default_model.py
@@ -42,7 +42,7 @@ def embed(self, batch: PaddedBatch) -> List[Embedding]:
 
         output = self.model(**kwargs)
         embedding = output[0][:, 0]
-        cpu_results = embedding.view(-1).tolist()
+        cpu_results = embedding.reshape(-1).tolist()
 
         return [
             Embedding(

diff --git a/backends/python/src/lib.rs b/backends/python/src/lib.rs
@@ -1,18 +1,14 @@
 mod logging;
 mod management;
 
-use backend_grpc_client::Client;
 use nohash_hasher::BuildNoHashHasher;
 use std::collections::HashMap;
 use text_embeddings_backend_core::{
     Backend, BackendError, Batch, Embedding, Embeddings, ModelType, Pool, Predictions,
 };
-use tokio::runtime::Runtime;
 
 pub struct PythonBackend {
-    _backend_process: management::BackendProcess,
-    tokio_runtime: Runtime,
-    backend_client: Client,
+    backend_process: management::BackendProcess,
 }
 
 impl PythonBackend {
@@ -38,39 +34,16 @@ impl PythonBackend {
             }
         };
 
-        let backend_process = management::BackendProcess::new(
-            model_path,
-            dtype,
-            &uds_path,
-            otlp_endpoint,
-            otlp_service_name,
-        )?;
-        let tokio_runtime = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .map_err(|err| BackendError::Start(format!("Could not start Tokio runtime: {err}")))?;
+        let backend_process =
+            management::BackendProcess::new(model_path, dtype, uds_path, otlp_endpoint, otlp_service_name)?;
 
-        let backend_client = tokio_runtime
-            .block_on(Client::connect_uds(uds_path))
-            .map_err(|err| {
-                BackendError::Start(format!("Could not connect to backend process: {err}"))
-            })?;
-
-        Ok(Self {
-            _backend_process: backend_process,
-            tokio_runtime,
-            backend_client,
-        })
+        Ok(Self { backend_process })
     }
 }
 
 impl Backend for PythonBackend {
     fn health(&self) -> Result<(), BackendError> {
-        if self
-            .tokio_runtime
-            .block_on(self.backend_client.clone().health())
-            .is_err()
-        {
+        if self.backend_process.client.health().is_err() {
             return Err(BackendError::Unhealthy);
         }
         Ok(())
@@ -89,14 +62,15 @@ impl Backend for PythonBackend {
         let batch_size = batch.len();
 
         let results = self
-            .tokio_runtime
-            .block_on(self.backend_client.clone().embed(
+            .backend_process
+            .client
+            .embed(
                 batch.input_ids,
                 batch.token_type_ids,
                 batch.position_ids,
                 batch.cumulative_seq_lengths,
                 batch.max_length,
-            ))
+            )
             .map_err(|err| BackendError::Inference(err.to_string()))?;
         let pooled_embeddings: Vec<Vec<f32>> = results.into_iter().map(|r| r.values).collect();
 

diff --git a/backends/python/src/management.rs b/backends/python/src/management.rs
@@ -1,4 +1,5 @@
 use crate::logging::log_lines;
+use backend_grpc_client::Client;
 use std::ffi::OsString;
 use std::io::{BufRead, BufReader};
 use std::os::unix::process::{CommandExt, ExitStatusExt};
@@ -13,18 +14,19 @@ use text_embeddings_backend_core::BackendError;
 #[derive(Debug)]
 pub(crate) struct BackendProcess {
     inner: Child,
+    pub client: Client,
 }
 
 impl BackendProcess {
     pub(crate) fn new(
         model_path: String,
         dtype: String,
-        uds_path: &str,
+        uds_path: String,
         otlp_endpoint: Option<String>,
         otlp_service_name: String,
     ) -> Result<Self, BackendError> {
         // Get UDS path
-        let uds = Path::new(uds_path);
+        let uds = Path::new(&uds_path);
 
         // Clean previous runs
         if uds.exists() {
@@ -87,7 +89,7 @@ impl BackendProcess {
         let start_time = Instant::now();
         let mut wait_time = Instant::now();
 
-        loop {
+        let client = loop {
             // Process exited
             if let Some(exit_status) = p.try_wait().unwrap() {
                 // We read stderr in another thread as it seems that lines() can block in some cases
@@ -114,18 +116,22 @@ impl BackendProcess {
                 ));
             }
 
-            // Shard is ready
-            if uds.exists() {
-                tracing::info!("Python backend ready in {:?}", start_time.elapsed());
-                break;
-            } else if wait_time.elapsed() > Duration::from_secs(10) {
-                tracing::info!("Waiting for Python backend to be ready...");
-                wait_time = Instant::now();
-            }
+            match Client::connect_uds(&uds_path) {
+                Ok(client) => {
+                    tracing::info!("Python backend ready in {:?}", start_time.elapsed());
+                    break client;
+                }
+                Err(_) if wait_time.elapsed() > Duration::from_secs(10) => {
+                    tracing::info!("Waiting for Python backend to be ready...");
+                    wait_time = Instant::now();
+                }
+                _ => {}
+            };
+
             sleep(Duration::from_millis(5));
-        }
+        };
 
-        Ok(Self { inner: p })
+        Ok(Self { inner: p, client })
     }
 }