huggingface · kozistr · Dec 24, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -24,7 +24,7 @@ hf-hub = { version = "0.3.2", features = ["tokio", "online"], default-features =
 metrics = "0.23"
 nohash-hasher = "0.2"
 num_cpus = "1.16.0"
-tokenizers = { version = "0.19.1", default-features = false, features = ["onig", "esaxx_fast"] }
+tokenizers = { version = "0.21.0", default-features = false, features = ["onig", "esaxx_fast"] }
 tokio = { version = "1.25", features = ["rt", "rt-multi-thread", "parking_lot", "sync", "signal"] }
 tracing = "0.1"
 serde = { version = "1.0", features = ["serde_derive"] }

diff --git a/README.md b/README.md
@@ -65,7 +65,7 @@ Ember, GTE and E5. TEI implements many features such as:
 #### Text Embeddings
 
 Text Embeddings Inference currently supports Nomic, BERT, CamemBERT, XLM-RoBERTa models with absolute positions, JinaBERT
-model with Alibi positions and Mistral, Alibaba GTE, Qwen2 models with Rope positions, and MPNet.
+model with Alibi positions and Mistral, Alibaba GTE, Qwen2 models with Rope positions, MPNet, and ModernBert.
 
 Below are some examples of the currently supported models:
 
@@ -82,6 +82,7 @@ Below are some examples of the currently supported models:
 | N/A       | 0.1B                | JinaBERT    | [jinaai/jina-embeddings-v2-base-en](https://hf.co/jinaai/jina-embeddings-v2-base-en)             |
 | N/A       | 0.1B                | JinaBERT    | [jinaai/jina-embeddings-v2-base-code](https://hf.co/jinaai/jina-embeddings-v2-base-code)         |
 | N/A       | 0.1B                | MPNet       | [sentence-transformers/all-mpnet-base-v2](https://hf.co/sentence-transformers/all-mpnet-base-v2)            |
+| N/A       | 0.4B                | ModernBert  | [answerdotai/ModernBERT-large](https://hf.co/answerdotai/ModernBERT-large) |
 
 To explore the list of best performing text embeddings models, visit the
 [Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).

diff --git a/backends/candle/src/flash_attn.rs b/backends/candle/src/flash_attn.rs
@@ -32,14 +32,15 @@ pub(crate) fn flash_attn_varlen(
     softmax_scale: f32,
     causal: bool,
     window_size_left: Option<usize>,
+    window_size_right: Option<usize>,
 ) -> Result<Tensor, candle::Error> {
     let runtime_compute_cap = get_runtime_compute_cap();
 
     if runtime_compute_cap == 75 {
         if alibi_slopes.is_some() {
             candle::bail!("Flash attention v1 does not support alibi");
         }
-        if window_size_left.is_some() {
+        if window_size_left.is_some() | window_size_right.is_some() {
             candle::bail!("Flash attention v1 does not support attention windowing");
         }
 
@@ -65,7 +66,13 @@ pub(crate) fn flash_attn_varlen(
         {
             use candle_flash_attn::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed};
 
-            let window_size_right = if causal { Some(0) } else { None };
+            let window_size_right = if causal {
+                Some(0)
+            } else if window_size_right.is_some() {
+                window_size_right
+            } else {
+                None
+            };
 
             let attention = if let Some(alibi_slopes) = alibi_slopes {
                 flash_attn_varlen_alibi_windowed(

diff --git a/backends/candle/src/layers/layer_norm.rs b/backends/candle/src/layers/layer_norm.rs
@@ -4,7 +4,7 @@ use candle_nn::VarBuilder;
 #[derive(Debug)]
 pub struct LayerNorm {
     weight: Tensor,
-    bias: Tensor,
+    bias: Option<Tensor>,
     epsilon: f32,
     span: tracing::Span,
 }
@@ -17,7 +17,8 @@ impl LayerNorm {
                 .or_else(|_| vb.get(hidden_size, "gamma"))?,
             bias: vb
                 .get(hidden_size, "bias")
-                .or_else(|_| vb.get(hidden_size, "beta"))?,
+                .or_else(|_| vb.get(hidden_size, "beta"))
+                .ok(),
             epsilon,
             span: tracing::span!(tracing::Level::TRACE, "layer-norm"),
         })
@@ -49,7 +50,12 @@ impl LayerNorm {
                 let hidden_states = hidden_states_normed
                     .to_dtype(hidden_states_dtype)?
                     .broadcast_mul(&self.weight)?;
-                hidden_states.broadcast_add(&self.bias)
+
+                if let Some(bias) = &self.bias {
+                    hidden_states.broadcast_add(bias)
+                } else {
+                    Ok(hidden_states)
+                }
             }
             Device::Cuda(_) => {
                 #[cfg(feature = "cuda")]
@@ -66,12 +72,17 @@ impl LayerNorm {
                             &hidden_states,
                             &residual,
                             &self.weight,
-                            Some(&self.bias),
+                            self.bias.as_ref(),
                             self.epsilon,
                         )?;
                         Ok(result)
                     } else {
-                        layer_norm(&hidden_states, &self.weight, Some(&self.bias), self.epsilon)
+                        layer_norm(
+                            &hidden_states,
+                            &self.weight,
+                            self.bias.as_ref(),
+                            self.epsilon,
+                        )
                     }?;
                     result.reshape(original_shape)
                 }

diff --git a/backends/candle/src/lib.rs b/backends/candle/src/lib.rs
@@ -12,8 +12,8 @@ use crate::compute_cap::{
 };
 use crate::models::{
     BertConfig, BertModel, DistilBertConfig, DistilBertModel, GTEConfig, GTEModel, JinaBertModel,
-    JinaCodeBertModel, MPNetConfig, MPNetModel, MistralConfig, Model, NomicBertModel, NomicConfig,
-    Qwen2Config,
+    JinaCodeBertModel, MPNetConfig, MPNetModel, MistralConfig, Model, ModernBertConfig,
+    ModernBertModel, NomicBertModel, NomicConfig, Qwen2Config,
 };
 #[cfg(feature = "cuda")]
 use crate::models::{
@@ -63,6 +63,8 @@ enum Config {
     Qwen2(Qwen2Config),
     #[serde(rename = "mpnet")]
     MPNet(MPNetConfig),
+    #[serde(rename(deserialize = "modernbert"))]
+    ModernBert(ModernBertConfig),
 }
 
 pub struct CandleBackend {
@@ -233,6 +235,12 @@ impl CandleBackend {
                 tracing::info!("Starting MPNet model on {:?}", device);
                 Ok(Box::new(MPNetModel::load(vb, &config, model_type).s()?))
             }
+            (Config::ModernBert(config), _) => {
+                tracing::info!("Starting ModernBert model on {:?}", device);
+                Ok(Box::new(
+                    ModernBertModel::load(vb, &config, model_type).s()?,
+                ))
+            }
             #[cfg(feature = "cuda")]
             (Config::Bert(config), Device::Cuda(_)) => {
                 if cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))

diff --git a/backends/candle/src/models/flash_bert.rs b/backends/candle/src/models/flash_bert.rs
@@ -104,6 +104,7 @@ impl BertAttention {
             self.softmax_scale,
             false,
             None,
+            None,
         )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
 

diff --git a/backends/candle/src/models/flash_distilbert.rs b/backends/candle/src/models/flash_distilbert.rs
@@ -85,6 +85,7 @@ impl DistilBertAttention {
             self.softmax_scale,
             false,
             None,
+            None,
         )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
 

diff --git a/backends/candle/src/models/flash_gte.rs b/backends/candle/src/models/flash_gte.rs
@@ -87,6 +87,7 @@ impl GTEAttention {
             self.softmax_scale,
             false,
             None,
+            None,
         )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
 

diff --git a/backends/candle/src/models/flash_jina.rs b/backends/candle/src/models/flash_jina.rs
@@ -106,6 +106,7 @@ impl JinaAttention {
             self.softmax_scale,
             false,
             None,
+            None,
         )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
 

diff --git a/backends/candle/src/models/flash_jina_code.rs b/backends/candle/src/models/flash_jina_code.rs
@@ -142,6 +142,7 @@ impl JinaCodeAttention {
             self.softmax_scale,
             false,
             None,
+            None,
         )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
 

diff --git a/backends/candle/src/models/flash_mistral.rs b/backends/candle/src/models/flash_mistral.rs
@@ -105,6 +105,7 @@ impl MistralAttention {
             self.softmax_scale,
             true,
             self.window_size_left,
+            None,
         )?;
         let attention = attention.flatten_from(candle::D::Minus2)?;
-Original file line number
+Diff line change
@@ Expand Up / @@ -104,6 +104,7 @@ impl BertAttention { @@
                 self.softmax_scale,
                 false,
                 None,
+                None,
             )?;
             let attention = attention.flatten_from(candle::D::Minus2)?;
@@ Expand Down @@