Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement the ModernBert model #459

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
4 changes: 2 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ hf-hub = { version = "0.3.2", features = ["tokio", "online"], default-features =
metrics = "0.23"
nohash-hasher = "0.2"
num_cpus = "1.16.0"
tokenizers = { version = "0.19.1", default-features = false, features = ["onig", "esaxx_fast"] }
tokenizers = { version = "0.21.0", default-features = false, features = ["onig", "esaxx_fast"] }
tokio = { version = "1.25", features = ["rt", "rt-multi-thread", "parking_lot", "sync", "signal"] }
tracing = "0.1"
serde = { version = "1.0", features = ["serde_derive"] }
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Ember, GTE and E5. TEI implements many features such as:
#### Text Embeddings

Text Embeddings Inference currently supports Nomic, BERT, CamemBERT, XLM-RoBERTa models with absolute positions, JinaBERT
model with Alibi positions and Mistral, Alibaba GTE, Qwen2 models with Rope positions, and MPNet.
model with Alibi positions and Mistral, Alibaba GTE, Qwen2 models with Rope positions, MPNet, and ModernBert.

Below are some examples of the currently supported models:

Expand All @@ -82,6 +82,7 @@ Below are some examples of the currently supported models:
| N/A | 0.1B | JinaBERT | [jinaai/jina-embeddings-v2-base-en](https://hf.co/jinaai/jina-embeddings-v2-base-en) |
| N/A | 0.1B | JinaBERT | [jinaai/jina-embeddings-v2-base-code](https://hf.co/jinaai/jina-embeddings-v2-base-code) |
| N/A | 0.1B | MPNet | [sentence-transformers/all-mpnet-base-v2](https://hf.co/sentence-transformers/all-mpnet-base-v2) |
| N/A | 0.4B | ModernBert | [answerdotai/ModernBERT-large](https://hf.co/answerdotai/ModernBERT-large) |

To explore the list of best performing text embeddings models, visit the
[Massive Text Embedding Benchmark (MTEB) Leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
Expand Down
11 changes: 9 additions & 2 deletions backends/candle/src/flash_attn.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,15 @@ pub(crate) fn flash_attn_varlen(
softmax_scale: f32,
causal: bool,
window_size_left: Option<usize>,
window_size_right: Option<usize>,
) -> Result<Tensor, candle::Error> {
let runtime_compute_cap = get_runtime_compute_cap();

if runtime_compute_cap == 75 {
if alibi_slopes.is_some() {
candle::bail!("Flash attention v1 does not support alibi");
}
if window_size_left.is_some() {
if window_size_left.is_some() | window_size_right.is_some() {
candle::bail!("Flash attention v1 does not support attention windowing");
}

Expand All @@ -65,7 +66,13 @@ pub(crate) fn flash_attn_varlen(
{
use candle_flash_attn::{flash_attn_varlen_alibi_windowed, flash_attn_varlen_windowed};

let window_size_right = if causal { Some(0) } else { None };
let window_size_right = if causal {
Some(0)
} else if window_size_right.is_some() {
window_size_right
} else {
None
};

let attention = if let Some(alibi_slopes) = alibi_slopes {
flash_attn_varlen_alibi_windowed(
Expand Down
21 changes: 16 additions & 5 deletions backends/candle/src/layers/layer_norm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use candle_nn::VarBuilder;
#[derive(Debug)]
pub struct LayerNorm {
weight: Tensor,
bias: Tensor,
bias: Option<Tensor>,
epsilon: f32,
span: tracing::Span,
}
Expand All @@ -17,7 +17,8 @@ impl LayerNorm {
.or_else(|_| vb.get(hidden_size, "gamma"))?,
bias: vb
.get(hidden_size, "bias")
.or_else(|_| vb.get(hidden_size, "beta"))?,
.or_else(|_| vb.get(hidden_size, "beta"))
.ok(),
epsilon,
span: tracing::span!(tracing::Level::TRACE, "layer-norm"),
})
Expand Down Expand Up @@ -49,7 +50,12 @@ impl LayerNorm {
let hidden_states = hidden_states_normed
.to_dtype(hidden_states_dtype)?
.broadcast_mul(&self.weight)?;
hidden_states.broadcast_add(&self.bias)

if let Some(bias) = &self.bias {
hidden_states.broadcast_add(bias)
} else {
Ok(hidden_states)
}
}
Device::Cuda(_) => {
#[cfg(feature = "cuda")]
Expand All @@ -66,12 +72,17 @@ impl LayerNorm {
&hidden_states,
&residual,
&self.weight,
Some(&self.bias),
self.bias.as_ref(),
self.epsilon,
)?;
Ok(result)
} else {
layer_norm(&hidden_states, &self.weight, Some(&self.bias), self.epsilon)
layer_norm(
&hidden_states,
&self.weight,
self.bias.as_ref(),
self.epsilon,
)
}?;
result.reshape(original_shape)
}
Expand Down
12 changes: 10 additions & 2 deletions backends/candle/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ use crate::compute_cap::{
};
use crate::models::{
BertConfig, BertModel, DistilBertConfig, DistilBertModel, GTEConfig, GTEModel, JinaBertModel,
JinaCodeBertModel, MPNetConfig, MPNetModel, MistralConfig, Model, NomicBertModel, NomicConfig,
Qwen2Config,
JinaCodeBertModel, MPNetConfig, MPNetModel, MistralConfig, Model, ModernBertConfig,
ModernBertModel, NomicBertModel, NomicConfig, Qwen2Config,
};
#[cfg(feature = "cuda")]
use crate::models::{
Expand Down Expand Up @@ -63,6 +63,8 @@ enum Config {
Qwen2(Qwen2Config),
#[serde(rename = "mpnet")]
MPNet(MPNetConfig),
#[serde(rename(deserialize = "modernbert"))]
ModernBert(ModernBertConfig),
}

pub struct CandleBackend {
Expand Down Expand Up @@ -233,6 +235,12 @@ impl CandleBackend {
tracing::info!("Starting MPNet model on {:?}", device);
Ok(Box::new(MPNetModel::load(vb, &config, model_type).s()?))
}
(Config::ModernBert(config), _) => {
tracing::info!("Starting ModernBert model on {:?}", device);
Ok(Box::new(
ModernBertModel::load(vb, &config, model_type).s()?,
))
}
#[cfg(feature = "cuda")]
(Config::Bert(config), Device::Cuda(_)) => {
if cfg!(any(feature = "flash-attn", feature = "flash-attn-v1"))
Expand Down
1 change: 1 addition & 0 deletions backends/candle/src/models/flash_bert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ impl BertAttention {
self.softmax_scale,
false,
None,
None,
)?;
let attention = attention.flatten_from(candle::D::Minus2)?;

Expand Down
1 change: 1 addition & 0 deletions backends/candle/src/models/flash_distilbert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ impl DistilBertAttention {
self.softmax_scale,
false,
None,
None,
)?;
let attention = attention.flatten_from(candle::D::Minus2)?;

Expand Down
1 change: 1 addition & 0 deletions backends/candle/src/models/flash_gte.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ impl GTEAttention {
self.softmax_scale,
false,
None,
None,
)?;
let attention = attention.flatten_from(candle::D::Minus2)?;

Expand Down
1 change: 1 addition & 0 deletions backends/candle/src/models/flash_jina.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ impl JinaAttention {
self.softmax_scale,
false,
None,
None,
)?;
let attention = attention.flatten_from(candle::D::Minus2)?;

Expand Down
1 change: 1 addition & 0 deletions backends/candle/src/models/flash_jina_code.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ impl JinaCodeAttention {
self.softmax_scale,
false,
None,
None,
)?;
let attention = attention.flatten_from(candle::D::Minus2)?;

Expand Down
1 change: 1 addition & 0 deletions backends/candle/src/models/flash_mistral.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ impl MistralAttention {
self.softmax_scale,
true,
self.window_size_left,
None,
)?;
let attention = attention.flatten_from(candle::D::Minus2)?;

Expand Down
Loading