diff --git a/crates/tabby-common/src/index/mod.rs b/crates/tabby-common/src/index/mod.rs index c3676d122ce1..d92798bc2888 100644 --- a/crates/tabby-common/src/index/mod.rs +++ b/crates/tabby-common/src/index/mod.rs @@ -178,6 +178,21 @@ impl IndexSchema { ]) } + /// Build a query to find the document with the given `doc_id`, include chunks. + pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query { + let doc_id_query = TermQuery::new( + Term::from_field_text(self.field_id, doc_id), + tantivy::schema::IndexRecordOption::Basic, + ); + + BooleanQuery::new(vec![ + // Must match the corpus + (Occur::Must, self.corpus_query(corpus)), + // Must match the doc id + (Occur::Must, Box::new(doc_id_query)), + ]) + } + pub fn doc_indexed_after( &self, corpus: &str, @@ -261,21 +276,11 @@ impl IndexSchema { FIELD_ATTRIBUTES, field ))), ), - ]) - } - - /// Build a query to find the document with the given `doc_id`, include chunks. - pub fn doc_query_with_chunks(&self, corpus: &str, doc_id: &str) -> impl Query { - let doc_id_query = TermQuery::new( - Term::from_field_text(self.field_id, doc_id), - tantivy::schema::IndexRecordOption::Basic, - ); - - BooleanQuery::new(vec![ - // Must match the corpus - (Occur::Must, self.corpus_query(corpus)), - // Must match the doc id - (Occur::Must, Box::new(doc_id_query)), + // Exclude chunk documents + ( + Occur::MustNot, + Box::new(ExistsQuery::new_exists_query(FIELD_CHUNK_ID.into())), + ), ]) } diff --git a/crates/tabby-index/src/code/index.rs b/crates/tabby-index/src/code/index.rs index 8012e3b2dcb8..5d0a4966bc4b 100644 --- a/crates/tabby-index/src/code/index.rs +++ b/crates/tabby-index/src/code/index.rs @@ -1,5 +1,6 @@ -use std::{pin::pin, sync::Arc}; +use std::{path::Path, pin::pin, sync::Arc}; +use anyhow::Result; use async_stream::stream; use futures::StreamExt; use ignore::{DirEntry, Walk}; @@ -12,7 +13,7 @@ use super::{ intelligence::{CodeIntelligence, SourceCode}, CodeRepository, }; -use crate::indexer::Indexer; +use crate::indexer::{Indexer, TantivyDocBuilder}; // Magic numbers static MAX_LINE_LENGTH_THRESHOLD: usize = 300; @@ -101,7 +102,19 @@ async fn add_changed_documents( let id = SourceCode::to_index_id(&repository.source_id, &key).id; + // Skip if already indexed and has no failed chunks, + // when skip, we should check if the document needs to be backfilled. if !require_updates(cloned_index.clone(), &id) { + backfill_commit_in_doc_if_needed( + builder.clone(), + cloned_index.clone(), + &id, + repository, + commit, + file.path()).await.unwrap_or_else(|e| { + warn!("Failed to backfill commit for {id}: {e}"); + } + ); continue; } @@ -139,12 +152,7 @@ async fn add_changed_documents( count_docs } -// 1. Backfill if the document is missing the commit field -// 2. Skip if already indexed and has no failed chunks fn require_updates(indexer: Arc, id: &str) -> bool { - if should_backfill(indexer.clone(), id) { - return true; - } if indexer.is_indexed(id) && !indexer.has_failed_chunks(id) { return false; }; @@ -152,9 +160,32 @@ fn require_updates(indexer: Arc, id: &str) -> bool { true } -fn should_backfill(indexer: Arc, id: &str) -> bool { - // v0.23.0 add the commit field to the code document. - !indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) +// v0.23.0 add the commit field to the code document. +async fn backfill_commit_in_doc_if_needed( + builder: Arc>, + indexer: Arc, + id: &str, + repository: &CodeRepository, + commit: &str, + path: &Path, +) -> Result<()> { + if indexer.has_attribute_field(id, code::fields::ATTRIBUTE_COMMIT) { + return Ok(()); + } + + let code = CodeIntelligence::compute_source_file(repository, commit, path) + .ok_or_else(|| anyhow::anyhow!("Failed to compute source file"))?; + if !is_valid_file(&code) { + anyhow::bail!("Invalid file"); + } + + let origin = indexer.get_doc(id).await?; + indexer.delete_doc(id); + indexer + .add(builder.backfill_doc_attributes(&origin, &code).await) + .await; + + Ok(()) } fn is_valid_file(file: &SourceCode) -> bool { diff --git a/crates/tabby-index/src/indexer.rs b/crates/tabby-index/src/indexer.rs index 0735aa454907..86b58daa23d8 100644 --- a/crates/tabby-index/src/indexer.rs +++ b/crates/tabby-index/src/indexer.rs @@ -167,6 +167,26 @@ impl TantivyDocBuilder { } } } + + pub async fn backfill_doc_attributes( + &self, + origin: &TantivyDocument, + doc: &T, + ) -> TantivyDocument { + let schema = IndexSchema::instance(); + let mut doc = doc! { + schema.field_id => get_text(origin, schema.field_id), + schema.field_source_id => get_text(origin, schema.field_source_id).to_string(), + schema.field_corpus => get_text(origin, schema.field_corpus).to_string(), + schema.field_attributes => self.builder.build_attributes(doc).await, + schema.field_updated_at => get_date(origin, schema.field_updated_at), + }; + if let Some(failed_chunks) = get_number_optional(origin, schema.field_failed_chunks_count) { + doc.add_u64(schema.field_failed_chunks_count, failed_chunks as u64); + } + + doc + } } pub struct Indexer { @@ -197,6 +217,25 @@ impl Indexer { .expect("Failed to add document"); } + pub async fn get_doc(&self, id: &str) -> Result { + let schema = IndexSchema::instance(); + let query = schema.doc_query(&self.corpus, id); + let docs = match self.searcher.search(&query, &TopDocs::with_limit(1)) { + Ok(docs) => docs, + Err(e) => { + debug!("query tantivy error: {}", e); + return Err(e.into()); + } + }; + if docs.is_empty() { + bail!("Document not found: {}", id); + } + + self.searcher + .doc(docs.first().unwrap().1) + .map_err(|e| e.into()) + } + pub fn delete(&self, id: &str) { let schema = IndexSchema::instance(); let _ = self @@ -204,6 +243,13 @@ impl Indexer { .delete_query(Box::new(schema.doc_query_with_chunks(&self.corpus, id))); } + pub fn delete_doc(&self, id: &str) { + let schema = IndexSchema::instance(); + let _ = self + .writer + .delete_query(Box::new(schema.doc_query(&self.corpus, id))); + } + pub fn commit(mut self) { self.writer.commit().expect("Failed to commit changes"); self.writer @@ -369,3 +415,11 @@ impl IndexGarbageCollector { fn get_text(doc: &TantivyDocument, field: schema::Field) -> &str { doc.get_first(field).unwrap().as_str().unwrap() } + +fn get_date(doc: &TantivyDocument, field: schema::Field) -> tantivy::DateTime { + doc.get_first(field).unwrap().as_datetime().unwrap() +} + +fn get_number_optional(doc: &TantivyDocument, field: schema::Field) -> Option { + doc.get_first(field)?.as_i64() +}