From f2eeec96eb6a27af1db40801211b14075f39298d Mon Sep 17 00:00:00 2001 From: "Dmitry Sirakov [Shade]" Date: Thu, 30 Jan 2025 13:10:45 +0300 Subject: [PATCH 1/2] fix start_index in TextSplitter --- libs/text-splitters/langchain_text_splitters/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 10dd6903ba172..9111920e2b827 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -82,7 +82,7 @@ def create_documents( offset = index + previous_chunk_len - self._chunk_overlap index = text.find(chunk, max(0, offset)) metadata["start_index"] = index - previous_chunk_len = len(chunk) + previous_chunk_len = self.length_function(chunk) new_doc = Document(page_content=chunk, metadata=metadata) documents.append(new_doc) return documents From 8a1e5b571d47ee2b41ffb7282e3bcf4660049985 Mon Sep 17 00:00:00 2001 From: "Dmitry Sirakov [Shade]" Date: Thu, 30 Jan 2025 13:22:58 +0300 Subject: [PATCH 2/2] fix method name --- libs/text-splitters/langchain_text_splitters/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/text-splitters/langchain_text_splitters/base.py b/libs/text-splitters/langchain_text_splitters/base.py index 9111920e2b827..6e6b53a1bd89e 100644 --- a/libs/text-splitters/langchain_text_splitters/base.py +++ b/libs/text-splitters/langchain_text_splitters/base.py @@ -82,7 +82,7 @@ def create_documents( offset = index + previous_chunk_len - self._chunk_overlap index = text.find(chunk, max(0, offset)) metadata["start_index"] = index - previous_chunk_len = self.length_function(chunk) + previous_chunk_len = self._length_function(chunk) new_doc = Document(page_content=chunk, metadata=metadata) documents.append(new_doc) return documents