From 2b09f798e1de1717c6389e5d9f5c34c7c3248be6 Mon Sep 17 00:00:00 2001 From: LIU Yuwei <22045841+Marsman1996@users.noreply.github.com> Date: Wed, 8 Jan 2025 23:19:27 +0800 Subject: [PATCH] community: add init for `UnstructuredHTMLLoader` to solve pathlib paths (#29091) ## Description Add `__init__` for `UnstructuredHTMLLoader` to restrict the input type to `str` or `Path`, and transfer the `self.file_path` to `str` just like `UnstructuredXMLLoader` does. ## Issue Fix #29090 ## Dependencies No changes. --- .../document_loaders/html.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/html.py b/libs/community/langchain_community/document_loaders/html.py index 857142bce96ae..9ea781a7e91cb 100644 --- a/libs/community/langchain_community/document_loaders/html.py +++ b/libs/community/langchain_community/document_loaders/html.py @@ -1,4 +1,5 @@ -from typing import List +from pathlib import Path +from typing import Any, List, Union from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -27,6 +28,23 @@ class UnstructuredHTMLLoader(UnstructuredFileLoader): https://unstructured-io.github.io/unstructured/bricks.html#partition-html """ + def __init__( + self, + file_path: Union[str, Path], + mode: str = "single", + **unstructured_kwargs: Any, + ): + """ + + Args: + file_path: The path to the HTML file to load. + mode: The mode to use when loading the file. Can be one of "single", + "multi", or "all". Default is "single". + **unstructured_kwargs: Any kwargs to pass to the unstructured. + """ + file_path = str(file_path) + super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) + def _get_elements(self) -> List: from unstructured.partition.html import partition_html