Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance _markitdown.py to support embedding images in markdown #205

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 85 additions & 10 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,24 +698,99 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:

class DocxConverter(HtmlConverter):
"""
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
"""

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def sanitize_filename(self, name: str) -> str:
"""Sanitizes a string to make it a valid file name across different operating systems."""
# Normalize underscore
name = re.sub(r"\s+", "_", name.strip())

# Replace invalid characters with underscores
name = re.sub(r'[\\/*?:"<>|]', "_", name)

# Remove leading and trailing dots and spaces
name = name.strip(" .")

# Limit the length of the filename to a reasonable length (e.g., 251 characters)
max_length = 251
if len(name) > max_length:
name = name[:max_length]

return name

def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
"""Truncates the filename to ensure the final length is within the limit."""
max_base_length = max_length - len(extension)
if len(name) > max_base_length:
return name[:max_base_length]
return name

def unique_filename(self, base_path: str, max_length: int = 251) -> str:
"""Generates a unique filename while ensuring it stays within the length limit."""
base, ext = os.path.splitext(base_path)
truncated_base = self.truncate_filename(base, max_length, ext)

counter = 1
unique_path = f"{truncated_base}{ext}"
while os.path.exists(unique_path):
suffix = f"_{counter}"
# Ensure base is short enough to add the suffix
truncated_base = self.truncate_filename(
base, max_length - len(suffix) - len(ext)
)
unique_path = f"{truncated_base}{suffix}{ext}"
counter += 1

return unique_path

def convert_image(self, image, output_dir: str) -> dict:
"""Handles image extraction and saving with collision avoidance and length limits."""
os.makedirs(output_dir, exist_ok=True)

image.alt_text = image.alt_text.replace("\n", " ")
raw_name = image.alt_text or f"image_{hash(image)}"
sanitized_name = self.sanitize_filename(raw_name)
truncated_name = self.truncate_filename(sanitized_name, 251, ".png")
image_path = os.path.join(output_dir, truncated_name + ".png")

# Ensure unique filename
image_path = self.unique_filename(image_path)

try:
with image.open() as image_bytes:
with open(image_path, "wb") as img_file:
img_file.write(image_bytes.read())
return {"src": image_path, "alt": image.alt_text}
except Exception:
# Return an empty src if saving fails
return {"src": ""}

def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
if extension.lower() != ".docx":
return None

result = None
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map", None)

result = mammoth.convert_to_html(docx_file, style_map=style_map)
html_content = result.value
result = self._convert(html_content)
try:
with open(local_path, "rb") as docx_file:
style_map = kwargs.get("style_map")
image_output_dir = kwargs.get("image_output_dir", "images")

mammoth_result = mammoth.convert_to_html(
docx_file,
style_map=style_map,
convert_image=mammoth.images.inline(
lambda img: self.convert_image(img, image_output_dir)
),
)

return result
html_content = mammoth_result.value
return self._convert(html_content)
except Exception:
return None


class XlsxConverter(HtmlConverter):
Expand Down
Binary file added tests/test_files/test_with_images.docx
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,16 @@
"Yet another comment in the doc. 55yiyi-asd09",
]

DOCX_IMAGES_TEST_STRINGS = [
"314b0a30-5b04-470b-b9f7-eed2c2bec74a",
"49e168b7-d2ae-407f-a055-2167576f39a1",
"## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
"# Abstract",
"# Introduction",
"AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
"GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png",
]

PPTX_TEST_STRINGS = [
"2cdda5c8-e50e-4db4-b5f0-9722a649f455",
"04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
Expand Down Expand Up @@ -206,6 +216,10 @@ def test_markitdown_local() -> None:
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
validate_strings(result, DOCX_TEST_STRINGS)

# Test DOCX processing, with images
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_with_images.docx"))
validate_strings(result, DOCX_IMAGES_TEST_STRINGS)

# Test DOCX processing, with comments
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
Expand Down