microsoft · MauroDruwel · Dec 23, 2024 · Dec 23, 2024 · Dec 23, 2024 · Jan 3, 2025
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -698,24 +698,99 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
 
 class DocxConverter(HtmlConverter):
     """
-    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+    Converts DOCX files to Markdown. Style information (e.g., headings) and tables are preserved where possible.
     """
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def sanitize_filename(self, name: str) -> str:
+        """Sanitizes a string to make it a valid file name across different operating systems."""
+        # Normalize underscore
+        name = re.sub(r"\s+", "_", name.strip())
+
+        # Replace invalid characters with underscores
+        name = re.sub(r'[\\/*?:"<>|]', "_", name)
+
+        # Remove leading and trailing dots and spaces
+        name = name.strip(" .")
+
+        # Limit the length of the filename to a reasonable length (e.g., 251 characters)
+        max_length = 251
+        if len(name) > max_length:
+            name = name[:max_length]
+
+        return name
+
+    def truncate_filename(self, name: str, max_length: int, extension: str = "") -> str:
+        """Truncates the filename to ensure the final length is within the limit."""
+        max_base_length = max_length - len(extension)
+        if len(name) > max_base_length:
+            return name[:max_base_length]
+        return name
+
+    def unique_filename(self, base_path: str, max_length: int = 251) -> str:
+        """Generates a unique filename while ensuring it stays within the length limit."""
+        base, ext = os.path.splitext(base_path)
+        truncated_base = self.truncate_filename(base, max_length, ext)
+
+        counter = 1
+        unique_path = f"{truncated_base}{ext}"
+        while os.path.exists(unique_path):
+            suffix = f"_{counter}"
+            # Ensure base is short enough to add the suffix
+            truncated_base = self.truncate_filename(
+                base, max_length - len(suffix) - len(ext)
+            )
+            unique_path = f"{truncated_base}{suffix}{ext}"
+            counter += 1
+
+        return unique_path
+
+    def convert_image(self, image, output_dir: str) -> dict:
+        """Handles image extraction and saving with collision avoidance and length limits."""
+        os.makedirs(output_dir, exist_ok=True)
+
+        image.alt_text = image.alt_text.replace("\n", " ")
+        raw_name = image.alt_text or f"image_{hash(image)}"
+        sanitized_name = self.sanitize_filename(raw_name)
+        truncated_name = self.truncate_filename(sanitized_name, 251, ".png")
+        image_path = os.path.join(output_dir, truncated_name + ".png")
+
+        # Ensure unique filename
+        image_path = self.unique_filename(image_path)
+
+        try:
+            with image.open() as image_bytes:
+                with open(image_path, "wb") as img_file:
+                    img_file.write(image_bytes.read())
+            return {"src": image_path, "alt": image.alt_text}
+        except Exception:
+            # Return an empty src if saving fails
+            return {"src": ""}
+
+    def convert(
+        self, local_path: str, **kwargs
+    ) -> Union[None, DocumentConverterResult]:
         # Bail if not a DOCX
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".docx":
             return None
 
-        result = None
-        with open(local_path, "rb") as docx_file:
-            style_map = kwargs.get("style_map", None)
-
-            result = mammoth.convert_to_html(docx_file, style_map=style_map)
-            html_content = result.value
-            result = self._convert(html_content)
+        try:
+            with open(local_path, "rb") as docx_file:
+                style_map = kwargs.get("style_map")
+                image_output_dir = kwargs.get("image_output_dir", "images")
+
+                mammoth_result = mammoth.convert_to_html(
+                    docx_file,
+                    style_map=style_map,
+                    convert_image=mammoth.images.inline(
+                        lambda img: self.convert_image(img, image_output_dir)
+                    ),
+                )
 
-        return result
+                html_content = mammoth_result.value
+                return self._convert(html_content)
+        except Exception:
+            return None
 
 
 class XlsxConverter(HtmlConverter):

diff --git a/tests/test_files/test_with_images.docx b/tests/test_files/test_with_images.docx
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -89,6 +89,16 @@
     "Yet another comment in the doc. 55yiyi-asd09",
 ]
 
+DOCX_IMAGES_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "GitHub_-_microsoft_markitdown__Python_tool_for_converting_files_and_office_documents_to_Markdown.png",
+]
+
 PPTX_TEST_STRINGS = [
     "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
     "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
@@ -206,6 +216,10 @@ def test_markitdown_local() -> None:
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
     validate_strings(result, DOCX_TEST_STRINGS)
 
+    # Test DOCX processing, with images
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_with_images.docx"))
+    validate_strings(result, DOCX_IMAGES_TEST_STRINGS)
+
     # Test DOCX processing, with comments
     result = markitdown.convert(
         os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),