diff --git a/NEWS b/NEWS index 447f685..d84dd79 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,8 @@ +# 1.9.0 + +* Detect checkboxes, both as complex fields and structured document tags, and + convert them to checkbox inputs. + # 1.8.0 * Add style mapping for highlights. diff --git a/mammoth/docx/body_xml.py b/mammoth/docx/body_xml.py index 134917f..cc4e9ad 100644 --- a/mammoth/docx/body_xml.py +++ b/mammoth/docx/body_xml.py @@ -137,7 +137,10 @@ def read_boolean_element(element): if element is None: return False else: - return element.attributes.get("w:val") not in ["false", "0"] + return read_boolean_attribute_value(element.attributes.get("w:val")) + + def read_boolean_attribute_value(value): + return value not in ["false", "0"] def read_underline_element(element): return element and element.attributes.get("w:val") not in [None, "false", "0", "none"] @@ -569,7 +572,17 @@ def alternate_content(element): return read_child_elements(element.find_child("mc:Fallback")) def read_sdt(element): - return read_child_elements(element.find_child_or_null("w:sdtContent")) + checkbox = element.find_child_or_null("w:sdtPr").find_child("wordml:checkbox") + + if checkbox is not None: + checked_element = checkbox.find_child("wordml:checked") + is_checked = ( + checked_element is not None and + read_boolean_attribute_value(checked_element.attributes.get("wordml:val")) + ) + return _success(documents.checkbox(checked=is_checked)) + else: + return read_child_elements(element.find_child_or_null("w:sdtContent")) handlers = { "w:t": text, diff --git a/mammoth/docx/office_xml.py b/mammoth/docx/office_xml.py index bebd04c..a988894 100644 --- a/mammoth/docx/office_xml.py +++ b/mammoth/docx/office_xml.py @@ -23,6 +23,10 @@ ("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"), ("v", "urn:schemas-microsoft-com:vml"), ("office-word", "urn:schemas-microsoft-com:office:word"), + + # [MS-DOCX]: Word Extensions to the Office Open XML (.docx) File Format + # https://learn.microsoft.com/en-us/openspecs/office_standards/ms-docx/b839fe1f-e1ca-4fa6-8c26-5954d0abbccd + ("wordml", "http://schemas.microsoft.com/office/word/2010/wordml"), ] diff --git a/tests/docx/body_xml_tests.py b/tests/docx/body_xml_tests.py index 8c1fdf0..8a8f40b 100644 --- a/tests/docx/body_xml_tests.py +++ b/tests/docx/body_xml_tests.py @@ -751,6 +751,43 @@ def test_complex_field_checkbox_with_default_0_and_checked_1_is_checked(self): is_run(children=is_sequence(is_checkbox(checked=True))), ))) + def test_structured_document_tag_checkbox_without_checked_is_not_checked(self): + element = xml_element("w:sdt", {}, [ + xml_element("w:sdtPr", {}, [ + xml_element("wordml:checkbox"), + ]), + ]) + + result = _read_and_get_document_xml_element(element) + + assert_that(result, is_checkbox(checked=False)) + + def test_structured_document_tag_checkbox_with_checked_0_is_not_checked(self): + element = xml_element("w:sdt", {}, [ + xml_element("w:sdtPr", {}, [ + xml_element("wordml:checkbox", {}, [ + xml_element("wordml:checked", {"wordml:val": "0"}), + ]), + ]), + ]) + + result = _read_and_get_document_xml_element(element) + + assert_that(result, is_checkbox(checked=False)) + + def test_structured_document_tag_checkbox_with_checked_1_is_checked(self): + element = xml_element("w:sdt", {}, [ + xml_element("w:sdtPr", {}, [ + xml_element("wordml:checkbox", {}, [ + xml_element("wordml:checked", {"wordml:val": "1"}), + ]), + ]), + ]) + + result = _read_and_get_document_xml_element(element) + + assert_that(result, is_checkbox(checked=True)) + def _complex_field_checkbox_paragraph(self, ff_data_children): return xml_element("w:p", {}, [ xml_element("w:r", {}, [