Skip to content

Commit

Permalink
Add support for the strict document format
Browse files Browse the repository at this point in the history
  • Loading branch information
mwilliamson committed Feb 18, 2024
1 parent 414f1cd commit 1db36be
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 3 deletions.
2 changes: 2 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

* Drop support for Python 2.7, Python 3.5 and Python 3.6.

* Add support for the strict document format.

# 1.6.0

* Support merged paragraphs when revisions are tracked.
Expand Down
16 changes: 13 additions & 3 deletions mammoth/docx/office_xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,32 @@


_namespaces = [
# Transitional format
("w", "http://schemas.openxmlformats.org/wordprocessingml/2006/main"),
("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
("wp", "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing"),
("a", "http://schemas.openxmlformats.org/drawingml/2006/main"),
("pic", "http://schemas.openxmlformats.org/drawingml/2006/picture"),

# Strict format
("w", "http://purl.oclc.org/ooxml/wordprocessingml/main"),
("r", "http://purl.oclc.org/ooxml/officeDocument/relationships"),
("wp", "http://purl.oclc.org/ooxml/drawingml/wordprocessingDrawing"),
("a", "http://purl.oclc.org/ooxml/drawingml/main"),
("pic", "http://purl.oclc.org/ooxml/drawingml/picture"),

# Common
("content-types", "http://schemas.openxmlformats.org/package/2006/content-types"),
("r", "http://schemas.openxmlformats.org/officeDocument/2006/relationships"),
("relationships", "http://schemas.openxmlformats.org/package/2006/relationships"),
("v", "urn:schemas-microsoft-com:vml"),
("mc", "http://schemas.openxmlformats.org/markup-compatibility/2006"),
("v", "urn:schemas-microsoft-com:vml"),
("office-word", "urn:schemas-microsoft-com:office:word"),
]


def read(fileobj):
return _collapse_alternate_content(parse_xml(fileobj, _namespaces))[0]


def _collapse_alternate_content(node):
if isinstance(node, XmlElement):
Expand Down
7 changes: 7 additions & 0 deletions tests/mammoth_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,13 @@ def test_can_extract_raw_text():
assert_equal("Apple\n\nBanana\n\n", result.value)


def test_can_read_strict_format():
with open(generate_test_path("strict-format.docx"), "rb") as fileobj:
result = mammoth.convert_to_html(fileobj=fileobj)
assert_equal([], result.messages)
assert_equal("<p>Test</p>", result.value)


def _copy_of_test_data(path):
destination = io.BytesIO()
with open(generate_test_path(path), "rb") as source:
Expand Down
Binary file added tests/test-data/strict-format.docx
Binary file not shown.

0 comments on commit 1db36be

Please sign in to comment.