Skip to content
Advertisement

Find and replace XML content in MS Word Document.xml with Python

I’m following the below guide to search a replace a keyword in the DOCX document.xml https://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/

I’m trying to find ‘abc’ within the w:instrText tag and replace it with ‘zzzz’

Before
<w:r>
<w:instrText>abc\\data</w:instrText>
</w:r>
After
<w:r>
<w:instrText>zzzz\\data</w:instrText>
</w:r>

Please find my code below but I’m getting an error:

Traceback (most recent call last):
  File "D:pythonsearch_docx_templatestest.py", line 68, in <module>
    document_target._write_and_close_docx(adjust_xml_content, "hello.docx")
  File "D:pythonsearch_docx_templatestest.py", line 46, in _write_and_close_docx
    xmlstr = etree.ElementTree.tostring(xml_content, pretty_print=True)
AttributeError: 'cython_function_or_method' object has no attribute 'tostring'
import zipfile
from lxml import etree
import os
import tempfile
import shutil
import re

class DocsWriter:
    def __init__(self, docx_file):
        self.docx_file = docx_file
        self.zipfile = zipfile.ZipFile(docx_file)

    def get_word_xml(self):
        print(self.docx_file)
        with open(self.docx_file, 'rb') as f:
            zip = zipfile.ZipFile(f)
            xml_content = zip.read('word/document.xml')
        return xml_content

    def _itertext(self, my_etree):
     """Iterator to go through xml tree's text nodes"""
     for node in my_etree.iter(tag=etree.Element):
         if self._check_element_is(node, 't'):
             yield (node, node.text)

    def _check_element_is(self, element, type_char):
     word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
     return element.tag == '{%s}%s' % (word_schema,type_char)


    def get_xml_tree(xml_string):
        return etree.fromstring(xml_string)


    def _write_and_close_docx (self, xml_content, output_filename):
        """ Create a temp directory, expand the original docx zip.
            Write the modified xml to word/document.xml
            Zip it up as the new docx
        """

        tmp_dir = tempfile.mkdtemp()

        self.zipfile.extractall(tmp_dir)

        with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
            xmlstr = etree.tostring (xml_content, pretty_print=True)
            f.write(xmlstr)

        # Get a list of all the files in the original docx zipfile
        filenames = self.zipfile.namelist()
        # Now, create the new zip file and add all the filex into the archive
        zip_copy_filename = output_filename
        with zipfile.ZipFile(zip_copy_filename, "w") as docx:
            for filename in filenames:
                docx.write(os.path.join(tmp_dir,filename), filename)

        # Clean up the temp dir
        shutil.rmtree(tmp_dir)


document_target = DocsWriter('inputdoc.docx')
exctracted_xml = document_target.get_word_xml()

html = exctracted_xml.decode('ISO-8859-1')  # encoding may vary!

adjust_xml_content = re.sub('abc', 'zzzz', html)
document_target._write_and_close_docx(adjust_xml_content, "outputdoc.docx")

I need help converting it back to docx with the correct Document.xml

Thanks

Advertisement

Answer

Your XML handling is dubious. You treat the XML as text and try to replace things in the XML source with regex. Never, never do that.

The way to make changes to XML is

  1. parse it into a DOM tree (tree = ET.parse(filename))
  2. manipulate the DOM tree
    1. find nodes, e.g. with XPath (nodes = tree.xpath('...'))
    2. make changes to those nodes, e.g. by updating their text (node.text = '...') – using regex at this point is fine
  3. write the DOM tree (tree.write(filename))

This is better:

from os import path
from shutil import rmtree
from lxml import etree as ET
from zipfile import ZipFile
from tempfile import mkdtemp

class DocxHelper:
    def __init__(self, docx_file):
        self.docx_file = docx_file

    def get_xml(self, filename):
        with ZipFile(self.docx_file) as docx:
            with docx.open(filename) as xml:
                # 1. parse into tree
                return ET.parse(xml)

    def set_xml(self, filename, tree, output_filename):
        tmp_dir = mkdtemp()

        with ZipFile(self.docx_file) as docx:
            filenames = docx.namelist()
            docx.extractall(tmp_dir)

        # 3. write tree
        tree.write(path.join(tmp_dir, filename), pretty_print=True)

        with ZipFile(output_filename, 'w') as docx:
            for filename in filenames:
                docx.write(path.join(tmp_dir, filename), filename)

        rmtree(tmp_dir)

Usage

input_docx = 'inputdoc.docx'
output_docx = 'outputdoc.docx'
filename = 'word/document.xml'

dh = DocxHelper(input_docx)
xml_tree = dh.get_xml(filename)

# 2. manipulate the DOM tree
ns = {
    'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
for elem in xml_tree.xpath('//w:r/w:instrText[contains(., "aaa")]', namespaces=ns):
    elem.text = elem.text.replace('aaa', 'zzzz')

dh.set_xml(filename, xml_tree, output_docx)

The XPath //w:r/w:instrText[contains(., "aaa")] matches your sample, adapt it accordingly. Note the use of the w: namespace prefix.

User contributions licensed under: CC BY-SA
3 People found this is helpful
Advertisement