I’m following the below guide to search a replace a keyword in the DOCX document.xml https://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/
I’m trying to find ‘abc’ within the w:instrText tag and replace it with ‘zzzz’
Before <w:r> <w:instrText>abc\\data</w:instrText> </w:r> After <w:r> <w:instrText>zzzz\\data</w:instrText> </w:r>
Please find my code below but I’m getting an error:
Traceback (most recent call last): File "D:pythonsearch_docx_templatestest.py", line 68, in <module> document_target._write_and_close_docx(adjust_xml_content, "hello.docx") File "D:pythonsearch_docx_templatestest.py", line 46, in _write_and_close_docx xmlstr = etree.ElementTree.tostring(xml_content, pretty_print=True) AttributeError: 'cython_function_or_method' object has no attribute 'tostring'
import zipfile from lxml import etree import os import tempfile import shutil import re class DocsWriter: def __init__(self, docx_file): self.docx_file = docx_file self.zipfile = zipfile.ZipFile(docx_file) def get_word_xml(self): print(self.docx_file) with open(self.docx_file, 'rb') as f: zip = zipfile.ZipFile(f) xml_content = zip.read('word/document.xml') return xml_content def _itertext(self, my_etree): """Iterator to go through xml tree's text nodes""" for node in my_etree.iter(tag=etree.Element): if self._check_element_is(node, 't'): yield (node, node.text) def _check_element_is(self, element, type_char): word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' return element.tag == '{%s}%s' % (word_schema,type_char) def get_xml_tree(xml_string): return etree.fromstring(xml_string) def _write_and_close_docx (self, xml_content, output_filename): """ Create a temp directory, expand the original docx zip. Write the modified xml to word/document.xml Zip it up as the new docx """ tmp_dir = tempfile.mkdtemp() self.zipfile.extractall(tmp_dir) with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f: xmlstr = etree.tostring (xml_content, pretty_print=True) f.write(xmlstr) # Get a list of all the files in the original docx zipfile filenames = self.zipfile.namelist() # Now, create the new zip file and add all the filex into the archive zip_copy_filename = output_filename with zipfile.ZipFile(zip_copy_filename, "w") as docx: for filename in filenames: docx.write(os.path.join(tmp_dir,filename), filename) # Clean up the temp dir shutil.rmtree(tmp_dir) document_target = DocsWriter('inputdoc.docx') exctracted_xml = document_target.get_word_xml() html = exctracted_xml.decode('ISO-8859-1') # encoding may vary! adjust_xml_content = re.sub('abc', 'zzzz', html) document_target._write_and_close_docx(adjust_xml_content, "outputdoc.docx")
I need help converting it back to docx with the correct Document.xml
Thanks
Advertisement
Answer
Your XML handling is dubious. You treat the XML as text and try to replace things in the XML source with regex. Never, never do that.
The way to make changes to XML is
- parse it into a DOM tree (
tree = ET.parse(filename)
) - manipulate the DOM tree
- find nodes, e.g. with XPath (
nodes = tree.xpath('...')
) - make changes to those nodes, e.g. by updating their text (
node.text = '...'
) – using regex at this point is fine
- find nodes, e.g. with XPath (
- write the DOM tree (
tree.write(filename)
)
This is better:
from os import path from shutil import rmtree from lxml import etree as ET from zipfile import ZipFile from tempfile import mkdtemp class DocxHelper: def __init__(self, docx_file): self.docx_file = docx_file def get_xml(self, filename): with ZipFile(self.docx_file) as docx: with docx.open(filename) as xml: # 1. parse into tree return ET.parse(xml) def set_xml(self, filename, tree, output_filename): tmp_dir = mkdtemp() with ZipFile(self.docx_file) as docx: filenames = docx.namelist() docx.extractall(tmp_dir) # 3. write tree tree.write(path.join(tmp_dir, filename), pretty_print=True) with ZipFile(output_filename, 'w') as docx: for filename in filenames: docx.write(path.join(tmp_dir, filename), filename) rmtree(tmp_dir)
Usage
input_docx = 'inputdoc.docx' output_docx = 'outputdoc.docx' filename = 'word/document.xml' dh = DocxHelper(input_docx) xml_tree = dh.get_xml(filename) # 2. manipulate the DOM tree ns = { 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' } for elem in xml_tree.xpath('//w:r/w:instrText[contains(., "aaa")]', namespaces=ns): elem.text = elem.text.replace('aaa', 'zzzz') dh.set_xml(filename, xml_tree, output_docx)
The XPath //w:r/w:instrText[contains(., "aaa")]
matches your sample, adapt it accordingly. Note the use of the w:
namespace prefix.