I’m following the below guide to search a replace a keyword in the DOCX document.xml https://virantha.com/2013/08/16/reading-and-writing-microsoft-word-docx-files-with-python/
I’m trying to find ‘abc’ within the w:instrText tag and replace it with ‘zzzz’
Before <w:r> <w:instrText>abc\\data</w:instrText> </w:r> After <w:r> <w:instrText>zzzz\\data</w:instrText> </w:r>
Please find my code below but I’m getting an error:
Traceback (most recent call last):
File "D:pythonsearch_docx_templatestest.py", line 68, in <module>
document_target._write_and_close_docx(adjust_xml_content, "hello.docx")
File "D:pythonsearch_docx_templatestest.py", line 46, in _write_and_close_docx
xmlstr = etree.ElementTree.tostring(xml_content, pretty_print=True)
AttributeError: 'cython_function_or_method' object has no attribute 'tostring'
import zipfile
from lxml import etree
import os
import tempfile
import shutil
import re
class DocsWriter:
def __init__(self, docx_file):
self.docx_file = docx_file
self.zipfile = zipfile.ZipFile(docx_file)
def get_word_xml(self):
print(self.docx_file)
with open(self.docx_file, 'rb') as f:
zip = zipfile.ZipFile(f)
xml_content = zip.read('word/document.xml')
return xml_content
def _itertext(self, my_etree):
"""Iterator to go through xml tree's text nodes"""
for node in my_etree.iter(tag=etree.Element):
if self._check_element_is(node, 't'):
yield (node, node.text)
def _check_element_is(self, element, type_char):
word_schema = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
return element.tag == '{%s}%s' % (word_schema,type_char)
def get_xml_tree(xml_string):
return etree.fromstring(xml_string)
def _write_and_close_docx (self, xml_content, output_filename):
""" Create a temp directory, expand the original docx zip.
Write the modified xml to word/document.xml
Zip it up as the new docx
"""
tmp_dir = tempfile.mkdtemp()
self.zipfile.extractall(tmp_dir)
with open(os.path.join(tmp_dir,'word/document.xml'), 'w') as f:
xmlstr = etree.tostring (xml_content, pretty_print=True)
f.write(xmlstr)
# Get a list of all the files in the original docx zipfile
filenames = self.zipfile.namelist()
# Now, create the new zip file and add all the filex into the archive
zip_copy_filename = output_filename
with zipfile.ZipFile(zip_copy_filename, "w") as docx:
for filename in filenames:
docx.write(os.path.join(tmp_dir,filename), filename)
# Clean up the temp dir
shutil.rmtree(tmp_dir)
document_target = DocsWriter('inputdoc.docx')
exctracted_xml = document_target.get_word_xml()
html = exctracted_xml.decode('ISO-8859-1') # encoding may vary!
adjust_xml_content = re.sub('abc', 'zzzz', html)
document_target._write_and_close_docx(adjust_xml_content, "outputdoc.docx")
I need help converting it back to docx with the correct Document.xml
Thanks
Advertisement
Answer
Your XML handling is dubious. You treat the XML as text and try to replace things in the XML source with regex. Never, never do that.
The way to make changes to XML is
- parse it into a DOM tree (
tree = ET.parse(filename)) - manipulate the DOM tree
- find nodes, e.g. with XPath (
nodes = tree.xpath('...')) - make changes to those nodes, e.g. by updating their text (
node.text = '...') – using regex at this point is fine
- find nodes, e.g. with XPath (
- write the DOM tree (
tree.write(filename))
This is better:
from os import path
from shutil import rmtree
from lxml import etree as ET
from zipfile import ZipFile
from tempfile import mkdtemp
class DocxHelper:
def __init__(self, docx_file):
self.docx_file = docx_file
def get_xml(self, filename):
with ZipFile(self.docx_file) as docx:
with docx.open(filename) as xml:
# 1. parse into tree
return ET.parse(xml)
def set_xml(self, filename, tree, output_filename):
tmp_dir = mkdtemp()
with ZipFile(self.docx_file) as docx:
filenames = docx.namelist()
docx.extractall(tmp_dir)
# 3. write tree
tree.write(path.join(tmp_dir, filename), pretty_print=True)
with ZipFile(output_filename, 'w') as docx:
for filename in filenames:
docx.write(path.join(tmp_dir, filename), filename)
rmtree(tmp_dir)
Usage
input_docx = 'inputdoc.docx'
output_docx = 'outputdoc.docx'
filename = 'word/document.xml'
dh = DocxHelper(input_docx)
xml_tree = dh.get_xml(filename)
# 2. manipulate the DOM tree
ns = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
for elem in xml_tree.xpath('//w:r/w:instrText[contains(., "aaa")]', namespaces=ns):
elem.text = elem.text.replace('aaa', 'zzzz')
dh.set_xml(filename, xml_tree, output_docx)
The XPath //w:r/w:instrText[contains(., "aaa")] matches your sample, adapt it accordingly. Note the use of the w: namespace prefix.