Dec-08-2023, 07:31 PM
This myApp() manipulates the document.xml file within the zip file, which is the .docx file, and creates a new file with different column widths, which actually show up in Libre Office!
Haven't tried to open the new file in Windows. I'm happy if it works in Libre Office!
Word is putting the settings somewhere else, not in document.xml, because, when I open the file in Windows, without manipulating document.xml, it displays correctly. This may be a version problem.
Maybe someone here could help me tidy this up a bit?? I never worked with zipfile before!
Haven't tried to open the new file in Windows. I'm happy if it works in Libre Office!
Word is putting the settings somewhere else, not in document.xml, because, when I open the file in Windows, without manipulating document.xml, it displays correctly. This may be a version problem.
Maybe someone here could help me tidy this up a bit?? I never worked with zipfile before!
from zipfile import ZipFile import os import re # got to change the overall table width too! def myApp(): path2files = "/home/pedro/myPython/zipfile/" # change to zfile directory os.chdir(path2files) # specifying the zip file name zfile = "example_copy.docx" # opening the zip file in READ mode # closes automatically with ZipFile(zfile, 'r') as zf: # printing all the contents of the zip file zf.printdir() # extracting document.xml zf.extract('word/document.xml') # returns bytes docdata = zf.read('word/document.xml') print(f'opened the zip file word/document.xml as {type(docdata)}') # set new column sizes newcolsizes = [500, 300, 600] twidth = sum(newcolsizes) # search pattern regex1 = b'<w:tblW w:w="\d+" w:type="dxa"/>' # set the table width to twidth which is: sum(newcolsizes) (only 1 of these, so far, could be more +2500 matches = re.finditer(regex1, docdata, re.MULTILINE) for num, match in enumerate(matches): print(num, match.group(), type(match[0])) # I don't understand what the numbers mean, maybe Pt # but trial and error shows with these column sizes +2500 works widthstring = str(twidth + 2500) print(f'widthstring is {widthstring}, type(widthstring) is {type(widthstring)}') byte_val = widthstring.encode() print(f'byte_val is {byte_val}, type(byte_val) is {type(byte_val)}') # pattern must be bytes or you get an error: b'\d+' newbytes = re.sub(b'\d+', byte_val, match[0], flags=0) print(newbytes) newdata = re.sub(match[0], newbytes, docdata, count=1) # reassign or only the last one will be changed docdata = newdata # find the column size lines in docdata and change them to the desired values from the list newcolsizes regex2 = b'<w:gridCol w:w="\d+"/>' matches = re.finditer(regex2, docdata, re.MULTILINE) for num, match in enumerate(matches): print(num, match.group(), type(match[0])) widthstring = str(newcolsizes[num]) print(f'widthstring is {widthstring}, type(widthstring) is {type(widthstring)}') byte_val = widthstring.encode() print(f'byte_val is {byte_val}, type(byte_val) is {type(byte_val)}') # pattern must be bytes or you get an error: b'\d+' newbytes = re.sub(b'\d+', byte_val, match[0], flags=0) print(newbytes) newdata = re.sub(match[0], newbytes, docdata, count=1) # reassign or only the last one will be changed docdata = newdata # reopen to create a new file which doesn't have word/document.xml # basically, make a copy but leave out the original word/document.xml # then add docdata from above to the new zip file as word/document.xml zin = ZipFile (zfile, 'r') zout = ZipFile ('example_copy2.docx', 'w') for item in zin.infolist(): print(item.filename) if item.filename != 'word/document.xml': buffer = zin.read(item.filename) zout.writestr(item, buffer) elif item.filename == 'word/document.xml': zout.writestr(item, docdata) # close the files # seems to work. At least displays correctly in Libre Office, have not tried in Windows zin.close() zout.close()If there are several tables, this may get more complicated!