Jun-09-2023, 10:41 AM
(I updated the code with comments)
Hi everyone, I am creating an xml file with python using xml.etree.ElementTree. In input I have a docx, where the titles are formatted like "Header1" and the paragraphs under the title will be the text of the title (in short).
I need to format the xml properly. At the moment I have this code (I attach it)
One of the xml output I have is
https://1drv.ms/i/s!AohXx8uDzsTq8xsoHKat...T?e=omJVaQ
...here after "lines" elem. the text is not wrapping. Instead I want that "lines" is like a group of the string like is here
https://1drv.ms/i/s!AohXx8uDzsTq8xzcRknU...g?e=QvGaYD
I attached also a test file.
If you prefer, here is the code:
[attachment=2399]
Hi everyone, I am creating an xml file with python using xml.etree.ElementTree. In input I have a docx, where the titles are formatted like "Header1" and the paragraphs under the title will be the text of the title (in short).
I need to format the xml properly. At the moment I have this code (I attach it)
One of the xml output I have is
https://1drv.ms/i/s!AohXx8uDzsTq8xsoHKat...T?e=omJVaQ
...here after "lines" elem. the text is not wrapping. Instead I want that "lines" is like a group of the string like is here
https://1drv.ms/i/s!AohXx8uDzsTq8xzcRknU...g?e=QvGaYD
I attached also a test file.
If you prefer, here is the code:
import os import re import xml.etree.ElementTree as ET from docx import Document def generate_xml(docx_file, output_folder): # Read the docx file document = Document(docx_file) # Initialize variables verse_num = 1 verse_type = 'V' song_count = 0 lines_text = '' verse_order_set = set() verseOrder = None lyrics = None root = None lines = None # Loop through the paragraphs in the docx file for i, para in enumerate(document.paragraphs): # Check if the paragraph style is "Heading 1" if para.style.name == 'Heading 1': # Create the root "song" element and set its attributes root = ET.Element("song") root.set("xmlns", "http://openlyrics.info/namespace/2009/song") root.set("version", "0.8") root.set("createdIn", "OpenLP 2.4.3") root.set("modifiedIn", "FreeWorship 3.2301.280.0") root.set("modifiedDate", "2023-04-27T18:47:00") # Create the child elements of the root element properties = ET.SubElement(root, 'properties') titles = ET.SubElement(properties, 'titles') title = ET.SubElement(titles, 'title') title.text = para.text authors = ET.SubElement(properties, 'authors') author = ET.SubElement(authors, 'author') author.text="Author Unknown" verseOrder = ET.SubElement(properties, 'verseOrder') verseOrder.text = '' songbooks = ET.SubElement(properties, 'songbooks') songbook = ET.SubElement(songbooks, 'songbook') songbook.set("name","Superbook") songbook.set("entry","SuperBook") lyrics = ET.SubElement(root, 'lyrics') # Increment the song count and reset variables for the next song song_count += 1 verse_order_set.clear() verse_num = 1 bridge_num = 1 chorus_num = 1 else: # Check if the paragraph text is not empty if para.text.strip(): first_word = para.text.split()[0] # Check if the first word is a verse number if (first_word[-1] == '.' or first_word[-1] == ')') and first_word[:-1].isdigit(): # Add the lines text to the "lines" element if lines_text and lines is not None: lines.text = lines_text[:-5] lines_text = '' # Increment the verse number and set the verse type to "V" verse_num += 1 verse_type = 'V' # Remove the verse number from the paragraph text para.text = para.text[len(first_word):].strip() # Set the verse name verse_name = f'{verse_type}{verse_num}' # Check if the first word is "Coro" elif 'Coro' in first_word: # Add the lines text to the "lines" element if lines_text and lines is not None: lines.text = lines_text[:-5] lines_text = '' # Set the verse type to "C" verse_type = 'C' # Remove the first word from the paragraph text para.text = para.text[len(first_word):].strip() # Set the verse name verse_name = f'{verse_type}{chorus_num}' # Increment the chorus number chorus_num += 1 # Check if the first word is "Bridge:" elif first_word == 'Bridge:': # Add the lines text to the "lines" element if lines_text and lines is not None: lines.text = lines_text[:-5] lines_text = '' # Set the verse type to "B" verse_type = 'B' # Remove the first word from the paragraph text para.text = para.text[len(first_word):].strip() # Set the verse name verse_name = f'{verse_type}{bridge_num}' else: # Set the verse type to "V" verse_type = 'V' # Set the verse name verse_name = f'{verse_type}{verse_num}' # Check if the verse name is not in the verse order set if verse_name not in verse_order_set: # Add the verse name to the "verseOrder" element text if verseOrder is not None: if not verseOrder.text: verseOrder.text += f'{verse_name}' else: verseOrder.text += f'{verse_name}' # Add the verse name to the verse order set verse_order_set.add(verse_name) # Check if there is no lines text yet if not lines_text and lyrics is not None: # Create the "verse" and "lines" elements verse = ET.SubElement(lyrics, 'verse', attrib={'name': verse_name, 'lang': ''}) lines= ET.SubElement(verse, 'lines') # Initialize a list to store the text runs text_runs = [] # Loop through the runs in the paragraph for run in para.runs: # Check if the run font color is None or black if run.font.color.rgb is None or run.font.color.rgb == "000000": # Append the run text to the text runs list text_runs.append(run.text) else: # Split the run text into words and add brackets around each word words = run.text.split() bracketed_words = ['[' + word + ']' for word in words] # Join the bracketed words and append them to the text runs list text_runs.append(' '.join(bracketed_words)) # Join the text runs and add line breaks para_text = ''.join(text_runs).replace('\n','<br/>') +'<br/>' # Add the paragraph text to the lines text lines_text += para_text # Check if this is the last paragraph or if the next paragraph style is "Heading 1" if i == len(document.paragraphs) - 1 or document.paragraphs[i + 1].style.name == 'Heading 1': # Check if there is a root element if root is not None: # Add the lines text to the "lines" element if lines_text and lyrics is not None and lines is not None: lines.text = lines_text[:-5] lines_text = '' # Create an ElementTree object and write it to an XML file tree = ET.ElementTree(root) filename = re.sub(r'[^\w\s-]', '', title.text).strip().lower() # filename = re.sub(r'[-\s]+', '-', filename) output_file = os.path.join(output_folder, f'{filename}.xml') tree.write(output_file, encoding='utf-8', xml_declaration=True) # Call the generate_xml function with a docx file and an output folder as arguments generate_xml("C:/Users/Daniele/Downloads/test.docx","C:/Users/Daniele/Downloads/xml")[attachment=2395]
[attachment=2399]