![]() |
why doesn't it replace all html tags? - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: General Coding Help (https://python-forum.io/forum-8.html) +--- Thread: why doesn't it replace all html tags? (/thread-40279.html) |
why doesn't it replace all html tags? - Melcu54 - Jul-04-2023 I have these lines in a html file. each of them must be translated into Russian, if in contain more than 3 words found in the list. The problem is that only the last tag, <p class="text_obisnuit">, is translated. I don't know why they are not all translated, because they have the same content. <title>The Fatal Solidarity, Whose because Punishment here two one Flings Its Condemned whose love me four times seven The Fatal Solidarity</title> <meta name="description" content="Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven"> <p class="text_obisnuit2">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p> <p class="text_obisnuit">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>And thise is the Python code import os import re from googletrans import Translator translator = Translator() # Folder path folder_path = r"c:\Folder3\2" # HTML tags to translate tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>'] # Words to check words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from'] # Initialize the amount of translated files amount = 0 # Iterate over all HTML files in the folder for filename in os.listdir(folder_path): if filename.endswith(".html"): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file: html_content = file.read() # Iterate over all tags to translate for tag in tags_to_translate: matches = re.findall(tag, html_content, re.DOTALL) # Translate each match translated_matches = [] for match in matches: original_match = match # Keep a copy of the original match # Check if the match contains at least three of the specified words if sum(word in match for word in words_to_check) >= 3: # Split the match into sentences sentences = re.split(r'(?<=[.!?])\s+', match) # Initialize a string to hold the translated match translated_match = match # Translate each sentence that contains at least three of the specified words for sentence in sentences: if sum(word in sentence for word in words_to_check) >= 3: translation = translator.translate(sentence, dest='ru').text # Replace the original sentence with the translation in the translated match translated_match = translated_match.replace(sentence, translation) # Add the translated match to the list of translated matches translated_matches.append((original_match, translated_match)) # Replace each original match with the corresponding translated match in the HTML content for original_match, translated_match in translated_matches: html_content = html_content.replace(original_match, translated_match) # Write the translated HTML content back to the file with open(os.path.join(folder_path, filename), 'w', encoding='utf-8') as file: file.write(html_content) print(f'{filename} translated ({amount})') amount += 1 RE: why doesn't it replace all html tags? - deanhystad - Jul-05-2023 Divide and conquer. First verify that you are finding matches. import os import re folder_path = r"c:\Folder3\2" # HTML tags to translate tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>'] # Iterate over all HTML files in the folder for filename in os.listdir(folder_path): if filename.endswith(".html"): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file: html_content = file.read() for tag in tags_to_translate: matches = re.findall(tag, html_content, re.DOTALL) for match in matches: print(match)Run this code. It should uncover some problems with how you process matches. Once you get it working correctly you can move onto step 2, translation. RE: why doesn't it replace all html tags? - Melcu54 - Jul-05-2023 I find the solution. Thanks, deanhystad ! import os import re from googletrans import Translator folder_path = r"c:\Folder3\2" # HTML tags to translate tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>'] # Initialize the translator translator = Translator() # Initialize a counter for the translated files translated_files_count = 0 # Iterate over all HTML files in the folder for filename in os.listdir(folder_path): if filename.endswith(".html"): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file: html_content = file.read() for tag in tags_to_translate: matches = re.findall(tag, html_content, re.DOTALL) for match in matches: # Translate the content translated_content = translator.translate(match, dest='ru').text # Replace the original content with the translated content html_content = html_content.replace(match, translated_content) # Write the translated HTML content back to the file new_filename = f'{filename.split(".")[0]}_ru.html' translated_folder_path = os.path.join(folder_path, 'translated') if not os.path.exists(translated_folder_path): os.mkdir(translated_folder_path) with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file: file.write(html_content) # Increment the counter and print the file name and number translated_files_count += 1 print(f"Fișierul tradus #{translated_files_count}: {new_filename}") RE: why doesn't it replace all html tags? - Melcu54 - Jul-05-2023 and the full code here, works fine now: import os import re from googletrans import Translator folder_path = r"c:\Folder3\2" # HTML tags to translate tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>'] # Words to check words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from'] # Initialize the translator translator = Translator() # Initialize a counter for the translated files translated_files_count = 0 # Iterate over all HTML files in the folder for filename in os.listdir(folder_path): if filename.endswith(".html"): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file: html_content = file.read() for tag in tags_to_translate: matches = re.findall(tag, html_content, re.DOTALL) for match in matches: # Check if the content contains at least 3 words from words_to_check words_count = sum(word in match for word in words_to_check) if words_count >= 3: # Translate the content translated_content = translator.translate(match, dest='ru').text # Replace the original content with the translated content html_content = html_content.replace(match, translated_content) # Write the translated HTML content back to the file new_filename = f'{filename.split(".")[0]}_ru.html' translated_folder_path = os.path.join(folder_path, 'translated') if not os.path.exists(translated_folder_path): os.mkdir(translated_folder_path) with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file: file.write(html_content) # Increment the counter and print the file name and number translated_files_count += 1 print(f"Fișierul tradus #{translated_files_count}: {new_filename}") |