Jul-04-2023, 09:07 AM
I have these lines in a html file. each of them must be translated into Russian, if in contain more than 3 words found in the list. The problem is that only the last tag, <p class="text_obisnuit">, is translated. I don't know why they are not all translated, because they have the same content.
<title>The Fatal Solidarity, Whose because Punishment here two one Flings Its Condemned whose love me four times seven The Fatal Solidarity</title> <meta name="description" content="Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven"> <p class="text_obisnuit2">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p> <p class="text_obisnuit">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>And thise is the Python code
import os import re from googletrans import Translator translator = Translator() # Folder path folder_path = r"c:\Folder3\2" # HTML tags to translate tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>'] # Words to check words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from'] # Initialize the amount of translated files amount = 0 # Iterate over all HTML files in the folder for filename in os.listdir(folder_path): if filename.endswith(".html"): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file: html_content = file.read() # Iterate over all tags to translate for tag in tags_to_translate: matches = re.findall(tag, html_content, re.DOTALL) # Translate each match translated_matches = [] for match in matches: original_match = match # Keep a copy of the original match # Check if the match contains at least three of the specified words if sum(word in match for word in words_to_check) >= 3: # Split the match into sentences sentences = re.split(r'(?<=[.!?])\s+', match) # Initialize a string to hold the translated match translated_match = match # Translate each sentence that contains at least three of the specified words for sentence in sentences: if sum(word in sentence for word in words_to_check) >= 3: translation = translator.translate(sentence, dest='ru').text # Replace the original sentence with the translation in the translated match translated_match = translated_match.replace(sentence, translation) # Add the translated match to the list of translated matches translated_matches.append((original_match, translated_match)) # Replace each original match with the corresponding translated match in the HTML content for original_match, translated_match in translated_matches: html_content = html_content.replace(original_match, translated_match) # Write the translated HTML content back to the file with open(os.path.join(folder_path, filename), 'w', encoding='utf-8') as file: file.write(html_content) print(f'{filename} translated ({amount})') amount += 1