why doesn't it replace all html tags?

why doesn't it replace all html tags? - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: General Coding Help (https://python-forum.io/forum-8.html)
+--- Thread: why doesn't it replace all html tags? (/thread-40279.html)

why doesn't it replace all html tags? - Melcu54 - Jul-04-2023

I have these lines in a html file. each of them must be translated into Russian, if in contain more than 3 words found in the list. The problem is that only the last tag, <p class="text_obisnuit">, is translated. I don't know why they are not all translated, because they have the same content.

<title>The Fatal Solidarity, Whose because Punishment here two one Flings Its Condemned whose love me four times seven The Fatal Solidarity</title>
<meta name="description" content="Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven">
<p class="text_obisnuit2">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>
<p class="text_obisnuit">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>

And thise is the Python code

import os
import re
from googletrans import Translator

translator = Translator()

# Folder path
folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>']

# Words to check
words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from']


# Initialize the amount of translated files
amount = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            # Iterate over all tags to translate
            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

        # Translate each match
        translated_matches = []
        for match in matches:
            original_match = match  # Keep a copy of the original match
            # Check if the match contains at least three of the specified words
            if sum(word in match for word in words_to_check) >= 3:
                # Split the match into sentences
                sentences = re.split(r'(?<=[.!?])\s+', match)

                # Initialize a string to hold the translated match
                translated_match = match

                # Translate each sentence that contains at least three of the specified words
                for sentence in sentences:
                    if sum(word in sentence for word in words_to_check) >= 3:
                        translation = translator.translate(sentence, dest='ru').text

                        # Replace the original sentence with the translation in the translated match
                        translated_match = translated_match.replace(sentence, translation)

                # Add the translated match to the list of translated matches
                translated_matches.append((original_match, translated_match))

        # Replace each original match with the corresponding translated match in the HTML content
        for original_match, translated_match in translated_matches:
            html_content = html_content.replace(original_match, translated_match)

        # Write the translated HTML content back to the file
        with open(os.path.join(folder_path, filename), 'w', encoding='utf-8') as file:
            file.write(html_content)
print(f'{filename} translated ({amount})')
amount += 1

RE: why doesn't it replace all html tags? - deanhystad - Jul-05-2023

Divide and conquer. First verify that you are finding matches.

import os
import re
 
folder_path = r"c:\Folder3\2"
 
# HTML tags to translate
tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>']
 
# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()
 
            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)
 
        for match in matches:
            print(match)

Run this code. It should uncover some problems with how you process matches. Once you get it working correctly you can move onto step 2, translation.

RE: why doesn't it replace all html tags? - Melcu54 - Jul-05-2023

I find the solution. Thanks, deanhystad !

import os
import re
from googletrans import Translator

folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>']

# Initialize the translator
translator = Translator()

# Initialize a counter for the translated files
translated_files_count = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

                for match in matches:
                    # Translate the content
                    translated_content = translator.translate(match, dest='ru').text

                    # Replace the original content with the translated content
                    html_content = html_content.replace(match, translated_content)

        # Write the translated HTML content back to the file
        new_filename = f'{filename.split(".")[0]}_ru.html'
        translated_folder_path = os.path.join(folder_path, 'translated')
        if not os.path.exists(translated_folder_path):
            os.mkdir(translated_folder_path)
        with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
            file.write(html_content)

        # Increment the counter and print the file name and number
        translated_files_count += 1
        print(f"Fișierul tradus #{translated_files_count}: {new_filename}")

RE: why doesn't it replace all html tags? - Melcu54 - Jul-05-2023

and the full code here, works fine now:

import os
import re
from googletrans import Translator

folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>']

# Words to check
words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from']

# Initialize the translator
translator = Translator()

# Initialize a counter for the translated files
translated_files_count = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

                for match in matches:
                    # Check if the content contains at least 3 words from words_to_check
                    words_count = sum(word in match for word in words_to_check)

                    if words_count >= 3:
                        # Translate the content
                        translated_content = translator.translate(match, dest='ru').text

                        # Replace the original content with the translated content
                        html_content = html_content.replace(match, translated_content)

        # Write the translated HTML content back to the file
        new_filename = f'{filename.split(".")[0]}_ru.html'
        translated_folder_path = os.path.join(folder_path, 'translated')
        if not os.path.exists(translated_folder_path):
            os.mkdir(translated_folder_path)
        with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
            file.write(html_content)

        # Increment the counter and print the file name and number
        translated_files_count += 1
        print(f"Fișierul tradus #{translated_files_count}: {new_filename}")