why doesn't it replace all html tags?

Melcu54 · Jul-04-2023, 09:07 AM

I have these lines in a html file. each of them must be translated into Russian, if in contain more than 3 words found in the list. The problem is that only the last tag, <p class="text_obisnuit">, is translated. I don't know why they are not all translated, because they have the same content.

<title>The Fatal Solidarity, Whose because Punishment here two one Flings Its Condemned whose love me four times seven The Fatal Solidarity</title>
<meta name="description" content="Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven">
<p class="text_obisnuit2">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>
<p class="text_obisnuit">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>

And thise is the Python code

import os
import re
from googletrans import Translator

translator = Translator()

# Folder path
folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>']

# Words to check
words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from']


# Initialize the amount of translated files
amount = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            # Iterate over all tags to translate
            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

        # Translate each match
        translated_matches = []
        for match in matches:
            original_match = match  # Keep a copy of the original match
            # Check if the match contains at least three of the specified words
            if sum(word in match for word in words_to_check) >= 3:
                # Split the match into sentences
                sentences = re.split(r'(?<=[.!?])\s+', match)

                # Initialize a string to hold the translated match
                translated_match = match

                # Translate each sentence that contains at least three of the specified words
                for sentence in sentences:
                    if sum(word in sentence for word in words_to_check) >= 3:
                        translation = translator.translate(sentence, dest='ru').text

                        # Replace the original sentence with the translation in the translated match
                        translated_match = translated_match.replace(sentence, translation)

                # Add the translated match to the list of translated matches
                translated_matches.append((original_match, translated_match))

        # Replace each original match with the corresponding translated match in the HTML content
        for original_match, translated_match in translated_matches:
            html_content = html_content.replace(original_match, translated_match)

        # Write the translated HTML content back to the file
        with open(os.path.join(folder_path, filename), 'w', encoding='utf-8') as file:
            file.write(html_content)
print(f'{filename} translated ({amount})')
amount += 1

**deanhystad** · Jul-05-2023, 02:27 AM

Divide and conquer. First verify that you are finding matches.

import os
import re
 
folder_path = r"c:\Folder3\2"
 
# HTML tags to translate
tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>']
 
# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()
 
            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)
 
        for match in matches:
            print(match)

Run this code. It should uncover some problems with how you process matches. Once you get it working correctly you can move onto step 2, translation.

Melcu54 · (This post was last modified: Jul-05-2023, 04:42 AM by Melcu54.)

I find the solution. Thanks, deanhystad !

import os
import re
from googletrans import Translator

folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>']

# Initialize the translator
translator = Translator()

# Initialize a counter for the translated files
translated_files_count = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

                for match in matches:
                    # Translate the content
                    translated_content = translator.translate(match, dest='ru').text

                    # Replace the original content with the translated content
                    html_content = html_content.replace(match, translated_content)

        # Write the translated HTML content back to the file
        new_filename = f'{filename.split(".")[0]}_ru.html'
        translated_folder_path = os.path.join(folder_path, 'translated')
        if not os.path.exists(translated_folder_path):
            os.mkdir(translated_folder_path)
        with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
            file.write(html_content)

        # Increment the counter and print the file name and number
        translated_files_count += 1
        print(f"Fișierul tradus #{translated_files_count}: {new_filename}")

Melcu54 · Jul-05-2023, 04:47 AM

and the full code here, works fine now:

import os
import re
from googletrans import Translator

folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>']

# Words to check
words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from']

# Initialize the translator
translator = Translator()

# Initialize a counter for the translated files
translated_files_count = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

                for match in matches:
                    # Check if the content contains at least 3 words from words_to_check
                    words_count = sum(word in match for word in words_to_check)

                    if words_count >= 3:
                        # Translate the content
                        translated_content = translator.translate(match, dest='ru').text

                        # Replace the original content with the translated content
                        html_content = html_content.replace(match, translated_content)

        # Write the translated HTML content back to the file
        new_filename = f'{filename.split(".")[0]}_ru.html'
        translated_folder_path = os.path.join(folder_path, 'translated')
        if not os.path.exists(translated_folder_path):
            os.mkdir(translated_folder_path)
        with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
            file.write(html_content)

        # Increment the counter and print the file name and number
        translated_files_count += 1
        print(f"Fișierul tradus #{translated_files_count}: {new_filename}")

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Need to replace a string with a file (HTML file)	tester_V	1	775	Aug-30-2023, 03:42 AM Last Post: Larz60+
	Tkinterweb (Browser Module) Appending/Adding Additional HTML to a HTML Table Row	AaronCatolico1	0	938	Dec-25-2022, 06:28 PM Last Post: AaronCatolico1
	BeautifulSoup - I can't translate html tags that contain <a href=..</a> OR <em></em>	Melcu54	10	1,645	Oct-27-2022, 08:58 AM Last Post: wavic
	Get text from within h3 html tags	Pedroski55	8	4,320	Jan-05-2022, 06:50 AM Last Post: Larz60+
	reading html and edit chekcbox to html	jacklee26	5	3,086	Jul-01-2021, 10:31 AM Last Post: snippsat
	Parsing link from html tags with Python	Melcu54	0	1,619	Jun-14-2021, 09:25 AM Last Post: Melcu54
	Delimiters - How to skip some html tags from being translate	Melcu54	0	1,663	May-26-2021, 06:21 AM Last Post: Melcu54
	Including a Variable In the HTML Tags When Sending An Email	JoeDainton123	0	1,898	Aug-08-2020, 03:11 AM Last Post: JoeDainton123
	Loop through tags inside tags in Selenium/Python	xpack24	1	5,694	Oct-23-2019, 10:15 AM Last Post: Larz60+
	HTML to Python to Windows .bat and back to HTML	perfectservice33	0	1,952	Aug-22-2019, 06:31 AM Last Post: perfectservice33

why doesn't it replace all html tags?

User Panel Messages

Announcements