Python Forum
why doesn't it replace all html tags?
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
why doesn't it replace all html tags?
#1
I have these lines in a html file. each of them must be translated into Russian, if in contain more than 3 words found in the list. The problem is that only the last tag, <p class="text_obisnuit">, is translated. I don't know why they are not all translated, because they have the same content.

<title>The Fatal Solidarity, Whose because Punishment here two one Flings Its Condemned whose love me four times seven The Fatal Solidarity</title>
<meta name="description" content="Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven">
<p class="text_obisnuit2">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>
<p class="text_obisnuit">Whose because Punishment here two one The Fatal Solidarity, Whose Punishment Flings Its Condemned whose love me four times seven</p>
And thise is the Python code

import os
import re
from googletrans import Translator

translator = Translator()

# Folder path
folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>']

# Words to check
words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from']


# Initialize the amount of translated files
amount = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            # Iterate over all tags to translate
            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

        # Translate each match
        translated_matches = []
        for match in matches:
            original_match = match  # Keep a copy of the original match
            # Check if the match contains at least three of the specified words
            if sum(word in match for word in words_to_check) >= 3:
                # Split the match into sentences
                sentences = re.split(r'(?<=[.!?])\s+', match)

                # Initialize a string to hold the translated match
                translated_match = match

                # Translate each sentence that contains at least three of the specified words
                for sentence in sentences:
                    if sum(word in sentence for word in words_to_check) >= 3:
                        translation = translator.translate(sentence, dest='ru').text

                        # Replace the original sentence with the translation in the translated match
                        translated_match = translated_match.replace(sentence, translation)

                # Add the translated match to the list of translated matches
                translated_matches.append((original_match, translated_match))

        # Replace each original match with the corresponding translated match in the HTML content
        for original_match, translated_match in translated_matches:
            html_content = html_content.replace(original_match, translated_match)

        # Write the translated HTML content back to the file
        with open(os.path.join(folder_path, filename), 'w', encoding='utf-8') as file:
            file.write(html_content)
print(f'{filename} translated ({amount})')
amount += 1
Reply
#2
Divide and conquer. First verify that you are finding matches.
import os
import re
 
folder_path = r"c:\Folder3\2"
 
# HTML tags to translate
tags_to_translate = ['<title>.*?</title>', '<meta name="description" content=".*?">', '<p class="text_obisnuit2">.*?</p>', '<p class="text_obisnuit">.*?</p>']
 
# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()
 
            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)
 
        for match in matches:
            print(match)
Run this code. It should uncover some problems with how you process matches. Once you get it working correctly you can move onto step 2, translation.
Reply
#3
I find the solution. Thanks, deanhystad !


import os
import re
from googletrans import Translator

folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>']

# Initialize the translator
translator = Translator()

# Initialize a counter for the translated files
translated_files_count = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

                for match in matches:
                    # Translate the content
                    translated_content = translator.translate(match, dest='ru').text

                    # Replace the original content with the translated content
                    html_content = html_content.replace(match, translated_content)

        # Write the translated HTML content back to the file
        new_filename = f'{filename.split(".")[0]}_ru.html'
        translated_folder_path = os.path.join(folder_path, 'translated')
        if not os.path.exists(translated_folder_path):
            os.mkdir(translated_folder_path)
        with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
            file.write(html_content)

        # Increment the counter and print the file name and number
        translated_files_count += 1
        print(f"Fișierul tradus #{translated_files_count}: {new_filename}")
Reply
#4
and the full code here, works fine now:

import os
import re
from googletrans import Translator

folder_path = r"c:\Folder3\2"

# HTML tags to translate
tags_to_translate = [r'<title>(.*?)</title>', r'<meta\s*name="description"\s*content="(.*?)"\s*>', r'<p class="text_obisnuit2">(.*?)</p>', r'<p class="text_obisnuit">(.*?)</p>']

# Words to check
words_to_check = ['whose', 'the', 'you', 'which', 'view', 'because', 'here', 'have', 'this', 'two', 'one', 'three', 'four', 'five', 'six', 'seven', 'ten', 'had', 'then', 'see', 'saw', 'also', 'than', 'that', 'must', 'make', 'from']

# Initialize the translator
translator = Translator()

# Initialize a counter for the translated files
translated_files_count = 0

# Iterate over all HTML files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".html"):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            html_content = file.read()

            for tag in tags_to_translate:
                matches = re.findall(tag, html_content, re.DOTALL)

                for match in matches:
                    # Check if the content contains at least 3 words from words_to_check
                    words_count = sum(word in match for word in words_to_check)

                    if words_count >= 3:
                        # Translate the content
                        translated_content = translator.translate(match, dest='ru').text

                        # Replace the original content with the translated content
                        html_content = html_content.replace(match, translated_content)

        # Write the translated HTML content back to the file
        new_filename = f'{filename.split(".")[0]}_ru.html'
        translated_folder_path = os.path.join(folder_path, 'translated')
        if not os.path.exists(translated_folder_path):
            os.mkdir(translated_folder_path)
        with open(os.path.join(translated_folder_path, new_filename), 'w', encoding='utf-8') as file:
            file.write(html_content)

        # Increment the counter and print the file name and number
        translated_files_count += 1
        print(f"Fișierul tradus #{translated_files_count}: {new_filename}")
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Need to replace a string with a file (HTML file) tester_V 1 775 Aug-30-2023, 03:42 AM
Last Post: Larz60+
  Tkinterweb (Browser Module) Appending/Adding Additional HTML to a HTML Table Row AaronCatolico1 0 938 Dec-25-2022, 06:28 PM
Last Post: AaronCatolico1
  BeautifulSoup - I can't translate html tags that contain <a href=..</a> OR <em></em> Melcu54 10 1,645 Oct-27-2022, 08:58 AM
Last Post: wavic
  Get text from within h3 html tags Pedroski55 8 4,320 Jan-05-2022, 06:50 AM
Last Post: Larz60+
  reading html and edit chekcbox to html jacklee26 5 3,086 Jul-01-2021, 10:31 AM
Last Post: snippsat
  Parsing link from html tags with Python Melcu54 0 1,619 Jun-14-2021, 09:25 AM
Last Post: Melcu54
  Delimiters - How to skip some html tags from being translate Melcu54 0 1,663 May-26-2021, 06:21 AM
Last Post: Melcu54
  Including a Variable In the HTML Tags When Sending An Email JoeDainton123 0 1,898 Aug-08-2020, 03:11 AM
Last Post: JoeDainton123
  Loop through tags inside tags in Selenium/Python xpack24 1 5,694 Oct-23-2019, 10:15 AM
Last Post: Larz60+
  HTML to Python to Windows .bat and back to HTML perfectservice33 0 1,952 Aug-22-2019, 06:31 AM
Last Post: perfectservice33

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020