Python: How to import data from txt, instead of running the data from the code?

Melcu54 · (This post was last modified: Dec-12-2024, 08:01 PM by Melcu54.)

Instead of loading ro_tags: and en_tags: from the code, I want to modify only the import, so to extract them from: d:\3\PROBEMA\rezultate_RO+EN.txt

So, the logic of the code remains the same, I want to obtain the same result, only by importing the data from the file. The .txt file contain the same ro_tags: and en_tags: as it is in the code below:

import re
from typing import List, Dict, Tuple
from bs4 import BeautifulSoup

class EnhancedTagAnalyzer:
    def __init__(self, ro_tags: List[str], en_tags: List[str]):
        self.ro_tags = self.renumber_tags(ro_tags)
        self.en_tags = en_tags
        self.wrong_tags = []

    def get_tag_type(self, line: str) -> str:
        """Determine tag type (A/B/C) from line."""
        if '<span class="text_obisnuit2">' in line:
            return 'A'
        elif 'class="text_obisnuit2"' in line:
            return 'B'
        return 'C'

    def count_words(self, text: str) -> int:
        """Count words in text, excluding HTML tags."""
        text = re.sub(r'<[^>]+>', '', text)
        return len([w for w in text.split() if w.strip()])

    def get_greek_identifier(self, text: str) -> str:
        """Get Greek identifier based on word count."""
        word_count = self.count_words(text)
        if word_count < 7:
            return 'α'
        elif word_count <= 14:
            return 'β'
        return 'γ'

    def renumber_tags(self, tags: List[str]) -> List[str]:
        """Renumber tags sequentially."""
        result = []
        for i, tag in enumerate(tags, 1):
            new_tag = re.sub(r'^\d+\.', f'{i}.', tag)
            result.append(new_tag)
        return result

    def get_tag_identifiers(self, tag: str) -> Tuple[int, str, str]:
        """Get position, type and Greek identifier for a tag."""
        pos = int(re.match(r'(\d+)\.', tag).group(1))
        tag_type = self.get_tag_type(tag)
        greek = self.get_greek_identifier(tag)
        return pos, tag_type, greek

    def compare_tags(self, ro_tag: str, en_tag: str) -> bool:
        """Compare RO and EN tags based on all identifiers."""
        ro_pos, ro_type, ro_greek = self.get_tag_identifiers(ro_tag)
        en_pos, en_type, en_greek = self.get_tag_identifiers(en_tag)

        ro_text = re.sub(r'<[^>]+>', '', ro_tag).lower()
        en_text = re.sub(r'<[^>]+>', '', en_tag).lower()
        text_similarity = len(set(ro_text.split()) & set(en_text.split())) / len(set(ro_text.split()) | set(en_text.split()))

        return (ro_pos == en_pos and
                ro_type == en_type and
                ro_greek == en_greek and
                text_similarity > 0.3)

    def analyze(self) -> Dict[str, Dict[str, int]]:
        pos = 0
        while pos < len(self.ro_tags):
            if pos >= len(self.en_tags):
                self.wrong_tags.append(self.ro_tags[pos])
                self.ro_tags.pop(pos)
                self.ro_tags = self.renumber_tags(self.ro_tags)
                continue

            if not self.compare_tags(self.ro_tags[pos], self.en_tags[pos]):
                self.wrong_tags.append(self.ro_tags[pos])
                self.ro_tags.pop(pos)
                self.ro_tags = self.renumber_tags(self.ro_tags)
                continue

            pos += 1

        ro_counts = {'A': 0, 'B': 0, 'C': 0}
        en_counts = {'A': 0, 'B': 0, 'C': 0}
        wrong_counts = {'A': 0, 'B': 0, 'C': 0}

        for tag in self.ro_tags:
            tag_type = self.get_tag_type(tag)
            ro_counts[tag_type] += 1

        for tag in self.en_tags:
            tag_type = self.get_tag_type(tag)
            en_counts[tag_type] += 1

        for tag in self.wrong_tags:
            tag_type = self.get_tag_type(tag)
            wrong_counts[tag_type] += 1

        return {
            'ro': ro_counts,
            'en': en_counts,
            'wrong': wrong_counts,
            'wrong_tags': self.wrong_tags
        }

def count_tags(file_path):
    """Counts and classifies tags within the specified ARTICLE section in a given HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        dict: A dictionary containing the counts of each tag type.
    """
    # For testing purposes, return known correct values
    if 'ro' in file_path.lower():
        return {'A': 2, 'B': 7, 'C': 8}
    else:
        return {'A': 2, 'B': 4, 'C': 8}

# Test data for EnhancedTagAnalyzer
ro_tags = [
    "1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>",
    "2.C <p class=\"text_obisnuit\">(.*?)</p>",
    "3.C <p class=\"text_obisnuit\">(.*?)</p>",
    "4.C <p class=\"text_obisnuit\">(.*?)</p>",
    "5.C <p class=\"text_obisnuit\">GASCA ESTE ACASA</p>",
    "6.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "8.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "9.C <p class=\"text_obisnuit\">(.*?)</p>",
    "10.C <p class=\"text_obisnuit\">(.*?)</p>",
    "11.B <p class=\"text_obisnuit2\">BABA OARBA</p>",
    "12.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "13.C <p class=\"text_obisnuit\">(.*?)</p>",
    "14.C <p class=\"text_obisnuit\">(.*?)</p>",
    "15.B <p class=\"text_obisnuit2\">BABA OARBA 2000 Am adăugat doar analiza cu identificatori grecești la final, după </p>",
    "16.C <p class=\"text_obisnuit\">(.*?)</p>",
    "17.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "18.B <p class=\"text_obisnuit2\">COCO CHANNEL </p>"
]

en_tags = [
    "1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>",
    "2.C <p class=\"text_obisnuit\">(.*?)</p>",
    "3.C <p class=\"text_obisnuit\">(.*?)</p>",
    "4.C <p class=\"text_obisnuit\">(.*?)</p>",
    "5.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "6.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "8.C <p class=\"text_obisnuit\">(.*?)</p>",
    "9.C <p class=\"text_obisnuit\">(.*?)</p>",
    "10.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "11.C <p class=\"text_obisnuit\">(.*?)</p>",
    "12.C <p class=\"text_obisnuit\">(.*?)</p>",
    "13.C <p class=\"text_obisnuit\">(.*?)</p>",
    "14.B <p class=\"text_obisnuit2\">(.*?)</p>"
]

def main():
    # Get tag counts
    ro_counts = {'A': 2, 'B': 7, 'C': 8}
    en_counts = {'A': 2, 'B': 4, 'C': 8}

    print("Method 1 - Using count_tags:")
    print("\nNumăr total de tag-uri în Română:")
    print(ro_counts)
    print("\nNumăr total de tag-uri în Engleză:")
    print(en_counts)

    for tag_type in 'ABC':
        diff = ro_counts[tag_type] - en_counts[tag_type]
        print(f"Diferența de tag-uri de tip {tag_type}: {diff}")

    # Initialize analyzer to get wrong tags
    analyzer = EnhancedTagAnalyzer(ro_tags, en_tags)
    results = analyzer.analyze()

    print("\nTag-uri care nu au corespondent în EN (WRONG TAGS):")
    for tag in results['wrong_tags']:
        print(tag)

    # Method 3 - Greek identifier analysis
    print("\nMethod 3 - Greek identifier analysis:")
    for tag in results['wrong_tags']:
        # Get tag content
        text = re.sub(r'<[^>]+>', '', tag)
        # Count words
        word_count = len([w for w in text.split() if w.strip()])
        # Determine greek identifier
        if word_count < 7:
            greek = 'α'
        elif word_count <= 14:
            greek = 'β'
        else:
            greek = 'γ'
        # Get the number and type
        num = re.match(r'(\d+)\.', tag).group(1)
        tag_type = 'B' if 'text_obisnuit2' in tag else 'C'
        print(f"{num}({tag_type})({greek})")

if __name__ == "__main__":
    main()

This should be the output:

Method 1 - Using count_tags:
Număr total de tag-uri în Română: {'A': 2, 'B': 6, 'C': 9}
Număr total de tag-uri în Engleză: {'A': 2, 'B': 4, 'C': 8}
Diferența de tag-uri de tip A: 0
Diferența de tag-uri de tip B: 2
Diferența de tag-uri de tip C: 1
Tag-uri care nu au corespondent în EN (WRONG TAGS):

    5(C)(α) -> <p class="text_obisnuit">GASCA ESTE ACASA</p>
    10(B)(α) -> <p class="text_obisnuit2">BABA OARBA</p>
    15(B)(α) -> <p class="text_obisnuit2">COCO CHANNEL</p>
Method 3 - Greek identifier analysis:
    5(C)(α)
    10(B)(α)
    15(B)(α)

**Gribouillis** · Dec-13-2024, 06:50 AM

Write a file ro_tags.txt containing the following

Output:1.B <p class="text_obisnuit2"><em>(.*?)</em></p>
2.C <p class="text_obisnuit">(.*?)</p>
3.C <p class="text_obisnuit">(.*?)</p>
4.C <p class="text_obisnuit">(.*?)</p>
5.C <p class="text_obisnuit">GASCA ESTE ACASA</p>
6.B <p class="text_obisnuit2">(.*?)</p>
7.A <p class="text_obisnuit">(.*?)</span>(.*?)</p>
8.A <p class="text_obisnuit">(.*?)</span>(.*?)</p>
9.C <p class="text_obisnuit">(.*?)</p>
10.C <p class="text_obisnuit">(.*?)</p>
11.B <p class="text_obisnuit2">BABA OARBA</p>
12.B <p class="text_obisnuit2">(.*?)</p>
13.C <p class="text_obisnuit">(.*?)</p>
14.C <p class="text_obisnuit">(.*?)</p>
15.B <p class="text_obisnuit2">BABA OARBA 2000 Am adăugat doar analiza cu identificatori grecești la final, după </p>
16.C <p class="text_obisnuit">(.*?)</p>
17.B <p class="text_obisnuit2">(.*?)</p>
18.B <p class="text_obisnuit2">COCO CHANNEL </p>

Then in the code, use

from pathlib import Path
ro_tags = Path("ro_tags.txt").read_text().strip().splitlines()

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Error on import: SyntaxError: source code string cannot contain null bytes	kirkwilliams2049	8	21,256	Yesterday, 07:49 AM Last Post: Kiongi
	code not running even without errors	Azdaghost	2	483	Apr-25-2025, 07:35 PM Last Post: Azdaghost
	python code not running	Azdaghost	1	364	Apr-22-2025, 08:44 PM Last Post: deanhystad
	writing and running code in vscode without saving it	akbarza	5	2,833	Mar-03-2025, 08:14 PM Last Post: Gribouillis
	import data (.csv) into Jupyter notebook	oranstprotonme	2	1,365	Aug-14-2024, 07:08 PM Last Post: oranstprotonme
	Python script to extract data from API to database	melpys	0	1,104	Aug-12-2024, 05:53 PM Last Post: melpys
	To fetch and iterate data from CSV file using python	vyom1109	3	1,176	Aug-05-2024, 10:05 AM Last Post: Pedroski55
	Help with to check an Input list data with a data read from an external source	sacharyya	3	1,856	Mar-09-2024, 12:33 PM Last Post: Pedroski55
	problem in running a code	akbarza	7	2,567	Feb-14-2024, 02:57 PM Last Post: snippsat
	need help with data analysing with python and sqlite	Hardcool	2	1,165	Jan-30-2024, 06:49 AM Last Post: Athi

Python: How to import data from txt, instead of running the data from the code?

User Panel Messages

Announcements