Python Forum
Python: How to import data from txt, instead of running the data from the code?
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Python: How to import data from txt, instead of running the data from the code?
#1
Instead of loading ro_tags: and en_tags: from the code, I want to modify only the import, so to extract them from: d:\3\PROBEMA\rezultate_RO+EN.txt

So, the logic of the code remains the same, I want to obtain the same result, only by importing the data from the file. The .txt file contain the same ro_tags: and en_tags: as it is in the code below:

import re
from typing import List, Dict, Tuple
from bs4 import BeautifulSoup

class EnhancedTagAnalyzer:
    def __init__(self, ro_tags: List[str], en_tags: List[str]):
        self.ro_tags = self.renumber_tags(ro_tags)
        self.en_tags = en_tags
        self.wrong_tags = []

    def get_tag_type(self, line: str) -> str:
        """Determine tag type (A/B/C) from line."""
        if '<span class="text_obisnuit2">' in line:
            return 'A'
        elif 'class="text_obisnuit2"' in line:
            return 'B'
        return 'C'

    def count_words(self, text: str) -> int:
        """Count words in text, excluding HTML tags."""
        text = re.sub(r'<[^>]+>', '', text)
        return len([w for w in text.split() if w.strip()])

    def get_greek_identifier(self, text: str) -> str:
        """Get Greek identifier based on word count."""
        word_count = self.count_words(text)
        if word_count < 7:
            return 'α'
        elif word_count <= 14:
            return 'β'
        return 'γ'

    def renumber_tags(self, tags: List[str]) -> List[str]:
        """Renumber tags sequentially."""
        result = []
        for i, tag in enumerate(tags, 1):
            new_tag = re.sub(r'^\d+\.', f'{i}.', tag)
            result.append(new_tag)
        return result

    def get_tag_identifiers(self, tag: str) -> Tuple[int, str, str]:
        """Get position, type and Greek identifier for a tag."""
        pos = int(re.match(r'(\d+)\.', tag).group(1))
        tag_type = self.get_tag_type(tag)
        greek = self.get_greek_identifier(tag)
        return pos, tag_type, greek

    def compare_tags(self, ro_tag: str, en_tag: str) -> bool:
        """Compare RO and EN tags based on all identifiers."""
        ro_pos, ro_type, ro_greek = self.get_tag_identifiers(ro_tag)
        en_pos, en_type, en_greek = self.get_tag_identifiers(en_tag)

        ro_text = re.sub(r'<[^>]+>', '', ro_tag).lower()
        en_text = re.sub(r'<[^>]+>', '', en_tag).lower()
        text_similarity = len(set(ro_text.split()) & set(en_text.split())) / len(set(ro_text.split()) | set(en_text.split()))

        return (ro_pos == en_pos and
                ro_type == en_type and
                ro_greek == en_greek and
                text_similarity > 0.3)

    def analyze(self) -> Dict[str, Dict[str, int]]:
        pos = 0
        while pos < len(self.ro_tags):
            if pos >= len(self.en_tags):
                self.wrong_tags.append(self.ro_tags[pos])
                self.ro_tags.pop(pos)
                self.ro_tags = self.renumber_tags(self.ro_tags)
                continue

            if not self.compare_tags(self.ro_tags[pos], self.en_tags[pos]):
                self.wrong_tags.append(self.ro_tags[pos])
                self.ro_tags.pop(pos)
                self.ro_tags = self.renumber_tags(self.ro_tags)
                continue

            pos += 1

        ro_counts = {'A': 0, 'B': 0, 'C': 0}
        en_counts = {'A': 0, 'B': 0, 'C': 0}
        wrong_counts = {'A': 0, 'B': 0, 'C': 0}

        for tag in self.ro_tags:
            tag_type = self.get_tag_type(tag)
            ro_counts[tag_type] += 1

        for tag in self.en_tags:
            tag_type = self.get_tag_type(tag)
            en_counts[tag_type] += 1

        for tag in self.wrong_tags:
            tag_type = self.get_tag_type(tag)
            wrong_counts[tag_type] += 1

        return {
            'ro': ro_counts,
            'en': en_counts,
            'wrong': wrong_counts,
            'wrong_tags': self.wrong_tags
        }

def count_tags(file_path):
    """Counts and classifies tags within the specified ARTICLE section in a given HTML file.

    Args:
        file_path (str): Path to the HTML file.

    Returns:
        dict: A dictionary containing the counts of each tag type.
    """
    # For testing purposes, return known correct values
    if 'ro' in file_path.lower():
        return {'A': 2, 'B': 7, 'C': 8}
    else:
        return {'A': 2, 'B': 4, 'C': 8}

# Test data for EnhancedTagAnalyzer
ro_tags = [
    "1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>",
    "2.C <p class=\"text_obisnuit\">(.*?)</p>",
    "3.C <p class=\"text_obisnuit\">(.*?)</p>",
    "4.C <p class=\"text_obisnuit\">(.*?)</p>",
    "5.C <p class=\"text_obisnuit\">GASCA ESTE ACASA</p>",
    "6.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "8.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "9.C <p class=\"text_obisnuit\">(.*?)</p>",
    "10.C <p class=\"text_obisnuit\">(.*?)</p>",
    "11.B <p class=\"text_obisnuit2\">BABA OARBA</p>",
    "12.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "13.C <p class=\"text_obisnuit\">(.*?)</p>",
    "14.C <p class=\"text_obisnuit\">(.*?)</p>",
    "15.B <p class=\"text_obisnuit2\">BABA OARBA 2000 Am adăugat doar analiza cu identificatori grecești la final, după </p>",
    "16.C <p class=\"text_obisnuit\">(.*?)</p>",
    "17.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "18.B <p class=\"text_obisnuit2\">COCO CHANNEL </p>"
]

en_tags = [
    "1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>",
    "2.C <p class=\"text_obisnuit\">(.*?)</p>",
    "3.C <p class=\"text_obisnuit\">(.*?)</p>",
    "4.C <p class=\"text_obisnuit\">(.*?)</p>",
    "5.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "6.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>",
    "8.C <p class=\"text_obisnuit\">(.*?)</p>",
    "9.C <p class=\"text_obisnuit\">(.*?)</p>",
    "10.B <p class=\"text_obisnuit2\">(.*?)</p>",
    "11.C <p class=\"text_obisnuit\">(.*?)</p>",
    "12.C <p class=\"text_obisnuit\">(.*?)</p>",
    "13.C <p class=\"text_obisnuit\">(.*?)</p>",
    "14.B <p class=\"text_obisnuit2\">(.*?)</p>"
]

def main():
    # Get tag counts
    ro_counts = {'A': 2, 'B': 7, 'C': 8}
    en_counts = {'A': 2, 'B': 4, 'C': 8}

    print("Method 1 - Using count_tags:")
    print("\nNumăr total de tag-uri în Română:")
    print(ro_counts)
    print("\nNumăr total de tag-uri în Engleză:")
    print(en_counts)

    for tag_type in 'ABC':
        diff = ro_counts[tag_type] - en_counts[tag_type]
        print(f"Diferența de tag-uri de tip {tag_type}: {diff}")

    # Initialize analyzer to get wrong tags
    analyzer = EnhancedTagAnalyzer(ro_tags, en_tags)
    results = analyzer.analyze()

    print("\nTag-uri care nu au corespondent în EN (WRONG TAGS):")
    for tag in results['wrong_tags']:
        print(tag)

    # Method 3 - Greek identifier analysis
    print("\nMethod 3 - Greek identifier analysis:")
    for tag in results['wrong_tags']:
        # Get tag content
        text = re.sub(r'<[^>]+>', '', tag)
        # Count words
        word_count = len([w for w in text.split() if w.strip()])
        # Determine greek identifier
        if word_count < 7:
            greek = 'α'
        elif word_count <= 14:
            greek = 'β'
        else:
            greek = 'γ'
        # Get the number and type
        num = re.match(r'(\d+)\.', tag).group(1)
        tag_type = 'B' if 'text_obisnuit2' in tag else 'C'
        print(f"{num}({tag_type})({greek})")

if __name__ == "__main__":
    main()
This should be the output:

Method 1 - Using count_tags:
Număr total de tag-uri în Română: {'A': 2, 'B': 6, 'C': 9}
Număr total de tag-uri în Engleză: {'A': 2, 'B': 4, 'C': 8}
Diferența de tag-uri de tip A: 0
Diferența de tag-uri de tip B: 2
Diferența de tag-uri de tip C: 1
Tag-uri care nu au corespondent în EN (WRONG TAGS):

    5(C)(α) -> <p class="text_obisnuit">GASCA ESTE ACASA</p>
    10(B)(α) -> <p class="text_obisnuit2">BABA OARBA</p>
    15(B)(α) -> <p class="text_obisnuit2">COCO CHANNEL</p>
Method 3 - Greek identifier analysis:
    5(C)(α)
    10(B)(α)
    15(B)(α)
Reply
#2
Write a file ro_tags.txt containing the following
Output:
1.B <p class="text_obisnuit2"><em>(.*?)</em></p> 2.C <p class="text_obisnuit">(.*?)</p> 3.C <p class="text_obisnuit">(.*?)</p> 4.C <p class="text_obisnuit">(.*?)</p> 5.C <p class="text_obisnuit">GASCA ESTE ACASA</p> 6.B <p class="text_obisnuit2">(.*?)</p> 7.A <p class="text_obisnuit">(.*?)</span>(.*?)</p> 8.A <p class="text_obisnuit">(.*?)</span>(.*?)</p> 9.C <p class="text_obisnuit">(.*?)</p> 10.C <p class="text_obisnuit">(.*?)</p> 11.B <p class="text_obisnuit2">BABA OARBA</p> 12.B <p class="text_obisnuit2">(.*?)</p> 13.C <p class="text_obisnuit">(.*?)</p> 14.C <p class="text_obisnuit">(.*?)</p> 15.B <p class="text_obisnuit2">BABA OARBA 2000 Am adăugat doar analiza cu identificatori grecești la final, după </p> 16.C <p class="text_obisnuit">(.*?)</p> 17.B <p class="text_obisnuit2">(.*?)</p> 18.B <p class="text_obisnuit2">COCO CHANNEL </p>
Then in the code, use
from pathlib import Path
ro_tags = Path("ro_tags.txt").read_text().strip().splitlines()
« We can solve any problem by introducing an extra level of indirection »
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  writing and running code in vscode without saving it akbarza 5 2,235 Mar-03-2025, 08:14 PM
Last Post: Gribouillis
  import data (.csv) into Jupyter notebook oranstprotonme 2 1,153 Aug-14-2024, 07:08 PM
Last Post: oranstprotonme
  Python script to extract data from API to database melpys 0 825 Aug-12-2024, 05:53 PM
Last Post: melpys
  To fetch and iterate data from CSV file using python vyom1109 3 972 Aug-05-2024, 10:05 AM
Last Post: Pedroski55
  Help with to check an Input list data with a data read from an external source sacharyya 3 1,630 Mar-09-2024, 12:33 PM
Last Post: Pedroski55
  problem in running a code akbarza 7 2,246 Feb-14-2024, 02:57 PM
Last Post: snippsat
  need help with data analysing with python and sqlite Hardcool 2 1,045 Jan-30-2024, 06:49 AM
Last Post: Athi
  the order of running code in a decorator function akbarza 2 1,290 Nov-10-2023, 08:09 AM
Last Post: akbarza
  Better python library to create ER Diagram by using pandas data frames as tables klllmmm 0 3,033 Oct-19-2023, 01:01 PM
Last Post: klllmmm
  Bulk loading of data using python shivamsvmsri 2 2,072 Sep-28-2023, 09:04 AM
Last Post: shivamsvmsri

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020