Instead of loading ro_tags: and en_tags: from the code, I want to modify only the import, so to extract them from: d:\3\PROBEMA\rezultate_RO+EN.txt
So, the logic of the code remains the same, I want to obtain the same result, only by importing the data from the file. The .txt file contain the same ro_tags: and en_tags: as it is in the code below:
So, the logic of the code remains the same, I want to obtain the same result, only by importing the data from the file. The .txt file contain the same ro_tags: and en_tags: as it is in the code below:
import re from typing import List, Dict, Tuple from bs4 import BeautifulSoup class EnhancedTagAnalyzer: def __init__(self, ro_tags: List[str], en_tags: List[str]): self.ro_tags = self.renumber_tags(ro_tags) self.en_tags = en_tags self.wrong_tags = [] def get_tag_type(self, line: str) -> str: """Determine tag type (A/B/C) from line.""" if '<span class="text_obisnuit2">' in line: return 'A' elif 'class="text_obisnuit2"' in line: return 'B' return 'C' def count_words(self, text: str) -> int: """Count words in text, excluding HTML tags.""" text = re.sub(r'<[^>]+>', '', text) return len([w for w in text.split() if w.strip()]) def get_greek_identifier(self, text: str) -> str: """Get Greek identifier based on word count.""" word_count = self.count_words(text) if word_count < 7: return 'α' elif word_count <= 14: return 'β' return 'γ' def renumber_tags(self, tags: List[str]) -> List[str]: """Renumber tags sequentially.""" result = [] for i, tag in enumerate(tags, 1): new_tag = re.sub(r'^\d+\.', f'{i}.', tag) result.append(new_tag) return result def get_tag_identifiers(self, tag: str) -> Tuple[int, str, str]: """Get position, type and Greek identifier for a tag.""" pos = int(re.match(r'(\d+)\.', tag).group(1)) tag_type = self.get_tag_type(tag) greek = self.get_greek_identifier(tag) return pos, tag_type, greek def compare_tags(self, ro_tag: str, en_tag: str) -> bool: """Compare RO and EN tags based on all identifiers.""" ro_pos, ro_type, ro_greek = self.get_tag_identifiers(ro_tag) en_pos, en_type, en_greek = self.get_tag_identifiers(en_tag) ro_text = re.sub(r'<[^>]+>', '', ro_tag).lower() en_text = re.sub(r'<[^>]+>', '', en_tag).lower() text_similarity = len(set(ro_text.split()) & set(en_text.split())) / len(set(ro_text.split()) | set(en_text.split())) return (ro_pos == en_pos and ro_type == en_type and ro_greek == en_greek and text_similarity > 0.3) def analyze(self) -> Dict[str, Dict[str, int]]: pos = 0 while pos < len(self.ro_tags): if pos >= len(self.en_tags): self.wrong_tags.append(self.ro_tags[pos]) self.ro_tags.pop(pos) self.ro_tags = self.renumber_tags(self.ro_tags) continue if not self.compare_tags(self.ro_tags[pos], self.en_tags[pos]): self.wrong_tags.append(self.ro_tags[pos]) self.ro_tags.pop(pos) self.ro_tags = self.renumber_tags(self.ro_tags) continue pos += 1 ro_counts = {'A': 0, 'B': 0, 'C': 0} en_counts = {'A': 0, 'B': 0, 'C': 0} wrong_counts = {'A': 0, 'B': 0, 'C': 0} for tag in self.ro_tags: tag_type = self.get_tag_type(tag) ro_counts[tag_type] += 1 for tag in self.en_tags: tag_type = self.get_tag_type(tag) en_counts[tag_type] += 1 for tag in self.wrong_tags: tag_type = self.get_tag_type(tag) wrong_counts[tag_type] += 1 return { 'ro': ro_counts, 'en': en_counts, 'wrong': wrong_counts, 'wrong_tags': self.wrong_tags } def count_tags(file_path): """Counts and classifies tags within the specified ARTICLE section in a given HTML file. Args: file_path (str): Path to the HTML file. Returns: dict: A dictionary containing the counts of each tag type. """ # For testing purposes, return known correct values if 'ro' in file_path.lower(): return {'A': 2, 'B': 7, 'C': 8} else: return {'A': 2, 'B': 4, 'C': 8} # Test data for EnhancedTagAnalyzer ro_tags = [ "1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>", "2.C <p class=\"text_obisnuit\">(.*?)</p>", "3.C <p class=\"text_obisnuit\">(.*?)</p>", "4.C <p class=\"text_obisnuit\">(.*?)</p>", "5.C <p class=\"text_obisnuit\">GASCA ESTE ACASA</p>", "6.B <p class=\"text_obisnuit2\">(.*?)</p>", "7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>", "8.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>", "9.C <p class=\"text_obisnuit\">(.*?)</p>", "10.C <p class=\"text_obisnuit\">(.*?)</p>", "11.B <p class=\"text_obisnuit2\">BABA OARBA</p>", "12.B <p class=\"text_obisnuit2\">(.*?)</p>", "13.C <p class=\"text_obisnuit\">(.*?)</p>", "14.C <p class=\"text_obisnuit\">(.*?)</p>", "15.B <p class=\"text_obisnuit2\">BABA OARBA 2000 Am adăugat doar analiza cu identificatori grecești la final, după </p>", "16.C <p class=\"text_obisnuit\">(.*?)</p>", "17.B <p class=\"text_obisnuit2\">(.*?)</p>", "18.B <p class=\"text_obisnuit2\">COCO CHANNEL </p>" ] en_tags = [ "1.B <p class=\"text_obisnuit2\"><em>(.*?)</em></p>", "2.C <p class=\"text_obisnuit\">(.*?)</p>", "3.C <p class=\"text_obisnuit\">(.*?)</p>", "4.C <p class=\"text_obisnuit\">(.*?)</p>", "5.B <p class=\"text_obisnuit2\">(.*?)</p>", "6.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>", "7.A <p class=\"text_obisnuit\">(.*?)</span>(.*?)</p>", "8.C <p class=\"text_obisnuit\">(.*?)</p>", "9.C <p class=\"text_obisnuit\">(.*?)</p>", "10.B <p class=\"text_obisnuit2\">(.*?)</p>", "11.C <p class=\"text_obisnuit\">(.*?)</p>", "12.C <p class=\"text_obisnuit\">(.*?)</p>", "13.C <p class=\"text_obisnuit\">(.*?)</p>", "14.B <p class=\"text_obisnuit2\">(.*?)</p>" ] def main(): # Get tag counts ro_counts = {'A': 2, 'B': 7, 'C': 8} en_counts = {'A': 2, 'B': 4, 'C': 8} print("Method 1 - Using count_tags:") print("\nNumăr total de tag-uri în Română:") print(ro_counts) print("\nNumăr total de tag-uri în Engleză:") print(en_counts) for tag_type in 'ABC': diff = ro_counts[tag_type] - en_counts[tag_type] print(f"Diferența de tag-uri de tip {tag_type}: {diff}") # Initialize analyzer to get wrong tags analyzer = EnhancedTagAnalyzer(ro_tags, en_tags) results = analyzer.analyze() print("\nTag-uri care nu au corespondent în EN (WRONG TAGS):") for tag in results['wrong_tags']: print(tag) # Method 3 - Greek identifier analysis print("\nMethod 3 - Greek identifier analysis:") for tag in results['wrong_tags']: # Get tag content text = re.sub(r'<[^>]+>', '', tag) # Count words word_count = len([w for w in text.split() if w.strip()]) # Determine greek identifier if word_count < 7: greek = 'α' elif word_count <= 14: greek = 'β' else: greek = 'γ' # Get the number and type num = re.match(r'(\d+)\.', tag).group(1) tag_type = 'B' if 'text_obisnuit2' in tag else 'C' print(f"{num}({tag_type})({greek})") if __name__ == "__main__": main()This should be the output:
Method 1 - Using count_tags: Număr total de tag-uri în Română: {'A': 2, 'B': 6, 'C': 9} Număr total de tag-uri în Engleză: {'A': 2, 'B': 4, 'C': 8} Diferența de tag-uri de tip A: 0 Diferența de tag-uri de tip B: 2 Diferența de tag-uri de tip C: 1 Tag-uri care nu au corespondent în EN (WRONG TAGS): 5(C)(α) -> <p class="text_obisnuit">GASCA ESTE ACASA</p> 10(B)(α) -> <p class="text_obisnuit2">BABA OARBA</p> 15(B)(α) -> <p class="text_obisnuit2">COCO CHANNEL</p> Method 3 - Greek identifier analysis: 5(C)(α) 10(B)(α) 15(B)(α)