Python Forum
Form that puts diacritics on the words in the text
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Form that puts diacritics on the words in the text
#1
I made a code that must add diacritics to the text in the form. Python.

I also have two .txt files, dictionary.txt contains a phrase (a set of words) with diacritics. And dictionary-2.txt contains the same phrase, but without diacritics.

In the form I added the same phrase from dictionar-2.txt (without diacritics) to which the "Diacritics" button will add diacritics to words.

This is how I thought about the problem. I don't know how well I thought.

The code must compare the words from the FORM with those words that have diacritics from the dictionary.txt The code must see the similarities between the words, for example between "proporțională" and "proportionala" and add the missing letter from the list of diacritics in the code.


import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog

# Încărcarea bazei de date din fișierul .txt
with open('dictionar.txt', 'r', encoding='utf-8') as f:
    continut_dictionar = f.read().lower()

with open('dictionar-2.txt', 'r', encoding='utf-8') as f:
    continut_dictionar_fara_diacritice = f.read().lower()

diacritice = {
    'a': 'ă', 'A': 'Ă', 'i': 'î', 'I': 'Î', 's': 'ș', 'S': 'Ș', 't': 'ț', 'T': 'Ț'
}

SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []

def elimina_sufix(cuvant):
    for sufix in SUFIXE:
        if cuvant.endswith(sufix):
            return cuvant[:-len(sufix)]
    return cuvant

def elimina_operatori(cuvant):
    operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
    while cuvant and cuvant[0] in operatori:
        cuvant = cuvant[1:]
    while cuvant and cuvant[-1] in operatori:
        cuvant = cuvant[:-1]
    return cuvant

def verifica_text(cuvinte_text):
    text_input.tag_remove("evidentiat", "1.0", tk.END)  # Ștergem evidențierea anterioară
    start_index = "1.0"
    cuvant_gasit = False

    for cuv in cuvinte_text:
        cuv_baza = elimina_sufix(cuv).lower()
        cuv_baza = elimina_operatori(cuv_baza)
        end_index = start_index + f"+{len(cuv)}c"
        if cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
            text_input.tag_add("evidentiat", start_index, end_index)
            text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
            cuvant_gasit = True
            break  # Întrerupem bucla după primul cuvânt nerecunoscut
        start_index = end_index + "+1c"

    if not cuvant_gasit:
        messagebox.showinfo("Informare", "Totul este ok!")

def modifica_cuvant():
    global continut_dictionar  # Declarăm variabila ca globală pentru a o putea modifica
    start_index = text_input.tag_ranges("evidentiat")[0]
    end_index = text_input.tag_ranges("evidentiat")[1]
    cuvant = text_input.get(start_index, end_index)

    cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
    if cuvant_modificat:
        cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
        cuvant_modificat = elimina_operatori(cuvant_modificat)

        if cuvant_modificat:
            text_input.delete(start_index, end_index)  # Șterge cuvântul evidentiat
            text_input.insert(start_index, cuvant_modificat)  # Inserează cuvântul modificat
            cuvinte_text = text_input.get("1.0", tk.END).split()  # Actualizează lista de cuvinte
            cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
            cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
            continut_dictionar += f", {cuvant_modificat}"
            with open('dictionar.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_modificat}")  # Adaugă cuvântul modificat în dictionar.txt
            verifica_text(cuvinte_text)
            print("Cuvantul modificat a fost adaugat in dictionar.txt")

def skip_cuvant():
    start_index = text_input.tag_ranges("evidentiat")[0]
    end_index = text_input.tag_ranges("evidentiat")[1]
    cuvant = text_input.get(start_index, end_index)
    CUVINTE_SKIP.append(cuvant.lower())
    with open('dictionar.txt', 'a', encoding='utf-8') as f:
        f.write(f", {cuvant}")  # Adaugă cuvântul skip în dictionar.txt
    verifica_text(text_input.get("1.0", tk.END).split())
    print("Cuvantul a fost adaugat in dictionar.txt")

def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    cuvinte_text = text.split()

    cuvinte_cu_diacritice = []
    for cuv in cuvinte_text:
        cuv_fara_diacritice = cuv
        for diacritic_fara, diacritic_cu in diacritice.items():
            cuv_fara_diacritice = cuv_fara_diacritice.replace(diacritic_cu, diacritic_fara)
        cuv_cu_diacritice = cuv_fara_diacritice
        for diacritic_fara, diacritic_cu in diacritice.items():
            cuv_cu_diacritice = cuv_cu_diacritice.replace(diacritic_fara, diacritic_cu)
        cuvinte_cu_diacritice.append(cuv_cu_diacritice)

    text_cu_diacritice = " ".join(cuvinte_cu_diacritice)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text_cu_diacritice)
    print("Diacritice au fost adaugate")

def main():
    global text_input
    root = tk.Tk()
    root.title("Adăugare Diacritice")

    text_input = tk.Text(root, height=20, width=50)
    text_input.pack(pady=20)

    btn_verifica = tk.Button(root, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
    btn_verifica.pack(side=tk.LEFT, padx=10)

    btn_modifica = tk.Button(root, text="Modificare", command=modifica_cuvant)
    btn_modifica.pack(side=tk.LEFT, padx=10)

    btn_skip = tk.Button(root, text="Skip", command=skip_cuvant)
    btn_skip.pack(side=tk.LEFT, padx=10)

    btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
    btn_diacritice.pack(side=tk.LEFT, padx=10)

    root.mainloop()

if __name__ == "__main__":
    main()
This is the sentence from dictionar.txt (With diacritics)

Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.

This is the sentence from dictionar-2.txt (Without diacritics)

Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.

So, when I put in the FORM words such as: ,

nu sufleteasca cu Compatibilitatea direct valoarea intensitatii proportionala sentimentelor este

and when I press "Diacritice" button, those words must become:

nu sufletească cu Compatibilitatea direct valoarea intensităţii proporţională sentimentelor este

Right now, the cod puts random the diacritics, totaly wrong.

maybe I didn't know how to think the problem through?
Reply
#2
Why not post a few lines of each .txt file, then people can experiment, try to find the best way to do this.

Can't be too difficult!
Reply
#3
did you read carefully what I wrote in the post?

This is the sentence from dictionar.txt (With diacritics)

Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.

This is the sentence from dictionar-2.txt (Without diacritics)

Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.

In the FORM I put random words, chosen from the same sentence on dictionar-2.txt

nu sufleteasca cu Compatibilitatea direct valoarea intensitatii proportionala sentimentelor este
Reply
#4
Maybe I don't understand the assignment, but I think this is incorrect:

Quote:The code must compare the words from the FORM with those words that have diacritics from the dictionary.txt The code must see the similarities between the words, for example between "proporțională" and "proportionala" and add the missing letter from the list of diacritics in the code.

I think the code should compare words from the FORM with words that do not have diacritics from the dictionary-2.txt. If a match is found, the word in the form should be replaced with the corresponding word that has diacritics from dictinary.txt.

What you want to do is make a plain-text->diacritics dictionary. You need to parse both files to find words, then make a dictionary using the words you found. Pedroski55's interest in seeing what the files look like probably has to do with seeing what separators appear between words. Your example sentences show spaces, periods and commas. Are there any question marks. Is this a complete list of punctuation?
operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
I think making the translation dictionary would look something like this:
import re


def get_words(filename):
    """Read words from file.  Remove punctuation and convert to lowercase.  Return list of words."""
    punctuation = re.compile("[:'\.”“;\?!]")
    with open(filename, 'r', encoding='utf-8') as f:
        return [re.sub(punctuation, " ", word).lower() for word in f.read().split()]


# Make dictionary.  key = plain text word, value = same word with dicritics
plain_2_diacritic = dict(zip(get_words("dictionary-2.txt"), get_words("dictionary.txt")))
Reply
#5
yes, the code should compare words from the FORM with words that do not have diacritics from the dictionary-2.txt. If a match is found, the word in the form should be replaced with the corresponding word that has diacritics from dictinary.txt.

I change the code. The Print is ok, but in the form nothing changes. Why ?


import re
import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog

root = None

# Încărcarea bazei de date din fișierul .txt
with open('dictionar.txt', 'r', encoding='utf-8') as f:
    continut_dictionar = f.read().lower()

with open('dictionar-2.txt', 'r', encoding='utf-8') as f:
    continut_dictionar_fara_diacritice = f.read().lower()

diacritice = {
    'a': 'ă', 'A': 'Ă', 'i': 'î', 'I': 'Î', 's': 'ș', 'S': 'Ș', 't': 'ț', 'T': 'Ț'
}

SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []

def get_words(filename):
    """Read words from file.  Remove punctuation and convert to lowercase.  Return list of words."""
    punctuation = re.compile("[:'\.”“;\?!]")
    with open(filename, 'r', encoding='utf-8') as f:
        return [re.sub(punctuation, " ", word).lower() for word in f.read().split()]
def elimina_sufix(cuvant):
    for sufix in SUFIXE:
        if cuvant.endswith(sufix):
            return cuvant[:-len(sufix)]
    return cuvant

def elimina_operatori(cuvant):
    operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
    while cuvant and cuvant[0] in operatori:
        cuvant = cuvant[1:]
    while cuvant and cuvant[-1] in operatori:
        cuvant = cuvant[:-1]
    return cuvant

def verifica_text(cuvinte_text):
    text_input.tag_remove("evidentiat", "1.0", tk.END)  # Ștergem evidențierea anterioară
    start_index = "1.0"
    cuvant_gasit = False

    for cuv in cuvinte_text:
        cuv_baza = elimina_sufix(cuv).lower()
        cuv_baza = elimina_operatori(cuv_baza)
        end_index = start_index + f"+{len(cuv)}c"
        if cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
            text_input.tag_add("evidentiat", start_index, end_index)
            text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
            cuvant_gasit = True
            break  # Întrerupem bucla după primul cuvânt nerecunoscut
        start_index = end_index + "+1c"

    if not cuvant_gasit:
        messagebox.showinfo("Informare", "Totul este ok!")

def modifica_cuvant():
    global continut_dictionar  # Declarăm variabila ca globală pentru a o putea modifica
    start_index = text_input.tag_ranges("evidentiat")[0]
    end_index = text_input.tag_ranges("evidentiat")[1]
    cuvant = text_input.get(start_index, end_index)

    cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
    if cuvant_modificat:
        cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
        cuvant_modificat = elimina_operatori(cuvant_modificat)

        if cuvant_modificat:
            text_input.delete(start_index, end_index)  # Șterge cuvântul evidentiat
            text_input.insert(start_index, cuvant_modificat)  # Inserează cuvântul modificat
            cuvinte_text = text_input.get("1.0", tk.END).split()  # Actualizează lista de cuvinte
            cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
            cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
            continut_dictionar += f", {cuvant_modificat}"
            with open('dictionar.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_modificat}")  # Adaugă cuvântul modificat în dictionar.txt
            verifica_text(cuvinte_text)
            print("Cuvantul modificat a fost adaugat in dictionar.txt")

def skip_cuvant():
    start_index = text_input.tag_ranges("evidentiat")[0]
    end_index = text_input.tag_ranges("evidentiat")[1]
    cuvant = text_input.get(start_index, end_index)
    CUVINTE_SKIP.append(cuvant.lower())
    with open('dictionar.txt', 'a', encoding='utf-8') as f:
        f.write(f", {cuvant}")  # Adaugă cuvântul skip în dictionar.txt
    verifica_text(text_input.get("1.0", tk.END).split())
    print("Cuvantul a fost adaugat in dictionar.txt")


def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    cuvinte_text = text.split()

    cuvinte_cu_diacritice = []
    for cuv in cuvinte_text:
        cuv_fara_diacritice = cuv
        if cuv_fara_diacritice in plain_2_diacritic:
            cuv_cu_diacritice = plain_2_diacritic[cuv_fara_diacritice]
            cuvinte_cu_diacritice.append(cuv_cu_diacritice)
        else:
            cuvinte_cu_diacritice.append(cuv_fara_diacritice)

    text_cu_diacritice = " ".join(cuvinte_cu_diacritice)

    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text_cu_diacritice)

    # Actualizează interfața grafică
    global root
    root.update()

    print("Diacritice au fost adăugate")

def main():
    global text_input
    text_input = tk.Text(root, height=20, width=50)
    text_input.pack(pady=20)

    btn_verifica = tk.Button(root, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
    btn_verifica.pack(side=tk.LEFT, padx=10)

    btn_modifica = tk.Button(root, text="Modificare", command=modifica_cuvant)
    btn_modifica.pack(side=tk.LEFT, padx=10)

    btn_skip = tk.Button(root, text="Skip", command=skip_cuvant)
    btn_skip.pack(side=tk.LEFT, padx=10)

    btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
    btn_diacritice.pack(side=tk.LEFT, padx=10)

# Eliminăm definiția globală din main() și o mutăm aici
root = tk.Tk()
root.title("Adăugare Diacritice")

if __name__ == "__main__":
    # Create the plain to diacritic translation dictionary
    plain_2_diacritic = dict(zip(get_words("dictionar-2.txt"), get_words("dictionar.txt")))
    print("Plain to Diacritic Dictionary:")
    print(plain_2_diacritic)

    main()  # Apelăm funcția main() pentru a începe interfața grafică
    root.mainloop()  # Rulăm bucla principală a interfeței grafice
Reply
#6
It works for me. In the code below I extracted the adauga_diacritice() function and supporting players so I could focus on one thing at a time. Press the button and the test text (gibberish to test case and punctuation) is translated.
import tkinter as tk
import re


def get_words(text, lower=False):
    """Return words from file.  Optionally set to lowercase."""
    words = re.split(r"[:;,\.\"'”“\?! \n]+", text)
    if lower:
        return [word.lower() for word in words]
    return words


def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    # If word in dictionary, replace, else keep word.
    words = [plain_2_diacritic.get(word, word) for word in get_words(text, lower=True)]
    text = " ".join(words)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)


def make_diacritic_word_dictionary():
    """Create dictionary to convert plain text words to words with diacritics."""
    d = get_words(
        "Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.",
        lower=True,
    )
    d2 = get_words(
        "Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.",
        lower=True,
    )
    # Only add words containing diacritics.
    return {w2: w for w, w2 in zip(d, d2) if w != w2}


plain_2_diacritic = make_diacritic_word_dictionary()

root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
    1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
There are some obvious problems though. All punctuation gets removed from the text, and all words are lower case. More care must be taken when modifying the original text. Only words with diacritics should be modified.
import tkinter as tk
import re


def get_words(text, lower=False):
    """Return words from file.  Optionally set to lowercase."""
    words = re.split(r"[:;,\.\"'”“\?! \n]+", text)
    if lower:
        return [word.lower() for word in words]
    return words


def replace(text, old, new):
    """In text replace old with new.  Adjust case in new to match case in old."""
    # Magic happens here


def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    words = get_words(text)
    for old in words:
        new= plain_2_diacritic.get(old.lower(), None)
        if replacement is not None:
            text = replace(text, old, new)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)


def make_diacritic_word_dictionary():
    """Create dictionary to convert plain text words to words with diacritics."""
    d = get_words(
        "Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.",
        lower=True,
    )
    d2 = get_words(
        "Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.",
        lower=True,
    )
    # Only add words containing diacritics.
    return {w2: w for w, w2 in zip(d, d2) if w != w2}


plain_2_diacritic = make_diacritic_word_dictionary()

root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
    1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
Oh, and I almost forgot, there was an error in get_words). Punctuation should be replaced by an empty string, not a blank.
Reply
#7
thanks, deanhystad. You are really great !

I will save here another version, just a little bit different.

import tkinter as tk
import re

def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    words = []

    for word in get_words(text):
        if word.lower() in plain_2_diacritic:
            diacritic_word = plain_2_diacritic[word.lower()]
            if word.istitle():  # Preserve title case
                words.append(diacritic_word.capitalize())
            elif word.isupper():  # Preserve uppercase
                words.append(diacritic_word.upper())
            else:
                words.append(diacritic_word)
        else:
            words.append(word)

    text = " ".join(words)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)

def make_diacritic_word_dictionary():
    d = get_words(
        "Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor."
    )
    d2 = get_words(
        "Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor."
    )
    return {w2: w for w, w2 in zip(d, d2) if w != w2}

def get_words(text):
    return re.findall(r'\b\w+\b', text)

plain_2_diacritic = make_diacritic_word_dictionary()

root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
    1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
OR THIS:


import tkinter as tk
import re

punctuation = re.compile("[:;,\.\"'”“\?!]")

def get_words(text, lower=False):
    words = [re.sub(punctuation, "", word) for word in text.split()]
    if lower:
        return [word.lower() for word in words]
    return words

def replace(text, old, new):
    return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)

def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    words = get_words(text)
    for word in words:
        replacement = plain_2_diacritic.get(word.lower(), None)
        if replacement is not None:
            text = replace(text, word, replacement)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)

def make_diacritic_word_dictionary():
    d = get_words(
        "Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.",
        lower=True,
    )
    d2 = get_words(
        "Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.",
        lower=True,
    )
    return {w2: w for w, w2 in zip(d, d2) if w != w2}

plain_2_diacritic = make_diacritic_word_dictionary()

root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
    1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
Reply
#8
Version 5 (Leep UpperCase, remove default text, retrieve data from dictionar.txt and dictionar-2.txt)

import tkinter as tk
import re

punctuation = re.compile("[:;,\.\"'”“\?!]")

def get_words(text, lower=False):
    words = [re.sub(punctuation, "", word) for word in text.split()]
    if lower:
        return [word.lower() for word in words]
    return words

def replace(text, old, new):
    # Replace old with new in text, keeping the case of old intact
    # Check if old starts with an uppercase letter
    if old[0].isupper():
        new = new.capitalize()
    return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)

def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    words = get_words(text)
    for word in words:
        replacement = plain_2_diacritic.get(word.lower(), None)
        if replacement is not None:
            text = replace(text, word, replacement)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)

def make_diacritic_word_dictionary():
    with open("dictionar.txt", "r", encoding="utf-8") as f1, open("dictionar-2.txt", "r", encoding="utf-8") as f2:
        d = get_words(f1.read(), lower=True)
        d2 = get_words(f2.read(), lower=True)
    return {w2: w for w, w2 in zip(d, d2) if w != w2}

plain_2_diacritic = make_diacritic_word_dictionary()

root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
Reply
#9
This doesn't maintain the case from the original form text.
def replace(text, old, new):
    return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)
This does a better job, but I don't like it much.
    for word in get_words(text):
        if word.lower() in plain_2_diacritic:
            diacritic_word = plain_2_diacritic[word.lower()]
            if word.istitle():  # Preserve title case
                words.append(diacritic_word.capitalize())
            elif word.isupper():  # Preserve uppercase
                words.append(diacritic_word.upper())
            else:
                words.append(diacritic_word)
        else:
            words.append(word)
It is too complicated, and too special case. For example, iit will fail for words that have an uppercase letter other than the start.

You have two words. "word" has the desired case from the form, but it doesn't have diacritics. diacritic_word has the diacritics, but is all lowercase. Is there some way to copy the case from word to diacritic_word? Think about it. Think about a very generic approach with no specific cases.

And this throws away all the punctuation.
text = " ".join(words)
Can you think of a way where you can replace one word in a str with a different word, without changing anything else in the str?
Melcu54 likes this post
Reply
#10
Final Version (I add 3 new buttons: Verificare, Modificare and SKIP)

Verificare (check each word if it exists in dictionar.txt)
Modificare (edit the word and add it to dictionar.txt, and without diacritics into dictionar-2.txt)
Skip (If the word exists in dictionar.txt, it goes over it, if it doesn't, it is automatically added to the dictionar.txt, and without diacritics into dictionar-2.txt )


import tkinter as tk
import re
from tkinter import messagebox, simpledialog
from unidecode import unidecode  # Importați unidecode

punctuation = re.compile("[:;,\.\"'”“\?!]")

def get_words(text, lower=False):
    words = [re.sub(punctuation, "", word) for word in text.split()]
    if lower:
        return [word.lower() for word in words]
    return words

def replace(text, old, new):
    if old[0].isupper():
        new = new.capitalize()
    return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)

def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    words = get_words(text)
    for word in words:
        replacement = plain_2_diacritic.get(word.lower(), None)
        if replacement is not None:
            text = replace(text, word, replacement)
    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)

def elimina_sufix(cuvant):
    for sufix in SUFIXE:
        if cuvant.endswith(sufix):
            return cuvant[:-len(sufix)]
    return cuvant

def elimina_operatori(cuvant):
    operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
    while cuvant and cuvant[0] in operatori:
        cuvant = cuvant[1:]
    while cuvant and cuvant[-1] in operatori:
        cuvant = cuvant[:-1]
    return cuvant

def verifica_text(cuvinte_text):
    text_input.tag_remove("evidentiat", "1.0", tk.END)
    start_index = "1.0"
    cuvant_gasit = False

    for cuv in cuvinte_text:
        cuv_baza = elimina_sufix(cuv).lower()
        cuv_baza = elimina_operatori(cuv_baza)
        end_index = start_index + f"+{len(cuv)}c"
        if cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
            text_input.tag_add("evidentiat", start_index, end_index)
            text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
            cuvant_gasit = True
            break
        start_index = end_index + "+1c"

    if not cuvant_gasit:
        messagebox.showinfo("Informare", "Totul este ok!")

def modifica_cuvant():
    global continut_dictionar
    start_index = text_input.tag_ranges("evidentiat")[0]
    end_index = text_input.tag_ranges("evidentiat")[1]
    cuvant = text_input.get(start_index, end_index)

    cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
    if cuvant_modificat:
        cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
        cuvant_modificat = elimina_operatori(cuvant_modificat)

        if cuvant_modificat:
            text_input.delete(start_index, end_index)
            text_input.insert(start_index, cuvant_modificat)
            cuvinte_text = text_input.get("1.0", tk.END).split()
            cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
            cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
            cuvant_baza = elimina_sufix(cuvant).lower()
            cuvant_baza = elimina_operatori(cuvant_baza)
            continut_dictionar += f", {cuvant_modificat}"
            with open('dictionar.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_modificat}")
            if cuvant_baza in CUVINTE_SKIP:
                CUVINTE_SKIP.remove(cuvant_baza)
            verifica_text(cuvinte_text)

            # Adăugăm cuvântul modificat în dictionar-2.txt fără diacritice
            cuvant_fara_diacritice = unidecode(cuvant_modificat)
            with open('dictionar-2.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_fara_diacritice}")

def skip_cuvant():
    tag_ranges = text_input.tag_ranges("evidentiat")
    if tag_ranges:
        start_index = tag_ranges[0]
        end_index = tag_ranges[1]
        cuvant = text_input.get(start_index, end_index)
        cuvant_baza = elimina_sufix(cuvant).lower()
        cuvant_baza = elimina_operatori(cuvant_baza)
        if cuvant_baza not in continut_dictionar and cuvant_baza not in CUVINTE_SKIP:
            CUVINTE_SKIP.append(cuvant_baza)
            with open('dictionar.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant}")

            # Adăugăm cuvântul skip în dictionar-2.txt fără diacritice
            cuvant_fara_diacritice = unidecode(cuvant)
            with open('dictionar-2.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_fara_diacritice}")

        verifica_text(text_input.get("1.0", tk.END).split())
    else:
        messagebox.showinfo("Eroare", "Nu există cuvinte evidențiate pentru a fi ignorate.")


def make_diacritic_word_dictionary():
    with open("dictionar.txt", "r", encoding="utf-8") as f1, open("dictionar-2.txt", "r", encoding="utf-8") as f2:
        d = get_words(f1.read(), lower=True)
        d2 = get_words(f2.read(), lower=True)
    return {w2: w for w, w2 in zip(d, d2) if w != w2}

plain_2_diacritic = make_diacritic_word_dictionary()

SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []

with open('dictionar.txt', 'r', encoding='utf-8') as f:
    continut_dictionar = f.read().lower()

root = tk.Tk()
root.title("Adăugare Diacritice")

text_input = tk.Text(root, height=20, width=50)
text_input.pack(pady=20)

btn_verifica = tk.Button(root, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
btn_verifica.pack(side=tk.LEFT, padx=10)

btn_modifica = tk.Button(root, text="Modificare", command=modifica_cuvant)
btn_modifica.pack(side=tk.LEFT, padx=10)

btn_skip = tk.Button(root, text="Skip", command=skip_cuvant)
btn_skip.pack(side=tk.LEFT, padx=10)

btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)

root.mainloop()
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Extract a string between 2 words from a text file OscarBoots 2 1,886 Nov-02-2021, 08:50 AM
Last Post: ibreeden
  Generate a string of words for multiple lists of words in txt files in order. AnicraftPlayz 2 2,835 Aug-11-2021, 03:45 PM
Last Post: jamesaarr
  Open and read multiple text files and match words kozaizsvemira 3 6,775 Jul-07-2021, 11:27 AM
Last Post: Larz60+
  Counting the most relevant words in a text file caiomartins 2 2,502 Sep-21-2020, 08:39 AM
Last Post: caiomartins
  Web Form to Python Script to Text File to zip file to web wfsteadman 1 2,153 Aug-09-2020, 02:12 PM
Last Post: snippsat
  Check text contains words similar to themes/topics (thesaurus) Bec 1 32,244 Jul-28-2020, 04:17 PM
Last Post: Larz60+
  Need Help Typing Text into Tough Form [xpath / selenium] digitalmatic7 0 1,767 Jun-05-2019, 06:46 AM
Last Post: digitalmatic7
  Creating Dictionary form LOG /text file DG1234 7 5,497 Feb-13-2019, 08:08 PM
Last Post: DG1234
  Counting words in text dan789 4 2,690 Nov-11-2018, 07:37 PM
Last Post: dan789
  Compare all words in input() to all words in file Trianne 1 2,779 Oct-05-2018, 06:27 PM
Last Post: ichabod801

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020