Posts: 61
Threads: 21
Joined: May 2021
Aug-17-2023, 04:11 PM
(This post was last modified: Aug-17-2023, 04:12 PM by Melcu54.)
I made a code that must add diacritics to the text in the form. Python.
I also have two .txt files, dictionary.txt contains a phrase (a set of words) with diacritics. And dictionary-2.txt contains the same phrase, but without diacritics.
In the form I added the same phrase from dictionar-2.txt (without diacritics) to which the "Diacritics" button will add diacritics to words.
This is how I thought about the problem. I don't know how well I thought.
The code must compare the words from the FORM with those words that have diacritics from the dictionary.txt The code must see the similarities between the words, for example between "proporțională" and "proportionala" and add the missing letter from the list of diacritics in the code.
import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog
# Încărcarea bazei de date din fișierul .txt
with open('dictionar.txt', 'r', encoding='utf-8') as f:
continut_dictionar = f.read().lower()
with open('dictionar-2.txt', 'r', encoding='utf-8') as f:
continut_dictionar_fara_diacritice = f.read().lower()
diacritice = {
'a': 'ă', 'A': 'Ă', 'i': 'î', 'I': 'Î', 's': 'ș', 'S': 'Ș', 't': 'ț', 'T': 'Ț'
}
SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []
def elimina_sufix(cuvant):
for sufix in SUFIXE:
if cuvant.endswith(sufix):
return cuvant[:-len(sufix)]
return cuvant
def elimina_operatori(cuvant):
operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
while cuvant and cuvant[0] in operatori:
cuvant = cuvant[1:]
while cuvant and cuvant[-1] in operatori:
cuvant = cuvant[:-1]
return cuvant
def verifica_text(cuvinte_text):
text_input.tag_remove("evidentiat", "1.0", tk.END) # Ștergem evidențierea anterioară
start_index = "1.0"
cuvant_gasit = False
for cuv in cuvinte_text:
cuv_baza = elimina_sufix(cuv).lower()
cuv_baza = elimina_operatori(cuv_baza)
end_index = start_index + f"+{len(cuv)}c"
if cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
text_input.tag_add("evidentiat", start_index, end_index)
text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
cuvant_gasit = True
break # Întrerupem bucla după primul cuvânt nerecunoscut
start_index = end_index + "+1c"
if not cuvant_gasit:
messagebox.showinfo("Informare", "Totul este ok!")
def modifica_cuvant():
global continut_dictionar # Declarăm variabila ca globală pentru a o putea modifica
start_index = text_input.tag_ranges("evidentiat")[0]
end_index = text_input.tag_ranges("evidentiat")[1]
cuvant = text_input.get(start_index, end_index)
cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
if cuvant_modificat:
cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
cuvant_modificat = elimina_operatori(cuvant_modificat)
if cuvant_modificat:
text_input.delete(start_index, end_index) # Șterge cuvântul evidentiat
text_input.insert(start_index, cuvant_modificat) # Inserează cuvântul modificat
cuvinte_text = text_input.get("1.0", tk.END).split() # Actualizează lista de cuvinte
cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
continut_dictionar += f", {cuvant_modificat}"
with open('dictionar.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant_modificat}") # Adaugă cuvântul modificat în dictionar.txt
verifica_text(cuvinte_text)
print("Cuvantul modificat a fost adaugat in dictionar.txt")
def skip_cuvant():
start_index = text_input.tag_ranges("evidentiat")[0]
end_index = text_input.tag_ranges("evidentiat")[1]
cuvant = text_input.get(start_index, end_index)
CUVINTE_SKIP.append(cuvant.lower())
with open('dictionar.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant}") # Adaugă cuvântul skip în dictionar.txt
verifica_text(text_input.get("1.0", tk.END).split())
print("Cuvantul a fost adaugat in dictionar.txt")
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
cuvinte_text = text.split()
cuvinte_cu_diacritice = []
for cuv in cuvinte_text:
cuv_fara_diacritice = cuv
for diacritic_fara, diacritic_cu in diacritice.items():
cuv_fara_diacritice = cuv_fara_diacritice.replace(diacritic_cu, diacritic_fara)
cuv_cu_diacritice = cuv_fara_diacritice
for diacritic_fara, diacritic_cu in diacritice.items():
cuv_cu_diacritice = cuv_cu_diacritice.replace(diacritic_fara, diacritic_cu)
cuvinte_cu_diacritice.append(cuv_cu_diacritice)
text_cu_diacritice = " ".join(cuvinte_cu_diacritice)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text_cu_diacritice)
print("Diacritice au fost adaugate")
def main():
global text_input
root = tk.Tk()
root.title("Adăugare Diacritice")
text_input = tk.Text(root, height=20, width=50)
text_input.pack(pady=20)
btn_verifica = tk.Button(root, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
btn_verifica.pack(side=tk.LEFT, padx=10)
btn_modifica = tk.Button(root, text="Modificare", command=modifica_cuvant)
btn_modifica.pack(side=tk.LEFT, padx=10)
btn_skip = tk.Button(root, text="Skip", command=skip_cuvant)
btn_skip.pack(side=tk.LEFT, padx=10)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
if __name__ == "__main__":
main() This is the sentence from dictionar.txt (With diacritics)
Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.
This is the sentence from dictionar-2.txt (Without diacritics)
Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.
So, when I put in the FORM words such as: ,
nu sufleteasca cu Compatibilitatea direct valoarea intensitatii proportionala sentimentelor este
and when I press "Diacritice" button, those words must become:
nu sufletească cu Compatibilitatea direct valoarea intensităţii proporţională sentimentelor este
Right now, the cod puts random the diacritics, totaly wrong.
maybe I didn't know how to think the problem through?
Posts: 851
Threads: 134
Joined: Jul 2017
Why not post a few lines of each .txt file, then people can experiment, try to find the best way to do this.
Can't be too difficult!
Posts: 61
Threads: 21
Joined: May 2021
Aug-18-2023, 10:21 AM
(This post was last modified: Aug-18-2023, 10:22 AM by Melcu54.)
did you read carefully what I wrote in the post?
This is the sentence from dictionar.txt (With diacritics)
Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.
This is the sentence from dictionar-2.txt (Without diacritics)
Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.
In the FORM I put random words, chosen from the same sentence on dictionar-2.txt
nu sufleteasca cu Compatibilitatea direct valoarea intensitatii proportionala sentimentelor este
Posts: 6,252
Threads: 16
Joined: Feb 2020
Maybe I don't understand the assignment, but I think this is incorrect:
Quote:The code must compare the words from the FORM with those words that have diacritics from the dictionary.txt The code must see the similarities between the words, for example between "proporțională" and "proportionala" and add the missing letter from the list of diacritics in the code.
I think the code should compare words from the FORM with words that do not have diacritics from the dictionary-2.txt. If a match is found, the word in the form should be replaced with the corresponding word that has diacritics from dictinary.txt.
What you want to do is make a plain-text->diacritics dictionary. You need to parse both files to find words, then make a dictionary using the words you found. Pedroski55's interest in seeing what the files look like probably has to do with seeing what separators appear between words. Your example sentences show spaces, periods and commas. Are there any question marks. Is this a complete list of punctuation?
operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"] I think making the translation dictionary would look something like this:
import re
def get_words(filename):
"""Read words from file. Remove punctuation and convert to lowercase. Return list of words."""
punctuation = re.compile("[:'\.”“;\?!]")
with open(filename, 'r', encoding='utf-8') as f:
return [re.sub(punctuation, " ", word).lower() for word in f.read().split()]
# Make dictionary. key = plain text word, value = same word with dicritics
plain_2_diacritic = dict(zip(get_words("dictionary-2.txt"), get_words("dictionary.txt")))
Posts: 61
Threads: 21
Joined: May 2021
Aug-18-2023, 04:33 PM
(This post was last modified: Aug-18-2023, 04:33 PM by Melcu54.)
yes, the code should compare words from the FORM with words that do not have diacritics from the dictionary-2.txt. If a match is found, the word in the form should be replaced with the corresponding word that has diacritics from dictinary.txt.
I change the code. The Print is ok, but in the form nothing changes. Why ?
import re
import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog
root = None
# Încărcarea bazei de date din fișierul .txt
with open('dictionar.txt', 'r', encoding='utf-8') as f:
continut_dictionar = f.read().lower()
with open('dictionar-2.txt', 'r', encoding='utf-8') as f:
continut_dictionar_fara_diacritice = f.read().lower()
diacritice = {
'a': 'ă', 'A': 'Ă', 'i': 'î', 'I': 'Î', 's': 'ș', 'S': 'Ș', 't': 'ț', 'T': 'Ț'
}
SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []
def get_words(filename):
"""Read words from file. Remove punctuation and convert to lowercase. Return list of words."""
punctuation = re.compile("[:'\.”“;\?!]")
with open(filename, 'r', encoding='utf-8') as f:
return [re.sub(punctuation, " ", word).lower() for word in f.read().split()]
def elimina_sufix(cuvant):
for sufix in SUFIXE:
if cuvant.endswith(sufix):
return cuvant[:-len(sufix)]
return cuvant
def elimina_operatori(cuvant):
operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
while cuvant and cuvant[0] in operatori:
cuvant = cuvant[1:]
while cuvant and cuvant[-1] in operatori:
cuvant = cuvant[:-1]
return cuvant
def verifica_text(cuvinte_text):
text_input.tag_remove("evidentiat", "1.0", tk.END) # Ștergem evidențierea anterioară
start_index = "1.0"
cuvant_gasit = False
for cuv in cuvinte_text:
cuv_baza = elimina_sufix(cuv).lower()
cuv_baza = elimina_operatori(cuv_baza)
end_index = start_index + f"+{len(cuv)}c"
if cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
text_input.tag_add("evidentiat", start_index, end_index)
text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
cuvant_gasit = True
break # Întrerupem bucla după primul cuvânt nerecunoscut
start_index = end_index + "+1c"
if not cuvant_gasit:
messagebox.showinfo("Informare", "Totul este ok!")
def modifica_cuvant():
global continut_dictionar # Declarăm variabila ca globală pentru a o putea modifica
start_index = text_input.tag_ranges("evidentiat")[0]
end_index = text_input.tag_ranges("evidentiat")[1]
cuvant = text_input.get(start_index, end_index)
cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
if cuvant_modificat:
cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
cuvant_modificat = elimina_operatori(cuvant_modificat)
if cuvant_modificat:
text_input.delete(start_index, end_index) # Șterge cuvântul evidentiat
text_input.insert(start_index, cuvant_modificat) # Inserează cuvântul modificat
cuvinte_text = text_input.get("1.0", tk.END).split() # Actualizează lista de cuvinte
cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
continut_dictionar += f", {cuvant_modificat}"
with open('dictionar.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant_modificat}") # Adaugă cuvântul modificat în dictionar.txt
verifica_text(cuvinte_text)
print("Cuvantul modificat a fost adaugat in dictionar.txt")
def skip_cuvant():
start_index = text_input.tag_ranges("evidentiat")[0]
end_index = text_input.tag_ranges("evidentiat")[1]
cuvant = text_input.get(start_index, end_index)
CUVINTE_SKIP.append(cuvant.lower())
with open('dictionar.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant}") # Adaugă cuvântul skip în dictionar.txt
verifica_text(text_input.get("1.0", tk.END).split())
print("Cuvantul a fost adaugat in dictionar.txt")
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
cuvinte_text = text.split()
cuvinte_cu_diacritice = []
for cuv in cuvinte_text:
cuv_fara_diacritice = cuv
if cuv_fara_diacritice in plain_2_diacritic:
cuv_cu_diacritice = plain_2_diacritic[cuv_fara_diacritice]
cuvinte_cu_diacritice.append(cuv_cu_diacritice)
else:
cuvinte_cu_diacritice.append(cuv_fara_diacritice)
text_cu_diacritice = " ".join(cuvinte_cu_diacritice)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text_cu_diacritice)
# Actualizează interfața grafică
global root
root.update()
print("Diacritice au fost adăugate")
def main():
global text_input
text_input = tk.Text(root, height=20, width=50)
text_input.pack(pady=20)
btn_verifica = tk.Button(root, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
btn_verifica.pack(side=tk.LEFT, padx=10)
btn_modifica = tk.Button(root, text="Modificare", command=modifica_cuvant)
btn_modifica.pack(side=tk.LEFT, padx=10)
btn_skip = tk.Button(root, text="Skip", command=skip_cuvant)
btn_skip.pack(side=tk.LEFT, padx=10)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
# Eliminăm definiția globală din main() și o mutăm aici
root = tk.Tk()
root.title("Adăugare Diacritice")
if __name__ == "__main__":
# Create the plain to diacritic translation dictionary
plain_2_diacritic = dict(zip(get_words("dictionar-2.txt"), get_words("dictionar.txt")))
print("Plain to Diacritic Dictionary:")
print(plain_2_diacritic)
main() # Apelăm funcția main() pentru a începe interfața grafică
root.mainloop() # Rulăm bucla principală a interfeței grafice
Posts: 6,252
Threads: 16
Joined: Feb 2020
Aug-18-2023, 07:40 PM
(This post was last modified: Aug-18-2023, 08:09 PM by deanhystad.)
It works for me. In the code below I extracted the adauga_diacritice() function and supporting players so I could focus on one thing at a time. Press the button and the test text (gibberish to test case and punctuation) is translated.
import tkinter as tk
import re
def get_words(text, lower=False):
"""Return words from file. Optionally set to lowercase."""
words = re.split(r"[:;,\.\"'”“\?! \n]+", text)
if lower:
return [word.lower() for word in words]
return words
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
# If word in dictionary, replace, else keep word.
words = [plain_2_diacritic.get(word, word) for word in get_words(text, lower=True)]
text = " ".join(words)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text)
def make_diacritic_word_dictionary():
"""Create dictionary to convert plain text words to words with diacritics."""
d = get_words(
"Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.",
lower=True,
)
d2 = get_words(
"Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.",
lower=True,
)
# Only add words containing diacritics.
return {w2: w for w, w2 in zip(d, d2) if w != w2}
plain_2_diacritic = make_diacritic_word_dictionary()
root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop() There are some obvious problems though. All punctuation gets removed from the text, and all words are lower case. More care must be taken when modifying the original text. Only words with diacritics should be modified.
import tkinter as tk
import re
def get_words(text, lower=False):
"""Return words from file. Optionally set to lowercase."""
words = re.split(r"[:;,\.\"'”“\?! \n]+", text)
if lower:
return [word.lower() for word in words]
return words
def replace(text, old, new):
"""In text replace old with new. Adjust case in new to match case in old."""
# Magic happens here
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
words = get_words(text)
for old in words:
new= plain_2_diacritic.get(old.lower(), None)
if replacement is not None:
text = replace(text, old, new)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text)
def make_diacritic_word_dictionary():
"""Create dictionary to convert plain text words to words with diacritics."""
d = get_words(
"Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.",
lower=True,
)
d2 = get_words(
"Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.",
lower=True,
)
# Only add words containing diacritics.
return {w2: w for w, w2 in zip(d, d2) if w != w2}
plain_2_diacritic = make_diacritic_word_dictionary()
root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop() Oh, and I almost forgot, there was an error in get_words). Punctuation should be replaced by an empty string, not a blank.
Posts: 61
Threads: 21
Joined: May 2021
Aug-18-2023, 07:50 PM
(This post was last modified: Aug-18-2023, 07:50 PM by Melcu54.)
thanks, deanhystad. You are really great !
I will save here another version, just a little bit different.
import tkinter as tk
import re
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
words = []
for word in get_words(text):
if word.lower() in plain_2_diacritic:
diacritic_word = plain_2_diacritic[word.lower()]
if word.istitle(): # Preserve title case
words.append(diacritic_word.capitalize())
elif word.isupper(): # Preserve uppercase
words.append(diacritic_word.upper())
else:
words.append(diacritic_word)
else:
words.append(word)
text = " ".join(words)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text)
def make_diacritic_word_dictionary():
d = get_words(
"Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor."
)
d2 = get_words(
"Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor."
)
return {w2: w for w, w2 in zip(d, d2) if w != w2}
def get_words(text):
return re.findall(r'\b\w+\b', text)
plain_2_diacritic = make_diacritic_word_dictionary()
root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop() OR THIS:
import tkinter as tk
import re
punctuation = re.compile("[:;,\.\"'”“\?!]")
def get_words(text, lower=False):
words = [re.sub(punctuation, "", word) for word in text.split()]
if lower:
return [word.lower() for word in words]
return words
def replace(text, old, new):
return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
words = get_words(text)
for word in words:
replacement = plain_2_diacritic.get(word.lower(), None)
if replacement is not None:
text = replace(text, word, replacement)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text)
def make_diacritic_word_dictionary():
d = get_words(
"Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.",
lower=True,
)
d2 = get_words(
"Compatibilitatea sufleteasca nu este direct proportionala cu valoarea intensitatii sentimentelor.",
lower=True,
)
return {w2: w for w, w2 in zip(d, d2) if w != w2}
plain_2_diacritic = make_diacritic_word_dictionary()
root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.insert(
1.0, "Sufleteasca nu este direct proportionala, cu valoarea intensitatii."
)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
Posts: 61
Threads: 21
Joined: May 2021
Aug-18-2023, 07:58 PM
(This post was last modified: Aug-18-2023, 08:13 PM by Melcu54.)
Version 5 (Leep UpperCase, remove default text, retrieve data from dictionar.txt and dictionar-2.txt)
import tkinter as tk
import re
punctuation = re.compile("[:;,\.\"'”“\?!]")
def get_words(text, lower=False):
words = [re.sub(punctuation, "", word) for word in text.split()]
if lower:
return [word.lower() for word in words]
return words
def replace(text, old, new):
# Replace old with new in text, keeping the case of old intact
# Check if old starts with an uppercase letter
if old[0].isupper():
new = new.capitalize()
return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
words = get_words(text)
for word in words:
replacement = plain_2_diacritic.get(word.lower(), None)
if replacement is not None:
text = replace(text, word, replacement)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text)
def make_diacritic_word_dictionary():
with open("dictionar.txt", "r", encoding="utf-8") as f1, open("dictionar-2.txt", "r", encoding="utf-8") as f2:
d = get_words(f1.read(), lower=True)
d2 = get_words(f2.read(), lower=True)
return {w2: w for w, w2 in zip(d, d2) if w != w2}
plain_2_diacritic = make_diacritic_word_dictionary()
root = tk.Tk()
text_input = tk.Text(root, height=20, width=50)
text_input.pack(pady=20)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
Posts: 6,252
Threads: 16
Joined: Feb 2020
Aug-18-2023, 08:14 PM
(This post was last modified: Aug-18-2023, 08:14 PM by deanhystad.)
This doesn't maintain the case from the original form text.
def replace(text, old, new):
return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE) This does a better job, but I don't like it much.
for word in get_words(text):
if word.lower() in plain_2_diacritic:
diacritic_word = plain_2_diacritic[word.lower()]
if word.istitle(): # Preserve title case
words.append(diacritic_word.capitalize())
elif word.isupper(): # Preserve uppercase
words.append(diacritic_word.upper())
else:
words.append(diacritic_word)
else:
words.append(word) It is too complicated, and too special case. For example, iit will fail for words that have an uppercase letter other than the start.
You have two words. "word" has the desired case from the form, but it doesn't have diacritics. diacritic_word has the diacritics, but is all lowercase. Is there some way to copy the case from word to diacritic_word? Think about it. Think about a very generic approach with no specific cases.
And this throws away all the punctuation.
text = " ".join(words) Can you think of a way where you can replace one word in a str with a different word, without changing anything else in the str?
Posts: 61
Threads: 21
Joined: May 2021
Aug-18-2023, 09:03 PM
(This post was last modified: Aug-18-2023, 09:03 PM by Melcu54.)
Final Version (I add 3 new buttons: Verificare, Modificare and SKIP)
Verificare (check each word if it exists in dictionar.txt)
Modificare (edit the word and add it to dictionar.txt, and without diacritics into dictionar-2.txt)
Skip (If the word exists in dictionar.txt, it goes over it, if it doesn't, it is automatically added to the dictionar.txt, and without diacritics into dictionar-2.txt )
import tkinter as tk
import re
from tkinter import messagebox, simpledialog
from unidecode import unidecode # Importați unidecode
punctuation = re.compile("[:;,\.\"'”“\?!]")
def get_words(text, lower=False):
words = [re.sub(punctuation, "", word) for word in text.split()]
if lower:
return [word.lower() for word in words]
return words
def replace(text, old, new):
if old[0].isupper():
new = new.capitalize()
return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)
def adauga_diacritice():
text = text_input.get("1.0", tk.END)
words = get_words(text)
for word in words:
replacement = plain_2_diacritic.get(word.lower(), None)
if replacement is not None:
text = replace(text, word, replacement)
text_input.delete("1.0", tk.END)
text_input.insert("1.0", text)
def elimina_sufix(cuvant):
for sufix in SUFIXE:
if cuvant.endswith(sufix):
return cuvant[:-len(sufix)]
return cuvant
def elimina_operatori(cuvant):
operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
while cuvant and cuvant[0] in operatori:
cuvant = cuvant[1:]
while cuvant and cuvant[-1] in operatori:
cuvant = cuvant[:-1]
return cuvant
def verifica_text(cuvinte_text):
text_input.tag_remove("evidentiat", "1.0", tk.END)
start_index = "1.0"
cuvant_gasit = False
for cuv in cuvinte_text:
cuv_baza = elimina_sufix(cuv).lower()
cuv_baza = elimina_operatori(cuv_baza)
end_index = start_index + f"+{len(cuv)}c"
if cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
text_input.tag_add("evidentiat", start_index, end_index)
text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
cuvant_gasit = True
break
start_index = end_index + "+1c"
if not cuvant_gasit:
messagebox.showinfo("Informare", "Totul este ok!")
def modifica_cuvant():
global continut_dictionar
start_index = text_input.tag_ranges("evidentiat")[0]
end_index = text_input.tag_ranges("evidentiat")[1]
cuvant = text_input.get(start_index, end_index)
cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
if cuvant_modificat:
cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
cuvant_modificat = elimina_operatori(cuvant_modificat)
if cuvant_modificat:
text_input.delete(start_index, end_index)
text_input.insert(start_index, cuvant_modificat)
cuvinte_text = text_input.get("1.0", tk.END).split()
cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
cuvant_baza = elimina_sufix(cuvant).lower()
cuvant_baza = elimina_operatori(cuvant_baza)
continut_dictionar += f", {cuvant_modificat}"
with open('dictionar.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant_modificat}")
if cuvant_baza in CUVINTE_SKIP:
CUVINTE_SKIP.remove(cuvant_baza)
verifica_text(cuvinte_text)
# Adăugăm cuvântul modificat în dictionar-2.txt fără diacritice
cuvant_fara_diacritice = unidecode(cuvant_modificat)
with open('dictionar-2.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant_fara_diacritice}")
def skip_cuvant():
tag_ranges = text_input.tag_ranges("evidentiat")
if tag_ranges:
start_index = tag_ranges[0]
end_index = tag_ranges[1]
cuvant = text_input.get(start_index, end_index)
cuvant_baza = elimina_sufix(cuvant).lower()
cuvant_baza = elimina_operatori(cuvant_baza)
if cuvant_baza not in continut_dictionar and cuvant_baza not in CUVINTE_SKIP:
CUVINTE_SKIP.append(cuvant_baza)
with open('dictionar.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant}")
# Adăugăm cuvântul skip în dictionar-2.txt fără diacritice
cuvant_fara_diacritice = unidecode(cuvant)
with open('dictionar-2.txt', 'a', encoding='utf-8') as f:
f.write(f", {cuvant_fara_diacritice}")
verifica_text(text_input.get("1.0", tk.END).split())
else:
messagebox.showinfo("Eroare", "Nu există cuvinte evidențiate pentru a fi ignorate.")
def make_diacritic_word_dictionary():
with open("dictionar.txt", "r", encoding="utf-8") as f1, open("dictionar-2.txt", "r", encoding="utf-8") as f2:
d = get_words(f1.read(), lower=True)
d2 = get_words(f2.read(), lower=True)
return {w2: w for w, w2 in zip(d, d2) if w != w2}
plain_2_diacritic = make_diacritic_word_dictionary()
SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []
with open('dictionar.txt', 'r', encoding='utf-8') as f:
continut_dictionar = f.read().lower()
root = tk.Tk()
root.title("Adăugare Diacritice")
text_input = tk.Text(root, height=20, width=50)
text_input.pack(pady=20)
btn_verifica = tk.Button(root, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
btn_verifica.pack(side=tk.LEFT, padx=10)
btn_modifica = tk.Button(root, text="Modificare", command=modifica_cuvant)
btn_modifica.pack(side=tk.LEFT, padx=10)
btn_skip = tk.Button(root, text="Skip", command=skip_cuvant)
btn_skip.pack(side=tk.LEFT, padx=10)
btn_diacritice = tk.Button(root, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)
root.mainloop()
|