Aug-18-2023, 11:36 PM
As a thought experiment, assume the user types "ProportionalA" in the form. Your program finds "proporțională" in the dictionary. Can you write code that produces "ProportinalĂ"?
Form that puts diacritics on the words in the text
|
Aug-18-2023, 11:36 PM
As a thought experiment, assume the user types "ProportionalA" in the form. Your program finds "proporțională" in the dictionary. Can you write code that produces "ProportinalĂ"?
FINAL ver 2.0
import tkinter as tk import re from tkinter import messagebox, simpledialog from unidecode import unidecode # Importați unidecode punctuation = re.compile("[:;,\.\"'”“\?!]") def get_words(text, lower=False): words = [re.sub(punctuation, "", word) for word in text.split()] if lower: return [word.lower() for word in words] return words def replace(text, old, new): if old[0].isupper(): new = new.capitalize() return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE) def adauga_diacritice(): text = text_input.get("1.0", tk.END) words = get_words(text) diacritic_added = False # Adăugăm o variabilă pentru a verifica dacă s-au adăugat diacritice for word in words: replacement = plain_2_diacritic.get(word.lower(), None) if replacement is not None: text = replace(text, word, replacement) diacritic_added = True # Marcam că s-au adăugat diacritice text_input.delete("1.0", tk.END) text_input.insert("1.0", text) if not diacritic_added: # Dacă nu s-au adăugat diacritice, afișăm un mesaj messagebox.showinfo("Informare", "Nu s-au găsit diacritice de adăugat") def elimina_sufix(cuvant): for sufix in SUFIXE: if cuvant.endswith(sufix): return cuvant[:-len(sufix)] return cuvant def elimina_operatori(cuvant): operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"] while cuvant and cuvant[0] in operatori: cuvant = cuvant[1:] while cuvant and cuvant[-1] in operatori: cuvant = cuvant[:-1] return cuvant import pprint def verifica_text(cuvinte_text): text_input.tag_remove("evidentiat", "1.0", tk.END) cuvant_gasit = False words = re.findall(r'\b\w+\b', text_input.get("1.0", tk.END)) for cuv in words: cuv_baza = elimina_sufix(cuv).lower() if cuv_baza and cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP: cuv_baza = elimina_operatori(cuv_baza) print(f"Verificare cuvânt: {cuv_baza}") start_indices = [match.start() for match in re.finditer(r'\b' + re.escape(cuv) + r'\b', text_input.get("1.0", tk.END))] pprint.pprint(start_indices) for start_index in start_indices: end_index = f"1.{start_index + len(cuv)}" text_input.tag_add("evidentiat", f"1.{start_index}", end_index) text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold")) cuvant_gasit = True break # Break from inner loop once a match is found if cuvant_gasit: break # Break from outer loop once a match is found if not cuvant_gasit: messagebox.showinfo("Informare", "Totul este ok!") def modifica_cuvant(): global continut_dictionar start_index = text_input.tag_ranges("evidentiat")[0] end_index = text_input.tag_ranges("evidentiat")[1] cuvant = text_input.get(start_index, end_index) cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':") if cuvant_modificat: cuvant_modificat = elimina_sufix(cuvant_modificat).lower() cuvant_modificat = elimina_operatori(cuvant_modificat) if cuvant_modificat: text_input.delete(start_index, end_index) text_input.insert(start_index, cuvant_modificat) cuvinte_text = text_input.get("1.0", tk.END).split() cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text] cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text] cuvant_baza = elimina_sufix(cuvant).lower() cuvant_baza = elimina_operatori(cuvant_baza) continut_dictionar += f", {cuvant_modificat}" with open('dictionar.txt', 'a', encoding='utf-8') as f: f.write(f", {cuvant_modificat}") if cuvant_baza in CUVINTE_SKIP: CUVINTE_SKIP.remove(cuvant_baza) verifica_text(cuvinte_text) # Adăugăm cuvântul modificat în dictionar-2.txt fără diacritice cuvant_fara_diacritice = unidecode(cuvant_modificat) with open('dictionar-2.txt', 'a', encoding='utf-8') as f: f.write(f", {cuvant_fara_diacritice}") def skip_cuvant(): tag_ranges = text_input.tag_ranges("evidentiat") if tag_ranges: start_index = tag_ranges[0] end_index = tag_ranges[1] cuvant = text_input.get(start_index, end_index) cuvant_baza = elimina_sufix(cuvant).lower() cuvant_baza = elimina_operatori(cuvant_baza) if cuvant_baza not in continut_dictionar and cuvant_baza not in CUVINTE_SKIP: CUVINTE_SKIP.append(cuvant_baza) with open('dictionar.txt', 'a', encoding='utf-8') as f: f.write(f", {cuvant}") # Adăugăm cuvântul skip în dictionar-2.txt fără diacritice cuvant_fara_diacritice = unidecode(cuvant) with open('dictionar-2.txt', 'a', encoding='utf-8') as f: f.write(f", {cuvant_fara_diacritice}") verifica_text(text_input.get("1.0", tk.END).split()) else: messagebox.showinfo("Eroare", "Nu există cuvinte evidențiate pentru a fi ignorate.") def make_diacritic_word_dictionary(): with open("dictionar.txt", "r", encoding="utf-8") as f1, open("dictionar-2.txt", "r", encoding="utf-8") as f2: d = get_words(f1.read(), lower=True) d2 = get_words(f2.read(), lower=True) return {w2: w for w, w2 in zip(d, d2) if w != w2} plain_2_diacritic = make_diacritic_word_dictionary() SUFIXE = ["-mi", "-a", "-ti", "-au"] CUVINTE_SKIP = [] with open('dictionar.txt', 'r', encoding='utf-8') as f: continut_dictionar = f.read().lower() root = tk.Tk() root.title("Adăugare Diacritice") root.geometry("600x400") # Schimbă dimensiunile ferestrei frame = tk.Frame(root, bg="lightgray") # Schimbă culoarea fundalului frame.pack(pady=20) text_input = tk.Text(frame, height=15, width=50) text_input.pack(pady=10) btn_verifica = tk.Button(frame, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split())) btn_verifica.pack(side=tk.LEFT, padx=10) btn_modifica = tk.Button(frame, text="Modificare", command=modifica_cuvant) btn_modifica.pack(side=tk.LEFT, padx=10) btn_skip = tk.Button(frame, text="Skip", command=skip_cuvant) btn_skip.pack(side=tk.LEFT, padx=10) btn_diacritice = tk.Button(frame, text="Diacritice", command=adauga_diacritice) btn_diacritice.pack(side=tk.LEFT, padx=10) root.mainloop()
Aug-19-2023, 09:01 PM
(This post was last modified: Aug-19-2023, 09:49 PM by deanhystad.)
def replace(text, old, new): chars = [n.upper() if o.isupper() else n for o, n in zip(old, new)] return text.replace(old, "".join(chars)
Aug-22-2023, 07:07 AM
Tricky thing to do this!
I tried this approach: 1. find the words with composite letters 2. Extract the composite letters 3. Find the simple letter corresponding to the composite letter 4. Put the simple letter in the word with composite letters, if it is the same as the input without composite letters, you can replace one word with the other. Diacritics are simple marks which alter the sound of a letter. You can find words with composite letters, that is letters with a simple letter + a diacritic, like this: import unicodedata # diacritics are small marks added to simple letters to change the pronunciation # first find letters with diacritics # composite characters will not return '' for unicodedata.decomposition(w) # so we can find all words with composite characters mystring = 'Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.' mylist = mystring.split() funny_letter_words = [] def find_CompositeChars(aword): for w in aword: if not unicodedata.decomposition(w) == '': return True # find the words with composite characters # save them in a list for word in mylist: if find_CompositeChars(word): funny_letter_words.append(word) # returns ['sufletească', 'proporțională', 'intensităţii'] print(funny_letter_words)Given only 1 line of text, I can't test this on a text file, but you can compare words with simple letters and composite letters like this: First get the composite letters. # start with 2 words which are the same length # the words may be completely different # get the difference and the position of the different letter # in this case w3 and w4 are the same except for the composite letters # the position of the composite letter may help to speed this up, not sure yet w3 = 'proporțională' w4 = 'proportionala' y = [(i, w3[i]) for i in range(len(w3)) if w3[i] != w4[i]] print(y)Second, get the simple letter from the composite letter: def getSimpleLetter(ch): comp = unicodedata.decomposition(ch) # returns '0074 0326' for 'ț' # comp_list can be more than 2 long for weird Greek stuff comp_list = comp.split() # returns ['0074', '0326'] # comp_list[0] is the number of the simple letter without the diacritic hex_int = int(comp_list[0], 16) # returns 116 for 't' simple_letter = chr(hex_int) return simple_letter # get the simple letter version(s) of the composite letter simple_letters = [] for i in range(len(y)): pos = y[i][0] letter = y[i][1] simple_letters.append(getSimpleLetter(letter))Lastly, replace the composite letters with the corresponding simple letter. If the result is the same as the word we are testing, then the 2 words are the same and we can put the word with composite letters in place of the word with simple letters. Of course, you need to do this in a loop for each word with composite letters and each line of text for a long text file! # put the simple letters in the original word, # see if the resulting word is the same as a word with no composite letters def putSimpleLetter(aword, new_letter, old_letter ): w3_new = aword.replace(old_letter, new_letter) return w3_new new_word = w3 for i in range(len(simple_letters)): new_word = putSimpleLetter(new_word, simple_letters[i], y[i][1]) if new_word == w4: print('You can replace', w4, 'with', w3)I read about a guy, he wants to do exactly this with a 1 hundred year old Romanian book! A problem is, a word with only simple letters and a word with a composite letter may be very similar. An example he gave of that is: Quote:peste = over Find a way around that! The best solution is write the words correctly to begin with! |
|