Form that puts diacritics on the words in the text

**deanhystad** · Aug-18-2023, 11:36 PM

As a thought experiment, assume the user types "ProportionalA" in the form. Your program finds "proporțională" in the dictionary. Can you write code that produces "ProportinalĂ"?

Melcu54 · (This post was last modified: Aug-19-2023, 08:14 AM by Melcu54.)

FINAL ver 2.0

import tkinter as tk
import re
from tkinter import messagebox, simpledialog
from unidecode import unidecode  # Importați unidecode

punctuation = re.compile("[:;,\.\"'”“\?!]")

def get_words(text, lower=False):
    words = [re.sub(punctuation, "", word) for word in text.split()]
    if lower:
        return [word.lower() for word in words]
    return words

def replace(text, old, new):
    if old[0].isupper():
        new = new.capitalize()
    return re.sub(r'\b' + re.escape(old) + r'\b', new, text, flags=re.IGNORECASE)

def adauga_diacritice():
    text = text_input.get("1.0", tk.END)
    words = get_words(text)
    diacritic_added = False  # Adăugăm o variabilă pentru a verifica dacă s-au adăugat diacritice

    for word in words:
        replacement = plain_2_diacritic.get(word.lower(), None)
        if replacement is not None:
            text = replace(text, word, replacement)
            diacritic_added = True  # Marcam că s-au adăugat diacritice

    text_input.delete("1.0", tk.END)
    text_input.insert("1.0", text)

    if not diacritic_added:  # Dacă nu s-au adăugat diacritice, afișăm un mesaj
        messagebox.showinfo("Informare", "Nu s-au găsit diacritice de adăugat")


def elimina_sufix(cuvant):
    for sufix in SUFIXE:
        if cuvant.endswith(sufix):
            return cuvant[:-len(sufix)]
    return cuvant

def elimina_operatori(cuvant):
    operatori = [":", '"', "'", ".", "”", "“", ",", ";", "?", "!"]
    while cuvant and cuvant[0] in operatori:
        cuvant = cuvant[1:]
    while cuvant and cuvant[-1] in operatori:
        cuvant = cuvant[:-1]
    return cuvant

import pprint

def verifica_text(cuvinte_text):
    text_input.tag_remove("evidentiat", "1.0", tk.END)
    cuvant_gasit = False

    words = re.findall(r'\b\w+\b', text_input.get("1.0", tk.END))

    for cuv in words:
        cuv_baza = elimina_sufix(cuv).lower()

        if cuv_baza and cuv_baza not in continut_dictionar and cuv_baza not in CUVINTE_SKIP:
            cuv_baza = elimina_operatori(cuv_baza)

            print(f"Verificare cuvânt: {cuv_baza}")

            start_indices = [match.start() for match in re.finditer(r'\b' + re.escape(cuv) + r'\b', text_input.get("1.0", tk.END))]

            pprint.pprint(start_indices)

            for start_index in start_indices:
                end_index = f"1.{start_index + len(cuv)}"
                text_input.tag_add("evidentiat", f"1.{start_index}", end_index)
                text_input.tag_configure("evidentiat", background="yellow", font=("Arial", 12, "bold"))
                cuvant_gasit = True
                break  # Break from inner loop once a match is found

            if cuvant_gasit:
                break  # Break from outer loop once a match is found

    if not cuvant_gasit:
        messagebox.showinfo("Informare", "Totul este ok!")



def modifica_cuvant():
    global continut_dictionar
    start_index = text_input.tag_ranges("evidentiat")[0]
    end_index = text_input.tag_ranges("evidentiat")[1]
    cuvant = text_input.get(start_index, end_index)

    cuvant_modificat = simpledialog.askstring("Modificare cuvânt", f"Modificați cuvântul '{cuvant}':")
    if cuvant_modificat:
        cuvant_modificat = elimina_sufix(cuvant_modificat).lower()
        cuvant_modificat = elimina_operatori(cuvant_modificat)

        if cuvant_modificat:
            text_input.delete(start_index, end_index)
            text_input.insert(start_index, cuvant_modificat)
            cuvinte_text = text_input.get("1.0", tk.END).split()
            cuvinte_text = [elimina_sufix(cuv.lower()) for cuv in cuvinte_text]
            cuvinte_text = [elimina_operatori(cuv) for cuv in cuvinte_text]
            cuvant_baza = elimina_sufix(cuvant).lower()
            cuvant_baza = elimina_operatori(cuvant_baza)
            continut_dictionar += f", {cuvant_modificat}"
            with open('dictionar.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_modificat}")
            if cuvant_baza in CUVINTE_SKIP:
                CUVINTE_SKIP.remove(cuvant_baza)
            verifica_text(cuvinte_text)

            # Adăugăm cuvântul modificat în dictionar-2.txt fără diacritice
            cuvant_fara_diacritice = unidecode(cuvant_modificat)
            with open('dictionar-2.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_fara_diacritice}")

def skip_cuvant():
    tag_ranges = text_input.tag_ranges("evidentiat")
    if tag_ranges:
        start_index = tag_ranges[0]
        end_index = tag_ranges[1]
        cuvant = text_input.get(start_index, end_index)
        cuvant_baza = elimina_sufix(cuvant).lower()
        cuvant_baza = elimina_operatori(cuvant_baza)
        if cuvant_baza not in continut_dictionar and cuvant_baza not in CUVINTE_SKIP:
            CUVINTE_SKIP.append(cuvant_baza)
            with open('dictionar.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant}")

            # Adăugăm cuvântul skip în dictionar-2.txt fără diacritice
            cuvant_fara_diacritice = unidecode(cuvant)
            with open('dictionar-2.txt', 'a', encoding='utf-8') as f:
                f.write(f", {cuvant_fara_diacritice}")

        verifica_text(text_input.get("1.0", tk.END).split())
    else:
        messagebox.showinfo("Eroare", "Nu există cuvinte evidențiate pentru a fi ignorate.")


def make_diacritic_word_dictionary():
    with open("dictionar.txt", "r", encoding="utf-8") as f1, open("dictionar-2.txt", "r", encoding="utf-8") as f2:
        d = get_words(f1.read(), lower=True)
        d2 = get_words(f2.read(), lower=True)
    return {w2: w for w, w2 in zip(d, d2) if w != w2}

plain_2_diacritic = make_diacritic_word_dictionary()

SUFIXE = ["-mi", "-a", "-ti", "-au"]
CUVINTE_SKIP = []

with open('dictionar.txt', 'r', encoding='utf-8') as f:
    continut_dictionar = f.read().lower()

root = tk.Tk()
root.title("Adăugare Diacritice")
root.geometry("600x400")  # Schimbă dimensiunile ferestrei

frame = tk.Frame(root, bg="lightgray")  # Schimbă culoarea fundalului
frame.pack(pady=20)

text_input = tk.Text(frame, height=15, width=50)
text_input.pack(pady=10)

btn_verifica = tk.Button(frame, text="Verificare", command=lambda: verifica_text(text_input.get("1.0", tk.END).split()))
btn_verifica.pack(side=tk.LEFT, padx=10)

btn_modifica = tk.Button(frame, text="Modificare", command=modifica_cuvant)
btn_modifica.pack(side=tk.LEFT, padx=10)

btn_skip = tk.Button(frame, text="Skip", command=skip_cuvant)
btn_skip.pack(side=tk.LEFT, padx=10)

btn_diacritice = tk.Button(frame, text="Diacritice", command=adauga_diacritice)
btn_diacritice.pack(side=tk.LEFT, padx=10)

root.mainloop()

**deanhystad** · (This post was last modified: Aug-19-2023, 09:49 PM by deanhystad.)

def replace(text, old, new):
    chars = [n.upper() if o.isupper() else n for o, n in zip(old, new)]
    return text.replace(old, "".join(chars)

Pedroski55 · Aug-22-2023, 07:07 AM

Tricky thing to do this!

I tried this approach:

1. find the words with composite letters
2. Extract the composite letters
3. Find the simple letter corresponding to the composite letter
4. Put the simple letter in the word with composite letters, if it is the same as the input without composite letters, you can replace one word with the other.

Diacritics are simple marks which alter the sound of a letter.

You can find words with composite letters, that is letters with a simple letter + a diacritic, like this:

import unicodedata

# diacritics are small marks added to simple letters to change the pronunciation
# first find letters with diacritics
# composite characters will not return '' for unicodedata.decomposition(w)
# so we can find all words with composite characters
mystring = 'Compatibilitatea sufletească nu este direct proporțională cu valoarea intensităţii sentimentelor.'
mylist = mystring.split()
funny_letter_words = []

def find_CompositeChars(aword):
    for w in aword:
        if not unicodedata.decomposition(w) == '':            
            return True
        
# find the words with composite characters
# save them in a list
for word in mylist:    
    if find_CompositeChars(word):
        funny_letter_words.append(word) # returns ['sufletească', 'proporțională', 'intensităţii']

print(funny_letter_words)

Given only 1 line of text, I can't test this on a text file, but you can compare words with simple letters and composite letters like this: First get the composite letters.

# start with 2 words which are the same length
# the words may be completely different
# get the difference and the position of the different letter
# in this case w3 and w4 are the same except for the composite letters
# the position of the composite letter may help to speed this up, not sure yet
w3 = 'proporțională'
w4 = 'proportionala'
y = [(i, w3[i]) for i in range(len(w3)) if w3[i] != w4[i]]
print(y)

Second, get the simple letter from the composite letter:

def getSimpleLetter(ch):
    comp = unicodedata.decomposition(ch) # returns '0074 0326' for 'ț'
    # comp_list can be more than 2 long for weird Greek stuff
    comp_list = comp.split() # returns ['0074', '0326']
    # comp_list[0] is the number of the simple letter without the diacritic
    hex_int = int(comp_list[0], 16) # returns 116 for 't'
    simple_letter = chr(hex_int)
    return simple_letter

# get the simple letter version(s) of the composite letter
simple_letters = []
for i in range(len(y)):    
    pos = y[i][0]
    letter = y[i][1]
    simple_letters.append(getSimpleLetter(letter))

Lastly, replace the composite letters with the corresponding simple letter. If the result is the same as the word we are testing,
then the 2 words are the same and we can put the word with composite letters in place of the word with simple letters.

Of course, you need to do this in a loop for each word with composite letters and each line of text for a long text file!

# put the simple letters in the original word,
# see if the resulting word is the same as a word with no composite letters

def putSimpleLetter(aword, new_letter, old_letter ):
    w3_new = aword.replace(old_letter, new_letter)
    return w3_new

new_word = w3                          

for i in range(len(simple_letters)):
    new_word = putSimpleLetter(new_word, simple_letters[i], y[i][1])

if new_word == w4:
    print('You can replace', w4, 'with', w3)

I read about a guy, he wants to do exactly this with a 1 hundred year old Romanian book! A problem is, a word with only simple letters and a word with a composite letter may be very similar. An example he gave of that is:

Quote:peste = over
pesţe = fish

Find a way around that! The best solution is write the words correctly to begin with!

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Extract a string between 2 words from a text file	OscarBoots	2	1,885	Nov-02-2021, 08:50 AM Last Post: ibreeden
	Generate a string of words for multiple lists of words in txt files in order.	AnicraftPlayz	2	2,823	Aug-11-2021, 03:45 PM Last Post: jamesaarr
	Open and read multiple text files and match words	kozaizsvemira	3	6,771	Jul-07-2021, 11:27 AM Last Post: Larz60+
	Counting the most relevant words in a text file	caiomartins	2	2,500	Sep-21-2020, 08:39 AM Last Post: caiomartins
	Web Form to Python Script to Text File to zip file to web	wfsteadman	1	2,149	Aug-09-2020, 02:12 PM Last Post: snippsat
	Check text contains words similar to themes/topics (thesaurus)	Bec	1	32,119	Jul-28-2020, 04:17 PM Last Post: Larz60+
	Need Help Typing Text into Tough Form [xpath / selenium]	digitalmatic7	0	1,767	Jun-05-2019, 06:46 AM Last Post: digitalmatic7
	Creating Dictionary form LOG /text file	DG1234	7	5,492	Feb-13-2019, 08:08 PM Last Post: DG1234
	Counting words in text	dan789	4	2,689	Nov-11-2018, 07:37 PM Last Post: dan789
	Compare all words in input() to all words in file	Trianne	1	2,776	Oct-05-2018, 06:27 PM Last Post: ichabod801

Form that puts diacritics on the words in the text

User Panel Messages

Announcements