Spelling correction

jareyesluengo · Apr-07-2020, 02:44 PM

Hi everyone;

I have this code, made for many parts of code, for preprocess and detected unique word on whatsapp group conversation:

import re
import pandas   as pd
import numpy    as np
import requests

import matplotlib.pyplot    as plt
import matplotlib.image     as mpimg
import matplotlib.patches   as patches

from PIL           import Image
from collections   import Counter

from matplotlib.offsetbox      import AnchoredText
from mpl_toolkits.axes_grid1   import make_axes_locatable

# PROCESING DATA

def import_data(file, path = ''):
    """ Import whatsapp data and transform it to a dataframe
    
    Parameters:
    -----------
    file : str
        Name of file including its extension.
    path : str, default ''
        Path to file without the file name. 
        Keep it empty if the file is in the 
        working directory.
        
    Returns:
    --------
    df : dataframe
        Dataframe of all messages
    
    """
   
    with open(path + file, encoding = 'utf-8') as outfile:
        raw_text = outfile.readlines()
        messages = {}

        for message in raw_text: 

            # Some messages are not sent by the user, 
            # but are simply comments and therefore need to be removed
            try:
                name = message.split(' - ')[1].split(':')[0]
            except:
                continue

            # Add name to dictionary if it exists
            if name in messages:
                messages[name].append(message)
            else:
                messages[name] = [message]

    # Convert dictionary to dataframe
    df = pd.DataFrame(columns=['Message_Raw', 'User'])

    for name in messages.keys():
        df = df.append(pd.DataFrame({'Message_Raw': messages[name], 'User': name}))

    df.reset_index(inplace=True)

    return df

def clean_message(row):
    """ 
    Try to extract name, if not possible then 
    somebody didn't write a message but changed
    the avatar of the group. 
        
    """
    
    name = row.User + ': '
    
    try:
        return row.Message_Raw.split(name)[1][:-1]
    except:
        return row.Message_Raw
    
def remove_inactive_users(df, min_messages=10):
    """ Removes inactive users or users that have 
    posted very few messages. 
    
    Parameters:
    -----------
    df : pandas dataframe
        Dataframe of all messages 
    min_messages: int, default 10
        Number of minimum messages that a user must have
        
    Returns:
    --------
    df : pandas dataframe
        Dataframe of all messages
        
    """
    # Remove users that have not posted more than min_messages
    to_keep = df.groupby('User').count().reset_index()
    to_keep = to_keep.loc[to_keep['Message_Raw'] >= min_messages, 'User'].values
    df = df[df.User.isin(to_keep)]
    return df

def preprocess_data(df, min_messages=10):
    """ Preprocesses the data by executing the following steps:
    
    * Import data
    * Create column with only message, not date/name etc.
    * Create column with only text message, no smileys etc.
    * Remove inactive users
    * Remove indices of images
    Parameters:
    -----------
    df : pandas dataframe
        Raw data in pandas dataframe format  
    min_messages : int, default 10
        Number of minimum messages each user needs
        to have posted else they are removed. 
        
    Returns:
    --------
    df : pandas dataframe
        Dataframe of all messages
        
    """
    
    # Create column with only message, not date/name etc.
    df['Message_Clean'] = df.apply(lambda row: clean_message(row), axis = 1)

    # Create column with only text message, no smileys etc.
    df['Message_Only_Text'] = df.apply(lambda row: re.sub(r'[^a-zA-Z ñáéíóúÑÁÉÍÓÚ]+', '', 
                                                          row.Message_Clean.lower()), 
                                       axis = 1)
    
    # Remove inactive users
    df = remove_inactive_users(df, min_messages)

    # Remove indices of images
    indices_to_remove = list(df.loc[df.Message_Clean.str.contains('|'.join(['<', '>'])),
                                    'Message_Clean'].index)
    df = df.drop(indices_to_remove)
    
    # Extract Time
    df['Date'] = df.apply(lambda row: row['Message_Raw'].split(' - ')[0], axis = 1)
    
    if '/' in str(df.iloc[df.index[0]].Date):
        df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%y %H:%M")
    else:
        if ',' in str(df.iloc[df.index[0]].Date):
            df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%y, %H:%M")
        else:
            df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%y %H:%M")
    
    # Extact Day of the Week
    df['Hour'] = df.apply(lambda row: row.Date.hour, axis = 1)
    df['Day_of_Week'] = df.apply(lambda row: row.Date.dayofweek, axis = 1)
    
    # Sort values by date to keep order
    df.sort_values('Date', inplace=True)
    
    return df

df = import_data('Chat de WhatsApp con Operaciones GCHC.txt')
df = preprocess_data(df)

# PROCESING COUNT WORD PER USER

def count_words_per_user(df, sentence_column = "Message_Only_Text", user_column = "User"):
    """ Creates a count vector for each user in which
        the occurence of each word is count over all 
        documents for that user. 
    Parameters:
    -----------
    df : pandas dataframe
        Dataframe of all messages
    sentence_column : string, default 'Message_Only_Text'
        Name of the column of which you want to 
        create a word count
    user_column : string, default 'User'
        Name of the column that specifies the user
        
    Returns:
    --------
    df : pandas dataframe
        Dataframe counts per word per user
    
    """
    # Creating a dataframe with all words
    counts = list(Counter(" ".join(list(df[sentence_column])).split(" ")).items())
    counts = [word[0] for word in counts]
    counts = pd.DataFrame(counts, columns = ['Word'])
    counts = counts.drop(0)

    # Adding counts of each user to the dataframe
    for user in df.User.unique():
        count_temp = list(Counter(" ".join(list(df.loc[df[user_column] == user, 
                                                       'Message_Only_Text'])).split(" ")).items())
        counts[user] = 0
        for word, count in count_temp:
            counts.loc[counts['Word'] == word, user] = count
            
    counts = counts[counts.Word.str.len() > 1]
            
    return counts

def remove_stopwords(df, file, path='', column = "Word"):
    """ Remove stopwords from a dataframe choosing
    a specific column in which to remove those words
    
    Parameters:
    -----------
    df : pandas dataframe
        Dataframe of counts per word per user
    file : string
        Name of file that contains the stopwords
    path : string, default ''
        Path of the file that contains the stopwords
    column : string, default 'Word'
        Column to clean
    Returns:
    --------
    df : pandas dataframe
        Dataframe of counts per word per user
        excluding the stopwords
    
    """
    
    # Remove stopwords
    with open(path + file) as stopwords:
        stopwords = stopwords.readlines()
        stopwords = [word[:-1] for word in stopwords]

    df = df[~df[column].isin(stopwords)]
    
    return df
           
def get_unique_words(counts, df_raw, version):
    """ Get a list of unique words 
    
    The dataframe needs be structured as follows:
    First column is called "Word" and contains a certain word
    Any following columns are named as the users and contain the
    count of each word. 
    
    |   |    Word   | Tim | Nadia | 
    | 1 | pride     | 0   | 1     |
    | 2 | groceries | 2   | 9     |
    etc. 
    
    Formulas:
    t_user = Number of times word t said by user
    t_all = Number of times word t said by all users
    sum_messages = Number of all messages
    messages_user = Number of messages user has send
    sum_words = Number of all words
    words_user = Number of words user has send
    
    Version A
    TF_IDF = ((t_user+1)^2 / t_all) * (sum_messages / messages_user)
    Version B
    TF_IDF = ((t_user+1)^2 / t_all) * (sum_words / words_user)
    Version C
    TF_IDF = (t_user + 1) / (words_user + 1) * log(sum_messages / t_all)
    
    Parameters:
    -----------
    counts : pandas dataframe
        Dataframe of counts per word per user
    df_raw : pandas dataframe
        Dataframe of raw messages
    version : string
        Which formula to use (A, B, C)
    Returns:
    --------
    df_words : pandas dataframe
        Dataframe tf_idf scores per word per user and unique value
    
    """
    
    df_words = counts.copy()
    
    # Number of messages by i 
    nr_messages = {user: len(df_raw[df_raw.User == user]) for user in df_words.columns[1:]}
    nr_users = len(nr_messages.keys())
    nr_words = {user: np.sum(df_words[user]) for user in df_words.columns[1:]}
    total = sum(nr_messages.values())

    # Calculate TF_IDF based on the version
    for user in nr_messages.keys():
        df_words[user+"_TF_IDF"] = df_words.apply(lambda row: tf_idf(row, user, 
                                                                    nr_users, nr_words,
                                                                    nr_messages, version=version), 
                                              axis = 1)

    # TF_IDF divided by each other so we can see the relative importance
    for user in nr_messages.keys():
        df_words[user+"_Unique"] = df_words.apply(lambda row: word_uniqueness(row, 
                                                                              nr_users,
                                                                              user),
                                                  axis = 1)
        
    return df_words

def tf_idf(row, user, nr_users, nr_words, nr_messages, version):
    """ Used as a lambda function inside get_unique_words() to 
        get the tf_idf scores based on one of three formulas
    
    Formulas:
    t_user = Number of times word t said by user
    t_all = Number of times word t said by all users
    sum_messages = Number of all messages
    messages_user = Number of messages user has send
    sum_words = Number of all words
    words_user = Number of words user has send
    
    Version A
    TF_IDF = ((t_user+1)^2 / t_all) * (sum_messages / messages_user)
    Version B
    TF_IDF = ((t_user+1)^2 / t_all) * (sum_words / words_user)
    Version C
    TF_IDF = (t_user + 1) / (words_user + 1) * log(sum_messages / t_all)
    
    """
    
    # TF_IDF = (t_user^2 / t_all) * (sum of messages / messages by user)
    if version == "A":
        t_user = row[user]
        t_all =  np.sum(row.iloc[1:nr_users+1])
        sum_messages = sum(nr_messages.values())
        messages_user = nr_messages[user]
        
        tf_idf = (np.square(t_user + 1) / (t_all)) * (sum_messages / messages_user)
        
        return tf_idf
    
    # TF_IDF = (t_user^2 / t_all) * (sum of words / words by user)
    elif version == "B":
        t_user = row[user]
        t_all =  np.sum(row.iloc[1:nr_users+1])
        sum_words = sum(nr_words.values())
        words_user = nr_words[user]
        
        tf_idf = (np.square(t_user + 1) / (t_all)) * (sum_words / words_user)
        
        return tf_idf
    
    # TF_IDF = (t_user / words_user) * log(sum of messages / t_all)
    elif version == "C":
        t_user = row[user]
        words_user = nr_words[user]

        sum_messages = sum(nr_messages.values())
        t_all =  np.sum(row.iloc[1:nr_users+1])
        
        tf_idf = (t_user + 1 / words_user + 1) * np.log(sum_messages / t_all)
        
        return tf_idf
    
def word_uniqueness(row, nr_users, user):
    """ Used as a lambda function in function get_unique_words()
    
    Formula:
    
    word_uniqueness = tf_idf_user / (tf_idf_all - tf_idf_user)
    
    """
    
    tf_idf_user = row[user+"_TF_IDF"]
    tf_idf_all = np.sum(row.iloc[nr_users+1: 2*nr_users+1])
    
    with np.errstate(divide='ignore'):
        unique_value_user = np.divide(tf_idf_user, 
                                      (tf_idf_all - tf_idf_user))
    
    return unique_value_user

def plot_unique_words(df_unique, user, image_path=None, image_url=None, save_name=None, save_path="",
                      title=" ", title_color="white", title_background="black", font=None, 
                     width=None, height=None):
    """
    
    Parameters:
    -----------
    df_unique : dataframe
        Dataframe containing a column "Word" and a column
        user+"_Unique" that describes how unique a word is
        by simply giving a floating value
    user : string 
        The name of the user which is the user in the column user+"_Unique"
    image_path : string with // to the path 
        Path to the picture you want to use
    image_url : string 
        Url to the image you want to use
    save_name : string
        If you want to save the name then simply set a name without extension
    save_path : string 
        Where you want to store the image
    title : string
        Title of the plot
    title_color : string
        Color of the title
    title_background : string
        Color of the background box of the title
    font : string
        Family font to use (make sure to check if you have it installed)
    width : integer or float
        Width of the plot (will also resize the image)
    height : integer or float
        Height of the plot (will also resize the image)
    """

    # Set font to be used
    if font:
        font = {'fontname':font}
    else:
        font = {'fontname':'Comic Sans MS'}

    # Background image to be used, black if nothing selected
    if image_path:
        img=mpimg.imread(image_path)
        img = Image.open(image_path)
    elif image_url:
        img = Image.open(requests.get(url, stream=True).raw)
    else:
        img = np.zeros([100,100,3],dtype=np.uint8)
        img.fill(0) 
    
    if width and height:
        img = img.resize((width, height))
    else:
        # Get size of image
        width = img.shape[1]
        height = img.shape[0]

    # Prepare data for plotting
    # to_plot = get_unique_words(counts, df_raw, version = 'C')
    to_plot = df_unique.sort_values(by=user+'_Unique', ascending=True)
    to_plot = to_plot.tail(10)[['Word', user+'_Unique']].copy()
    
    # Create left part of graph ('top') and right part which overlays
    # the image ('bottom')
    to_plot['top'] = (to_plot[user+'_Unique'] * (width*0.99) ) / max(to_plot[user+'_Unique']) 
    to_plot['bottom'] = width - to_plot['top'] 

    # Create the steps of the bars based on the height of the image
    steps = height/len(to_plot)
    y_pos = [(height/len(to_plot)/2) + (i * steps) for i in range(0, len(to_plot))]

    # Plot figure
    fig, ax = plt.subplots()

    # First plot the image
    plt.imshow(img, extent=[0, width*0.99, 0, height], zorder=1)

    # Then plot the right part which covers up the right part of the picture
    ax.barh(y_pos, to_plot['bottom'], left=to_plot['top'],height=steps, color='w',align='center',
            alpha=1,lw=2, edgecolor='w', zorder=2)

    # Finally plot the bar which is fully transparent aside from its edges
    ax.barh(y_pos, to_plot['top'], height=steps, fc=(1, 0, 0, 0.0), align='center',lw=2,
            edgecolor='white',zorder=3)

    # Remove ticks
    ax.yaxis.set_ticks_position('none') 
    ax.xaxis.set_ticks_position('none') 

    # Set labels and location y-axis
    ax.set_yticks(y_pos)
    ax.set_yticklabels(list(to_plot['Word'].values), fontsize=18,**font)
    ax.set_ylim(top=height)

    # Make them with to remove any image line that may be left
    ax.spines['top'].set_color('white') 
    ax.spines['right'].set_color('white')

    # Remove the left and bottom axis
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)

    # Add a small patch that removes some of the extra background at the top
    ax.add_patch(patches.Rectangle((0,height),width, 20,facecolor='white',linewidth = 0, zorder=3))

    # Add left and bottom lines
    plt.axvline(0, color='black', ymax=1, lw=5, zorder=4)
    plt.axvline(width, color='white', ymax=1, lw=5, zorder=5)
    plt.axhline(0, color='black', xmax=1, lw=5, zorder=6)
    plt.axhline(height, color=title_background, xmax=1, lw=3, zorder=7)

    # Create Title Box
    # This might be a temporary solution as 
    # makes_axes_locatable might lose its functionality
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("top", size="9%", pad=None)
    cax.get_xaxis().set_visible(False)
    cax.get_yaxis().set_visible(False)
    at = AnchoredText(title, loc=10, pad=0,
                      prop=dict(backgroundcolor=title_background,
                                size=23, color=title_color, **font))
    cax.add_artist(at)
    cax.set_facecolor(title_background)   
    cax.spines['left'].set_visible(False)
    cax.spines['bottom'].set_visible(False)
    cax.spines['right'].set_visible(False)
    cax.spines['top'].set_visible(False)
                   
    fig.set_size_inches(10, 10)
    if save_name:
        plt.savefig(save_path+save_name+'.png', dpi = 300)
        
def print_users(df):
    print("#" * (len('Users')+8))
    print("##  " + 'Users' + "  ##" )
    print("#" * (len('Users')+8))
    print()
    
    for user in df.User.unique():
        print(user)

Now i want to replace specific word with writte on .csv file, i try with this code:

def check(df, file='dictionary.csv', column = "Word"):
    
    dictionary = pd.read_csv(file)
    read_words = pd.DataFrame(dictionary, columns=['Word_Comp'])
    replace_word = pd.DataFrame(dictionary, columns=['Replace'])
        
    if df[column] in read_words():
        df.replace(to_replace=column, value=replace_word)
    else:
        return

but dosent work. Please Help

Dictionary on .CSV

Word_Comp Replace
yronadura tronadura
vulzanizado vulcanizado
vomo como
viernescuántas viernes
via vía
venian venían
vel velocidad
vdfcon vdf
varuadores variadores
vamps vamos
vamiones camiones

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Input error correction.	shakoun	4	1,648	Jun-07-2024, 12:59 PM Last Post: menator01
	output correction using print() function	afefDXCTN	3	13,529	Sep-18-2021, 06:57 PM Last Post: Sky_Mx
	Spelling program.	GalaxyCoyote	2	2,998	May-11-2019, 03:13 PM Last Post: GalaxyCoyote

Spelling correction

User Panel Messages

Announcements