Apr-07-2020, 02:44 PM
Hi everyone;
I have this code, made for many parts of code, for preprocess and detected unique word on whatsapp group conversation:
Dictionary on .CSV
Word_Comp Replace
yronadura tronadura
vulzanizado vulcanizado
vomo como
viernescuántas viernes
via vía
venian venían
vel velocidad
vdfcon vdf
varuadores variadores
vamps vamos
vamiones camiones
I have this code, made for many parts of code, for preprocess and detected unique word on whatsapp group conversation:
import re import pandas as pd import numpy as np import requests import matplotlib.pyplot as plt import matplotlib.image as mpimg import matplotlib.patches as patches from PIL import Image from collections import Counter from matplotlib.offsetbox import AnchoredText from mpl_toolkits.axes_grid1 import make_axes_locatable # PROCESING DATA def import_data(file, path = ''): """ Import whatsapp data and transform it to a dataframe Parameters: ----------- file : str Name of file including its extension. path : str, default '' Path to file without the file name. Keep it empty if the file is in the working directory. Returns: -------- df : dataframe Dataframe of all messages """ with open(path + file, encoding = 'utf-8') as outfile: raw_text = outfile.readlines() messages = {} for message in raw_text: # Some messages are not sent by the user, # but are simply comments and therefore need to be removed try: name = message.split(' - ')[1].split(':')[0] except: continue # Add name to dictionary if it exists if name in messages: messages[name].append(message) else: messages[name] = [message] # Convert dictionary to dataframe df = pd.DataFrame(columns=['Message_Raw', 'User']) for name in messages.keys(): df = df.append(pd.DataFrame({'Message_Raw': messages[name], 'User': name})) df.reset_index(inplace=True) return df def clean_message(row): """ Try to extract name, if not possible then somebody didn't write a message but changed the avatar of the group. """ name = row.User + ': ' try: return row.Message_Raw.split(name)[1][:-1] except: return row.Message_Raw def remove_inactive_users(df, min_messages=10): """ Removes inactive users or users that have posted very few messages. Parameters: ----------- df : pandas dataframe Dataframe of all messages min_messages: int, default 10 Number of minimum messages that a user must have Returns: -------- df : pandas dataframe Dataframe of all messages """ # Remove users that have not posted more than min_messages to_keep = df.groupby('User').count().reset_index() to_keep = to_keep.loc[to_keep['Message_Raw'] >= min_messages, 'User'].values df = df[df.User.isin(to_keep)] return df def preprocess_data(df, min_messages=10): """ Preprocesses the data by executing the following steps: * Import data * Create column with only message, not date/name etc. * Create column with only text message, no smileys etc. * Remove inactive users * Remove indices of images Parameters: ----------- df : pandas dataframe Raw data in pandas dataframe format min_messages : int, default 10 Number of minimum messages each user needs to have posted else they are removed. Returns: -------- df : pandas dataframe Dataframe of all messages """ # Create column with only message, not date/name etc. df['Message_Clean'] = df.apply(lambda row: clean_message(row), axis = 1) # Create column with only text message, no smileys etc. df['Message_Only_Text'] = df.apply(lambda row: re.sub(r'[^a-zA-Z ñáéíóúÑÁÉÍÓÚ]+', '', row.Message_Clean.lower()), axis = 1) # Remove inactive users df = remove_inactive_users(df, min_messages) # Remove indices of images indices_to_remove = list(df.loc[df.Message_Clean.str.contains('|'.join(['<', '>'])), 'Message_Clean'].index) df = df.drop(indices_to_remove) # Extract Time df['Date'] = df.apply(lambda row: row['Message_Raw'].split(' - ')[0], axis = 1) if '/' in str(df.iloc[df.index[0]].Date): df['Date'] = pd.to_datetime(df['Date'], format="%d/%m/%y %H:%M") else: if ',' in str(df.iloc[df.index[0]].Date): df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%y, %H:%M") else: df['Date'] = pd.to_datetime(df['Date'], format="%d-%m-%y %H:%M") # Extact Day of the Week df['Hour'] = df.apply(lambda row: row.Date.hour, axis = 1) df['Day_of_Week'] = df.apply(lambda row: row.Date.dayofweek, axis = 1) # Sort values by date to keep order df.sort_values('Date', inplace=True) return df df = import_data('Chat de WhatsApp con Operaciones GCHC.txt') df = preprocess_data(df) # PROCESING COUNT WORD PER USER def count_words_per_user(df, sentence_column = "Message_Only_Text", user_column = "User"): """ Creates a count vector for each user in which the occurence of each word is count over all documents for that user. Parameters: ----------- df : pandas dataframe Dataframe of all messages sentence_column : string, default 'Message_Only_Text' Name of the column of which you want to create a word count user_column : string, default 'User' Name of the column that specifies the user Returns: -------- df : pandas dataframe Dataframe counts per word per user """ # Creating a dataframe with all words counts = list(Counter(" ".join(list(df[sentence_column])).split(" ")).items()) counts = [word[0] for word in counts] counts = pd.DataFrame(counts, columns = ['Word']) counts = counts.drop(0) # Adding counts of each user to the dataframe for user in df.User.unique(): count_temp = list(Counter(" ".join(list(df.loc[df[user_column] == user, 'Message_Only_Text'])).split(" ")).items()) counts[user] = 0 for word, count in count_temp: counts.loc[counts['Word'] == word, user] = count counts = counts[counts.Word.str.len() > 1] return counts def remove_stopwords(df, file, path='', column = "Word"): """ Remove stopwords from a dataframe choosing a specific column in which to remove those words Parameters: ----------- df : pandas dataframe Dataframe of counts per word per user file : string Name of file that contains the stopwords path : string, default '' Path of the file that contains the stopwords column : string, default 'Word' Column to clean Returns: -------- df : pandas dataframe Dataframe of counts per word per user excluding the stopwords """ # Remove stopwords with open(path + file) as stopwords: stopwords = stopwords.readlines() stopwords = [word[:-1] for word in stopwords] df = df[~df[column].isin(stopwords)] return df def get_unique_words(counts, df_raw, version): """ Get a list of unique words The dataframe needs be structured as follows: First column is called "Word" and contains a certain word Any following columns are named as the users and contain the count of each word. | | Word | Tim | Nadia | | 1 | pride | 0 | 1 | | 2 | groceries | 2 | 9 | etc. Formulas: t_user = Number of times word t said by user t_all = Number of times word t said by all users sum_messages = Number of all messages messages_user = Number of messages user has send sum_words = Number of all words words_user = Number of words user has send Version A TF_IDF = ((t_user+1)^2 / t_all) * (sum_messages / messages_user) Version B TF_IDF = ((t_user+1)^2 / t_all) * (sum_words / words_user) Version C TF_IDF = (t_user + 1) / (words_user + 1) * log(sum_messages / t_all) Parameters: ----------- counts : pandas dataframe Dataframe of counts per word per user df_raw : pandas dataframe Dataframe of raw messages version : string Which formula to use (A, B, C) Returns: -------- df_words : pandas dataframe Dataframe tf_idf scores per word per user and unique value """ df_words = counts.copy() # Number of messages by i nr_messages = {user: len(df_raw[df_raw.User == user]) for user in df_words.columns[1:]} nr_users = len(nr_messages.keys()) nr_words = {user: np.sum(df_words[user]) for user in df_words.columns[1:]} total = sum(nr_messages.values()) # Calculate TF_IDF based on the version for user in nr_messages.keys(): df_words[user+"_TF_IDF"] = df_words.apply(lambda row: tf_idf(row, user, nr_users, nr_words, nr_messages, version=version), axis = 1) # TF_IDF divided by each other so we can see the relative importance for user in nr_messages.keys(): df_words[user+"_Unique"] = df_words.apply(lambda row: word_uniqueness(row, nr_users, user), axis = 1) return df_words def tf_idf(row, user, nr_users, nr_words, nr_messages, version): """ Used as a lambda function inside get_unique_words() to get the tf_idf scores based on one of three formulas Formulas: t_user = Number of times word t said by user t_all = Number of times word t said by all users sum_messages = Number of all messages messages_user = Number of messages user has send sum_words = Number of all words words_user = Number of words user has send Version A TF_IDF = ((t_user+1)^2 / t_all) * (sum_messages / messages_user) Version B TF_IDF = ((t_user+1)^2 / t_all) * (sum_words / words_user) Version C TF_IDF = (t_user + 1) / (words_user + 1) * log(sum_messages / t_all) """ # TF_IDF = (t_user^2 / t_all) * (sum of messages / messages by user) if version == "A": t_user = row[user] t_all = np.sum(row.iloc[1:nr_users+1]) sum_messages = sum(nr_messages.values()) messages_user = nr_messages[user] tf_idf = (np.square(t_user + 1) / (t_all)) * (sum_messages / messages_user) return tf_idf # TF_IDF = (t_user^2 / t_all) * (sum of words / words by user) elif version == "B": t_user = row[user] t_all = np.sum(row.iloc[1:nr_users+1]) sum_words = sum(nr_words.values()) words_user = nr_words[user] tf_idf = (np.square(t_user + 1) / (t_all)) * (sum_words / words_user) return tf_idf # TF_IDF = (t_user / words_user) * log(sum of messages / t_all) elif version == "C": t_user = row[user] words_user = nr_words[user] sum_messages = sum(nr_messages.values()) t_all = np.sum(row.iloc[1:nr_users+1]) tf_idf = (t_user + 1 / words_user + 1) * np.log(sum_messages / t_all) return tf_idf def word_uniqueness(row, nr_users, user): """ Used as a lambda function in function get_unique_words() Formula: word_uniqueness = tf_idf_user / (tf_idf_all - tf_idf_user) """ tf_idf_user = row[user+"_TF_IDF"] tf_idf_all = np.sum(row.iloc[nr_users+1: 2*nr_users+1]) with np.errstate(divide='ignore'): unique_value_user = np.divide(tf_idf_user, (tf_idf_all - tf_idf_user)) return unique_value_user def plot_unique_words(df_unique, user, image_path=None, image_url=None, save_name=None, save_path="", title=" ", title_color="white", title_background="black", font=None, width=None, height=None): """ Parameters: ----------- df_unique : dataframe Dataframe containing a column "Word" and a column user+"_Unique" that describes how unique a word is by simply giving a floating value user : string The name of the user which is the user in the column user+"_Unique" image_path : string with // to the path Path to the picture you want to use image_url : string Url to the image you want to use save_name : string If you want to save the name then simply set a name without extension save_path : string Where you want to store the image title : string Title of the plot title_color : string Color of the title title_background : string Color of the background box of the title font : string Family font to use (make sure to check if you have it installed) width : integer or float Width of the plot (will also resize the image) height : integer or float Height of the plot (will also resize the image) """ # Set font to be used if font: font = {'fontname':font} else: font = {'fontname':'Comic Sans MS'} # Background image to be used, black if nothing selected if image_path: img=mpimg.imread(image_path) img = Image.open(image_path) elif image_url: img = Image.open(requests.get(url, stream=True).raw) else: img = np.zeros([100,100,3],dtype=np.uint8) img.fill(0) if width and height: img = img.resize((width, height)) else: # Get size of image width = img.shape[1] height = img.shape[0] # Prepare data for plotting # to_plot = get_unique_words(counts, df_raw, version = 'C') to_plot = df_unique.sort_values(by=user+'_Unique', ascending=True) to_plot = to_plot.tail(10)[['Word', user+'_Unique']].copy() # Create left part of graph ('top') and right part which overlays # the image ('bottom') to_plot['top'] = (to_plot[user+'_Unique'] * (width*0.99) ) / max(to_plot[user+'_Unique']) to_plot['bottom'] = width - to_plot['top'] # Create the steps of the bars based on the height of the image steps = height/len(to_plot) y_pos = [(height/len(to_plot)/2) + (i * steps) for i in range(0, len(to_plot))] # Plot figure fig, ax = plt.subplots() # First plot the image plt.imshow(img, extent=[0, width*0.99, 0, height], zorder=1) # Then plot the right part which covers up the right part of the picture ax.barh(y_pos, to_plot['bottom'], left=to_plot['top'],height=steps, color='w',align='center', alpha=1,lw=2, edgecolor='w', zorder=2) # Finally plot the bar which is fully transparent aside from its edges ax.barh(y_pos, to_plot['top'], height=steps, fc=(1, 0, 0, 0.0), align='center',lw=2, edgecolor='white',zorder=3) # Remove ticks ax.yaxis.set_ticks_position('none') ax.xaxis.set_ticks_position('none') # Set labels and location y-axis ax.set_yticks(y_pos) ax.set_yticklabels(list(to_plot['Word'].values), fontsize=18,**font) ax.set_ylim(top=height) # Make them with to remove any image line that may be left ax.spines['top'].set_color('white') ax.spines['right'].set_color('white') # Remove the left and bottom axis ax.spines['left'].set_visible(False) ax.spines['bottom'].set_visible(False) # Add a small patch that removes some of the extra background at the top ax.add_patch(patches.Rectangle((0,height),width, 20,facecolor='white',linewidth = 0, zorder=3)) # Add left and bottom lines plt.axvline(0, color='black', ymax=1, lw=5, zorder=4) plt.axvline(width, color='white', ymax=1, lw=5, zorder=5) plt.axhline(0, color='black', xmax=1, lw=5, zorder=6) plt.axhline(height, color=title_background, xmax=1, lw=3, zorder=7) # Create Title Box # This might be a temporary solution as # makes_axes_locatable might lose its functionality divider = make_axes_locatable(ax) cax = divider.append_axes("top", size="9%", pad=None) cax.get_xaxis().set_visible(False) cax.get_yaxis().set_visible(False) at = AnchoredText(title, loc=10, pad=0, prop=dict(backgroundcolor=title_background, size=23, color=title_color, **font)) cax.add_artist(at) cax.set_facecolor(title_background) cax.spines['left'].set_visible(False) cax.spines['bottom'].set_visible(False) cax.spines['right'].set_visible(False) cax.spines['top'].set_visible(False) fig.set_size_inches(10, 10) if save_name: plt.savefig(save_path+save_name+'.png', dpi = 300) def print_users(df): print("#" * (len('Users')+8)) print("## " + 'Users' + " ##" ) print("#" * (len('Users')+8)) print() for user in df.User.unique(): print(user)Now i want to replace specific word with writte on .csv file, i try with this code:
def check(df, file='dictionary.csv', column = "Word"): dictionary = pd.read_csv(file) read_words = pd.DataFrame(dictionary, columns=['Word_Comp']) replace_word = pd.DataFrame(dictionary, columns=['Replace']) if df[column] in read_words(): df.replace(to_replace=column, value=replace_word) else: returnbut dosent work. Please Help
Dictionary on .CSV
Word_Comp Replace
yronadura tronadura
vulzanizado vulcanizado
vomo como
viernescuántas viernes
via vía
venian venían
vel velocidad
vdfcon vdf
varuadores variadores
vamps vamos
vamiones camiones