Mar-23-2020, 08:42 PM
I'm analyzing the speeches of several US presidents and I would like to make some sentiment analysis. However, the pre-processing part gets stuck on [*] in Google Colab and it doesn't pre-process the speeches. However I executed everything and everything worked well (no errors) before I added the
text_process(df['Text'])after at the top. However, I noticed that the text was not pre-processed so I decided to add that snippet of code at the top, but now it's stuck on [*] in Google Colab...
import pandas as pd import numpy as np import itertools import matplotlib.pyplot as plt import seaborn as sns import string from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from PIL import Image from wordcloud import WordCloud import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.naive_bayes import MultinomialNB from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from google.colab import files uploaded = files.upload() df = pd.read_csv('train.csv') df.head(10) lemmatiser = WordNetLemmatizer() # Defining a module for Text Processing def text_process(tex): # 1. Removal of Punctuation Marks nopunct=[char for char in tex if char not in string.punctuation] nopunct=''.join(nopunct) # 2. Lemmatisation a='' i=0 for i in range(len(nopunct.split())): b=lemmatiser.lemmatize(nopunct.split()[i], pos="v") a=a+b+' ' tex.strip("[]") #res = str(test_list)[1:-1] # 3. Removal of Stopwords return [word for word in a.split() if word.lower() not in stopwords.words('english')] ##################### STUCK HERE ############################ text_process(df['Text']) y = df['Author'] labelencoder = LabelEncoder() y = labelencoder.fit_transform(y) X = df['Text'] wordcloud1 = WordCloud().generate(X[0]) # for Andrew Jackson wordcloud2 = WordCloud().generate(X[26]) # for Barack Obama wordcloud3 = WordCloud().generate(X[75]) # for Bill Clinton wordcloud4 = WordCloud().generate(X[114]) # for Donald Trump wordcloud5 = WordCloud().generate(X[136]) # for Franklin D. Roosevelt wordcloud6 = WordCloud().generate(X[185]) # for George H. W. Bush wordcloud7 = WordCloud().generate(X[208]) # for George W. Bush wordcloud8 = WordCloud().generate(X[247]) # for George Washington wordcloud9 = WordCloud().generate(X[268]) # for Richard M. Nixon wordcloud10 = WordCloud().generate(X[291]) # for Ronald Reagan wordcloud11 = WordCloud().generate(X[350]) # for Thomas Jefferson #print(X[0]) print(df['Author'][0]) plt.imshow(wordcloud1, interpolation='bilinear') plt.show() #print(X[1]) print(df['Author'][26]) plt.imshow(wordcloud2, interpolation='bilinear') plt.show() #print(X[3]) print(df['Author'][75]) plt.imshow(wordcloud3, interpolation='bilinear') plt.show() print(df['Author'][114]) plt.imshow(wordcloud4, interpolation='bilinear') plt.show() print(df['Author'][136]) plt.imshow(wordcloud5, interpolation='bilinear') plt.show() print(df['Author'][185]) plt.imshow(wordcloud6, interpolation='bilinear') plt.show() print(df['Author'][208]) plt.imshow(wordcloud7, interpolation='bilinear') plt.show() print(df['Author'][247]) plt.imshow(wordcloud8, interpolation='bilinear') plt.show() print(df['Author'][268]) plt.imshow(wordcloud9, interpolation='bilinear') plt.show() print(df['Author'][291]) plt.imshow(wordcloud10, interpolation='bilinear') plt.show() print(df['Author'][350]) plt.imshow(wordcloud11, interpolation='bilinear') plt.show() import nltk nltk.download('wordnet') nltk.download('stopwords') # 80-20 splitting the dataset (80%->Training and 20%->Validation) X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42) # defining the bag-of-words transformer on the text-processed corpus # i.e., text_process() declared in II is executed... bow_transformer=CountVectorizer(analyzer=text_process).fit(X_train) # transforming into Bag-of-Words and hence textual data to numeric.. text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA # transforming into Bag-of-Words and hence textual data to numeric.. text_bow_test=bow_transformer.transform(X_test)#TEST DATA # instantiating the model with Multinomial Naive Bayes.. model = MultinomialNB() # training the model... model = model.fit(text_bow_train, y_train) # Training Accuracy model.score(text_bow_train, y_train) # Test Accuracy model.score(text_bow_test, y_test) # Getting the predictions of the Test Set... predictions = model.predict(text_bow_test) # Getting the Precision, Recall, F1-Score print(classification_report(y_test,predictions)) # Defining a module for Confusion Matrix... def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting \normalize=True`.` """ if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') # print(cm) plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title) plt.colorbar() tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45) plt.yticks(tick_marks, classes) fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i, format(cm[i, j], fmt), horizontalalignment="center", color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label') plt.xlabel('Predicted label') cm = confusion_matrix(y_test,predictions) plt.figure(figsize=(20,10)) plot_confusion_matrix(cm, classes=[0,26,75,114,136,185,208,247,268,291,350], normalize=True,title='Confusion Matrix') from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer analyser = SentimentIntensityAnalyzer() def sentiment_analyzer_scores(sentence): score = analyser.polarity_scores(sentence) return ("{:-<40} {}".format(sentence, str(score))) df.loc[df['Author'].str.contains('George Washington'), 'Year'] = 1789 df.loc[df['Author'].str.contains('Thomas Jefferson'), 'Year'] = 1801 df.loc[df['Author'].str.contains('Andrew Jackson'), 'Year'] = 1829 df.loc[df['Author'].str.contains('Franklin D. Roosevelt'), 'Year'] = 1933 df.loc[df['Author'].str.contains('Richard M. Nixon"'), 'Year'] = 1969 df.loc[df['Author'].str.contains('Ronald Reagan'), 'Year'] = 1981 df.loc[df['Author'].str.contains('George H. W. Bush'), 'Year'] = 1989 df.loc[df['Author'].str.contains('Bill Clinton'), 'Year'] = 1993 df.loc[df['Author'].str.contains('George W. Bush'), 'Year'] = 2001 df.loc[df['Author'].str.contains('Barack Obama'), 'Year'] = 2009 df.loc[df['Author'].str.contains('Donald Trump'), 'Year'] = 2017 df.Year = df.Year.fillna(1969) d = pd.Series(df.Author.values,index=df.Year).to_dict() for key in sorted(d.keys()): print("%s: %s" % (key, d[key])) dfAndrewJackson = df[['Text']].copy() # Delete row at index from 26 to 375 dfAndrewJackson_clean = dfAndrewJackson.drop(dfAndrewJackson.index[26:375]) years = ['1834/04/15', '1834/04/21', '1834/12/01', '1833/12/03', '1832/12/04', '1836/12/05', '1830/12/06', '1831/12/06', '1832/12/06', '1835/12/07', '1829/12/08', '1832/12/10', '1833/12/12', '1836/12/21', '1832/02/15', '1831/02/22', '1833/01/16', '1832/07/10', '1829/03/04', '1833/03/04', '1837/03/04', '1830/05/06', '1829/05/11', '1830/05/27', '1830/10/05', '1833/09/18'] print(len(years)) dfAndrewJackson_clean['Speech_Date'] = years print(dfAndrewJackson_clean.shape) print(dfAndrewJackson_clean) dict_AndrewJackson = pd.Series(dfAndrewJackson_clean.Text.values,index=dfAndrewJackson_clean.Speech_Date).to_dict() for key in sorted(dict_AndrewJackson.keys()): print("%s: %s" % (key, dict_AndrewJackson[key])) l = [] for key in sorted(dict_AndrewJackson.keys()): l.append(sentiment_analyzer_scores(dict_AndrewJackson[key])) print(l) print(len(l)) for i in range(len(l)): print(l[i][-62:]) sentiments = pd.DataFrame(columns=['Neg', 'Neu', 'Pos']) sentiments['Neg'] = [0.032, 0.033, 0.064, 0.011, 0.05, 0.014, 0.054, 0.049, 0.055, 0.021, 0.048, 0.066, 0.045, 0.094, 0.06, 0.046, 0.051, 0.055, 0.008, 0.063, 0.043, 0.07, 0.059, 0.071, 0.066, 0.092] sentiments['Neu'] = [0.778, 0.842, 0.776, 0.873, 0.806, 0.853, 0.77, 0.864, 0.772, 0.83, 0.821, 0.767, 0.845, 0.792, 0.831, 0.768, 0.857, 0.782, 0.92, 0.834, 0.867, 0.785, 0.789, 0.805, 0.813, 0.711] sentiments['Pos'] = [0.191, 0.126, 0.159, 0.117, 0.144, 0.133, 0.176, 0.087, 0.173, 0.149, 0.131, 0.167, 0.11, 0.114, 0.109, 0.186, 0.092, 0.163, 0.072, 0.103, 0.09, 0.145, 0.151, 0.124, 0.121, 0.198] sentimentsWhat could be the issue? Any suggestions are appreciated. Thx. You can find my 'train.csv' file here: https://drive.google.com/file/d/1a5fbORQ...sp=sharing