Text pre-processing issue

PythonSpeaker · Mar-23-2020, 08:42 PM

I'm analyzing the speeches of several US presidents and I would like to make some sentiment analysis. However, the pre-processing part gets stuck on [*] in Google Colab and it doesn't pre-process the speeches. However I executed everything and everything worked well (no errors) before I added the

text_process(df['Text'])

after at the top. However, I noticed that the text was not pre-processed so I decided to add that snippet of code at the top, but now it's stuck on [*] in Google Colab...

import pandas as pd

import numpy as np

import itertools

import matplotlib.pyplot as plt

import seaborn as sns

import string

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from PIL import Image

from wordcloud import WordCloud

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix



from google.colab import files

uploaded = files.upload()



df = pd.read_csv('train.csv')

df.head(10)



lemmatiser = WordNetLemmatizer()

# Defining a module for Text Processing

def text_process(tex):

# 1. Removal of Punctuation Marks

nopunct=[char for char in tex if char not in string.punctuation]

nopunct=''.join(nopunct)

# 2. Lemmatisation

a=''

i=0

for i in range(len(nopunct.split())):

b=lemmatiser.lemmatize(nopunct.split()[i], pos="v")

a=a+b+' '



tex.strip("[]") #res = str(test_list)[1:-1]



# 3. Removal of Stopwords

return [word for word in a.split() if word.lower() not

in stopwords.words('english')]



##################### STUCK HERE ############################

text_process(df['Text'])



y = df['Author']

labelencoder = LabelEncoder()

y = labelencoder.fit_transform(y)



X = df['Text']

wordcloud1 = WordCloud().generate(X[0]) # for Andrew Jackson

wordcloud2 = WordCloud().generate(X[26]) # for Barack Obama

wordcloud3 = WordCloud().generate(X[75]) # for Bill Clinton

wordcloud4 = WordCloud().generate(X[114]) # for Donald Trump

wordcloud5 = WordCloud().generate(X[136]) # for Franklin D. Roosevelt

wordcloud6 = WordCloud().generate(X[185]) # for George H. W. Bush

wordcloud7 = WordCloud().generate(X[208]) # for George W. Bush

wordcloud8 = WordCloud().generate(X[247]) # for George Washington

wordcloud9 = WordCloud().generate(X[268]) # for Richard M. Nixon

wordcloud10 = WordCloud().generate(X[291]) # for Ronald Reagan

wordcloud11 = WordCloud().generate(X[350]) # for Thomas Jefferson

#print(X[0])

print(df['Author'][0])

plt.imshow(wordcloud1, interpolation='bilinear')

plt.show()

#print(X[1])

print(df['Author'][26])

plt.imshow(wordcloud2, interpolation='bilinear')

plt.show()

#print(X[3])

print(df['Author'][75])

plt.imshow(wordcloud3, interpolation='bilinear')

plt.show()

print(df['Author'][114])

plt.imshow(wordcloud4, interpolation='bilinear')

plt.show()

print(df['Author'][136])

plt.imshow(wordcloud5, interpolation='bilinear')

plt.show()

print(df['Author'][185])

plt.imshow(wordcloud6, interpolation='bilinear')

plt.show()

print(df['Author'][208])

plt.imshow(wordcloud7, interpolation='bilinear')

plt.show()

print(df['Author'][247])

plt.imshow(wordcloud8, interpolation='bilinear')

plt.show()

print(df['Author'][268])

plt.imshow(wordcloud9, interpolation='bilinear')

plt.show()

print(df['Author'][291])

plt.imshow(wordcloud10, interpolation='bilinear')

plt.show()

print(df['Author'][350])

plt.imshow(wordcloud11, interpolation='bilinear')

plt.show()



import nltk

nltk.download('wordnet')

nltk.download('stopwords')



# 80-20 splitting the dataset (80%->Training and 20%->Validation)

X_train, X_test, y_train, y_test = train_test_split(X, y

,test_size=0.2, random_state=42)

# defining the bag-of-words transformer on the text-processed corpus # i.e., text_process() declared in II is executed...

bow_transformer=CountVectorizer(analyzer=text_process).fit(X_train)

# transforming into Bag-of-Words and hence textual data to numeric..

text_bow_train=bow_transformer.transform(X_train)#ONLY TRAINING DATA

# transforming into Bag-of-Words and hence textual data to numeric..

text_bow_test=bow_transformer.transform(X_test)#TEST DATA



# instantiating the model with Multinomial Naive Bayes..

model = MultinomialNB()

# training the model...

model = model.fit(text_bow_train, y_train)



# Training Accuracy

model.score(text_bow_train, y_train)



# Test Accuracy

model.score(text_bow_test, y_test)



# Getting the predictions of the Test Set...

predictions = model.predict(text_bow_test)

# Getting the Precision, Recall, F1-Score

print(classification_report(y_test,predictions))



# Defining a module for Confusion Matrix...

def plot_confusion_matrix(cm, classes,normalize=False,title='Confusion matrix',cmap=plt.cm.Blues):

"""

This function prints and plots the confusion matrix.

Normalization can be applied by setting \normalize=True`.`

"""

if normalize:

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

print("Normalized confusion matrix")

else:

print('Confusion matrix, without normalization')



# print(cm)



plt.imshow(cm, interpolation='nearest', cmap=cmap)

plt.title(title)

plt.colorbar()

tick_marks = np.arange(len(classes))

plt.xticks(tick_marks, classes, rotation=45)

plt.yticks(tick_marks, classes)

fmt = '.2f' if normalize else 'd'

thresh = cm.max() / 2.

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):

plt.text(j, i, format(cm[i, j], fmt),

horizontalalignment="center",

color="white" if cm[i, j] > thresh else "black")



plt.tight_layout()

plt.ylabel('True label')

plt.xlabel('Predicted label')



cm = confusion_matrix(y_test,predictions)

plt.figure(figsize=(20,10))

plot_confusion_matrix(cm, classes=[0,26,75,114,136,185,208,247,268,291,350], normalize=True,title='Confusion Matrix')



from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()



def sentiment_analyzer_scores(sentence):

score = analyser.polarity_scores(sentence)

return ("{:-<40} {}".format(sentence, str(score)))



df.loc[df['Author'].str.contains('George Washington'), 'Year'] = 1789

df.loc[df['Author'].str.contains('Thomas Jefferson'), 'Year'] = 1801

df.loc[df['Author'].str.contains('Andrew Jackson'), 'Year'] = 1829

df.loc[df['Author'].str.contains('Franklin D. Roosevelt'), 'Year'] = 1933

df.loc[df['Author'].str.contains('Richard M. Nixon"'), 'Year'] = 1969

df.loc[df['Author'].str.contains('Ronald Reagan'), 'Year'] = 1981

df.loc[df['Author'].str.contains('George H. W. Bush'), 'Year'] = 1989

df.loc[df['Author'].str.contains('Bill Clinton'), 'Year'] = 1993

df.loc[df['Author'].str.contains('George W. Bush'), 'Year'] = 2001

df.loc[df['Author'].str.contains('Barack Obama'), 'Year'] = 2009

df.loc[df['Author'].str.contains('Donald Trump'), 'Year'] = 2017



df.Year = df.Year.fillna(1969)

d = pd.Series(df.Author.values,index=df.Year).to_dict()



for key in sorted(d.keys()):

print("%s: %s" % (key, d[key]))



dfAndrewJackson = df[['Text']].copy()

# Delete row at index from 26 to 375

dfAndrewJackson_clean = dfAndrewJackson.drop(dfAndrewJackson.index[26:375])

years = ['1834/04/15', '1834/04/21', '1834/12/01', '1833/12/03', '1832/12/04',

'1836/12/05', '1830/12/06', '1831/12/06', '1832/12/06', '1835/12/07',

'1829/12/08', '1832/12/10', '1833/12/12', '1836/12/21', '1832/02/15',

'1831/02/22', '1833/01/16', '1832/07/10', '1829/03/04', '1833/03/04',

'1837/03/04', '1830/05/06', '1829/05/11', '1830/05/27', '1830/10/05',

'1833/09/18']

print(len(years))

dfAndrewJackson_clean['Speech_Date'] = years

print(dfAndrewJackson_clean.shape)

print(dfAndrewJackson_clean)



dict_AndrewJackson = pd.Series(dfAndrewJackson_clean.Text.values,index=dfAndrewJackson_clean.Speech_Date).to_dict()



for key in sorted(dict_AndrewJackson.keys()):

print("%s: %s" % (key, dict_AndrewJackson[key]))



l = []

for key in sorted(dict_AndrewJackson.keys()):

l.append(sentiment_analyzer_scores(dict_AndrewJackson[key]))

print(l)



print(len(l))



for i in range(len(l)):

print(l[i][-62:])



sentiments = pd.DataFrame(columns=['Neg', 'Neu', 'Pos'])

sentiments['Neg'] = [0.032, 0.033, 0.064, 0.011, 0.05, 0.014, 0.054, 0.049,

0.055, 0.021, 0.048, 0.066, 0.045, 0.094, 0.06, 0.046,

0.051, 0.055, 0.008, 0.063, 0.043, 0.07, 0.059, 0.071,

0.066, 0.092]



sentiments['Neu'] = [0.778, 0.842, 0.776, 0.873, 0.806, 0.853, 0.77, 0.864,

0.772, 0.83, 0.821, 0.767, 0.845, 0.792, 0.831, 0.768,

0.857, 0.782, 0.92, 0.834, 0.867, 0.785, 0.789, 0.805,

0.813, 0.711]



sentiments['Pos'] = [0.191, 0.126, 0.159, 0.117, 0.144, 0.133, 0.176, 0.087,

0.173, 0.149, 0.131, 0.167, 0.11, 0.114, 0.109, 0.186,

0.092, 0.163, 0.072, 0.103, 0.09, 0.145, 0.151, 0.124,

0.121, 0.198]



sentiments

What could be the issue? Any suggestions are appreciated. Thx. You can find my 'train.csv' file here: https://drive.google.com/file/d/1a5fbORQ...sp=sharing

Text pre-processing issue

User Panel Messages

Announcements