# Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.utils import shuffle from sklearn.preprocessing import LabelEncoder,OneHotEncoder #making corpus or words from comments import re from nltk.stem.porter import PorterStemmer import nltk from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from nltk import word_tokenize dataset = pd.read_csv(r'D:\thesis material\DataSet.csv',encoding='cp437', names=['comment', 'sentiment', 'nan']) print(dataset.head()) Pos = dataset[dataset['sentiment'] == 'Positive'].shape[0] Neg = dataset[dataset['sentiment'] == 'Negative'].shape[0] Neu = dataset[dataset['sentiment'] == 'Neutral'].shape[0] # bar plot of the 3 classes plt.bar(10,Pos,3, label="Positve") plt.bar(15,Neg,3, label="Negative") plt.bar(20,Neu,3, label="Neutral") plt.legend() plt.ylabel('Number of examples') plt.title('Proportion of examples') plt.show() #another code from github #Step 4: We have y in form of categorical data y = dataset['sentiment'] stopwords=['ai', 'ayi', 'hy', 'hai', 'main', 'ki', 'tha', 'koi', 'ko', 'sy', 'woh', 'bhi', 'aur', 'wo', 'yeh', 'rha', 'hota', 'ho', 'ga', 'ka', 'le', 'lye', 'kr', 'kar', 'lye', 'liye', 'hotay', 'waisay', 'gya', 'gaya', 'kch', 'ab', 'thy', 'thay', 'houn', 'hain', 'han', 'to', 'is', 'hi', 'jo', 'kya', 'thi', 'se', 'pe', 'phr', 'wala', 'waisay', 'us', 'na', 'ny', 'hun', 'rha', 'raha', 'ja', 'rahay', 'abi', 'uski', 'ne', 'haan', 'acha', 'nai', 'sent', 'photo', 'you', 'kafi', 'gai', 'rhy', 'kuch', 'jata', 'aye', 'ya', 'dono', 'hoa', 'aese', 'de', 'wohi', 'jati', 'jb', 'krta', 'lg', 'rahi', 'hui', 'karna', 'krna', 'gi', 'hova', 'yehi', 'jana', 'jye', 'chal', 'mil', 'tu', 'hum', 'par', 'hay', 'kis', 'sb', 'gy', 'dain', 'krny', 'tou'] def clean(x): review_with_no_special_character = re.sub('[^a-zA-Z]',' ',str(x)) review_in_lowercase = review_with_no_special_character.lower() review_in_tokens = word_tokenize(review_in_lowercase) review_with_no_stopwords = [word for word in review_in_tokens if not word in stopwords] review_in_sentence = ' '.join(review_with_no_stopwords) return review_in_sentence dataset['comment'] = dataset['comment'].apply(lambda x:clean(x)) x = dataset['comment'] #Step 6: Split data set into training and testing sets x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20) #Step 6: convert a collection of raw documents to a matrix from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() x_train_vector = vectorizer.fit_transform(x_train) x_test_vector = vectorizer.transform(x_test) #Step 8: Creating classifier and fitting data in classifier from sklearn.svm import SVC classifier = SVC(kernel='linear', C=1.0, degree=3, random_state=0) classifier.fit(x_train_vector, y_train) #Step 9 : Pickling teh Model import pickle #To reuse, we can dump the model and load whenever or where-ever you want. #Vocabulary is also needed to vectorize teh new documents while predicting teh label. # pickling the vectorizer pickle.dump(vectorizer, open('vectorizer.sav', 'wb')) # pickling the model pickle.dump(classifier, open('classifier.sav', 'wb')) #Step 9: Perform Prediction y_pred=classifier.predict(x_test_vector) #Step 10: Create Confusion Matrix ConfusionMatrix=confusion_matrix(y_test, y_pred) #Step 11: Evaluation Accuracy = format(classifier.score(x_test_vector, y_test)*100, '.2f')+ ' %' file = open('AccuracyPercentage', 'wb') pickle.dump(Accuracy, file) file.close() print('Learning end') labels=['Positive','Neutral','Negative'] fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(ConfusionMatrix) plt.title('Confusion matrix of the classifier \n') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show() #classifier1 classifier=LogisticRegression(random_state=0,solver='liblinear',multi_class='ovr') classifier.fit(x_train,y_train) y_pred=classifier.predict(x_test) print(y_pred) #confusion matrix cm=confusion_matrix(y_test,y_pred) print(cm) #accuracy of LogisticRegression print('Accuracy is {} '.format(accuracy_score(y_test, y_pred))) labels=['Positive','Neutral','Negative'] fig = plt.figure() ax = fig.add_subplot(111) cax = ax.matshow(cm) plt.title('Confusion matrix of the classifier \n') fig.colorbar(cax) ax.set_xticklabels([''] + labels) ax.set_yticklabels([''] + labels) plt.xlabel('Predicted') plt.ylabel('True') plt.show()
Error:Error: return array(a, dtype, copy=False, order=order)
ValueError: could not convert string to float:
buran write Mar-01-2021, 06:58 AM:
Please, use proper tags when post code, traceback, output, etc. This time I have added tags for you.
See BBcode help for more info.
also, please, post the entire traceback that you get. We need to see the whole thing. Do not just give us the last line.
Take a time to read What to include in a post
Please, use proper tags when post code, traceback, output, etc. This time I have added tags for you.
See BBcode help for more info.
also, please, post the entire traceback that you get. We need to see the whole thing. Do not just give us the last line.
Take a time to read What to include in a post