hey, guys, in regard to the error here:
Error:
[color=#000000][size=small][font=monospace]Traceback (most recent call last):[/font][/size][/color]
[color=#000000][size=small][font=monospace] File "/Users/jordanXXX/Documents/NLP/scikitlearn", line 56, in <module>[/font][/size][/color]
[color=#000000][size=small][font=monospace] GaussianNB_classifier.train(training_set)[/font][/size][/color]
[color=#000000][size=small][font=monospace] File "/Library/Python/2.7/site-packages/nltk/classify/scikitlearn.py", line 117, in train[/font][/size][/color]
[color=#000000][size=small][font=monospace] self._clf.fit(X, y)[/font][/size][/color]
[color=#000000][size=small][font=monospace] File "/usr/local/lib/python2.7/site-packages/sklearn/naive_bayes.py", line 182, in fit[/font][/size][/color]
[color=#000000][size=small][font=monospace] X, y = check_X_y(X, y)[/font][/size][/color]
[color=#000000][size=small][font=monospace] File "/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 521, in check_X_y[/font][/size][/color]
[color=#000000][size=small][font=monospace] ensure_min_features, warn_on_dtype, estimator)[/font][/size][/color]
[color=#000000][size=small][font=monospace] File "/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 380, in check_array[/font][/size][/color]
[color=#000000][size=small][font=monospace] force_all_finite)[/font][/size][/color]
[color=#000000][size=small][font=monospace] File "/usr/local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 243, in _ensure_sparse_format[/font][/size][/color]
[color=#000000][size=small][font=monospace] raise TypeError('A sparse matrix was passed, but dense '[/font][/size][/color]
[color=#000000][size=small][font=monospace]TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
[/font][/size][/color]
i figured out that GaussianNB is not really compatible with sparse data classifiers, thus the error. i could get around this but for the intended usage with text, it's not really necessary since it doesn't seem like it's a great thing to use when I could use something like the LinearSVC instead.
thanks again for all your help.
however, i'm still running into YUGE data processing times (i.e. 5+ minutes, closer to 10+ minutes when running longer codes) when running the following codes.
any suggestions? i haven't really been able to find too much on a solution anywhere online and i know you touched on checking my memory for swapping (or something of the like) but i'm not sure how to fix that or if that's even the problem.
my CPU in activity monitor is at like a steady 98% when running these processes and i don't know if this is normal or what to do to speed things up.
below is the coding i'm using that seems to run fine except with insane processing times:
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
# print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[:1900:]
# classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
# save_classifier = open("naivebayes.pickle", "wb")
# pickle.dump(classifier, save_classifier)
# save_classifier.close()
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
# GaussianNB_classifier = SklearnClassifier(GaussianNB())
# GaussianNB_classifier.train(training_set)
# print("GaussianNB_classifier accuracy percent:", (nltk.classify.accuracy(GaussianNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
# SVC_classifier = SklearnClassifier(SVC())
# SVC_classifier.train(training_set)
# print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
voted_classifier = VoteClassifier(classifier, MNB_classifier, BernoulliNB_classifier, LogisticRegression_classifier, SGDClassifier_classifier, LinearSVC_classifier, NuSVC_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
print("Classication:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0]))
print("Classication:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0]))
print("Classication:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0]))
print("Classication:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0]))
print("Classication:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0]))
print("Classication:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0]))
EDIT: this script took 39 minutes to process
Output:
('Original Naive Bayes Algo accuracy percent:', 87.31578947368422)
Most Informative Features
insulting = True neg : pos = 11.0 : 1.0
sans = True neg : pos = 9.0 : 1.0
refreshingly = True pos : neg = 8.4 : 1.0
wasting = True neg : pos = 8.3 : 1.0
mediocrity = True neg : pos = 7.7 : 1.0
dismissed = True pos : neg = 7.0 : 1.0
customs = True pos : neg = 6.3 : 1.0
fabric = True pos : neg = 6.3 : 1.0
overwhelmed = True pos : neg = 6.3 : 1.0
bruckheimer = True neg : pos = 6.3 : 1.0
wires = True neg : pos = 6.3 : 1.0
uplifting = True pos : neg = 6.2 : 1.0
ugh = True neg : pos = 5.8 : 1.0
stinks = True neg : pos = 5.8 : 1.0
lang = True pos : neg = 5.7 : 1.0
('MNB_classifier accuracy percent:', 89.21052631578948)
('BernoulliNB_classifier accuracy percent:', 86.42105263157895)
('LogisticRegression_classifier accuracy percent:', 94.47368421052632)
('SGDClassifier_classifier accuracy percent:', 85.73684210526315)
('LinearSVC_classifier accuracy percent:', 99.52631578947368)
('NuSVC_classifier accuracy percent:', 91.52631578947368)
('voted_classifier accuracy percent:', 93.36842105263158)
('Classication:', u'pos', 'Confidence %:', 100)
('Classication:', u'pos', 'Confidence %:', 0)
('Classication:', u'neg', 'Confidence %:', 0)
('Classication:', u'neg', 'Confidence %:', 100)
('Classication:', u'neg', 'Confidence %:', 100)
('Classication:', u'neg', 'Confidence %:', 100)