May-08-2019, 09:01 AM
Hi guys i have a twitter dataset i want to train and test with NB and SVM. After cleaning and vectorizing, i am stuck on the following:
1. splitting of data into 80/20
2. fitting this into classifier.
Your guidance will be highly appreciated.
ValueError
1. splitting of data into 80/20
2. fitting this into classifier.
Your guidance will be highly appreciated.
# tokenize helper function def text_process(raw_text): # Check punctuation nopunc = [char for char in list(raw_text) if char not in string.punctuation] # Join the characters again to form the string. nopunc = ''.join(nopunc) # remove any stopwords return [word for word in nopunc.lower().split() if word.lower() not in stopwords.words('english')] def remove_words(word_list): remove = ['he','and','...','“','”','’','…','and’'] return [w for w in word_list if w not in remove] # tokenize message column and create a column for tokens df_marathon = df_marathon.copy() df_marathon['tokens'] = df_marathon['Text'].apply(text_process) # step 1 df_marathon['tokenized_tweet'] = df_marathon['tokens'].apply(remove_words) # ste 2 df_marathon.head(10) # vectorize bow_transformer = CountVectorizer(analyzer=text_process).fit(df_marathon['tokenized_tweet']) # print total number of vocab words print(len(bow_transformer.vocabulary_)) # example of vectorized text sample_tweet = df_marathon['tokenized_tweet'][16] print(sample_tweet) print('\n') # vector representation bow_sample = bow_transformer.transform([sample_tweet]) print(bow_sample) print('\n') # transform the entire DataFrame of messages X = bow_transformer.transform(df_marathon['Text']) # check out the bag-of-words counts for the entire corpus as a large sparse matrix print('Shape of Sparse Matrix: ', X.shape) print('Amount of Non-Zero occurences: ', X.nnz) #convert values obtained using the bag of words model into TFIDF values tfidfconverter = TfidfTransformer() X = tfidfconverter.fit_transform(X) X.toarray()Error on splitting:
ValueError
Error:Traceback (most recent call last)
~\Anaconda3\envs\Ipython.display\lib\site-packages\scipy\sparse\csr.py in asindices(x)
243 if idx_dtype != x.dtype:
--> 244 x = x.astype(idx_dtype)
245 except:
ValueError: invalid literal for int() with base 10: 'tokenized_tweet'
During handling of the above exception, another exception occurred:
IndexError Traceback (most recent call last)
<ipython-input-37-da320e3c2eee> in <module>
1 # split data into rain and test, we are creating train set 80% and test set 20%
----> 2 Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(X['tokenized_tweet'],ML_Corpus['label'],test_size=0.2)
~\Anaconda3\envs\Ipython.display\lib\site-packages\scipy\sparse\csr.py in __getitem__(self, key)
333 row, col = self._index_to_arrays(row, col)
334
--> 335 row = asindices(row)
336 col = asindices(col)
337 if row.shape != col.shape:
~\Anaconda3\envs\Ipython.display\lib\site-packages\scipy\sparse\csr.py in asindices(x)
244 x = x.astype(idx_dtype)
245 except:
--> 246 raise IndexError('invalid index')
247 else:
248 return x
IndexError: invalid index
Error on fitting classifier:
ValueError Traceback (most recent call last)
<ipython-input-38-6fc83fb90fa2> in <module>
----> 1 classifier = MultinomialNB().fit(X, df_marathon)
~\Anaconda3\envs\Ipython.display\lib\site-packages\sklearn\naive_bayes.py in fit(self, X, y, sample_weight)
583 self : object
584 """
--> 585 X, y = check_X_y(X, y, 'csr')
586 _, n_features = X.shape
587
~\Anaconda3\envs\Ipython.display\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
759 dtype=None)
760 else:
--> 761 y = column_or_1d(y, warn=True)
762 _assert_all_finite(y)
763 if y_numeric and y.dtype.kind == 'O':
~\Anaconda3\envs\Ipython.display\lib\site-packages\sklearn\utils\validation.py in column_or_1d(y, warn)
795 return np.ravel(y)
796
--> 797 raise ValueError("bad input shape {0}".format(shape))
798
799
ValueError: bad input shape (200, 3)