Python Forum

#per SGD classifier i dati devono essere numerically encoded, not dict
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
valid_score = clf.score(X_valid, y_valid)

I get the following error:

Error:TypeError                                 Traceback (most recent call last)
<ipython-input-10-a2f8bfb8f242> in <module>
      2 from sklearn.linear_model import SGDClassifier
      3 clf = SGDClassifier()
----> 4 clf.fit(X_train, y_train)
      5 train_score = clf.score(X_train, y_train)
      6 valid_score = clf.score(X_valid, y_valid)

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
    726                          loss=self.loss, learning_rate=self.learning_rate,
    727                          coef_init=coef_init, intercept_init=intercept_init,
--> 728                          sample_weight=sample_weight)
    729 
    730 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    539         X, y = self._validate_data(X, y, accept_sparse='csr',
    540                                    dtype=np.float64, order="C",
--> 541                                    accept_large_sparse=False)
    542 
    543         # labels can be encoded as float, int, or string literals

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    430                 y = check_array(y, **check_y_params)
    431             else:
--> 432                 X, y = check_X_y(X, y, **check_params)
    433             out = X, y
    434 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    800                     ensure_min_samples=ensure_min_samples,
    801                     ensure_min_features=ensure_min_features,
--> 802                     estimator=estimator)
    803     if multi_output:
    804         y = check_array(y, accept_sparse='csr', force_all_finite=True,

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    596                     array = array.astype(dtype, casting="unsafe", copy=False)
    597                 else:
--> 598                     array = np.asarray(array, order=order, dtype=dtype)
    599             except ComplexWarning:
    600                 raise ValueError("Complex data not supported\n"

~\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

TypeError: float() argument must be a string or a number, not 'dict'

I believe that this is an error from using the wrong version of python. I use python 3.83 on Windows 10. I am ot sure how to fix it.

Any help appreciated. Thanks in advance

Respectfully,

ErnestTBass

Error message says that float() argument must be string or a number and this is valid to all Python versions:

From float() documentation:

Quote:Return a floating point number constructed from a number or string x.

It can be easily demonstrated in Python code:

>>> float('42')
42.0
>>> float(42)
42.0
>>> float({42:0})
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: float() argument must be a string or a number, not 'dict'

Okay, I am sure that you are right. In my case in seems that X_train and y_train (or both) are numbers. How did they become dict?

That error just makes no sense. It seems that somehow they went from float or int to dict. But where? Where did this happen?

I believe that casting both of them to floats should work should it not? It seems this is the solution.

Any help appreciated. Thanks in advance.

Respectfully,

ErnestTBass

No, you can't cast a dict to a float.

Right before your call of the function, add some info to see what the variables are. Find out if the problem is in your code.

x_train = 42
y_train = {"a": 1}
print(f"x_train is of type {type(x_train)} and y_train is of type {type(y_train)}")

What type are you supposed to pass? I thought fit and score expected 2D arrays.

I put the print statement that you gave me directly in the code.

per SGD classifier i dati devono essere numerically encoded, not dict
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
print(f"X_train is of type {type(X_train)} and y_train is of type {type(y_train)}")
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
valid_score = clf.score(X_valid, y_valid)

It produced an error as you can see:

Error:TypeError                                 Traceback (most recent call last)
<ipython-input-30-eba0396feef7> in <module>
      3 clf = SGDClassifier()
      4 print(f"X_train is of type {type(X_train)} and y_train is of type {type(y_train)}")
----> 5 clf.fit(X_train, y_train)
      6 train_score = clf.score(X_train, y_train)
      7 valid_score = clf.score(X_valid, y_valid)

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
    726                          loss=self.loss, learning_rate=self.learning_rate,
    727                          coef_init=coef_init, intercept_init=intercept_init,
--> 728                          sample_weight=sample_weight)
    729 
    730 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\linear_model\_stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    539         X, y = self._validate_data(X, y, accept_sparse='csr',
    540                                    dtype=np.float64, order="C",
--> 541                                    accept_large_sparse=False)
    542 
    543         # labels can be encoded as float, int, or string literals

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    430                 y = check_array(y, **check_y_params)
    431             else:
--> 432                 X, y = check_X_y(X, y, **check_params)
    433             out = X, y
    434 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    800                     ensure_min_samples=ensure_min_samples,
    801                     ensure_min_features=ensure_min_features,
--> 802                     estimator=estimator)
    803     if multi_output:
    804         y = check_array(y, accept_sparse='csr', force_all_finite=True,

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~\AppData\Local\Programs\Python\Python37\lib\site-packages\sklearn\utils\validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    596                     array = array.astype(dtype, casting="unsafe", copy=False)
    597                 else:
--> 598                     array = np.asarray(array, order=order, dtype=dtype)
    599             except ComplexWarning:
    600                 raise ValueError("Complex data not supported\n"

~\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\core\_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

TypeError: float() argument must be a string or a number, not 'dict'

It said both X_train and y_train are lists. That is exactly what I want them to be. Still
the error insists that at least one of them is dict. I assume this means dictionary.

If both are lists what happened to make at least one a dict?

I can post all the code that precedes the error.

I am really confused and have no idea how it came to call one of them a dict.

Any help appreciated, Thanks in advance.

Respectfully,

ErnestTBass

I can post a screenshot of the printout of the line.

print(f"X_train is of type {type(X_train)} and y_train is of type {type(y_train)}")

I am just not sure how to do it.

Okay to put things in context, I am posting the code for the program,

You can see where it fails.

#!/usr/bin/env python
# coding: utf-8

# https://opendatascience.com/intro-to-natural-language-processing/

# In[ ]:


#!pip install nltk
import nltk
#per risolvere un bug, altrimenti da errore
nltk.download('punkt')

#tokenizer
def format_sentence(sent):
  return({word: True for word in nltk.word_tokenize(sent)})


# #Tweets

# In[ ]:


print(nltk.word_tokenize("The cat is very cute"))

##X_train, y_train, X_test, y_test


# In[ ]:


#   X + y
#se chiamiamo a al di fuori di questo slot non funziona
total = open('pos_tweets.txt')
X_pos = list()
y_pos = list()
#word tokenization
for sentence in total:
  #print(sentence)
  X_pos.append([format_sentence(sentence)])
  y_pos.append(0)
  #saves the sentence in format: [{tokenized sentence}, 'pos]
#X_pos


# In[ ]:


#   X + y
#se chiamiamo a al di fuori di questo slot non funziona
total = open('pos_tweets.txt')
X_neg = list()
y_neg = list()
#word tokenization
for sentence in total:
  #print(sentence)
  X_neg.append([format_sentence(sentence)])
  y_neg.append(1)
  #saves the sentence in format: [{tokenized sentence}, 'pos]
#X_neg


# In[ ]:


X_pos[0]


# In[ ]:


X = X_pos + X_neg
y = y_pos + y_neg
print(len(X), len(y))


# In[ ]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X_train), len(X_test), len(y_train), len(y_test))


# In[ ]:


#we can use Embedding layers
#we can use a ML algorithm that takes X_train, y_train, X_test, y_test


# In[ ]:


#per SGD classifier i dati devono essere numerically encoded, not dict
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
print(f"X_train is of type {type(X_train)} and y_train is of type {type(y_train)}")
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
valid_score = clf.score(X_valid, y_valid)


# ##Xy_train, Xy_test

# In[ ]:


#   X + y
#se chiamiamo a al di fuori di questo slot non funziona
total = open('/content/drive/My Drive/Colab Notebooks/Projects/20200602_Twitter_Sentiment_Analysis/pos_tweets.txt')
Xy_pos = list()
#word tokenization
for sentence in total:
  #print(sentence)
  Xy_pos.append([format_sentence(sentence), 'pos'])
  #saves the sentence in format: [{tokenized sentence}, 'pos]
#Xy_pos


# In[ ]:


#   X + y
#se chiamiamo a al di fuori di questo slot non funziona
total = open('/content/drive/My Drive/Colab Notebooks/Projects/20200602_Twitter_Sentiment_Analysis/neg_tweets.txt')
Xy_neg = list()
#word tokenization
for sentence in total:
  #print(sentence)
  Xy_neg.append([format_sentence(sentence), 'neg'])
  #saves the sentence in format: [{tokenized sentence}, 'pos]
#Xy_neg


# In[ ]:


len(Xy_neg)


# In[ ]:


Xy_pos[0]


# In[ ]:


def split(pos, neg, ratio):
  train = pos[:int((1-ratio)*len(pos))] + neg[:int((1-ratio)*len(neg))]
  test = pos[int((ratio)*len(pos)):] + neg[int((ratio)*len(neg)):]
  return train, test

Xy_train, Xy_test = split(Xy_pos, Xy_neg, 0.1)


# In[ ]:


from nltk.classify import NaiveBayesClassifier

#encoded thorugh dictionaries
classifier = NaiveBayesClassifier.train(Xy_train)
classifier.show_most_informative_features()


# In[ ]:


example2 = "beautiful"
print(classifier.classify(format_sentence(example2)))


# In[ ]:


from nltk.classify.util import accuracy
print(accuracy(classifier, Xy_test))


# ##Movies

# In[ ]:


import pandas as pd
total = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Projects/20200602_Twitter_Sentiment_Analysis/movie_review.csv')
total


# In[ ]:


total_positive = total.copy()
total_positive.columns
total_positive = total_positive.loc[total_positive['tag'] == 'pos']
#total_positive = total_positive.pop('text')
total_positive = total_positive.drop(['fold_id', 'cv_tag', 'html_id', 'sent_id'], axis=1)
total_positive


# In[ ]:


total_negative = total.copy()
total_negative.columns
total_negative = total_negative.loc[total_negative['tag'] == 'neg']
#total_negative = total_negative.pop('text')
total_negative = total_negative.drop(['fold_id', 'cv_tag', 'html_id', 'sent_id'], axis=1)
total_negative


# In[ ]:


format_sentence('how are you')


# In[ ]:


#   tokenizer
#input: series, ?lists?
def create_dict(total_positive, total_negative):
  
  positive_reviews = list()
  #word tokenization
  for sentence in list(total_positive.values):
    positive_reviews.append([format_sentence(sentence[0]), 'pos'])
    #saves the sentence in format: [{tokenized sentence}, 'pos]
  
  negative_reviews = list()
  #word tokenization
  for sentence in list(total_negative.values):
    #print(sentence)
    negative_reviews.append([format_sentence(sentence[0]), 'neg'])
    #saves the sentence in format: [{tokenized sentence}, 'pos]
  
  return positive_reviews, negative_reviews

positive_reviews, negative_reviews = create_dict(total_positive, total_negative)


# In[ ]:


X = pd.concat([total_positive, total_negative], axis=0)
X.columns = ['text', 'sentiment']
X


# In[ ]:


import seaborn as sns
sns.countplot(x='sentiment', data=X)


# In[ ]:


y = pd.DataFrame(X.pop('sentiment'))
y


# In[ ]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


# In[ ]:


X_train['text'][0]


# In[ ]:


##del?
def preprocess_text(sen):
  # Removing html tags
  sentence = remove_tags(sen)

  # Remove punctuations and numbers
  sentence = re.sub('[^a-zA-Z]', ' ', sentence)

  # Single character removal
  sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

  # Removing multiple spaces
  sentence = re.sub(r'\s+', ' ', sentence)

  return sentence

TAG_RE = re.compile(r'<[^>]+>')
#replaces anything between <> with an empty space
def remove_tags(text):
    return TAG_RE.sub('', text)


# In[ ]:


from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)


# In[ ]:


tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts('come stai')
tokenizer.texts_to_sequences('come stai')


# In[ ]:


#del?
X = []
sentences = list(movie_reviews['review'])
for sen in sentences:
  X.append(preprocess_text(sen))


# In[ ]:


print(len(positive_reviews))
print(len(negative_reviews))


# In[ ]:


train = positive_reviews[:int((.9)*len(positive_reviews))] + negative_reviews[:int((.9)*len(negative_reviews))]
test = positive_reviews[int((.1)*len(positive_reviews)):] + negative_reviews[int((.1)*len(negative_reviews)):]
print(len(train), len(test))


# In[ ]:


print(train[0])


# In[ ]:


from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(train)
classifier.show_most_informative_features()


# In[ ]:


example2 = "mulan"
print(classifier.classify(format_sentence(example2)))


# In[ ]:


from nltk.classify.util import accuracy
print(accuracy(classifier, test))


# In[ ]:


get_ipython().system('python -V')


# In[ ]:

I cannot understand where it gets the error both X_train and y__rain are lists as the
code says so where does it get dict from?

Any help appreciated. Thanks in advance.

Respectfully,

ErnestTBass

ErnestTBass

perfringo

ErnestTBass

bowlofred

deanhystad

ErnestTBass

ErnestTBass