problem to add precision, recall and f1-score

edys · May-24-2019, 02:33 AM

Please, i confused to add precision, recall and f1-score.
i need to solve for my final project regarding this URL reference on https://github.com/amaiya/keras-nbsvm/bl...bsvm.ipynb

any assistance would be very helpful.

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
# read in training and test corpora
categories = ['pos', 'neg']
train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
train_b.data = [x.decode('utf-8') for x in train_b.data]
test_b.data = [x.decode('utf-8') for x in test_b.data]
veczr = CountVectorizer(ngram_range=(1,3), binary=True,
token_pattern=r'\w+',
max_features=800000)
dtm_train = veczr.fit_transform(train_b.data)
dtm_test = veczr.transform(test_b.data)
y_train = train_b.target
y_test = test_b.target
print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
print('vocab size:%s' % (num_words))

return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
x = []
nwds = []
for idx, row in enumerate(dtm):
seq = []
indices = (row.indices + 1).astype(np.int64)
np.append(nwds, len(indices))
data = (row.data).astype(np.int64)
count_dict = dict(zip(indices, data))
for k,v in count_dict.items():
seq.extend([k]*v)
num_words = len(seq)
nwds.append(num_words)
# pad up to maxlen
if num_words < maxlen:
seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
# truncate down to maxlen
else:
seq = seq[-maxlen:]
x.append(seq)
nwds = np.array(nwds)
print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
return np.array(x)

maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)

def pr(dtm, y, y_i):
p = dtm[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
embedding_matrix = np.zeros((num_words, 1))
for i in range(1, num_words): # skip 0, the padding value
if nbratios is not None:
# if log-count ratios are supplied, then it's NBSVM
embedding_matrix[i] = nbratios[i-1]
else:
# if log-count rations are not supplied, this reduces to a logistic regression
embedding_matrix[i] = 1

# set up the model
inp = Input(shape=(maxlen,))
r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
x = dot([r,x], axes=1)
x = Flatten()(x)
x = Activation('sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer=Adam(lr=0.001),
metrics=['accuracy'])
return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,
batch_size=32,
epochs=5,
validation_data=(x_test, y_test))

heiner55 · May-24-2019, 03:07 AM

Helping is difficult, because your code lost indents.

edys · (This post was last modified: May-24-2019, 03:27 AM by edys.)

thanks for replying, excuse me what is indents means?

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
# read in training and test corpora
categories = ['pos', 'neg']
train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
train_b.data = [x.decode('utf-8') for x in train_b.data]
test_b.data = [x.decode('utf-8') for x in test_b.data]
veczr = CountVectorizer(ngram_range=(1,3), binary=True,
token_pattern=r'\w+',
max_features=800000)
dtm_train = veczr.fit_transform(train_b.data)
dtm_test = veczr.transform(test_b.data)
y_train = train_b.target
y_test = test_b.target
print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
print('vocab size:%s' % (num_words))

return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
x = []
nwds = []
for idx, row in enumerate(dtm):
seq = []
indices = (row.indices + 1).astype(np.int64)
np.append(nwds, len(indices))
data = (row.data).astype(np.int64)
count_dict = dict(zip(indices, data))
for k,v in count_dict.items():
seq.extend([k]*v)
num_words = len(seq)
nwds.append(num_words)
# pad up to maxlen
if num_words < maxlen:
seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
# truncate down to maxlen
else:
seq = seq[-maxlen:]
x.append(seq)
nwds = np.array(nwds)
print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
return np.array(x)
maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)

def pr(dtm, y, y_i):
p = dtm[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
embedding_matrix = np.zeros((num_words, 1))
for i in range(1, num_words): # skip 0, the padding value
if nbratios is not None:
# if log-count ratios are supplied, then it's NBSVM
embedding_matrix[i] = nbratios[i-1]
else:
# if log-count rations are not supplied, this reduces to a logistic regression
embedding_matrix[i] = 1

# set up the model
inp = Input(shape=(maxlen,))
r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
x = dot([r,x], axes=1)
x = Flatten()(x)
x = Activation('sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,batch_size=32,epochs=5,validation_data=(x_test, y_test))

i cant attach file here, i have been set the indents. but it is flat to left again

***snippsat*** · May-24-2019, 11:57 AM

Use code tag then indentation get preserved in post.
Look at BBCode on how to use.

edys · (This post was last modified: May-24-2019, 12:21 PM by edys.)

How to add recall, precision and F1 score regarding the code below. im not familiar with python

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
    # read in training and test corpora
    categories = ['pos', 'neg']
    train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
    test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
    train_b.data = [x.decode('utf-8') for x in train_b.data]
    test_b.data =  [x.decode('utf-8') for x in test_b.data]
    veczr =  CountVectorizer(ngram_range=(1,3), binary=True, 
                             token_pattern=r'\w+', 
                             max_features=800000)
    dtm_train = veczr.fit_transform(train_b.data)
    dtm_test = veczr.transform(test_b.data)
    y_train = train_b.target
    y_test = test_b.target
    print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
    print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
    num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
    print('vocab size:%s' % (num_words))
  
    return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
    x = []
    nwds = []
    for idx, row in enumerate(dtm):
        seq = []
        indices = (row.indices + 1).astype(np.int64)
        np.append(nwds, len(indices))
        data = (row.data).astype(np.int64)
        count_dict = dict(zip(indices, data))
        for k,v in count_dict.items():
            seq.extend([k]*v)
        num_words = len(seq)
        nwds.append(num_words)
        # pad up to maxlen
        if num_words < maxlen: 
            seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
        # truncate down to maxlen
        else:                  
            seq = seq[-maxlen:]
        x.append(seq)
    nwds = np.array(nwds)
    print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
    return np.array(x)

maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)  

def pr(dtm, y, y_i):
    p = dtm[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
    embedding_matrix = np.zeros((num_words, 1))
    for i in range(1, num_words): # skip 0, the padding value
        if nbratios is not None:
            # if log-count ratios are supplied, then it's NBSVM
            embedding_matrix[i] = nbratios[i-1]
        else:
            # if log-count rations are not supplied, this reduces to a logistic regression
            embedding_matrix[i] = 1

    # set up the model
    inp = Input(shape=(maxlen,))
    r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
    x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
    x = dot([r,x], axes=1)
    x = Flatten()(x)
    x = Activation('sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
    return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,
          batch_size=32,
          epochs=5,
          validation_data=(x_test, y_test))

im still confuse to find precision, recall and f1-score. there is reference to help from this URL : https://scikit-learn.org/stable/modules/...pport.html

heiner55 · (This post was last modified: May-27-2019, 05:09 AM by heiner55.)

Sorry I can not help.
I don't understand your code
in reasonable time.

edys · May-27-2019, 04:11 AM

can anyone help me to fine someone to assist me with this code? in python actually. thanks

heiner55 · May-28-2019, 04:48 AM

Maybe you can simplify your code.
Then we can better help.

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Cannot plot without high machine precision problems on small numbers	aphobic	1	897	Feb-03-2025, 09:16 PM Last Post: deanhystad
	precision & recall	gracenz	1	1,663	Sep-21-2022, 02:14 AM Last Post: jefsummers
	Time class with picosecond precision	marcocod	5	5,179	Jul-03-2020, 12:12 AM Last Post: Larz60+
	F-score and Recall values Greater Than 1	Hani	5	3,681	May-13-2020, 01:47 AM Last Post: Hani
	F-score, Precision, and Recall values	Hani	3	3,632	May-09-2020, 08:16 AM Last Post: ThomasL
	Difference between R^2 and .score	donnertrud	1	8,468	Jan-08-2020, 05:14 PM Last Post: jefsummers

problem to add precision, recall and f1-score

User Panel Messages

Announcements