problem to add precision, recall and f1-score

edys · May-24-2019, 02:33 AM

Please, i confused to add precision, recall and f1-score.
i need to solve for my final project regarding this URL reference on https://github.com/amaiya/keras-nbsvm/bl...bsvm.ipynb

any assistance would be very helpful.

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
# read in training and test corpora
categories = ['pos', 'neg']
train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
train_b.data = [x.decode('utf-8') for x in train_b.data]
test_b.data = [x.decode('utf-8') for x in test_b.data]
veczr = CountVectorizer(ngram_range=(1,3), binary=True,
token_pattern=r'\w+',
max_features=800000)
dtm_train = veczr.fit_transform(train_b.data)
dtm_test = veczr.transform(test_b.data)
y_train = train_b.target
y_test = test_b.target
print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
print('vocab size:%s' % (num_words))

return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
x = []
nwds = []
for idx, row in enumerate(dtm):
seq = []
indices = (row.indices + 1).astype(np.int64)
np.append(nwds, len(indices))
data = (row.data).astype(np.int64)
count_dict = dict(zip(indices, data))
for k,v in count_dict.items():
seq.extend([k]*v)
num_words = len(seq)
nwds.append(num_words)
# pad up to maxlen
if num_words < maxlen:
seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
# truncate down to maxlen
else:
seq = seq[-maxlen:]
x.append(seq)
nwds = np.array(nwds)
print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
return np.array(x)

maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)

def pr(dtm, y, y_i):
p = dtm[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
embedding_matrix = np.zeros((num_words, 1))
for i in range(1, num_words): # skip 0, the padding value
if nbratios is not None:
# if log-count ratios are supplied, then it's NBSVM
embedding_matrix[i] = nbratios[i-1]
else:
# if log-count rations are not supplied, this reduces to a logistic regression
embedding_matrix[i] = 1

# set up the model
inp = Input(shape=(maxlen,))
r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
x = dot([r,x], axes=1)
x = Flatten()(x)
x = Activation('sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer=Adam(lr=0.001),
metrics=['accuracy'])
return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,
batch_size=32,
epochs=5,
validation_data=(x_test, y_test))

heiner55 · May-24-2019, 03:07 AM

Helping is difficult, because your code lost indents.

edys · (This post was last modified: May-24-2019, 03:27 AM by edys.)

thanks for replying, excuse me what is indents means?

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
# read in training and test corpora
categories = ['pos', 'neg']
train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
train_b.data = [x.decode('utf-8') for x in train_b.data]
test_b.data = [x.decode('utf-8') for x in test_b.data]
veczr = CountVectorizer(ngram_range=(1,3), binary=True,
token_pattern=r'\w+',
max_features=800000)
dtm_train = veczr.fit_transform(train_b.data)
dtm_test = veczr.transform(test_b.data)
y_train = train_b.target
y_test = test_b.target
print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
print('vocab size:%s' % (num_words))

return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
x = []
nwds = []
for idx, row in enumerate(dtm):
seq = []
indices = (row.indices + 1).astype(np.int64)
np.append(nwds, len(indices))
data = (row.data).astype(np.int64)
count_dict = dict(zip(indices, data))
for k,v in count_dict.items():
seq.extend([k]*v)
num_words = len(seq)
nwds.append(num_words)
# pad up to maxlen
if num_words < maxlen:
seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
# truncate down to maxlen
else:
seq = seq[-maxlen:]
x.append(seq)
nwds = np.array(nwds)
print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
return np.array(x)
maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)

def pr(dtm, y, y_i):
p = dtm[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
embedding_matrix = np.zeros((num_words, 1))
for i in range(1, num_words): # skip 0, the padding value
if nbratios is not None:
# if log-count ratios are supplied, then it's NBSVM
embedding_matrix[i] = nbratios[i-1]
else:
# if log-count rations are not supplied, this reduces to a logistic regression
embedding_matrix[i] = 1

# set up the model
inp = Input(shape=(maxlen,))
r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
x = dot([r,x], axes=1)
x = Flatten()(x)
x = Activation('sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,batch_size=32,epochs=5,validation_data=(x_test, y_test))

i cant attach file here, i have been set the indents. but it is flat to left again

***snippsat*** · May-24-2019, 11:57 AM

Use code tag then indentation get preserved in post.
Look at BBCode on how to use.

edys · (This post was last modified: May-24-2019, 12:21 PM by edys.)

How to add recall, precision and F1 score regarding the code below. im not familiar with python

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
    # read in training and test corpora
    categories = ['pos', 'neg']
    train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
    test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
    train_b.data = [x.decode('utf-8') for x in train_b.data]
    test_b.data =  [x.decode('utf-8') for x in test_b.data]
    veczr =  CountVectorizer(ngram_range=(1,3), binary=True, 
                             token_pattern=r'\w+', 
                             max_features=800000)
    dtm_train = veczr.fit_transform(train_b.data)
    dtm_test = veczr.transform(test_b.data)
    y_train = train_b.target
    y_test = test_b.target
    print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
    print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
    num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
    print('vocab size:%s' % (num_words))
  
    return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
    x = []
    nwds = []
    for idx, row in enumerate(dtm):
        seq = []
        indices = (row.indices + 1).astype(np.int64)
        np.append(nwds, len(indices))
        data = (row.data).astype(np.int64)
        count_dict = dict(zip(indices, data))
        for k,v in count_dict.items():
            seq.extend([k]*v)
        num_words = len(seq)
        nwds.append(num_words)
        # pad up to maxlen
        if num_words < maxlen: 
            seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
        # truncate down to maxlen
        else:                  
            seq = seq[-maxlen:]
        x.append(seq)
    nwds = np.array(nwds)
    print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
    return np.array(x)

maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)  

def pr(dtm, y, y_i):
    p = dtm[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
    embedding_matrix = np.zeros((num_words, 1))
    for i in range(1, num_words): # skip 0, the padding value
        if nbratios is not None:
            # if log-count ratios are supplied, then it's NBSVM
            embedding_matrix[i] = nbratios[i-1]
        else:
            # if log-count rations are not supplied, this reduces to a logistic regression
            embedding_matrix[i] = 1

    # set up the model
    inp = Input(shape=(maxlen,))
    r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
    x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
    x = dot([r,x], axes=1)
    x = Flatten()(x)
    x = Activation('sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
    return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,
          batch_size=32,
          epochs=5,
          validation_data=(x_test, y_test))

im still confuse to find precision, recall and f1-score. there is reference to help from this URL : https://scikit-learn.org/stable/modules/...pport.html

heiner55 · (This post was last modified: May-27-2019, 05:09 AM by heiner55.)

Sorry I can not help.
I don't understand your code
in reasonable time.

edys · May-27-2019, 04:11 AM

can anyone help me to fine someone to assist me with this code? in python actually. thanks

heiner55 · May-28-2019, 04:48 AM

Maybe you can simplify your code.
Then we can better help.

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	precision & recall	gracenz	1	994	Sep-21-2022, 02:14 AM Last Post: jefsummers
	Time class with picosecond precision	marcocod	5	3,654	Jul-03-2020, 12:12 AM Last Post: Larz60+
	F-score and Recall values Greater Than 1	Hani	5	2,452	May-13-2020, 01:47 AM Last Post: Hani
	F-score, Precision, and Recall values	Hani	3	2,521	May-09-2020, 08:16 AM Last Post: ThomasL
	Difference between R^2 and .score	donnertrud	1	6,904	Jan-08-2020, 05:14 PM Last Post: jefsummers

problem to add precision, recall and f1-score

User Panel Messages

Announcements