Python Forum
problem to add precision, recall and f1-score
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
problem to add precision, recall and f1-score
#1
Please, i confused to add precision, recall and f1-score.
i need to solve for my final project regarding this URL reference on https://github.com/amaiya/keras-nbsvm/bl...bsvm.ipynb

any assistance would be very helpful.

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
# read in training and test corpora
categories = ['pos', 'neg']
train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
train_b.data = [x.decode('utf-8') for x in train_b.data]
test_b.data = [x.decode('utf-8') for x in test_b.data]
veczr = CountVectorizer(ngram_range=(1,3), binary=True,
token_pattern=r'\w+',
max_features=800000)
dtm_train = veczr.fit_transform(train_b.data)
dtm_test = veczr.transform(test_b.data)
y_train = train_b.target
y_test = test_b.target
print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
print('vocab size:%s' % (num_words))

return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
x = []
nwds = []
for idx, row in enumerate(dtm):
seq = []
indices = (row.indices + 1).astype(np.int64)
np.append(nwds, len(indices))
data = (row.data).astype(np.int64)
count_dict = dict(zip(indices, data))
for k,v in count_dict.items():
seq.extend([k]*v)
num_words = len(seq)
nwds.append(num_words)
# pad up to maxlen
if num_words < maxlen:
seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
# truncate down to maxlen
else:
seq = seq[-maxlen:]
x.append(seq)
nwds = np.array(nwds)
print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
return np.array(x)

maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)

def pr(dtm, y, y_i):
p = dtm[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
embedding_matrix = np.zeros((num_words, 1))
for i in range(1, num_words): # skip 0, the padding value
if nbratios is not None:
# if log-count ratios are supplied, then it's NBSVM
embedding_matrix[i] = nbratios[i-1]
else:
# if log-count rations are not supplied, this reduces to a logistic regression
embedding_matrix[i] = 1

# set up the model
inp = Input(shape=(maxlen,))
r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
x = dot([r,x], axes=1)
x = Flatten()(x)
x = Activation('sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer=Adam(lr=0.001),
metrics=['accuracy'])
return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,
batch_size=32,
epochs=5,
validation_data=(x_test, y_test))
Reply
#2
Helping is difficult, because your code lost indents.
Reply
#3
thanks for replying, excuse me what is indents means?

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
# read in training and test corpora
categories = ['pos', 'neg']
train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
train_b.data = [x.decode('utf-8') for x in train_b.data]
test_b.data = [x.decode('utf-8') for x in test_b.data]
veczr = CountVectorizer(ngram_range=(1,3), binary=True,
token_pattern=r'\w+',
max_features=800000)
dtm_train = veczr.fit_transform(train_b.data)
dtm_test = veczr.transform(test_b.data)
y_train = train_b.target
y_test = test_b.target
print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
print('vocab size:%s' % (num_words))

return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
x = []
nwds = []
for idx, row in enumerate(dtm):
seq = []
indices = (row.indices + 1).astype(np.int64)
np.append(nwds, len(indices))
data = (row.data).astype(np.int64)
count_dict = dict(zip(indices, data))
for k,v in count_dict.items():
seq.extend([k]*v)
num_words = len(seq)
nwds.append(num_words)
# pad up to maxlen
if num_words < maxlen:
seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
# truncate down to maxlen
else:
seq = seq[-maxlen:]
x.append(seq)
nwds = np.array(nwds)
print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
return np.array(x)
maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)

def pr(dtm, y, y_i):
p = dtm[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
embedding_matrix = np.zeros((num_words, 1))
for i in range(1, num_words): # skip 0, the padding value
if nbratios is not None:
# if log-count ratios are supplied, then it's NBSVM
embedding_matrix[i] = nbratios[i-1]
else:
# if log-count rations are not supplied, this reduces to a logistic regression
embedding_matrix[i] = 1

# set up the model
inp = Input(shape=(maxlen,))
r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
x = dot([r,x], axes=1)
x = Flatten()(x)
x = Activation('sigmoid')(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=0.001),metrics=['accuracy'])
return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,batch_size=32,epochs=5,validation_data=(x_test, y_test))

i cant attach file here, i have been set the indents. but it is flat to left again
Reply
#4
Use code tag then indentation get preserved in post.
Look at BBCode on how to use.
Reply
#5
How to add recall, precision and F1 score regarding the code below. im not familiar with python

%reload_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
from keras.layers.core import Activation
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, dot
from keras import backend as K
from keras.optimizers import Adam
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import load_files
from sklearn.metrics import precision_recall_fscore_support

PATH_TO_IMDB = r'./data/aclImdb'

def load_imdb_data(datadir):
    # read in training and test corpora
    categories = ['pos', 'neg']
    train_b = load_files(datadir+'/train', shuffle=True, categories=categories)
    test_b = load_files(datadir+'/test', shuffle=True, categories=categories)
    train_b.data = [x.decode('utf-8') for x in train_b.data]
    test_b.data =  [x.decode('utf-8') for x in test_b.data]
    veczr =  CountVectorizer(ngram_range=(1,3), binary=True, 
                             token_pattern=r'\w+', 
                             max_features=800000)
    dtm_train = veczr.fit_transform(train_b.data)
    dtm_test = veczr.transform(test_b.data)
    y_train = train_b.target
    y_test = test_b.target
    print("document-term matrix shape (training): (%s, %s)" % (dtm_train.shape))
    print("document-term matrix shape (test): (%s, %s)" % (dtm_train.shape))
    num_words = len([v for k,v in veczr.vocabulary_.items()]) + 1 # add 1 for 0 padding
    print('vocab size:%s' % (num_words))
  
    return (dtm_train, dtm_test), (y_train, y_test), num_words

(dtm_train, dtm_test), (y_train, y_test), num_words = load_imdb_data(PATH_TO_IMDB)

def dtm2wid(dtm, maxlen=2000):
    x = []
    nwds = []
    for idx, row in enumerate(dtm):
        seq = []
        indices = (row.indices + 1).astype(np.int64)
        np.append(nwds, len(indices))
        data = (row.data).astype(np.int64)
        count_dict = dict(zip(indices, data))
        for k,v in count_dict.items():
            seq.extend([k]*v)
        num_words = len(seq)
        nwds.append(num_words)
        # pad up to maxlen
        if num_words < maxlen: 
            seq = np.pad(seq, (maxlen - num_words, 0), mode='constant')
        # truncate down to maxlen
        else:                  
            seq = seq[-maxlen:]
        x.append(seq)
    nwds = np.array(nwds)
    print('sequence stats: avg:%s, max:%s, min:%s' % (nwds.mean(), nwds.max(), nwds.min()) )
    return np.array(x)

maxlen = 2000
x_train = dtm2wid(dtm_train, maxlen=maxlen)
x_test = dtm2wid(dtm_test, maxlen=maxlen)  

def pr(dtm, y, y_i):
    p = dtm[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)
nbratios = np.log(pr(dtm_train, y_train, 1)/pr(dtm_train, y_train, 0))
nbratios = np.squeeze(np.asarray(nbratios))

def get_model(num_words, maxlen, nbratios=None):
    embedding_matrix = np.zeros((num_words, 1))
    for i in range(1, num_words): # skip 0, the padding value
        if nbratios is not None:
            # if log-count ratios are supplied, then it's NBSVM
            embedding_matrix[i] = nbratios[i-1]
        else:
            # if log-count rations are not supplied, this reduces to a logistic regression
            embedding_matrix[i] = 1

    # set up the model
    inp = Input(shape=(maxlen,))
    r = Embedding(num_words, 1, input_length=maxlen, weights=[embedding_matrix], trainable=False)(inp)
    x = Embedding(num_words, 1, input_length=maxlen, embeddings_initializer='glorot_normal')(inp)
    x = dot([r,x], axes=1)
    x = Flatten()(x)
    x = Activation('sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
    return model

model = get_model(num_words, maxlen, nbratios=nbratios)
model.fit(x_train, y_train,
          batch_size=32,
          epochs=5,
          validation_data=(x_test, y_test))

im still confuse to find precision, recall and f1-score. there is reference to help from this URL : https://scikit-learn.org/stable/modules/...pport.html
Reply
#6
Sorry I can not help.
I don't understand your code
in reasonable time.
Reply
#7
can anyone help me to fine someone to assist me with this code? in python actually. thanks
Reply
#8
Maybe you can simplify your code.
Then we can better help.
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
Photo Cannot plot without high machine precision problems on small numbers aphobic 1 673 Feb-03-2025, 09:16 PM
Last Post: deanhystad
  precision & recall gracenz 1 1,571 Sep-21-2022, 02:14 AM
Last Post: jefsummers
  Time class with picosecond precision marcocod 5 5,010 Jul-03-2020, 12:12 AM
Last Post: Larz60+
  F-score and Recall values Greater Than 1 Hani 5 3,530 May-13-2020, 01:47 AM
Last Post: Hani
  F-score, Precision, and Recall values Hani 3 3,496 May-09-2020, 08:16 AM
Last Post: ThomasL
  Difference between R^2 and .score donnertrud 1 8,323 Jan-08-2020, 05:14 PM
Last Post: jefsummers

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020