How to use a tfrecord file for training an autoencoder

JohnMarie · Feb-17-2019, 04:05 PM

I don't know how to get the number of features from a tfrecord file to make them as input to a stacked autoencoder.

I used the follwing function for stacked autoencoder:

from __future__ import print_function
import keras
import numpy
from keras.models import Sequential
from keras.layers.core import *
from sklearn.model_selection import train_test_split
from app_flag import FLAGS

class StackedAutoencoder(object):
    """
    Implementation of stacked autoencoder multi-class classifier using the Keras Python package.
    This classifier is used to classify cells to cell cycle phases S, G1 or G2M.
    """
    def __init__(self, features, labels, num_labels):
        self.features = features
        self.labels = labels
        self.auto_encoder = None
        self.encoding_dim = num_labels
        

        # fix random seed for reproducibility
        self.seed = 7
        numpy.random.seed(7)

    def create_autoencoder(self):
        """
        Build the stacked auto-encoder using multiple hidden layers.
        The stacked auto-encoder is then trained and weights are freezed afterwards.
        A softmax classification layer is that appended to the last layer, replacing the input
        re-constructed layer of the auto-encoder.
        :return: Compiled classification neural network model.
        """
        self.auto_encoder = Sequential()
        self.auto_encoder.add(Dense(3000, activation='relu', input_dim=self.features.shape[1]))
        self.auto_encoder.add(Dense(1000, activation='relu'))
        self.auto_encoder.add(Dense(30, activation='relu'))

        self.auto_encoder.add(Dense(3000, activation='relu'))
        self.auto_encoder.add(Dense(self.features.shape[1], activation='sigmoid'))

        self.auto_encoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        self.auto_encoder.fit(self.features, self.features,
                              epochs=10,
                              batch_size=5,
                              shuffle=True,
                              validation_split=0.33,
                              validation_data=None)

        self.auto_encoder.layers.pop()
        self.auto_encoder.add(Dense(self.encoding_dim, activation='softmax'))
        self.auto_encoder.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        print(self.auto_encoder.summary())

        # Freeze all weights after training the stacked auto-encoder and all the classification layer
        for i in range(0, len(self.auto_encoder.layers)-1):
            self.auto_encoder.layers[i].trainable = False

        return self.auto_encoder

    def evaluate_autoencoder(self):
        """
        Fit the trained neural network and validate it using splitting the dataset to training and testing sets.
        :return: Accuracy score of the classification.
        """
        self.auto_encoder.fit(self.features, self.labels,
                              epochs=10,
                              batch_size=5,
                              shuffle=True)

        X_train, X_test, Y_train, Y_test = train_test_split(self.features, self.labels, test_size=0.33, random_state=self.seed)
        #predictions = self.auto_encoder.predict_classes(X_test)
        #print(predictions)
        #print(self.label_encoder.inverse_transform(predictions))
        score = self.auto_encoder.evaluate(X_test, Y_test, batch_size=5, verbose=1)
        return score

When I run the code I get an error in the line :

self.auto_encoder.add(Dense(3000, activation='relu', input_dim=self.features.shape[1]))

indicating :TypeError: float() argument must be a string or a number.

So, how can I use the Tfrecord file to get the input dimentionality (the number of features)

Axel_Erfurt · Feb-17-2019, 04:15 PM

try

add(Dense(3000.0, ...

or

add(Dense('3000', ...

JohnMarie · Feb-17-2019, 05:19 PM

Thanks for your response. I try the two suggestions but I get the same error, I think that the problem is from input_dim=self.features.shape[1] given that I can't get the number of features from the tfrecord file

**scidam** · Feb-17-2019, 11:16 PM

What is underlying data used to run the model? Is it a data frame or numpy array?
May be self.features includes non-numerical, e.g. np.object cells?

JohnMarie · Feb-18-2019, 01:22 PM

I used a tfrecord file that containes only binary values. it is a numpy array.

**scidam** · Feb-21-2019, 04:08 AM

I ran the following code

from __future__ import print_function
import keras
import numpy
from keras.models import Sequential
from keras.layers.core import *
from sklearn.model_selection import train_test_split

 
class StackedAutoencoder(object):
    """
    Implementation of stacked autoencoder multi-class classifier using the Keras Python package.
    This classifier is used to classify cells to cell cycle phases S, G1 or G2M.
    """
    def __init__(self, features, labels, num_labels):
        self.features = features
        self.labels = labels
        self.auto_encoder = None
        self.encoding_dim = num_labels
         
 
        # fix random seed for reproducibility
        self.seed = 7
        numpy.random.seed(7)
 
    def create_autoencoder(self):
        """
        Build the stacked auto-encoder using multiple hidden layers.
        The stacked auto-encoder is then trained and weights are freezed afterwards.
        A softmax classification layer is that appended to the last layer, replacing the input
        re-constructed layer of the auto-encoder.
        :return: Compiled classification neural network model.
        """
        self.auto_encoder = Sequential()
        self.auto_encoder.add(Dense(3000, activation='relu', input_dim=self.features.shape[1]))
        self.auto_encoder.add(Dense(1000, activation='relu'))
        self.auto_encoder.add(Dense(30, activation='relu'))
 
        self.auto_encoder.add(Dense(3000, activation='relu'))
        self.auto_encoder.add(Dense(self.features.shape[1], activation='sigmoid'))
 
        self.auto_encoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        self.auto_encoder.fit(self.features, self.features,
                              epochs=10,
                              batch_size=5,
                              shuffle=True,
                              validation_split=0.33,
                              validation_data=None)
 
        self.auto_encoder.layers.pop()
        self.auto_encoder.add(Dense(self.encoding_dim, activation='softmax'))
        
        for i in range(0, len(self.auto_encoder.layers)-1):
            self.auto_encoder.layers[i].trainable = False
        
        self.auto_encoder.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        print(self.auto_encoder.summary())
 
 
        return self.auto_encoder
 
    def evaluate_autoencoder(self):
        """
        Fit the trained neural network and validate it using splitting the dataset to training and testing sets.
        :return: Accuracy score of the classification.
        """
        self.auto_encoder.fit(self.features, self.labels,
                              epochs=10,
                              batch_size=5,
                              shuffle=True)
        print("Fitted")
        X_train, X_test, Y_train, Y_test = train_test_split(self.features, self.labels, test_size=0.33, random_state=self.seed)
        #predictions = self.auto_encoder.predict_classes(X_test)
        #print(predictions)
        #print(self.label_encoder.inverse_transform(predictions))
        score = self.auto_encoder.evaluate(X_test, Y_test, batch_size=5, verbose=1)
        return score
    

sa = StackedAutoencoder(np.random.rand(100,10).astype(bool), np.random.randint(2, 5, size=100), 3)
sa.create_autoencoder()
sa.evaluate_autoencoder()

and got the output:

Output:Using TensorFlow backend.
Train on 67 samples, validate on 33 samples
Epoch 1/10
67/67 [==============================] - 2s 31ms/step - loss: 0.2513 - acc: 0.9254 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 2/10
67/67 [==============================] - 2s 24ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 3/10
67/67 [==============================] - 2s 25ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 4/10
67/67 [==============================] - 2s 24ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 5/10
67/67 [==============================] - 2s 24ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 6/10
67/67 [==============================] - 2s 23ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 7/10
67/67 [==============================] - 2s 24ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 8/10
67/67 [==============================] - 2s 25ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 9/10
67/67 [==============================] - 2s 24ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
Epoch 10/10
67/67 [==============================] - 2s 23ms/step - loss: 1.1921e-07 - acc: 1.0000 - val_loss: 1.1921e-07 - val_acc: 1.0000
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 3000)              33000     
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              3001000   
_________________________________________________________________
dense_3 (Dense)              (None, 30)                30030     
_________________________________________________________________
dense_4 (Dense)              (None, 3000)              93000     
_________________________________________________________________
dense_5 (Dense)              (None, 10)                30010     
_________________________________________________________________
dense_6 (Dense)              (None, 3)                 33        
=================================================================
Total params: 3,187,073
Trainable params: 33
Non-trainable params: 3,187,040
_________________________________________________________________
None

and error:

Output:
ValueError: Error when checking target: expected dense_6 to have shape (3,) but got array with shape (1,)

So, it is almost working fine... No TypeError rised.

JohnMarie · (This post was last modified: Feb-22-2019, 06:35 PM by JohnMarie.)

I used a tfrecord file to train the stacked autoencoder. My function is:

import tensorflow as tf
import numpy as np
import readers
import pre_precessing
from app_flag import FLAGS
from StackedAutoencoder import StackedAutoencoder


def write_and_encode(data_list, tfrecord_filename):
    writer = tf.python_io.TFRecordWriter(tfrecord_filename)
    for label, data_matrix in data_list:
        example = tf.train.Example(features=tf.train.Features(
            feature={
                "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
                "data_raw": tf.train.Feature(bytes_list=tf.train.BytesList(value=[data_matrix.tostring()]))
            }
        ))
        writer.write(example.SerializeToString())

    writer.close()


def read_and_decode(tfrecord_filename):
    reader = tf.TFRecordReader()
    filename_queue = tf.train.string_input_producer([tfrecord_filename],)
    _, serialized_example = reader.read(filename_queue)
    feature = tf.parse_single_example(serialized_example,
                                      features={
                                          "label": tf.FixedLenFeature([], tf.int64),
                                          "data_raw": tf.FixedLenFeature([], tf.string)
                                      })
    data = tf.decode_raw(feature["data_raw"], tf.float64)
    data = tf.reshape(data, [FLAGS.image_rows, FLAGS.image_cols])
    return data, feature["label"]



def train_input_fn():

    tfrecord_file = "../resources/train_tfrecord"  
    dataset = tf.data.TFRecordDataset(tfrecord_file)
    dataset = dataset.map(parser)

    train_dataset = dataset.repeat(FLAGS.num_epochs).batch(FLAGS.batch_size)
    train_iterator = train_dataset.make_one_shot_iterator()

    features, labels = train_iterator.get_next()

    return features, labels


def parser(record_line):

    features = {
        "label": tf.FixedLenFeature([], tf.int64),
        "data_raw": tf.FixedLenFeature([], tf.string)
    }
    parsed = tf.parse_single_example(record_line, features=features)
    label = tf.cast(parsed["label"], tf.int32) - 1  
    data = tf.decode_raw(parsed["data_raw"], tf.float64)
    data = tf.reshape(data, [FLAGS.image_rows, FLAGS.image_cols])
    data = tf.cast(data, tf.float32)
    return data, label




def write_user_instances_to_tfrecord():
    
    users = ["0"+str(i) for i in range(1, 10)]
    users.extend([str(i) for i in range(10, 17)])
    users.extend(["32", "40", "41", "42", "43", "49", "50", "51"])

   
    instances = []
    for user in users:
        train_data = readers.read_user_files(user)
        for label, instance in train_data.items():
            instances.append((label, instance))

 
    formalized_instances = pre_precessing.extend_to_maxsize(instances)

   
    train_instances = formalized_instances[:100]
    write_and_encode(train_instances, "../resources/train_tfrecord")

 

def main():
    build_stacked_ae("../resources/train_tfrecord")

def build_stacked_ae(path):
    """
    Build the stacked auto-encoder neural network, and evaluate its performance
    """
    ############### Stacked Auto-Encoders ##############
    features,labels=train_input_fn()
    ae = StackedAutoencoder(features,labels,5)
    ae.create_autoencoder()
    result = ae.evaluate_autoencoder()
    return result[1] * 100
    print("Accuracy: %.2f%%" % (result[1] * 100))

if __name__ == "__main__":
    main()

I can't fix the error any help please?

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Sample training small model	AndrzejB	3	2,001	Mar-22-2023, 07:37 PM Last Post: jefsummers
	Is it normal so much time training for Training Custom Object Detector??	hobbyist	2	3,704	May-31-2022, 08:55 AM Last Post: aserikova
	Using Autoencoder for Data Augmentation of numerical Dataset in Python	Marvin93	2	4,502	Jul-10-2020, 07:18 PM Last Post: Marvin93
	How to save predictions made by an autoencoder	Glasgow1988	0	2,064	Jul-03-2020, 12:43 PM Last Post: Glasgow1988
	stacked autoencoder training	JohnMarie	0	3,123	Feb-24-2019, 12:23 AM Last Post: JohnMarie

How to use a tfrecord file for training an autoencoder

User Panel Messages

Announcements