vendredi 16 février 2018

Tensorflow wrong bias initialization

I'm initializing my 2 sets of weights/bias using the same function

1st set:

W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')

2nd set:

W = tf.Variable(tf.random_uniform([input_dim, output_dim], -0.1, 0.1), name='W_post_attn')  
b = tf.Variable(tf.random_uniform([output_dim], -0.1, 0.1), name='b_post_attn')

But the histogram in Tensorboard shows that the 2nd set of bias is not uniformly distributed (binary distribution centered around +/-0.06, see top image below).

Any idea what might be causing this?

enter image description here

Adding a dummy code using MNIST data and running on jupyter notebook. Note that my orignial code is a binary classification while MNIST has 10 classes. It seems like the number of peaks in the output bias (at the final layer) is correlated to the number of output classes (see figure below).

from __future__ import division, print_function, unicode_literals
from functools import partial

import numpy as np
import os

import tensorflow as tf

def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

path = '/your_folder/'

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/mnist/tmp/data/")

reset_graph()

def variable_summaries(var, name):
  """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
  with tf.name_scope(name):
    mean = tf.reduce_mean(var)
    tf.summary.scalar('mean', mean)
    tf.summary.scalar('max', tf.reduce_max(var))
    tf.summary.scalar('min', tf.reduce_min(var))
    tf.summary.histogram('histogram', var)

n_inputs = 28*28  # MNIST
n_hidden1 = 200
n_outputs = 10

learning_rate = 0.01

n_epochs = 50
batch_size = 50

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

def attention(inputs, attention_size, name):
    hidden_size = int(inputs.get_shape()[1])

    # Trainable parameters
    with tf.name_scope(name):
        with tf.name_scope('Attention_variables'):
            W_omega = tf.Variable(tf.random_uniform([hidden_size, attention_size], -0.1, 0.1), name='W_omega')
            b_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='b_omega')
            u_omega = tf.Variable(tf.random_uniform([attention_size], -0.1, 0.1), name='u_omega')

            variable_summaries(W_omega, 'W_omega')
            variable_summaries(b_omega, 'b_omega')

        with tf.name_scope('Attention_u_it'):
            v = tf.tanh(tf.tensordot(inputs, W_omega, axes=[[1], [0]]) + b_omega, name='u_it')

        with tf.name_scope('Attention_alpha_it'):
            vu = tf.tensordot(v, u_omega, axes=[[1], [0]], name='u_it_u_w')   
            alphas = tf.nn.softmax(vu, name='alphas')              

        with tf.name_scope('Attention_output'):
            #output = tf.reduce_sum(inputs * tf.expand_dims(alphas, -1), 1, name='attention_output')
            output = inputs * tf.expand_dims(alphas, -1)
    return output


def neuron_layer(X, n_neurons, name, activation=None):
    with tf.name_scope(name):
        n_inputs = int(X.get_shape()[1])
        W = tf.Variable(tf.random_uniform([n_inputs, n_neurons], -0.1, 0.1), name='W')
        b = tf.Variable(tf.random_uniform([n_neurons], -0.1, 0.1), name='b')

        variable_summaries(W, 'W')
        variable_summaries(b, 'b')
        if activation is not None:
            return activation(tf.matmul(X, W) + b)
        else:
            return tf.matmul(X, W) + b


with tf.name_scope("dnn"):
    hidden1 = attention(X, n_hidden1, name="hidden1_attn")
    logits = neuron_layer(hidden1, n_outputs, name="outputs")

with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,
                                                              logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    opt = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = opt.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

init = tf.global_variables_initializer()
saver = tf.train.Saver()

#tensorboard saving parameters
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "/tensorboard_files/"
logdir = "{}/run-{}/".format(root_logdir, now)

# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
merged = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(logdir+'/train', tf.get_default_graph())


with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
        if epoch%2==0:    
            acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})        
            #tensorboard summary
            summary = sess.run(merged, feed_dict={X: X_batch, y: y_batch})
            train_writer.add_summary(summary, epoch)

            acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
                                                y: mnist.validation.labels})
            print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)

    save_path = saver.save(sess, path+"my_model_final.ckpt")

enter image description here




Aucun commentaire:

Enregistrer un commentaire