% Toy demo about neural nets showing: 1) the input/output mapping
% as a function of the number of layers, 2) the input/output mapping
% as a function of the number of hidden units and 3) a simple example
% of training for regression.
%
% Usage: from Matlab command line type:
%        demo_nnet
%
% Marc'Aurelio Ranzato
% 27 May 2012
% ranzato@google.com


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEMO #1
fprintf('This demo shows input output mapping of')
fprintf(' neural nets with one input and one output.\n')
fprintf('We will vary the number of hidden layers.\n')
fprintf('We keep the number of hidden units to 100.\n')
clear
randn('seed',7)
number_of_hidden_layers = [1 2 3];
number_of_hidden_units = 100;
input = -20:.01:20; % 1 dimensional input

figure(1); clf;
plot_style = {'-b','-.r','--g',':k'};
for num_hid = 1 : length(number_of_hidden_layers)
  size_of_layers = number_of_hidden_units * ...
      ones(1,number_of_hidden_layers(num_hid));
  size_of_layers = [size_of_layers 1]; % Add size of output
  size_of_layers = [1 size_of_layers]; % Add size of input
  fprintf('Generating parameters at random\n')
  for ll = 1 : length(size_of_layers) - 1
    W{ll} = .2*randn(size_of_layers(ll+1), size_of_layers(ll));
    b{ll} = zeros(size_of_layers(ll+1),1); % with 0 biases, output is symmetric
  end
  fprintf('FPROP: computing the ouput values\n')
  h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
  for ll = 2 : length(size_of_layers) - 1
    h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
  end
  figure(1);
  hold on
  plot(input, h{length(size_of_layers)-1},plot_style{num_hid}, ...
       'LineWidth',4)
  hold off
end
h = legend('1 hidden layer','2 hidden layers','3 hidden layers', ...
	   'Location','SouthEast');
set(h,'FontSize',14,'FontWeight','bold')
grid on
xlabel('input','FontSize',16)
ylabel('output','FontSize',16)
saveas(1,'input_output_varying_num_layers.png')


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% DEMO #2
fprintf('This demo shows input output mapping of')
fprintf(' neural nets with one input and one output.\n')
fprintf('We will vary the number of hidden units.\n')
fprintf('We keep the number of hidden layers to 3.\n')
clear
randn('seed',8)
number_of_hidden_layers = 3;
number_of_hidden_units = [10 100 1000];
input = -20:.01:20; % 1 dimensional input

figure(2); clf;
plot_style = {'-b','-.r','--g',':k'};
for num_hid = 1 : length(number_of_hidden_units)
  size_of_layers = number_of_hidden_units(num_hid) * ...
      ones(1,number_of_hidden_layers);
  size_of_layers = [size_of_layers 1]; % Add size of output
  size_of_layers = [1 size_of_layers]; % Add size of input
  fprintf('Generating parameters at random\n')
  fprintf('We also set biases at random making the mapping')
  fprintf(' almost certainly not (anti-)symmetric\n')
  for ll = 1 : length(size_of_layers) - 1
    W{ll} = .2*randn(size_of_layers(ll+1), size_of_layers(ll));
    b{ll} = .1*randn(size_of_layers(ll+1),1);
  end
  fprintf('FPROP: computing the ouput values\n')
  h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
  for ll = 2 : length(size_of_layers) - 1
    h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
  end
  figure(2);
  hold on
  plot(input, h{length(size_of_layers)-1},plot_style{num_hid}, ...
       'LineWidth',4)
  hold off
end
hndl = legend('10 hiddens','100 hiddens','1000 hiddens', ...
	   'Location','NorthWest');
set(hndl,'FontSize',14,'FontWeight','bold')
grid on
xlabel('input','FontSize',16)
ylabel('output','FontSize',16)
saveas(2,'input_output_varying_num_hiddens.png')


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%55
% DEMO #3
fprintf('This demo shows a simple example of training')
fprintf(' a neural net to perform a  desired input/output mapping')
fprintf(', a task known as regression.\n')
fprintf('The output is a piece of cosine function.\n')
fprintf('The optizer is stochastic gradient descent.\n')
clear
randn('seed',123)
number_of_hidden_layers = 3;
number_of_hidden_units = 1000;
input = -5:.01:5; % 1 dimensional input
target = cos(input + pi/3); % desired output
fprintf('Shuffling data\n')
pp = randperm(length(input));
input = input(pp); % Shuffle data.
target = target(pp);
fprintf('Using stochastic gradient descent as optimizer\n.')
learning_rate = 0.001;
number_of_epochs = 100;
mini_batch_size = 100;
num_batches = floor(size(input,2)/mini_batch_size);
fprintf('Number of sweeps over the whole data %d\n', number_of_epochs)
fprintf('Size of mini-batch %d\n', mini_batch_size)
figure(3); clf;
plot_style = {'+b','or','dg','^k'};
plot(input, target, plot_style{end}, 'LineWidth',4)
fprintf('Using %d hidden layer neural net with %d hidden units.\n', ...
	number_of_hidden_layers, number_of_hidden_units)
size_of_layers = number_of_hidden_units * ...
    ones(1,number_of_hidden_layers);
size_of_layers = [size_of_layers 1]; % Add size of output
size_of_layers = [1 size_of_layers]; % Add size of input
fprintf('Generating parameters at random\n')
for ll = 1 : length(size_of_layers) - 1
  W{ll} = .2*randn(size_of_layers(ll+1), size_of_layers(ll));
  b{ll} = zeros(size_of_layers(ll+1),1);
  Wgrad{ll} = zeros(size(W{ll})); % Initialize gradient slots.
  bgrad{ll} = zeros(size(b{ll}));
end
% PLot initial prediction
h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
for ll = 2 : length(size_of_layers) - 2
  h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
end
h{ll+1} = bsxfun(@plus, W{ll+1}*h{ll}, b{ll+1}); % prediction
hold on
plot(input, h{ll+1}, plot_style{1},'LineWidth',4);
hold off
fprintf('Initial error %g\n', sum(sum( (h{ll+1}-input).^2 )) / length(input));
fprintf('Start training!\n')
for ee = 1 : number_of_epochs
    fprintf('Epoch %d: ', ee)
    error = 0;
    tic;
    for bb = 1 : num_batches
        % Get current mini-batch.
        in = input(:, 1 + mini_batch_size * (bb - 1) : mini_batch_size * bb);
        desired = target(:, 1 + mini_batch_size * (bb-1) : mini_batch_size * bb);
	% FPROP
	[h{1} dh{1}] = tanhAct(bsxfun(@plus, W{1}*in, b{1}));
	for ll = 2 : length(size_of_layers) - 2
	  [h{ll} dh{ll}] = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
	end
	h{ll+1} = bsxfun(@plus, W{ll+1}*h{ll}, b{ll+1}); % prediction
        %% COMPUTE ERROR
	difference = h{ll+1} - desired;
	error = error + 0.5 * sum(sum(difference.^2));
        derivative{ll+1} = difference;

	%% BACKPROP
    	for ll = length(size_of_layers) - 1 : -1 : 2
	  % Compute derivative w.r.t. parameters
	  Wgrad{ll} = derivative{ll} * h{ll-1}';
	  bgrad{ll} = sum(derivative{ll},2);
	  % Compute derivative w.r.t. input of this layer
	  derivative{ll-1} = (W{ll}' * derivative{ll}) .* dh{ll-1};
    	end
    	Wgrad{1} = derivative{1} * in';
    	bgrad{1} = sum(derivative{1},2);
        % update parameters
	for ll = 1 : length(size_of_layers) - 1
	  W{ll} = W{ll} - (learning_rate / mini_batch_size) * Wgrad{ll};
	  b{ll} = b{ll} - (learning_rate / mini_batch_size) * bgrad{ll};
	end

    end
    timing(ee) = toc;
    fprintf('Average error %g\n', error / (mini_batch_size * num_batches))
    errors(ee) = error;
    % Plot predictions at first and last epoch.
    if (ee == 1) || (ee == number_of_epochs)
      h{1} = tanhAct(bsxfun(@plus, W{1}*input, b{1}));
      for ll = 2 : length(size_of_layers) - 2
	h{ll} = tanhAct(bsxfun(@plus, W{ll}*h{ll-1}, b{ll}));
      end
      h{ll+1} = bsxfun(@plus, W{ll+1}*h{ll}, b{ll+1}); % prediction
      hold on
      if (ee == 1)
	plot(input, h{ll+1}, plot_style{2},'LineWidth',4);
      else
	plot(input, h{ll+1}, plot_style{3},'LineWidth',4);
      end
      hold off
    end
end
grid on
hndl = legend('Target','Before training', 'After 1 epoch', ...
	      'At the end of training', 'Location', 'NorthWest');
set(hndl, 'FontSize', 14, 'FontWeight', 'bold')
grid on
xlabel('input','FontSize',16)
ylabel('output','FontSize',16)
saveas(3,'demo_regression.png')
