【DeepLearning】Exercise:Sparse Autoencoder

浏览数：113 / 时间：2015年06月09日

习题的链接：http://deeplearning.stanford.edu/wiki/index.php/Exercise:Sparse_Autoencoder

我的实现：

sampleIMAGES.m

function patches = sampleIMAGES()
% sampleIMAGES
% Returns 10000 patches for training

load IMAGES;    % load images from disk 

patchsize = 8;  % we‘ll use 8x8 patches 
numpatches = 10000;

% Initialize patches with zeros.  Your code will fill in this matrix--one
% column per patch, 10000 columns. 
patches = zeros(patchsize*patchsize, numpatches);

%% ---------- YOUR CODE HERE --------------------------------------
%  Instructions: Fill in the variable called "patches" using data 
%  from IMAGES.  
%  
%  IMAGES is a 3D array containing 10 images
%  For instance, IMAGES(:,:,6) is a 512x512 array containing the 6th image,
%  and you can type "imagesc(IMAGES(:,:,6)), colormap gray;" to visualize
%  it. (The contrast on these images look a bit off because they have
%  been preprocessed using using "whitening."  See the lecture notes for
%  more details.) As a second example, IMAGES(21:30,21:30,1) is an image
%  patch corresponding to the pixels in the block (21,21) to (30,30) of
%  Image 1

for i=1:numpatches
    %randomly pick one of the 10 images
    ind = round(rand(1)*9)+1;   %[0~9]+1=[1,10]
    %randomly sample an 8×8 image patch
    startx = round(rand(1)*504)+1;   %[0~504]+1=[1,505]
    starty = round(rand(1)*504)+1;
    tempPatch = IMAGES(startx:startx+patchsize-1,starty:starty+patchsize-1,ind);
    %convert the image patch into a 64-dimensional vector
    tempVec = reshape(tempPatch, patchsize*patchsize, 1);
    patches(:,i) = tempVec;
end


%% ---------------------------------------------------------------
% For the autoencoder to work well we need to normalize the data
% Specifically, since the output of the network is bounded between [0,1]
% (due to the sigmoid activation function), we have to make sure 
% the range of pixel values is also bounded between [0,1]
patches = normalizeData(patches);

end


%% ---------------------------------------------------------------
function patches = normalizeData(patches)

% Squash data to [0.1, 0.9] since we use sigmoid as the activation
% function in the output layer

% Remove DC (mean of images). 
patches = bsxfun(@minus, patches, mean(patches));

% Truncate to +/-3 standard deviations and scale to -1 to 1
pstd = 3 * std(patches(:));
patches = max(min(patches, pstd), -pstd) / pstd;

% Rescale from [-1,1] to [0.1,0.9]
patches = (patches + 1) * 0.4 + 0.1;

end

computeNumericalGradient.m

function numgrad = computeNumericalGradient(J, theta)
% numgrad = computeNumericalGradient(J, theta)
% theta: a vector of parameters
% J: a function that outputs a real-number. Calling y = J(theta) will return the
% function value at theta. 
  
% Initialize numgrad with zeros
numgrad = zeros(size(theta));

%% ---------- YOUR CODE HERE --------------------------------------
% Instructions: 
% Implement numerical gradient checking, and return the result in numgrad.  
% (See Section 2.3 of the lecture notes.)
% You should write code so that numgrad(i) is (the numerical approximation to) the 
% partial derivative of J with respect to the i-th input argument, evaluated at theta.  
% I.e., numgrad(i) should be the (approximately) the partial derivative of J with 
% respect to theta(i).
%                
% Hint: You will probably want to compute the elements of numgrad one at a time. 

% The function aims to check whether W1grad&W2grad&b1grad&b2grad in
% sparseAutoencoderCost have been computed correctly.

%theta(i) denotes i_th parameter
length = size(theta,1);
EPSILON = 1e-4;
E = eye(length);
for i=1:length
    numgrad(i)=(J(theta + EPSILON*E(:,i)) - J(theta - EPSILON*E(:,i)))/(2*EPSILON);
end


%% ---------------------------------------------------------------
end

sparseAutoencoderCost.m

function [cost,grad] = sparseAutoencoderCost(theta, visibleSize, hiddenSize, ...
                                             lambda, sparsityParam, beta, data)

% visibleSize: the number of input units (probably 64) 
% hiddenSize: the number of hidden units (probably 25) 
% lambda: weight decay parameter
% sparsityParam: The desired average activation for the hidden units (denoted in the lecture
%                           notes by the greek alphabet rho, which looks like a lower-case "p").
% beta: weight of sparsity penalty term
% data: Our 64x10000 matrix containing the training data.  So, data(:,i) is the i-th training example. 
  
% The input theta is a vector (because minFunc expects the parameters to be a vector). 
% We first convert theta to the (W1, W2, b1, b2) matrix/vector format, so that this 
% follows the notation convention of the lecture notes. 

% W1(i,j) denotes the weight from j_th node in input layer to i_th node
% in hidden layer. Thus it is a hiddenSize*visibleSize matrix
W1 = reshape(theta(1:hiddenSize*visibleSize), hiddenSize, visibleSize);  
% W2(i,j) denotes the weight from j_th node in hidden layer to i_th node
% in output layer. Thus it is a visibleSize*hiddenSize matrix
W2 = reshape(theta(hiddenSize*visibleSize+1:2*hiddenSize*visibleSize), visibleSize, hiddenSize);
% b1(i) denotes the i_th bias in input layer to i_th node in hidden layer.
% Thus it is a hiddenSize*1 vector
b1 = theta(2*hiddenSize*visibleSize+1:2*hiddenSize*visibleSize+hiddenSize);
% b2(i) denotes the i_th bias in hidden layer to i_th node in output layer.
% Thus it is a visibleSize*1 vector
b2 = theta(2*hiddenSize*visibleSize+hiddenSize+1:end);

%% ---------- YOUR CODE HERE --------------------------------------
%  Instructions: Compute the cost/optimization objective J_sparse(W,b) for the Sparse Autoencoder,
%                and the corresponding gradients W1grad, W2grad, b1grad, b2grad.
%
% W1grad, W2grad, b1grad and b2grad should be computed using backpropagation.
% Note that W1grad has the same dimensions as W1, b1grad has the same dimensions
% as b1, etc.  Your code should set W1grad to be the partial derivative of J_sparse(W,b) with
% respect to W1.  I.e., W1grad(i,j) should be the partial derivative of J_sparse(W,b) 
% with respect to the input parameter W1(i,j).  Thus, W1grad should be equal to the term 
% [(1/m) \Delta W^{(1)} + \lambda W^{(1)}] in the last block of pseudo-code in Section 2.2 
% of the lecture notes (and similarly for W2grad, b1grad, b2grad).
% 
% Stated differently, if we were using batch gradient descent to optimize the parameters,
% the gradient descent update to W1 would be W1 := W1 - alpha * W1grad, and similarly for W2, b1, b2. 
% 

% 1. Set \Delta W^{(1)}, \Delta b^{(1)} to 0 for all layer l
% Cost and gradient variables (your code needs to compute these values). 
% Here, we initialize them to zeros. 
cost = 0;
rho = zeros(hiddenSize,1);
W1grad = zeros(size(W1)); 
W2grad = zeros(size(W2));
b1grad = zeros(size(b1)); 
b2grad = zeros(size(b2));

% 2.For i=1 to m
m = size(data,2);

%{
For Big-data, which we cannot save activation information for all nodes
% Compute rho before backpropagation
for i=1:m
    x = data(:,i);
    z2 = W1 * x + b1;
    a2 = sigmoid(z2);    
    % Accumulate rho
    rho = rho + a2;
end
rho = rho ./ m;
KLterm = beta*(-sparsityParam ./ rho + (1-sparsityParam) ./ (1-rho));

for i=1:m
    x = data(:,i);
    % 2a. Use backpropagation to compute diff(J_sparse(W,b;x,y), W^{(1)})
    % and diff(J_sparse(W,b;x,y), b^{(1)})
    
    % 2a.1. Perform a feedforward pass, computing the activations for
    % hidden layer and output layer.
    z2 = W1 * x + b1;
    a2 = sigmoid(z2);
    z3 = W2 * a2 + b2;
    a3 = sigmoid(z3);

    
    % Accumulate the cost
    cost = cost + 1/2 * ((x-a3)‘*(x-a3));
    
    % 2a.2. For the output layer, set delta3
    % delta3 is a visibleSize*1 vector
    delta3 = -(x-a3) .* sigmoidDiff(z3);
    
    % 2a.3. For the hidden layer, set delta2
    % delta2 is a hiddenSize*1 vector
    delta2 = (W2‘*delta3 + KLterm) .* sigmoidDiff(z2);
    
    % 2a.4. Compute the desired partial derivatives
    JW1diff = delta2 * x‘;
    Jb1diff = delta2;
    JW2diff = delta3 * a2‘;
    Jb2diff = delta3;

    % 2b. Update \Delta W^{(1)}
    W1grad = W1grad + JW1diff;
    W2grad = W2grad + JW2diff;

    % 2c. Update \Delta b^{(1)}
    b1grad = b1grad + Jb1diff;
    b2grad = b2grad + Jb2diff;
end

% Compute KL penalty term
KLpen = beta * sum(sparsityParam*log(sparsityParam ./ rho) + (1-sparsityParam)*log((1-sparsityParam) ./ (1-rho)));

% Compute weight decay term
tempW1 = W1 .* W1;
tempW2 = W2 .* W2;
WD = (lambda/2)*(sum(sum(tempW1))+sum(sum(tempW2)));

cost = cost ./ m + WD + KLpen;
W1grad = W1grad ./ m + lambda .* W1;
W2grad = W2grad ./ m + lambda .* W2;
b1grad = b1grad ./ m;
b2grad = b2grad ./ m;
%}

% For small data, save activation information during computing rho
z2_all = zeros(hiddenSize, m);
a2_all = zeros(hiddenSize, m);
z3_all = zeros(visibleSize, m);
a3_all = zeros(visibleSize, m);

for i=1:m
    x = data(:,i);
    z2_all(:,i) = W1 * x + b1;
    a2_all(:,i) = sigmoid(z2_all(:,i));    
    z3_all(:,i) = W2 * a2_all(:,i) + b2;
    a3_all(:,i) = sigmoid(z3_all(:,i));
    % Accumulate rho
    rho = rho + a2_all(:,i);
end
rho = rho ./ m;
KLterm = beta*(-sparsityParam ./ rho + (1-sparsityParam) ./ (1-rho));

for i=1:m
    x = data(:,i);
    % 2a. Use backpropagation to compute diff(J_sparse(W,b;x,y), W^{(1)})
    % and diff(J_sparse(W,b;x,y), b^{(1)})
    
    % 2a.1. Perform a feedforward pass, computing the activations for
    % hidden layer and output layer.
    z2 = z2_all(:,i);
    a2 = a2_all(:,i);
    z3 = z3_all(:,i);
    a3 = a3_all(:,i);

    
    % Accumulate the cost
    cost = cost + 1/2 * ((x-a3)‘*(x-a3));
    
    % 2a.2. For the output layer, set delta3
    % delta3 is a visibleSize*1 vector
    delta3 = -(x-a3) .* sigmoidDiff(z3);
    
    % 2a.3. For the hidden layer, set delta2
    % delta2 is a hiddenSize*1 vector
    delta2 = (W2‘*delta3 + KLterm) .* sigmoidDiff(z2);
    
    % 2a.4. Compute the desired partial derivatives
    JW1diff = delta2 * x‘;
    Jb1diff = delta2;
    JW2diff = delta3 * a2‘;
    Jb2diff = delta3;

    % 2b. Update \Delta W^{(1)}
    W1grad = W1grad + JW1diff;
    W2grad = W2grad + JW2diff;

    % 2c. Update \Delta b^{(1)}
    b1grad = b1grad + Jb1diff;
    b2grad = b2grad + Jb2diff;
end

% Compute KL penalty term
KLpen = beta * sum(sparsityParam*log(sparsityParam ./ rho) + (1-sparsityParam)*log((1-sparsityParam) ./ (1-rho)));

% Compute weight decay term
tempW1 = W1 .* W1;
tempW2 = W2 .* W2;
WD = (lambda/2)*(sum(sum(tempW1))+sum(sum(tempW2)));

cost = cost ./ m + WD + KLpen;
W1grad = W1grad ./ m + lambda .* W1;
W2grad = W2grad ./ m + lambda .* W2;
b1grad = b1grad ./ m;
b2grad = b2grad ./ m;

%-------------------------------------------------------------------
% 3.Update the parametersAfter computing the cost and gradient, we will 
% convert the gradients back to a vector format (suitable for minFunc).  
% Specifically, we will unroll your gradient matrices into a vector.

grad = [W1grad(:) ; W2grad(:) ; b1grad(:) ; b2grad(:)];

end

%-------------------------------------------------------------------
% Here‘s an implementation of the sigmoid function, which you may find useful
% in your computation of the costs and the gradients.  This inputs a (row or
% column) vector (say (z1, z2, z3)) and returns (f(z1), f(z2), f(z3)). 

function sigm = sigmoid(x)
    sigm = 1 ./ (1 + exp(-x));
end

% define the differential of sigmoid
function sigmDiff = sigmoidDiff(x)
    sigmDiff = sigmoid(x) .* (1-sigmoid(x));
end