只用numpy库,从底层实现深度神经网络。底层的数学逻辑可参见吴恩达的深度学习。
温馨建议:
为了便于对整体进行观察,把主要子函数的输入输出列写如下,可快速了解各函数如何相互作用。
可结合整体再深入细节看每个函数的具体实现
parameters = initialize_parameters_deep(layer_dims)
# forward propagation
Z, cache = linear_forward(A, W, b)
A, cache = linear_activation_forward(A_prev, W, b, activation)
AL, caches = L_model_forward(X, parameters)
# cost function
cost = compute_cost(AL, Y)
# backward propagation
dA_prev, dW, db = linear_activation_backward(dA, cache, activation)
grads = L_model_backward(AL, Y, caches)
parameters = update_parameters(parameters, grads, learning_rate)
# compute sigmoid and ReLU function, and corresponding dZ
A, cache = sigmoid(Z)
A, cache = relu(Z)
dZ = relu_backward(dA, cache)
dZ = sigmoid_backward(dA, cache)
import numpy as np
n [ l ] n^{[l]} n[l]表示第 l l l层的单元数(units)。
假如输入 X X X的大小是(12288, 209)( m = 209 m=209 m=209 examples),那么:
Initialization of a L-layers Neural Network
def initialize_parameters_deep(layers_dims):
"""
input:
layers_dims -- python list,维度矩阵.
eg.layers_dims=[2,3,2]: input layers 有 2个 units,包含3个unit的一个hidden layers,output layer has 2 units
output/return:
parameters -- pathon dictiionary, initialize parameters containing parameters:
Wl : ['W' + str(l)]
bl : ['b' + str(l)]
"""
np.random.seed(3)
parameters = {} # 先申明dict,然后利用 for loop 在 dict 中添加 key
L = len(layers_dims) # 层的维度的个数即是层的个数
for l in range(1, L):
parameters["W" + str(l)] = np.random.rand(layers_dims[l-1], layers_dims[l]) # layers_dims[l]:第l层的units
parameters["b" + str(l)] = np.zeros(( layers_dims[l], 1))
# 验证 parameters 的 shape
assert(parameters["W" + str(l)].shape == ( layers_dims[l-1], layers_dims[l]))
assert(parameters["b" + str(l)].shape == ( layers_dims[l], ))
return parameters
The linear forward 函数 (vectorized over all the examples) 计算 下面的等式:
Z [ l ] = W [ l ] A [ l − 1 ] + b [ l ] Z^{[l]} = W^{[l]}A^{[l-1]} + b^{[l]} Z[l]=W[l]A[l−1]+b[l]
where A [ 0 ] = X A^{[0]} = X A[0]=X
def linear_forward(A, W, b):
"""
input:
A -- 前一层的activations,(or input data X): (size of previous layer, numbel of examples)
W -- weight matrix: 矩阵 shape (size of current layer, size of previous layer)
b -- bias vector, 矩阵 shape (size of current layer, 1)
output/return:
Z -- the input of activations function(前激活参数)
cache -- python dictionary,containing A ,W, b. 存储在cache中,用于计算后向传播过程
"""
Z = np.dot(W, A) + b # broadcasting rule
assert(Z.shape == (W.shape[0], A.shape[1]))
cache = (A, W, b)
return Z, cache
在整个网络中,使用两种activation functions:
A
” 和 a “cache
” 存储变量 “Z
” (作为相关后向传播函数的输入)。 To use it following:A, activation_cache = sigmoid(Z)
A
” 和"cache
" 存储变量 “Z
” (作为相关后向传播函数的输入)。To use it following:A, activation_cache = relu(Z)
把两个函数(linear,activation)组合成一个函数(linear->activation). The mathematical relation is: A [ l ] = g ( Z [ l ] ) = g ( W [ l ] A [ l − 1 ] + b [ l ] ) A^{[l]} = g(Z^{[l]}) = g(W^{[l]} A^{[l-1]} + b^{[l]}) A[l]=g(Z[l])=g(W[l]A[l−1]+b[l]).
where 激活函数 “g” 可以是 sigmoid()(第L层activation) 或 relu()(前L-1层activation)
def linear_activation_forward(A_prev, W, b, activation):
"""
input:
A_prev -- 前一层的activation value(或input data X): (size of previous layer, the number of examples)
W -- weight matrix: (size of current layer, size of previous layer)
b -- bias vector: (size of current layer)
output/return:
A -- the output of activation function(后激活值)
cache -- python dictionary, containing "linear_cache"(A_prev, W, b),and "activation_cache"(Z)
用于计算后向传播
"""
# linear_cache containing (A, W, b)
# activation_cache containing (Z)
if activation == "sigmoid":
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = sigmiod(Z)
elif activation == "relu":
Z, linear_cache = linear_forward(A_prev, W, b)
A, activation_cache = sigmiod(Z)
assert(A.shape == (W.shape[0], A_prev.shape[1]))
cache = (linear_cache, activation_cache) # cache:tuple
return A, cache
前 L − 1 L-1 L−1层的activation function 是 ReLU,第 L L L层的activation function 是 sigmoid。
The variable AL
will denote A [ L ] = σ ( Z [ L ] ) = σ ( W [ L ] A L − 1 + b [ L ] ) A^{[L]} = \sigma(Z^{[L]}) = \sigma(W^{[L]} A^{L-1} + b^{[L]}) A[L]=σ(Z[L])=σ(W[L]AL−1+b[L]). Also called Yhat
,就是 Y ^ \hat{Y} Y^
def L_model_forward(X, parameters):
"""
input:
X -- input data,shape (input size, number of examples)
parameters -- python dictionary, output of initialize_parameters_deep (W, b)
output:
AL -- 最后输出的激活值(last post_activation value)
caches -- 缓存列表
linear_cache和activation_cache from Sigmoid和ReLU
relu(有L-1个,indexed from 0 to L-2),sigmiod(just one, indexed L-1)
"""
caches = []
A_prev = X
L = len(parameters) // 2 # 整数除法,返回不大于结果的最大整数
# there are L-1 activation function "relu"
for l in range(1, L):
A, cache = linear_activation_forward(A_prev, parameters["W" + str(l)], parameters["b" + str(l)], relu)
caches.append(cache)
AL, cache = linear_activation_forward(A, parameters["W" + str(L-1)], parameters["b" + str(L-1)], sigmoid)
caches.append(cache)
assert(AL.shape == (1, X.shape[1]))
# AL 返回最终的激活之,
#caches(list)即[cache0,cache1,cache2...],cache(tuple)包括linear cache(A,W,b) 和 activation cache(Z)
return AL, caches
计算 cross-entropy cost J J J,为了在后向传播中计算梯度
using the following formula: − 1 m ∑ i = 1 m ( y ( i ) log ( a [ L ] ( i ) ) + ( 1 − y ( i ) ) log ( 1 − a [ L ] ( i ) ) ) -\frac{1}{m} \sum\limits_{i=1}^{m}(y^{(i)} \log\left(a^{[L](i)}\right) + (1-y^{(i)}) \log\left(1-a^{[L](i)}\right)) −m1i=1∑m(y(i)log(a[L](i))+(1−y(i))log(1−a[L](i)))
def compute_cost(AL, Y):
"""
input:
AL -- label predictions,shape(1, number of examples)
Y -- true "label" vetor
output/return:
cost -- cross-entrpy cost
"""
m = Y.shape[1]
cost = (-1/m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply((1-Y), np.log(1-AL)))
# 矩阵直接的运算,最终的结果仍然是矩阵,需要把矩阵转换为int
cost = np.squeeze(cost) # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
assert(cost.shape == ())
return cost
第L层的线性部分为: Z [ l ] = W [ l ] A [ l − 1 ] + b [ l ] Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]} Z[l]=W[l]A[l−1]+b[l],然后进行激活
在计算得到 d Z [ l ] = ∂ L ∂ Z [ l ] dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}} dZ[l]=∂Z[l]∂L之后,就可以得到( d W [ l ] , d b [ l ] , d A [ l − 1 ] dW^{[l]},db^{[l]},dA^{[l-1]} dW[l],db[l],dA[l−1])
The three outputs ( d W [ l ] , d b [ l ] , d A [ l ] ) (dW^{[l]}, db^{[l]}, dA^{[l]}) (dW[l],db[l],dA[l]) are computed using the input d Z [ l ] dZ^{[l]} dZ[l].Here are the formulas you need:
d W [ l ] = ∂ L ∂ W [ l ] = 1 m d Z [ l ] A [ l − 1 ] T dW^{[l]} = \frac{\partial \mathcal{L} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} dW[l]=∂W[l]∂L=m1dZ[l]A[l−1]T
d b [ l ] = ∂ L ∂ b [ l ] = 1 m ∑ i = 1 m d Z [ l ] ( i ) db^{[l]} = \frac{\partial \mathcal{L} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)} db[l]=∂b[l]∂L=m1i=1∑mdZ[l](i)
d A [ l − 1 ] = ∂ L ∂ A [ l − 1 ] = W [ l ] T d Z [ l ] dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} dA[l−1]=∂A[l−1]∂L=W[l]TdZ[l]
Use the 3 formulas above to implement linear_backward().
def linear_backward(dZ, cache):
"""
input:
dZ -- cost 对 linear output的梯度(of current layer l)
cache -- tuple of value (A_prev, W, b), coming from the forward propagation in the current layer
output/return:
dA_prev -- cost 对 activation 的梯度(of the previous layers l-1),same shape as A_prev
dW -- cost 对 W 的梯度(current layer 1),same shape as W
db -- cost 对 b 的梯度(current layer 1),same shape as b
"""
A_prev, W, b = cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ, A_prev.T)
db = (1/m) * np.sum(dZ, axis = 1, keepdims = True)
dA_prev = np.dot(W.T, dZ)
assert(dW.shape == W.shape)
assert(db.shape == b.shape)
assert(dA_prev == A_prev.shape)
return dW, db, dA_prev
把函数整合到一块
用 dA 和 Z(保存在activation_cache中)计算 dZ
sigmoid_backward
: dZ = sigmoid_backward(dA, activation_cache)relu_backward
: dZ = relu_backward(dA, activation_cache)g ( . ) g(.) g(.)表示激活函数, d Z [ l ] = d A [ l ] ∗ g ′ ( Z [ l ] ) dZ^{[l]} = dA^{[l]} * g'(Z^{[l]}) dZ[l]=dA[l]∗g′(Z[l])
def linear_activation_backward(dA, cache, activation):
"""
input:
dA -- 当前 l 层的 post-activation
cache -- tuple of value(linear_caceh, activation_cache),用于计算后向传播
activation -- sigmoid or relu
output/return:
dA_prev -- cost 对 activation 的梯度(of the previous layers l-1),same shape as A_prev
dW -- cost 对 W 的梯度(current layer 1),same shape as W
db -- cost 对 b 的梯度(current layer 1),same shape as b
"""
linear_cache, activation_cache = cache
if activation == "relu":
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
elif activation == "sigmoid":
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
return dA_prev, dW, db
现在可以对整个网络求反向传播。每次迭代都需要把 (X,W,b, and z)存储在cache中,在back propagation中需要这些参数计算梯度,从L层开始向前迭代
我们已经知道output: A [ L ] = σ ( Z [ L ] ) A^{[L]} = \sigma(Z^{[L]}) A[L]=σ(Z[L]).
我们需要 compute dAL
= ∂ L ∂ A [ L ] = \frac{\partial \mathcal{L}}{\partial A^{[L]}} =∂A[L]∂L.
推导可得如下计算公式:
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
最后用一个for
loop迭代所有层,并把 dW,db,dA 存在grads字典中
def L_model_backward(AL, Y, caches):
"""
input:
AL -- output of the forward propagation(L_model_forward)
Y -- true "label" vector
caches -- list of caches containing:
linear_cache和activation_cache from Sigmoid和ReLU
relu(有L-1个,indexed from 0 to L-2),sigmiod(just one, indexed L-1)
output/return:
grads -- a dictionary with the gradients
grads["dA" + str(l)] = ...
grads["dW" + str(l)] = ...
grads["db" + str(l)] = ...
"""
grads = {}
L = len(caches) # layers 的个数
Y = Y.reshape(AL.shape) # 统一shape。【为防止出错(可有可无)】
dAL = - (np.divided(Y, AL) - np.divided(1 - Y, 1 - AL))
# 先计算第 L 层的梯度
current_cache = caches[L-1] #第 L 层的 index 为 L-1
grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_caches, "sigmoid")
# 然后计算前面L-1层的梯度 (反向迭代)
for l in reversed(range(L-1)):
current_caches = caches[l]
grads["dA" + str(l)], grads["dW" + str(l)], grads["db" + str(l)] = linear_activation_backward(dAL, current_caches, "sigmoid")
return grads
Using gradient descent:
W [ l ] = W [ l ] − α d W [ l ] W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} W[l]=W[l]−α dW[l]
b [ l ] = b [ l ] − α d b [ l ] b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} b[l]=b[l]−α db[l]
where α \alpha α is the learning rate. After computing the updated parameters, store them in the parameters dictionary.
def update_parameters(parameters, grads, learning_rate):
"""
input:
parameters -- python dictionary containing parameters
grads -- python dictionary containing gradients, output of L_model_backward
output/return:
parameters -- python dictionary containing your updated parameters
parameters["W" + str(l)] = ...
parameters["b" + str(l)] = ...
"""
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] -= learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
return parameters
.
.
.
.
.
def sigmoid(Z):
A = 1/(1+np.exp(-Z))
cache = Z
return A, cache
def relu(Z):
A = np.maximum(0,Z)
assert(A.shape == Z.shape)
cache = Z
return A, cache
def relu_backward(dA, cache):
"""
Arguments:
dA -- post-activation gradient, of any shape
cache -- where we store 'Z'
Returns:
dZ -- cost 对 Z 的梯度
"""
Z = cache
dZ = np.array(dA, copy=True) # just converting dz to a correct object.
# When z <= 0, you should set dz to 0 as well.
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
def sigmoid_backward(dA, cache):
Z = cache
s = 1/(1+np.exp(-Z))
dZ = dA * s * (1-s)
assert (dZ.shape == Z.shape)
return dZ