A 5-layer Neural Network

Because it is very similar with 2-layer neural network, I only post the source code rather than go through details with it, and some key points have been written into comments of code.

import numpy as np
    import h5py
    import matplotlib
    matplotlib.use('TkAgg')
    import matplotlib.pyplot as plt
    from deeplearning_ai_week4.testCases_v2 import *
    import deeplearning_ai_week4.dnn_app_utils_v2 as sup

    def initialize_parameters(n_x, n_h, n_y):
        W1 = np.random.randn(n_h, n_x) * 0.01
        b1 = np.zeros((n_h, 1))
        W2 = np.random.randn(n_y, n_h) * 0.01
        b2 = np.zeros((n_y, 1))

        assert (W1.shape == (n_h, n_x))
        assert (b1.shape == (n_h, 1))
        assert (W2.shape == (n_y, n_h))
        assert (b2.shape == (n_y, 1))

        parameters = {"W1": W1,
                      "b1": b1,
                      "W2": W2,
                      "b2": b2}

        return parameters

    def initialize_parameters_deep(layer_dims):
        np.random.seed(1)
        parameters = {}

        L = len(layer_dims)  # number of layers in the network

        for l in range(1, L):
            """
            at very begin, I'm using np.random.randn(layer_dims[l], layer_dims[l - 1]) * 0.01
            instead of np.random.randn(layer_dims[l], layer_dims[l - 1]) / np.sqrt(layer_dims[l-1])
            but the results are very different
            so you can see that initializing parameters play a very important role in neural network
            """
            parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l - 1]) / np.sqrt(layer_dims[l-1])
            parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))

            assert (parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
            assert (parameters['b' + str(l)].shape == (layer_dims[l], 1))

        return parameters

    def linear_forward(A, W, b):
        Z = np.dot(W, A) + b
        assert (Z.shape == (W.shape[0], A.shape[1]))
        cache = (A, W, b)

        return Z, cache

    def linear_activation_forward(A_prev, W, b, activation):
        if activation == "sigmoid":
            Z, linear_cache = linear_forward(A_prev, W, b)
            A, activation_cache = sup.sigmoid(Z)
        elif activation == "relu":
            Z, linear_cache = linear_forward(A_prev, W, b)
            A, activation_cache = sup.relu(Z)

        assert (A.shape == (W.shape[0], A_prev.shape[1]))
        cache = (linear_cache, activation_cache)

        return A, cache

    def L_model_forward(X, parameters):
        """
        Arguments:
        X -- data, numpy array of shape (input size, number of examples)
        parameters -- output of initialize_parameters_deep()
        """
        caches = []
        A = X
        L = len(parameters) // 2

        for l in range(1, L):
            A_prev = A
            A, cache = linear_activation_forward(A_prev, parameters['W' + str(l)],
                                                 parameters['b' + str(l)], "relu")
            caches.append(cache)

        AL, cache = linear_activation_forward(A, parameters['W' + str(L)],
                                              parameters['b' + str(L)], "sigmoid")
        caches.append(cache)

        assert (AL.shape == (1, X.shape[1]))

        return AL, caches

    def compute_cost(AL, Y):
        m = Y.shape[1]

        cost = np.sum(np.multiply(Y, np.log(AL)) + np.multiply((1 - Y), np.log(1 - AL))) / (-m)

        cost = np.squeeze(cost)
        assert cost.shape == ()

        return cost

    def linear_backward(dZ, cache):
        A_prev, W, b = cache
        m = A_prev.shape[1]

        dW = np.dot(dZ, A_prev.T) / m
        db = np.sum(dZ, axis=1, keepdims=True) / m
        dA_prev = np.dot(W.T, dZ)

        assert (dA_prev.shape == A_prev.shape)
        assert (dW.shape == W.shape)
        assert (db.shape == b.shape)

        return dA_prev, dW, db

    def linear_activation_backward(dA, cache, activation):
        linear_cache, activation_cache = cache

        if activation == "relu":
            dZ = sup.relu_backward(dA, activation_cache)
        elif activation == "sigmoid":
            dZ = sup.sigmoid_backward(dA, activation_cache)

        dA_prev, dW, db = linear_backward(dZ, linear_cache)

        return dA_prev, dW, db

    def L_model_backward(AL, Y, caches):
        """
        Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group

        :param AL: probability vector, output of the forward propagation (L_model_forward())
        :param Y: true "label" vector (containing 0 if not-cat, 1 if cat)
        :param caches: list of caches containing:
                        every cache of linear_activation_forward() with "relu"
                            (it's caches[1], for 1 in range(L-1) i.e l = 0..L
                        the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
        :return: A dictionary with gradients
                    grads["dA" + str(l)] = ...
                    grads["dW" + str(l)] = ...
                    grads["db" + str(l)] = ...
        """
        grads = {}
        L = len(caches)  # the number of layers
        m = AL.shape[1]  # the number of examples
        Y = Y.reshape(AL.shape)

        # Initializing the back propagation
        dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))

        # Lth layer (SIGMOID -> LINEAR) gradients.
        # Inputs: "AL, Y, caches". Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
        current_cache = caches[L - 1]
        grads["dA" + str(L)], grads["dW" + str(L)], grads["db" + str(L)] = \
            linear_activation_backward(dAL, current_cache, "sigmoid")

        # other layers
        # range(L - 1): 1, 2, 3, ..., L-2 (NOT INCLUDING L - 1)
        for l in reversed(range(L - 1)):
            current_cache = caches[l]
            dA_prev_temp, dW_temp, db_temp = linear_activation_backward(
                grads["dA" + str(l + 2)], current_cache, "relu")
            # grads["dA" + str(a)] refers to the output of the backward propagation
            # for example: for lth layer, the input is dA(l+1) and the output is dA(l)
            grads["dA" + str(l + 1)] = dA_prev_temp
            grads["dW" + str(l + 1)] = dW_temp
            grads["db" + str(l + 1)] = db_temp

        return grads

    def update_parameters(parameters, grads, learning_rate):
        L = len(parameters) // 2

        for l in range(L):
            parameters["W" + str(l + 1)] = parameters["W" + str(l + 1)] - learning_rate * grads["dW" + str(l + 1)]
            parameters["b" + str(l + 1)] = parameters["b" + str(l + 1)] - learning_rate * grads["db" + str(l + 1)]

        return parameters

    def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
        np.random.seed(1)
        costs = []

        parameters = initialize_parameters_deep(layers_dims)

        for i in range(0, num_iterations):
            AL, caches = L_model_forward(X, parameters)
            cost = compute_cost(AL, Y)
            grads = L_model_backward(AL, Y, caches)
            parameters = update_parameters(parameters, grads, learning_rate)
            costs.append(cost)
            if print_cost and i % 100 == 0:
                print("Cost after iteration %i: %f" %(i, cost))

        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

        return parameters

    if __name__ == "__main__":
        # matplotlib inline
        plt.rcParams['figure.figsize'] = (5., 4.)
        plt.rcParams['image.interpolation'] = 'nearest'
        plt.rcParams['image.cmap'] = 'gray'

        np.random.seed(1)

        train_x_orig, train_y, test_x_orig, test_y, classes = sup.load_data()

        m_train = train_x_orig.shape[0]
        num_px = train_x_orig.shape[1]
        m_test = test_x_orig.shape[0]

        train_x_flatten = train_x_orig.reshape(train_x_orig.shape[0], -1).T
        test_x_flatten = test_x_orig.reshape(test_x_orig.shape[0], -1).T

        train_x = train_x_flatten / 255.
        test_x = test_x_flatten / 255.

        # L-layer neural network core part
        layers_dims = [train_x.shape[0], 20, 7, 5, 1]   # 5-layer model
        parameters = L_layer_model(train_x, train_y, layers_dims, num_iterations=2500, print_cost=True)

        pred_train = sup.predict(train_x, train_y, parameters)