Neural Network from scratch

Jan 6, 2022

creating a simple neural network with numpy.

some of the important methods and terminologies in neural network are :

  • forwards propagation
  • neurons
  • hidden units
  • activation functions
  • loss functions
  • backward propagation
  • optimizers

in order to create a simple neural network, we need to implement all the above terminologies

  1. import the necessary modules
import numpy as np
import pandas as pd
from numpy import random
from numpy.random.mtrand import beta, random_sample
from sklearn.base import BaseEstimator
from tqdm import tqdm

2. create a class and constructor to initialize the variables

class Network(BaseEstimator):
def __init__(self, learning_rate = 0.01, epoches = 30, activations = [], layers = [],
optimizer = 'adam', batch_size = 64, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8, decay_rate = 0.5,
random_state = None, regularization = 'l2', keep_prob = 1, lambd = 1, t = 2, y_reshape = False):

self.random_state = random_state
if self.random_state:
self.learning_rate = learning_rate
self.layers = layers
self.n_layers = len(layers) - 1
self.activations = activations
self.epoches = epoches
self.weights, self.biases = self.initialize_parameters()
self.cache = self.initialize_cache()
self.v, self.s = self.initialize_ewa()
self.costs = []
self.optimizer = optimizer
self.batch_size = batch_size
self.beta1 = beta1
self.beta2 = beta2
self.decay_rate = decay_rate
self.regularization = regularization
self.lambd = lambd
self.keep_prob = keep_prob
self.epsilon = epsilon
self.t = t
self.y_reshape = y_reshape

3. create a function to initialize the weights, biases, cache and exponentially weighted average (EMA)

def initialize_parameters(self):
w = [
np.random.randn(next_layer, current_layer) * 0.01 for current_layer, next_layer in zip(self.layers[:-1], self.layers[1:])
b = [
np.zeros((next_layer, 1)) for next_layer in self.layers[1:]

return w, b
def initialize_ewa(self):
v = {}
s = {}
for i in range(len(self.weights)):
v[f'dw{i + 1}'] = np.zeros(self.weights[i].shape)
v[f'db{i + 1}'] = np.zeros(self.biases[i].shape)
s[f'dw{i + 1}'] = np.zeros(self.weights[i].shape)
s[f'db{i + 1}'] = np.zeros(self.biases[i].shape)
return v, sdef initialize_cache(self):
c = {}
for i in range(len(self.weights)):
c[f'z{i + 1}'] = None
c[f'activation{i + 1}'] = None
c[f'w{i + 1}'] = self.weights[i]
c[f'b{i + 1}'] = self.biases[i]

c[f'da{i + 1}'] = None
c[f'dw{i + 1}'] = None
c[f'db{i + 1}'] = None

return c

4. next we need to create a method to propagate through the model ( forward pass )

def activate(self, z, activation):
if activation == 'sigmoid':
return sigmoid(z)
elif activation == 'relu':
return relu(z)
def forward(self, x):
a_prev = x
self.cache[f'activation0'] = a_prev
# for i, params in enumerate(zip(self.weights, self.biases)):
for i in range(self.n_layers):
# w, b = params
w, b = self.cache[f"w{i + 1}"], self.cache[f"b{i + 1}"]
z =, a_prev) + b
a_prev = self.activate(z, self.activations[i])
self.cache[f'z{i + 1}'] = z
self.cache[f'activation{i + 1}'] = a_prev

return a_prev

only implementing sigmoid and relu activation functions to keep it simple

def sigmoid(z):
return 1 / (1 + np.exp(-z))

def relu(z):
return np.maximum(0, z)

make sure that the above two functions are outside the class Network

5. Now implement a cost function — cross entropy

def cross_entropy_loss(self, a, y):
if self.y_reshape:
y = y.reshape(a.shape)
loss = np.multiply(y, np.log(a)) + np.multiply((1 - y), np.log(1 - a))
return -np.mean(loss)
def compute_cost(self, a, y):
loss = self.cross_entropy_loss(a, y)
if self.regularization == 'l2':
new_weights = 0
for l in range(1, self.n_layers + 1):
new_weights += np.sum(np.square(self.cache[f'w{l}']))
l2_loss = (self.lambd / (2 * a.shape[1])) * new_weights
loss = loss + l2_lossreturn loss

the above loss function is a also called as log loss because it is used to compute loss for the binary classification

6. Now implement a method to propagate backwards

def linear_backward(self, dZ, cache):
A_prev, W, b = cache
m = A_prev.shape[1]
dW =, A_prev.T) / m
if self.regularization == 'l2':
dW = dW + (self.lambd / m) * W
db = np.mean(dZ, keepdims = True, axis = 1)
dA_prev =, dZ)
return dA_prev, dW, db

def activate_prime(self, da, layer, activation):
if activation == 'sigmoid':
dz = sigmoid_prime(da, self.cache[f"z{layer}"])
elif activation == 'relu':
dz = relu_prime(da, self.cache[f"z{layer}"])

cache = self.cache[f'activation{layer - 1}'], self.cache[f'w{layer}'], self.cache[f'b{layer}']
return self.linear_backward(dz, cache)
def backward(self, a, y):

L = self.n_layers
if self.y_reshape:
y = y.reshape(a.shape)
da = - (np.divide(y, a) - np.divide((1 - y), (1 - a)))
self.cache["da" + str(L)] = da

da_prev, dw, db = self.activate_prime(da, L, self.activations[ L - 1 ])
self.cache["da" + str(L-1)] = da_prev
self.cache["dw" + str(L)] = dw
self.cache["db" + str(L)] = db

for l in range(L - 1, 0, -1):
da = self.cache[f'da{l}']
da_prev, dw, db = self.activate_prime(da, l, self.activations[ l - 1 ])
self.cache["da" + str(l-1)] = da_prev
self.cache["dw" + str(l)] = dw
self.cache["db" + str(l)] = db


like forward pass having a function to compute sigmoid and relu we also need to implement derived function of those activation function

def sigmoid_prime(da, z):
return da * sigmoid(z) * (1 - sigmoid(z))
def relu_prime(da, z):
dz = np.array(da, copy=True)
dz[z <= 0] = 0
return dz

make sure that the above two functions are outside the class Network as well

7. now we need to implement a method to update the parameters after the back propagation

def update_parameters(self):
for l in range(1, self.n_layers + 1):
self.cache[f"w{l}"] = self.cache[f"w{l}"] - (self.learning_rate * self.cache[f"dw{l}"])
self.cache[f"b{l}"] = self.cache[f"b{l}"] - (self.learning_rate * self.cache[f"db{l}"])
def update_paramters_with_adam(self):
v_corrected = {}
s_corrected = {}
for l in range(1, self.n_layers + 1):
self.v[f'dw{l}'] = (self.beta1 * self.v[f'dw{l}']) + ((1 - self.beta1) * self.cache[f'dw{l}'])
self.v[f'db{l}'] = (self.beta1 * self.v[f'db{l}']) + ((1 - self.beta1) * self.cache[f'db{l}'])
v_corrected[f'dw{l}'] = self.v[f'dw{l}'] / ( 1 - self.beta1**self.t )
v_corrected[f'db{l}'] = self.v[f'db{l}'] / ( 1 - self.beta1**self.t )
self.s[f'dw{l}'] = (self.beta2 * self.s[f'dw{l}']) + ((1 - self.beta2) * self.cache[f'dw{l}']**2)
self.s[f'db{l}'] = (self.beta2 * self.s[f'db{l}']) + ((1 - self.beta2) * self.cache[f'db{l}']**2)
s_corrected[f'dw{l}'] = self.s[f'dw{l}'] / ( 1 - self.beta2**self.t )
s_corrected[f'db{l}'] = self.s[f'db{l}'] / ( 1 - self.beta2**self.t )
self.cache[f"w{l}"] = self.cache[f"w{l}"] - (self.learning_rate * (v_corrected[f'dw{l}'] / (np.sqrt(s_corrected[f'dw{l}']) + self.epsilon)))
self.cache[f"b{l}"] = self.cache[f"b{l}"] - (self.learning_rate * (v_corrected[f'db{l}'] / (np.sqrt(s_corrected[f'db{l}']) + self.epsilon)))

the above parameter update have two different methods, one for gradient descent and other for adam optimizer

8. next implement the optimizers, in our case we need to implement gradient descent and adam

def GD(self, X, y):
for epoch in tqdm(range(self.epoches)):
yhat = self.forward(X)
cost = self.compute_cost(yhat, y)
self.backward(yhat, y)


if epoch % 100 == 0:

def adam(self, X, Y):
for epoch in tqdm(range(self.epoches)):
idx = np.random.permutation(X.shape[1])
shuffled_X = X[:, idx]
shuffled_y = Y[:, idx]
st = 0
ed = self.batch_size
iter_per_batch_size = (X.shape[1] // self.batch_size) + 1
for _ in range(iter_per_batch_size):
batch_X = shuffled_X[:, st : ed]
batch_Y = shuffled_y[:, st : ed]
if batch_X.shape[1] > 0:
yhat = self.forward(batch_X)
cost = self.compute_cost(yhat, batch_Y)
self.backward(yhat, batch_Y)
self.update_paramters_with_adam() st = ed
ed += self.batch_size
if epoch % 100 == 0:

9. finally implement the fit and predict methods

def fit(self, X, y):
X = np.array(X)
y = np.array(y)
if self.optimizer == 'gd':
self.GD(X, y)
self.adam(X, y)
return self

def predict(self, X):
return self.forward(X)

