import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
encoder=OneHotEncoder()
The first step is to split the data into testing and training sets.
We will work with the famous Iris dataset, loaded from sklearn.
iris = datasets.load_iris()
data=np.array(iris['data'])
target=np.array(iris['target'])
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.33, random_state=10)
y_train=encoder.fit_transform(y_train.reshape(-1,1)).toarray()
y_test =(encoder.fit_transform(y_test.reshape(-1,1)).toarray())
Due to the nature of Neural Networks, we must scale the predictors into values between 0 and 1.
We can do this by applying a lambda function to each row for both sets.
scale_col = lambda c : c/np.max(c)
X_train=np.apply_along_axis(scale_col, 0, X_train)
X_test =np.apply_along_axis(scale_col, 0, X_test)
Our activation function is sigmoid.
def sigmoid(x):
return 1.0/(1.0+np.exp(-x))
# Weight Matrix 1
W1=np.random.rand(4,3)
B1=np.random.rand(3,1)
J=np.ones(len(X_train)).reshape(100,1)
Z2=(X_train@W1)+(J@B1.T)
#Activation Function
A2=sigmoid(Z2)
W2=np.random.rand(3,3)
B2=np.random.rand(3,1)
Z3=(A2@W2)+(J@B2.T)
y_hat=sigmoid(Z2)
def cost(y,yh):
return 0.5*np.sum((y - yh)**2)
def sigmoidprime(x):
return sigmoid(x)*(1-sigmoid(x))
delta3=(-(y_train-y_hat))*sigmoidprime(Z3)
delta2=(delta3@W2.T)*sigmoidprime(Z2)
djdw2=A2.T@delta3
djdb2=np.apply_along_axis(np.sum,0,delta3)
djdw1=X_train.T@delta2
djdb1=np.apply_along_axis(np.sum,0,delta2)
#Number of Observations
n=len(X_train)
#Number of Iterations
N=10000
#Constant Gamma
gamma=0.15
#Generate Initial Random Weights
W1=np.random.rand(4,3)
B1=np.random.rand(3,1)
W2=np.random.rand(3,3)
B2=np.random.rand(3,1)
#Start Training!
costs=[]
for i in range(0,N):
#Calculate the Predicted y values with the current weights
Z2=(X_train@W1)+(J@(B1.T))
A2=sigmoid(Z2)
Z3=(A2@W2)+(J@B2.T)
y_hat=sigmoid(Z3)
costs.append(cost(y_train,y_hat))
#Calculate the Gradient with the current Weights.
delta3=(-(y_train-y_hat))*sigmoidprime(Z3)
delta2=(delta3@W2.T)*sigmoidprime(Z2)
djdw2=A2.T@delta3
djdb2=np.apply_along_axis(np.sum,0,delta3).reshape(3,1)
djdw1=X_train.T@delta2
djdb1=np.apply_along_axis(np.sum,0,delta2).reshape(3,1)
#Take a step in the right direction of the weights' gradient.
W1=W1-djdw1*gamma
W2=W2-djdw2*gamma
B1=B1-djdb1*gamma
B2=B2-djdb2*gamma
J=np.ones(len(X_test)).reshape(-1,1)
Z2=(X_test@W1)+(J@(B1.T))
A2=sigmoid(Z2)
Z3=(A2@W2)+(J@B2.T)
y_hat=sigmoid(Z3)
y_hat=np.array((np.round(y_hat,decimals=2)>0.5),dtype=np.int)
y_hat[:5]
array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 1, 0], [1, 0, 0]])
confusion_matrix(y_test.argmax(axis=1), y_hat.argmax(axis=1))
array([[15, 0, 0], [ 0, 19, 0], [ 0, 2, 14]], dtype=int64)