Implementing Gradient Descent for Logistic Regression¶
This notebook follows the topics discussed in logistic regression course notes. Please refer to that page for context. This notebook tries to implement the concepts in Python, instead of MatLab/Octave. I have borrowed some inspiration and code from this blog.
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
%matplotlib inline
Plot sigmoid function¶
To bound our probability predictions between 0-1
, we use a sigmoid function. Its definition is below.
vals = np.arange(-10,10,0.2)
gz= 1/(1+np.e**(0-vals))
plt.plot(vals, gz)
plt.title('Sigmoid function');
Plot loss function for logistic regression¶
xvals = np.arange(0,1,0.1)
y1vals = 0-np.log(xvals)
y0vals = 0-np.log(1-xvals)
plt.plot(xvals, y1vals, 'b', label='y=1')
plt.plot(xvals, y0vals, 'g', label='y=0')
plt.title('Loss functions of logistic regression')
plt.legend()
plt.xlabel('Hypothesis: $h\\theta(x)$')
plt.ylabel('Loss');
/Users/atma6951/anaconda3/envs/pychakras/lib/python3.7/site-packages/ipykernel_launcher.py:2: RuntimeWarning: divide by zero encountered in log
Load and visualize training data¶
from numpy import loadtxt, where
#load the dataset
data_path = '/Users/atma6951/Documents/code/pychakras/pyChakras/ml/coursera-ml-matlabonline/machine-learning-ex/ex2'
data = loadtxt(f'{data_path}/ex2data1.txt', delimiter=',')
data.shape
(100, 3)
# set dependent and independent variables.
X = data[:, 0:2]
y = data[:, 2]
y = y.reshape(y.shape[0],1)
# find positive and negative cases
pos = where(y == 1)
neg = where(y == 0)
# plot the data
plt.scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
plt.scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend(['Admitted', 'Not Admitted']);
X.shape, y.shape
((100, 2), (100, 1))
Define sigmoid and cost functions¶
def sigmoid(z):
denom = 1+np.e**(0-z)
return 1/denom
# test sigmoid function for a range of values
[sigmoid(-5), sigmoid(0), sigmoid(5)]
[0.006692850924284857, 0.5, 0.9933071490757153]
def cost_function(theta, X, y):
m=np.size(y) # number of training samples
h_theta_x = sigmoid(np.dot(X,theta))
term1 = (0-y)*np.log(h_theta_x)
term2 = (1-y)*np.log(1-h_theta_x)
J = (np.sum(term1-term2))/m
grad = np.dot(np.transpose(h_theta_x - y),X)
grad = grad/m
return (J, grad)
Apply¶
# Create an array for X_0
x_0 = np.ones(X.shape[0]).reshape(X.shape[0],1)
x_0.shape
(100, 1)
# Splice this with existing array X
X2 = np.concatenate((x_0, X), axis=1)
X2.shape
(100, 3)
# initialize the coefficients to 0
initial_theta = np.zeros(X.shape[1]+1).reshape(X.shape[1]+1,1)
initial_theta
array([[0.], [0.], [0.]])
# compute cost and gradient
(J, grad) = cost_function(initial_theta, X2, y)
print(f'Cost at initial theta: {J}')
print(f'Gradient at inital theta: {grad.flatten()}')
Cost at initial theta: 0.6931471805599453 Gradient at inital theta: [ -0.1 -12.00921659 -11.26284221]
def gradient_descent(X, y, theta=initial_theta,
alpha=0.01, num_iterations=1500):
"""
Solve for theta using Gradient Descent optimiztion technique.
Alpha is the learning rate
"""
m = len(y)
J_history = []
theta0_history = []
theta1_history = []
theta2_history = []
theta = theta.reshape(3,1)
for i in range(num_iterations):
error = (np.dot(X, theta) - y)
term0 = (alpha/m) * np.sum(error* X[:,0].reshape(m,1))
term1 = (alpha/m) * np.sum(error* X[:,1].reshape(m,1))
term2 = (alpha/m) * np.sum(error* X[:,2].reshape(m,1))
# update theta
term_vector = np.array([[term0],[term1], [term2]])
# print(term_vector)
theta = theta - term_vector.reshape(3,1)
# store history values
theta0_history.append(theta[0].tolist()[0])
theta1_history.append(theta[1].tolist()[0])
theta2_history.append(theta[2].tolist()[0])
J_history.append(cost_function(theta,X,y)[0])
return (theta, J_history, theta0_history, theta1_history, theta2_history)
%%time
num_iterations=150
initial_theta2 = (np.ones(3)-0.5).reshape(3,1)
alpha=0.0002
theta, J_history, theta0_history, \
theta1_history, theta2_history = gradient_descent(X2,y,initial_theta,
alpha,num_iterations)
CPU times: user 13.4 ms, sys: 2.6 ms, total: 16 ms Wall time: 13.6 ms
theta.flatten()
array([-0.00142996, 0.0058575 , 0.00403084])
fig, ax1 = plt.subplots(figsize=(8,5))
# plot thetas over time
color='tab:blue'
ax1.plot(theta0_history, label='$\\theta_{0}$', linestyle='--', color=color)
ax1.plot(theta1_history, label='$\\theta_{1}$', linestyle='-', color=color)
ax1.plot(theta2_history, label='$\\theta_{2}$', linestyle='-.', color=color)
# ax1.legend()
ax1.set_xlabel('Iterations'); ax1.set_ylabel('$\\theta$', color=color);
ax1.tick_params(axis='y', labelcolor=color)
# plot loss function over time
color='tab:red'
ax2 = ax1.twinx()
ax2.plot(J_history, label='Loss function', color=color)
ax2.set_title('Values of $\\theta$ and $J(\\theta)$ over iterations')
ax2.set_ylabel('Loss: $J(\\theta)$', color=color)
ax1.tick_params(axis='y', labelcolor=color)
# ax2.legend();
fig.legend();
Prediction and plot decision boundary¶
Using gradient descent, we found, the values of theta. The decision boundary exists where $h_{\theta}(x) = 0$. Thus, we write the equation as
$$ \theta_{0} + \theta_{1}x_{1} + \theta_{2}x_{2} = 0 \\ -0.04904473x_{0} + 0.00618754x_{1} + 0.00439495x_{2} = 0 \\ 0.00618754x_{1} + 0.00439495x_{2} = 0.04904473 $$
substituting x1=0 and find x2, then vice versa. Thus, we get points (0,11.15933),(7.92636,0)
. But these are out of bounds to plot. Instead, we calculate values within the range of 30-100
as these are exam scores.
# when x1 = 30, find x2. x2= (-theta0 - theta1*30)/theta2
t = theta.flatten()
x2 = (0-t[0] - t[1]*40)/t[2]
print(x2)
# when x1 = 100, find x2. x2= (-theta0 - theta1*100)/theta2
t = theta.flatten()
x2 = (0-t[0] - t[1]*100)/t[2]
print(x2)
-57.77211067097919 -144.96240929648445
Note: At this point, I realize my gradient descent is not really optimizing well. The equation of the decision boundary line is way off. Hence I approach to solve this problem using Scikit-Learn and see what its parameters are.
Logistic regression using Scikit-Learn¶
Using the logistic regression from SKlearn, we fit the same data and explore what the parameters are.
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X, y)
/Users/atma6951/anaconda3/envs/pychakras/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning. FutureWarning) /Users/atma6951/anaconda3/envs/pychakras/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). y = column_or_1d(y, warn=True)
clf.coef_ # theta1, theta2
array([[0.03844482, 0.03101855]])
# mean test accuracy
clf.score(X, y)
0.87
clf.predict(X)
array([0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.])
# plot function copied from https://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
h = .02 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(8, 6))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
# Plot also the training points
# plt.scatter(X[:, 0], X[:, 1], edgecolors='k', cmap=plt.cm.Paired)
plt.scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
plt.scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
plt.xlabel('Test 1 scores')
plt.ylabel('Test 2 scores')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Logistic regression decision boundary');