Implementing Gradient Descent for Logistic Regression¶

This notebook follows the topics discussed in logistic regression course notes. Please refer to that page for context. This notebook tries to implement the concepts in Python, instead of MatLab/Octave. I have borrowed some inspiration and code from this blog.

In [ ]:

In [1]:

Copied!





import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

%matplotlib inline

Plot sigmoid function¶

To bound our probability predictions between 0-1, we use a sigmoid function. Its definition is below.

In [2]:

Copied!





vals = np.arange(-10,10,0.2)
gz= 1/(1+np.e**(0-vals))
plt.plot(vals, gz)
plt.title('Sigmoid function');
vals = np.arange(-10,10,0.2)
gz= 1/(1+np.e**(0-vals))
plt.plot(vals, gz)
plt.title('Sigmoid function');

No description has been provided for this image

Plot loss function for logistic regression¶

In [3]:

Copied!





xvals = np.arange(0,1,0.1)
y1vals = 0-np.log(xvals)
y0vals = 0-np.log(1-xvals)
plt.plot(xvals, y1vals, 'b', label='y=1')
plt.plot(xvals, y0vals, 'g', label='y=0')
plt.title('Loss functions of logistic regression')
plt.legend()
plt.xlabel('Hypothesis: $h\\theta(x)$')
plt.ylabel('Loss');
xvals = np.arange(0,1,0.1)
y1vals = 0-np.log(xvals)
y0vals = 0-np.log(1-xvals)
plt.plot(xvals, y1vals, 'b', label='y=1')
plt.plot(xvals, y0vals, 'g', label='y=0')
plt.title('Loss functions of logistic regression')
plt.legend()
plt.xlabel('Hypothesis: $h\\theta(x)$')
plt.ylabel('Loss');

/Users/atma6951/anaconda3/envs/pychakras/lib/python3.7/site-packages/ipykernel_launcher.py:2: RuntimeWarning: divide by zero encountered in log

Load and visualize training data¶

In [4]:

Copied!





from numpy import loadtxt, where

#load the dataset
data_path = '/Users/atma6951/Documents/code/pychakras/pyChakras/ml/coursera-ml-matlabonline/machine-learning-ex/ex2'
data = loadtxt(f'{data_path}/ex2data1.txt', delimiter=',')
data.shape
from numpy import loadtxt, where

#load the dataset
data_path = '/Users/atma6951/Documents/code/pychakras/pyChakras/ml/coursera-ml-matlabonline/machine-learning-ex/ex2'
data = loadtxt(f'{data_path}/ex2data1.txt', delimiter=',')
data.shape

Out[4]:

(100, 3)

In [224]:

Copied!





# set dependent and independent variables.
X = data[:, 0:2]
y = data[:, 2]
y = y.reshape(y.shape[0],1)

# find positive and negative cases
pos = where(y == 1)
neg = where(y == 0)

# plot the data
plt.scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
plt.scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend(['Admitted', 'Not Admitted']);
# set dependent and independent variables.
X = data[:, 0:2]
y = data[:, 2]
y = y.reshape(y.shape[0],1)

# find positive and negative cases
pos = where(y == 1)
neg = where(y == 0)

# plot the data
plt.scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
plt.scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
plt.xlabel('Exam 1 score')
plt.ylabel('Exam 2 score')
plt.legend(['Admitted', 'Not Admitted']);

In [225]:

Copied!

X.shape, y.shape
X.shape, y.shape

Out[225]:

((100, 2), (100, 1))

Define sigmoid and cost functions¶

In [6]:

Copied!

def sigmoid(z):
    denom = 1+np.e**(0-z)
    return 1/denom
def sigmoid(z):
    denom = 1+np.e**(0-z)
    return 1/denom

In [28]:

Copied!

# test sigmoid function for a range of values
[sigmoid(-5), sigmoid(0), sigmoid(5)]
# test sigmoid function for a range of values
[sigmoid(-5), sigmoid(0), sigmoid(5)]

Out[28]:

[0.006692850924284857, 0.5, 0.9933071490757153]

In [33]:

Copied!





def cost_function(theta, X, y):
    m=np.size(y)  # number of training samples
    
    h_theta_x = sigmoid(np.dot(X,theta))
    term1 = (0-y)*np.log(h_theta_x)
    term2 = (1-y)*np.log(1-h_theta_x)
    J = (np.sum(term1-term2))/m
    
    grad = np.dot(np.transpose(h_theta_x - y),X)
    grad = grad/m
    
    return (J, grad)
def cost_function(theta, X, y):
    m=np.size(y)  # number of training samples
    
    h_theta_x = sigmoid(np.dot(X,theta))
    term1 = (0-y)*np.log(h_theta_x)
    term2 = (1-y)*np.log(1-h_theta_x)
    J = (np.sum(term1-term2))/m
    
    grad = np.dot(np.transpose(h_theta_x - y),X)
    grad = grad/m
    
    return (J, grad)

Apply¶

In [20]:

Copied!

# Create an array for X_0
x_0 = np.ones(X.shape[0]).reshape(X.shape[0],1)
x_0.shape
# Create an array for X_0
x_0 = np.ones(X.shape[0]).reshape(X.shape[0],1)
x_0.shape

Out[20]:

(100, 1)

In [21]:

Copied!

# Splice this with existing array X
X2 = np.concatenate((x_0, X), axis=1)
X2.shape
# Splice this with existing array X
X2 = np.concatenate((x_0, X), axis=1)
X2.shape

Out[21]:

(100, 3)

In [201]:

Copied!

# initialize the coefficients to 0
initial_theta = np.zeros(X.shape[1]+1).reshape(X.shape[1]+1,1)
initial_theta
# initialize the coefficients to 0
initial_theta = np.zeros(X.shape[1]+1).reshape(X.shape[1]+1,1)
initial_theta

Out[201]:

array([[0.],
       [0.],
       [0.]])

In [202]:

Copied!





# compute cost and gradient
(J, grad) = cost_function(initial_theta, X2, y)
print(f'Cost at initial theta: {J}')
print(f'Gradient at inital theta: {grad.flatten()}')
# compute cost and gradient
(J, grad) = cost_function(initial_theta, X2, y)
print(f'Cost at initial theta: {J}')
print(f'Gradient at inital theta: {grad.flatten()}')

Cost at initial theta: 0.6931471805599453
Gradient at inital theta: [ -0.1        -12.00921659 -11.26284221]

Minimize the cost function using gradient descent¶

Note: The implementation of gradient descent for logistic regression is the same as that for linear regression, as seen here.

In [109]:

Copied!





def gradient_descent(X, y, theta=initial_theta,
                    alpha=0.01, num_iterations=1500):
    """
    Solve for theta using Gradient Descent optimiztion technique. 
    Alpha is the learning rate
    """
    m = len(y)
    J_history = []
    theta0_history = []
    theta1_history = []
    theta2_history = []
    theta = theta.reshape(3,1)
    
    for i in range(num_iterations):
        error = (np.dot(X, theta) - y)
        
        term0 = (alpha/m) * np.sum(error* X[:,0].reshape(m,1))
        term1 = (alpha/m) * np.sum(error* X[:,1].reshape(m,1))
        term2 = (alpha/m) * np.sum(error* X[:,2].reshape(m,1))
        
        # update theta
        term_vector = np.array([[term0],[term1], [term2]])
#         print(term_vector)
        theta = theta - term_vector.reshape(3,1)
        
        # store history values
        theta0_history.append(theta[0].tolist()[0])
        theta1_history.append(theta[1].tolist()[0])
        theta2_history.append(theta[2].tolist()[0])
        J_history.append(cost_function(theta,X,y)[0])
        
    return (theta, J_history, theta0_history, theta1_history, theta2_history)
def gradient_descent(X, y, theta=initial_theta,
                    alpha=0.01, num_iterations=1500):
    """
    Solve for theta using Gradient Descent optimiztion technique. 
    Alpha is the learning rate
    """
    m = len(y)
    J_history = []
    theta0_history = []
    theta1_history = []
    theta2_history = []
    theta = theta.reshape(3,1)
    
    for i in range(num_iterations):
        error = (np.dot(X, theta) - y)
        
        term0 = (alpha/m) * np.sum(error* X[:,0].reshape(m,1))
        term1 = (alpha/m) * np.sum(error* X[:,1].reshape(m,1))
        term2 = (alpha/m) * np.sum(error* X[:,2].reshape(m,1))
        
        # update theta
        term_vector = np.array([[term0],[term1], [term2]])
#         print(term_vector)
        theta = theta - term_vector.reshape(3,1)
        
        # store history values
        theta0_history.append(theta[0].tolist()[0])
        theta1_history.append(theta[1].tolist()[0])
        theta2_history.append(theta[2].tolist()[0])
        J_history.append(cost_function(theta,X,y)[0])
        
    return (theta, J_history, theta0_history, theta1_history, theta2_history)

In [216]:

Copied!





%%time
num_iterations=150
initial_theta2 = (np.ones(3)-0.5).reshape(3,1)
alpha=0.0002
theta, J_history, theta0_history, \
theta1_history, theta2_history = gradient_descent(X2,y,initial_theta,
                                                  alpha,num_iterations)
%%time
num_iterations=150
initial_theta2 = (np.ones(3)-0.5).reshape(3,1)
alpha=0.0002
theta, J_history, theta0_history, \
theta1_history, theta2_history = gradient_descent(X2,y,initial_theta,
                                                  alpha,num_iterations)

CPU times: user 13.4 ms, sys: 2.6 ms, total: 16 ms
Wall time: 13.6 ms

In [217]:

Copied!

theta.flatten()
theta.flatten()

Out[217]:

array([-0.00142996,  0.0058575 ,  0.00403084])

In [218]:

Copied!





fig, ax1 = plt.subplots(figsize=(8,5))

# plot thetas over time
color='tab:blue'
ax1.plot(theta0_history, label='$\\theta_{0}$', linestyle='--', color=color)
ax1.plot(theta1_history, label='$\\theta_{1}$', linestyle='-', color=color)
ax1.plot(theta2_history, label='$\\theta_{2}$', linestyle='-.', color=color)
# ax1.legend()
ax1.set_xlabel('Iterations'); ax1.set_ylabel('$\\theta$', color=color);
ax1.tick_params(axis='y', labelcolor=color)

# plot loss function over time
color='tab:red'
ax2 = ax1.twinx()
ax2.plot(J_history, label='Loss function', color=color)
ax2.set_title('Values of $\\theta$ and $J(\\theta)$ over iterations')
ax2.set_ylabel('Loss: $J(\\theta)$', color=color)
ax1.tick_params(axis='y', labelcolor=color)

# ax2.legend();
fig.legend();
fig, ax1 = plt.subplots(figsize=(8,5))

# plot thetas over time
color='tab:blue'
ax1.plot(theta0_history, label='$\\theta_{0}$', linestyle='--', color=color)
ax1.plot(theta1_history, label='$\\theta_{1}$', linestyle='-', color=color)
ax1.plot(theta2_history, label='$\\theta_{2}$', linestyle='-.', color=color)
# ax1.legend()
ax1.set_xlabel('Iterations'); ax1.set_ylabel('$\\theta$', color=color);
ax1.tick_params(axis='y', labelcolor=color)

# plot loss function over time
color='tab:red'
ax2 = ax1.twinx()
ax2.plot(J_history, label='Loss function', color=color)
ax2.set_title('Values of $\\theta$ and $J(\\theta)$ over iterations')
ax2.set_ylabel('Loss: $J(\\theta)$', color=color)
ax1.tick_params(axis='y', labelcolor=color)

# ax2.legend();
fig.legend();

Prediction and plot decision boundary¶

Using gradient descent, we found, the values of theta. The decision boundary exists where $h_{\theta}(x) = 0$. Thus, we write the equation as

$$ \theta_{0} + \theta_{1}x_{1} + \theta_{2}x_{2} = 0 \\ -0.04904473x_{0} + 0.00618754x_{1} + 0.00439495x_{2} = 0 \\ 0.00618754x_{1} + 0.00439495x_{2} = 0.04904473 $$

substituting x1=0 and find x2, then vice versa. Thus, we get points (0,11.15933),(7.92636,0). But these are out of bounds to plot. Instead, we calculate values within the range of 30-100 as these are exam scores.

In [220]:

Copied!





# when x1 = 30, find x2. x2= (-theta0 - theta1*30)/theta2
t = theta.flatten()
x2 = (0-t[0] - t[1]*40)/t[2]
print(x2)

# when x1 = 100, find x2. x2= (-theta0 - theta1*100)/theta2
t = theta.flatten()
x2 = (0-t[0] - t[1]*100)/t[2]
print(x2)
# when x1 = 30, find x2. x2= (-theta0 - theta1*30)/theta2
t = theta.flatten()
x2 = (0-t[0] - t[1]*40)/t[2]
print(x2)

# when x1 = 100, find x2. x2= (-theta0 - theta1*100)/theta2
t = theta.flatten()
x2 = (0-t[0] - t[1]*100)/t[2]
print(x2)

-57.77211067097919
-144.96240929648445

Note: At this point, I realize my gradient descent is not really optimizing well. The equation of the decision boundary line is way off. Hence I approach to solve this problem using Scikit-Learn and see what its parameters are.

Logistic regression using Scikit-Learn¶

Using the logistic regression from SKlearn, we fit the same data and explore what the parameters are.

In [243]:

Copied!

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X, y)
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X, y)

/Users/atma6951/anaconda3/envs/pychakras/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
/Users/atma6951/anaconda3/envs/pychakras/lib/python3.7/site-packages/sklearn/utils/validation.py:724: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

In [244]:

Copied!

clf.coef_ # theta1, theta2
clf.coef_ # theta1, theta2

Out[244]:

array([[0.03844482, 0.03101855]])

In [246]:

Copied!

# mean test accuracy
clf.score(X, y)
# mean test accuracy
clf.score(X, y)

Out[246]:

0.87

In [247]:

Copied!

clf.predict(X)
clf.predict(X)

Out[247]:

array([0., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
       0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1.])

In [257]:

Copied!





# plot function copied from https://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(8, 6))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
# plt.scatter(X[:, 0], X[:, 1], edgecolors='k', cmap=plt.cm.Paired)
plt.scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
plt.scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
plt.xlabel('Test 1 scores')
plt.ylabel('Test 2 scores')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Logistic regression decision boundary');
# plot function copied from https://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(8, 6))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
# plt.scatter(X[:, 0], X[:, 1], edgecolors='k', cmap=plt.cm.Paired)
plt.scatter(X[pos, 0], X[pos, 1], marker='o', c='b')
plt.scatter(X[neg, 0], X[neg, 1], marker='x', c='r')
plt.xlabel('Test 1 scores')
plt.ylabel('Test 2 scores')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Logistic regression decision boundary');