Python

I’m trying to implement a multiclass logistic regression classifier that distinguishes between k different classes.

This is my code.

import numpy as np
from scipy.special import expit


def cost(X,y,theta,regTerm):
    (m,n) = X.shape
    J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
    return J

def gradient(X,y,theta,regTerm):
    (m,n) = X.shape
    grad = np.dot(((expit(np.dot(X,theta))).reshape(m,1) - y).T,X)/m + (np.concatenate(([0],theta[1:].T),axis=0)).reshape(1,n)
    return np.asarray(grad)

def train(X,y,regTerm,learnRate,epsilon,k):
    (m,n) = X.shape
    theta = np.zeros((k,n))
    for i in range(0,k):
        previousCost = 0;
        currentCost = cost(X,y,theta[i,:],regTerm)
        while(np.abs(currentCost-previousCost) > epsilon):
            print(theta[i,:])
            theta[i,:] = theta[i,:] - learnRate*gradient(X,y,theta[i,:],regTerm)
            print(theta[i,:])
            previousCost = currentCost
            currentCost = cost(X,y,theta[i,:],regTerm)
    return theta

trX = np.load('trX.npy')
trY = np.load('trY.npy')
theta = train(trX,trY,2,0.1,0.1,4)

JavaScript
​x
 
import numpy as np
from scipy.special import expit
​
​
def cost(X,y,theta,regTerm):
    (m,n) = X.shape
    J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
    return J
​
def gradient(X,y,theta,regTerm):
    (m,n) = X.shape
    grad = np.dot(((expit(np.dot(X,theta))).reshape(m,1) - y).T,X)/m + (np.concatenate(([0],theta[1:].T),axis=0)).reshape(1,n)
    return np.asarray(grad)
​
def train(X,y,regTerm,learnRate,epsilon,k):
    (m,n) = X.shape
    theta = np.zeros((k,n))
    for i in range(0,k):
        previousCost = 0;
        currentCost = cost(X,y,theta[i,:],regTerm)
        while(np.abs(currentCost-previousCost) > epsilon):
            print(theta[i,:])
            theta[i,:] = theta[i,:] - learnRate*gradient(X,y,theta[i,:],regTerm)
            print(theta[i,:])
            previousCost = currentCost
            currentCost = cost(X,y,theta[i,:],regTerm)
    return theta
​
trX = np.load('trX.npy')
trY = np.load('trY.npy')
theta = train(trX,trY,2,0.1,0.1,4)
​

I can verify that cost and gradient are returning values that are in the right dimension (cost returns a scalar, and gradient returns a 1 by n row vector), but i get the error

RuntimeWarning: divide by zero encountered in log
  J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])

JavaScript
 
RuntimeWarning: divide by zero encountered in log
  J = (np.dot(-(y.T),np.log(expit(np.dot(X,theta))))-np.dot((np.ones((m,1))-y).T,np.log(np.ones((m,1)) - (expit(np.dot(X,theta))).reshape((m,1))))) / m + (regTerm / (2 * m)) * np.linalg.norm(theta[1:])
​

why is this happening and how can i avoid this?

Answer

You can clean up the formula by appropriately using broadcasting, the operator * for dot products of vectors, and the operator @ for matrix multiplication — and breaking it up as suggested in the comments.

Here is your cost function:

def cost(X, y, theta, regTerm):
    m = X.shape[0]  # or y.shape, or even p.shape after the next line, number of training set
    p = expit(X @ theta)
    log_loss = -np.average(y*np.log(p) + (1-y)*np.log(1-p))
    J = log_loss + regTerm * np.linalg.norm(theta[1:]) / (2*m)
    return J

JavaScript
 
def cost(X, y, theta, regTerm):
    m = X.shape[0]  # or y.shape, or even p.shape after the next line, number of training set
    p = expit(X @ theta)
    log_loss = -np.average(y*np.log(p) + (1-y)*np.log(1-p))
    J = log_loss + regTerm * np.linalg.norm(theta[1:]) / (2*m)
    return J
​

You can clean up your gradient function along the same lines.

By the way, are you sure you want np.linalg.norm(theta[1:]). If you’re trying to do L2-regularization, the term should be np.linalg.norm(theta[1:]) ** 2.

python divide by zero encountered in log – logistic regression

Advertisement

Answer