import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
!pip3 install ipympl
%matplotlib widget
%matplotlib inline
%config InlineBackend.figure_format = 'retina'Gradient Descent
For a multivariable function, in \(N\) dimensions let’s say, \(F(\mathbf{v})\) which is differentiable at a point \(\mathbf{v}\), we say that \(F(\mathbf{v})\) decreases fastest in the direction of negative of the gradient at that point \(\mathbf{v}\) denoted by \(-∇F(\mathbf{v})\).
Evaluating the descent
\[\begin{equation} \mathbf{v}_{i + 1} = \mathbf{v}_{i} - L \nabla F(\mathbf{v}_{i}) \end{equation}\] where, \(L\) is the learning rate and \(L \in \mathbb{R}_{+}\) and \(\mathbf{v} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_N \end{bmatrix}\) and \(\nabla F(\mathbf{v}) = \begin{bmatrix} \frac{\partial F}{\partial x_1} \\ \frac{\partial F}{\partial x_2} \\ \vdots \\ \frac{\partial F}{\partial x_n} \end{bmatrix}\). Hence the equation becomes:
\[\begin{equation} \begin{bmatrix} x_1^{i + 1} \\ x_2^{i + 1} \\ \vdots \\ x_N^{i + 1} \end{bmatrix} = \begin{bmatrix} x_1^i \\ x_2^i \\ \vdots \\ x_N^i \end{bmatrix} - L \begin{bmatrix}\frac{\partial F}{\partial x_1} \\ \frac{\partial F}{\partial x_2} \\ \vdots \\ \frac{\partial F}{\partial x_n} \end{bmatrix} \end{equation}\]
Descent to a minima
Notice that the decrease in \(F(\mathbf{v})\) is guaranteed only to the nearest well, which may or may not be the global minima. We may run for a specified number of epochs or terminate at a set tolerance too.
2-dimensional example
Let \(F(\mathbf{v}) = F\left(\begin{bmatrix} x \\ y \end{bmatrix}\right) = \sin(x)\cos(y)\), then the gradient \(∇F(\mathbf{v}) = \begin{bmatrix}\frac{\partial F}{\partial x} \\ \frac{\partial F}{\partial y} \end{bmatrix} = \begin{bmatrix} \cos(x)\cos(y) \\ -\sin(x)\sin(y) \end{bmatrix}\) and starting from an initial point \(\mathbf{v}_0\), we may reach the nearest local minima as:
\[\begin{equation} \begin{bmatrix} \bar x \\ \bar y \end{bmatrix} = \begin{bmatrix} x \\ y \end{bmatrix} - L \begin{bmatrix} \cos(x)\cos(y) \\ -\sin(x)\sin(y) \end{bmatrix} \end{equation}\]
def gradient_func2d(x, y):
return np.cos(x) * np.cos(y), - np.sin(x) * np.sin(y)def func2d(x, y):
return np.sin(x) * np.cos(y)The 2D Function \(F(x, y)\)
x = np.arange(-5, 5, 0.1)
y = np.arange(-5, 5, 0.1)
X, Y = np.meshgrid(x, y)
fig = plt.figure(figsize = (10, 8))
ax = fig.add_subplot(projection="3d", computed_zorder=False)
ax.plot_surface(X, Y, func2d(X, Y), cmap="viridis")
plt.show()3 initial points on \(F(x, y)\)
p1 = (1, 0.5)
p2 = (-1, 3)
p3 = (-4, -0.5)
fig = plt.figure(figsize = (10, 8))
ax = fig.add_subplot(projection="3d", computed_zorder = False)
ax.plot_surface(X, Y, func2d(X, Y), cmap="viridis", zorder = 0)
ax.scatter(p1[0], p1[1], func2d(p1[0], p1[1]), c = "red", s = 75, zorder = 1)
ax.scatter(p2[0], p2[1], func2d(p2[0], p2[1]), c = "green", s = 75, zorder = 1)
ax.scatter(p3[0], p3[1], func2d(p3[0], p3[1]), c = "cyan", s = 75, zorder = 1)
plt.show()Descent of the points
def iterate_gradient(point, L):
Xgrad, Ygrad = gradient_func2d(point[0], point[1])
Xnew, Ynew = point[0] - L * Xgrad, point[1] - L * Ygrad
return (Xnew, Ynew, func2d(Xnew, Ynew))L, epochs = 0.1, 100
ax = plt.subplot(projection="3d", computed_zorder = False)
for i in range(epochs):
p1 = iterate_gradient(p1, L)
p2 = iterate_gradient(p2, L)
p3 = iterate_gradient(p3, L)
ax.plot_surface(X, Y, func2d(X, Y), cmap="viridis", zorder = 0)
ax.scatter(p1[0], p1[1], func2d(p1[0], p1[1]), c = "red", s = 75, zorder = 1)
ax.scatter(p2[0], p2[1], func2d(p2[0], p2[1]), c = "green", s = 75, zorder = 1)
ax.scatter(p3[0], p3[1], func2d(p3[0], p3[1]), c = "cyan", s = 75, zorder = 1)
ax.set_title(f"Epoch No: {i}")
plt.pause(0.001)
ax.clear()