import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
!pip3 install ipympl
%matplotlib widget
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
Gradient Descent
For a multivariable function, in \(N\) dimensions let’s say, \(F(\mathbf{v})\) which is differentiable at a point \(\mathbf{v}\), we say that \(F(\mathbf{v})\) decreases fastest in the direction of negative of the gradient at that point \(\mathbf{v}\) denoted by \(-∇F(\mathbf{v})\).
Evaluating the descent
\[\begin{equation} \mathbf{v}_{i + 1} = \mathbf{v}_{i} - L \nabla F(\mathbf{v}_{i}) \end{equation}\] where, \(L\) is the learning rate and \(L \in \mathbb{R}_{+}\) and \(\mathbf{v} = \begin{bmatrix} x_1 \\ x_2 \\ \vdots \\ x_N \end{bmatrix}\) and \(\nabla F(\mathbf{v}) = \begin{bmatrix} \frac{\partial F}{\partial x_1} \\ \frac{\partial F}{\partial x_2} \\ \vdots \\ \frac{\partial F}{\partial x_n} \end{bmatrix}\). Hence the equation becomes:
\[\begin{equation} \begin{bmatrix} x_1^{i + 1} \\ x_2^{i + 1} \\ \vdots \\ x_N^{i + 1} \end{bmatrix} = \begin{bmatrix} x_1^i \\ x_2^i \\ \vdots \\ x_N^i \end{bmatrix} - L \begin{bmatrix}\frac{\partial F}{\partial x_1} \\ \frac{\partial F}{\partial x_2} \\ \vdots \\ \frac{\partial F}{\partial x_n} \end{bmatrix} \end{equation}\]
Descent to a minima
Notice that the decrease in \(F(\mathbf{v})\) is guaranteed only to the nearest well, which may or may not be the global minima. We may run for a specified number of epochs or terminate at a set tolerance too.
2-dimensional example
Let \(F(\mathbf{v}) = F\left(\begin{bmatrix} x \\ y \end{bmatrix}\right) = \sin(x)\cos(y)\), then the gradient \(∇F(\mathbf{v}) = \begin{bmatrix}\frac{\partial F}{\partial x} \\ \frac{\partial F}{\partial y} \end{bmatrix} = \begin{bmatrix} \cos(x)\cos(y) \\ -\sin(x)\sin(y) \end{bmatrix}\) and starting from an initial point \(\mathbf{v}_0\), we may reach the nearest local minima as:
\[\begin{equation} \begin{bmatrix} \bar x \\ \bar y \end{bmatrix} = \begin{bmatrix} x \\ y \end{bmatrix} - L \begin{bmatrix} \cos(x)\cos(y) \\ -\sin(x)\sin(y) \end{bmatrix} \end{equation}\]
def gradient_func2d(x, y):
return np.cos(x) * np.cos(y), - np.sin(x) * np.sin(y)
def func2d(x, y):
return np.sin(x) * np.cos(y)
The 2D Function \(F(x, y)\)
= np.arange(-5, 5, 0.1)
x = np.arange(-5, 5, 0.1)
y = np.meshgrid(x, y)
X, Y
= plt.figure(figsize = (10, 8))
fig = fig.add_subplot(projection="3d", computed_zorder=False)
ax ="viridis")
ax.plot_surface(X, Y, func2d(X, Y), cmap
plt.show()
3 initial points on \(F(x, y)\)
= (1, 0.5)
p1 = (-1, 3)
p2 = (-4, -0.5)
p3 = plt.figure(figsize = (10, 8))
fig = fig.add_subplot(projection="3d", computed_zorder = False)
ax ="viridis", zorder = 0)
ax.plot_surface(X, Y, func2d(X, Y), cmap
0], p1[1], func2d(p1[0], p1[1]), c = "red", s = 75, zorder = 1)
ax.scatter(p1[0], p2[1], func2d(p2[0], p2[1]), c = "green", s = 75, zorder = 1)
ax.scatter(p2[0], p3[1], func2d(p3[0], p3[1]), c = "cyan", s = 75, zorder = 1)
ax.scatter(p3[ plt.show()
Descent of the points
def iterate_gradient(point, L):
= gradient_func2d(point[0], point[1])
Xgrad, Ygrad = point[0] - L * Xgrad, point[1] - L * Ygrad
Xnew, Ynew return (Xnew, Ynew, func2d(Xnew, Ynew))
= 0.1, 100
L, epochs
= plt.subplot(projection="3d", computed_zorder = False)
ax
for i in range(epochs):
= iterate_gradient(p1, L)
p1 = iterate_gradient(p2, L)
p2 = iterate_gradient(p3, L)
p3
="viridis", zorder = 0)
ax.plot_surface(X, Y, func2d(X, Y), cmap
0], p1[1], func2d(p1[0], p1[1]), c = "red", s = 75, zorder = 1)
ax.scatter(p1[0], p2[1], func2d(p2[0], p2[1]), c = "green", s = 75, zorder = 1)
ax.scatter(p2[0], p3[1], func2d(p3[0], p3[1]), c = "cyan", s = 75, zorder = 1)
ax.scatter(p3[
f"Epoch No: {i}")
ax.set_title(
0.001)
plt.pause( ax.clear()