by Fanyu

Optimiser¶

Linear Regression¶

Notation:

  • Y is True,
  • T is the Estimate.
$$Y = XW + b + e$$

Let $T = WX +b $

$$ loss = e^T e $$$$ loss = (Y-T)^T \cdot (Y-T) = (WX+b+e-T)^T \cdot (WX+b+e-T) $$$$ \frac{\partial Loss}{\partial W} = X^T (WX+b+e-T) \quad, \quad \frac{\partial loss }{\partial b} = 1^T (WX+b+e-T) $$$$ \frac{\partial Loss}{\partial W} = 2X^T (Y-T) \quad, \quad \frac{\partial loss }{\partial b} = 1^T (Y-T) $$

Just One More Thing We Consider Here!

We do not want the number of sample to affect loss function's value, so we take averge, instead of sumation.

$$ \frac{\partial Loss}{\partial W} = \frac{2}{N}\ X^T (Y-T) \quad, \quad \frac{\partial loss }{\partial b} = \frac{1}{N} \ 1^T (Y-T) $$
In [15]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

def model(x, w, b):
    """
    Linear Regression
    """
    y = x @ w + b
    return y

def grad(x, w, b, t):
    y = model(x, w, b)
    dw = 2/ len(x)  * x.T @ (y-t) 
    db = np.mean(y - t, axis = 0)
    return dw, db

Forward¶

In [16]:
################## Initialise Values
x = np.random.randn(1000, 1)
w = np.random.rand(1, 1)
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + e # true value

################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
plt.show()
In [17]:
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1)  # Alter Dimension
w = np.random.rand(2, 1)  # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + e # true value

################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1)  # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
plt.show()

Gradient Descent¶

We find the Model works explicitly better after optimiser

In [18]:
eta = 0.1
################## Initialise Values
x = np.random.randn(1000, 1)
w = np.random.rand(1, 1)
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y * ((2)) + e ############## Alter Parameters Here  - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
################## GD #################
for step in range(50):
    gw, gb =grad(x, w, b, t)
    w = w - eta * gw
    b = b - eta * gb
################## Ploting Again #############
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='b')
plt.show()
In [19]:
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1)  # Alter Dimension
w = np.random.rand(2, 1)  # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + ((2*x**2 -2*x))  + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1)  # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
################## GD #################
for step in range(50):
    gw, gb =grad(x, w, b, t)
    w = w - eta * gw
    b = b - eta * gb
################## Ploting Again #############
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='b')
plt.show()

Stochastic Gradient Descent¶

Batches were added.

In [25]:
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1)  # Alter Dimension
w = np.random.rand(2, 1)  # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + ((2*x**2 -2*x))  + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1)  # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')
################## GD #################
eta = 0.1
w_control, b_control = w, b
for step in range(10):
    gw, gb =grad(x, w_control, b_control, t)
    w_control = w_control - eta * gw
    b_control = b_control - eta * gb
    
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label='GD')
###########     SGD <- Batch      ###############
batch_size = 10
w_batch, b_batch = w, b
for step in range(10):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
    w_batch = w_batch - eta * gw
    b_batch = b_batch - eta * gb

yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label='SGD')
plt.legend()
plt.show()

Momentum SGD¶

  • Apply Momentum to the Gradient, $\nabla$.
$$\text{Gradient Descent:} \quad w_t = w_{t-1} -\eta g_w$$$$ \text{Momentum:}\quad v_t = \beta_1 v_{t-1} + (1-\beta_1)g_w $$

In the Beginning of Iteration, $v_0=0$, so we amend it to be $\hat{v_T}$

$$ \hat{v_t} = \frac{v_t}{1-\beta_1^t} $$

,where $\beta_1$ assign weights between previous value and the gradient.

Replace $g_w$ with $\hat{v}_t$:

$$ w_t = w_{t-1} -\eta \hat{v}_t $$
In [26]:
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1)  # Alter Dimension
w = np.random.rand(2, 1)  # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + ((2*x**2 -2*x))  + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1)  # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')


###############Optimiser#############

Epoch = 30
batch_size = 32
eta = 0.01
################## GD #################
w_control, b_control = w, b
for step in range(Epoch):
    gw, gb =grad(x, w_control, b_control, t)
    w_control = w_control - eta * gw
    b_control = b_control - eta * gb
    
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label = 'GD')
###########     SGD <- Batch      ###############
w_batch, b_batch = w, b
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
    w_batch = w_batch - eta * gw
    b_batch = b_batch - eta * gb

yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label = 'SGD')
###########     Momentum <- batch + v      ###############
w_momentum, b_momentum = w, b
v_w, v_b = 0, 0
beta1 = 0.9
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_momentum, b_momentum, t_idx)
    v_w = beta1 * v_w + (1 - beta1)*gw
    v_b = beta1 * v_b + (1 - beta1)*gb
    v_w_hat = v_w / (1 - beta1**(step+1))
    v_b_hat = v_b / (1 - beta1**(step+1))
    w_momentum = w_momentum - eta * v_w_hat
    b_momentum = b_momentum - eta * v_b_hat

yplt = model(xlim, w_momentum, b_momentum)
plt.plot(xlim[:,0], yplt[:,0], c='y', label = 'Momentum SGD')
plt.legend()
plt.show()

RMSprop¶

  • Apply a Transformation to the Gradient, $\nabla$.
$$\text{Gradient Descent:} \quad w_t = w_{t-1} -\eta g_w$$$$ \text{RMS:}\quad m_t = \beta_1 v_{t-1} + (1-\beta_2)g^2_w $$

In the Beginning of Iteration, $v_0=0$, so we amend it to be $\hat{v_T}$

$$ \hat{m_t} = \frac{m_t}{1-\beta_1^t} $$

,where $\beta_1$ assign weights between previous value and the gradient.

Add a square root of $\hat{m}_t$ in the denominator:

$$ w_t = w_{t-1} -\eta \frac{g_w}{\sqrt{\hat{m}_t}} $$
In [27]:
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1)  # Alter Dimension
w = np.random.rand(2, 1)  # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + ((2*x**2 -2*x))  + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1)  # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')


###############Optimiser#############

Epoch = 30
batch_size = 32
eta = 0.01
################## GD #################
w_control, b_control = w, b
for step in range(Epoch):
    gw, gb =grad(x, w_control, b_control, t)
    w_control = w_control - eta * gw
    b_control = b_control - eta * gb
    
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label = 'GD')
###########     SGD <- Batch      ###############
w_batch, b_batch = w, b
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
    w_batch = w_batch - eta * gw
    b_batch = b_batch - eta * gb

yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label = 'SGD')
###########     Momentum <- batch + v      ###############
w_momentum, b_momentum = w, b
v_w, v_b = 0, 0
beta1 = 0.9
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_momentum, b_momentum, t_idx)
    v_w = beta1 * v_w + (1 - beta1)*gw
    v_b = beta1 * v_b + (1 - beta1)*gb
    v_w_hat = v_w / (1 - beta1**(step+1))
    v_b_hat = v_b / (1 - beta1**(step+1))
    w_momentum = w_momentum - eta * v_w_hat
    b_momentum = b_momentum - eta * v_b_hat

yplt = model(xlim, w_momentum, b_momentum)
plt.plot(xlim[:,0], yplt[:,0], c='y', label = 'Momentum SGD')
###########     RESprop <- batch + m      ###############
w_res, b_res = w, b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
beta1 = 0.9
beta2 = 0.999
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_res, b_res, t_idx)
    v_w = beta1 * v_w + (1 - beta1)*gw
    v_b = beta1 * v_b + (1 - beta1)*gb
    v_w_hat = v_w / (1 - beta1**(step+1))
    v_b_hat = v_b / (1 - beta1**(step+1))
    m_w = beta1 * m_w + (1 - beta2)*gw**2
    m_b = beta1 * m_b + (1 - beta2)*gb**2
    m_w_hat = m_w / (1 - beta1**(step+1))
    m_b_hat = m_b / (1 - beta1**(step+1))
    w_res = w_res - eta * np.sqrt(m_w_hat) * gw
    b_res = b_res - eta * np.sqrt(m_b_hat) * gb

yplt = model(xlim, w_res, b_res)
plt.plot(xlim[:,0], yplt[:,0], c='pink', label = 'RMS')
plt.legend()
plt.show()

Adam¶

$$ w_t = w_{t-1} - \eta \cdot \frac{ \hat{v}_t }{\sqrt{\hat{m}_t}} $$
In [29]:
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1)  # Alter Dimension
w = np.random.rand(2, 1)  # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])

y = model(x, w, b) # the estimate 
t = y + ((2*x**2 -2*x))  + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1)  # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
#plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')


###############Optimiser#############

Epoch = 30
batch_size = 32
eta = 0.01
################## GD #################
w_control, b_control = w, b
for step in range(Epoch):
    gw, gb =grad(x, w_control, b_control, t)
    w_control = w_control - eta * gw
    b_control = b_control - eta * gb
    
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label = 'GD')
###########     SGD <- Batch      ###############
w_batch, b_batch = w, b
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
    w_batch = w_batch - eta * gw
    b_batch = b_batch - eta * gb

yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label = 'SGD')
###########     Momentum <- batch + v      ###############
w_momentum, b_momentum = w, b
v_w, v_b = 0, 0
beta1 = 0.9
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_momentum, b_momentum, t_idx)
    v_w = beta1 * v_w + (1 - beta1)*gw
    v_b = beta1 * v_b + (1 - beta1)*gb
    v_w_hat = v_w / (1 - beta1**(step+1))
    v_b_hat = v_b / (1 - beta1**(step+1))
    w_momentum = w_momentum - eta * v_w_hat
    b_momentum = b_momentum - eta * v_b_hat

yplt = model(xlim, w_momentum, b_momentum)
plt.plot(xlim[:,0], yplt[:,0], c='y', label = 'Momentum SGD')
###########     RESprop <- batch + m      ###############
w_res, b_res = w, b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
beta1 = 0.9
beta2 = 0.999
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_res, b_res, t_idx)
    v_w = beta1 * v_w + (1 - beta1)*gw
    v_b = beta1 * v_b + (1 - beta1)*gb
    v_w_hat = v_w / (1 - beta1**(step+1))
    v_b_hat = v_b / (1 - beta1**(step+1))
    m_w = beta1 * m_w + (1 - beta2)*gw**2
    m_b = beta1 * m_b + (1 - beta2)*gb**2
    m_w_hat = m_w / (1 - beta1**(step+1))
    m_b_hat = m_b / (1 - beta1**(step+1))
    w_res = w_res - eta * np.sqrt(m_w_hat) * gw
    b_res = b_res - eta * np.sqrt(m_b_hat) * gb

yplt = model(xlim, w_res, b_res)
plt.plot(xlim[:,0], yplt[:,0], c='pink', label = 'RMS')

###########     ADAM     ###############
w_adam, b_adam = w, b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
beta1 = 0.9
beta2 = 0.999
for step in range(Epoch):
    # Suffle with the Batch
    idx = np.random.randint(0, 1000, batch_size)
    x_idx = x[idx]
    t_idx = t[idx]
    gw, gb =grad(x_idx, w_adam, b_adam, t_idx)
    v_w = beta1 * v_w + (1 - beta1)*gw
    v_b = beta1 * v_b + (1 - beta1)*gb
    v_w_hat = v_w / (1 - beta1**(step+1))
    v_b_hat = v_b / (1 - beta1**(step+1))
    m_w = beta1 * m_w + (1 - beta2)*gw**2
    m_b = beta1 * m_b + (1 - beta2)*gb**2
    m_w_hat = m_w / (1 - beta1**(step+1))
    m_b_hat = m_b / (1 - beta1**(step+1))
    w_adam = w_adam - eta * np.sqrt(m_w_hat) * v_w_hat
    b_adam = b_adam - eta * np.sqrt(m_b_hat) * v_b_hat

yplt = model(xlim, w_adam, b_adam)
plt.plot(xlim[:,0], yplt[:,0], c='blue', label = 'Adam')
plt.legend()
plt.show()
In [ ]: