by Fanyu
Notation:
Let $T = WX +b $
$$ loss = e^T e $$$$ loss = (Y-T)^T \cdot (Y-T) = (WX+b+e-T)^T \cdot (WX+b+e-T) $$$$ \frac{\partial Loss}{\partial W} = X^T (WX+b+e-T) \quad, \quad \frac{\partial loss }{\partial b} = 1^T (WX+b+e-T) $$$$ \frac{\partial Loss}{\partial W} = 2X^T (Y-T) \quad, \quad \frac{\partial loss }{\partial b} = 1^T (Y-T) $$Just One More Thing We Consider Here!
We do not want the number of sample to affect loss function's value, so we take averge, instead of sumation.
$$ \frac{\partial Loss}{\partial W} = \frac{2}{N}\ X^T (Y-T) \quad, \quad \frac{\partial loss }{\partial b} = \frac{1}{N} \ 1^T (Y-T) $$import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
def model(x, w, b):
"""
Linear Regression
"""
y = x @ w + b
return y
def grad(x, w, b, t):
y = model(x, w, b)
dw = 2/ len(x) * x.T @ (y-t)
db = np.mean(y - t, axis = 0)
return dw, db
################## Initialise Values
x = np.random.randn(1000, 1)
w = np.random.rand(1, 1)
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + e # true value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
plt.show()
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1) # Alter Dimension
w = np.random.rand(2, 1) # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + e # true value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1) # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
plt.show()
We find the Model works explicitly better after optimiser
eta = 0.1
################## Initialise Values
x = np.random.randn(1000, 1)
w = np.random.rand(1, 1)
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y * ((2)) + e ############## Alter Parameters Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
################## GD #################
for step in range(50):
gw, gb =grad(x, w, b, t)
w = w - eta * gw
b = b - eta * gb
################## Ploting Again #############
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='b')
plt.show()
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1) # Alter Dimension
w = np.random.rand(2, 1) # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + ((2*x**2 -2*x)) + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1) # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red')
################## GD #################
for step in range(50):
gw, gb =grad(x, w, b, t)
w = w - eta * gw
b = b - eta * gb
################## Ploting Again #############
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='b')
plt.show()
Batches were added.
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1) # Alter Dimension
w = np.random.rand(2, 1) # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + ((2*x**2 -2*x)) + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1) # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')
################## GD #################
eta = 0.1
w_control, b_control = w, b
for step in range(10):
gw, gb =grad(x, w_control, b_control, t)
w_control = w_control - eta * gw
b_control = b_control - eta * gb
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label='GD')
########### SGD <- Batch ###############
batch_size = 10
w_batch, b_batch = w, b
for step in range(10):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
w_batch = w_batch - eta * gw
b_batch = b_batch - eta * gb
yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label='SGD')
plt.legend()
plt.show()
In the Beginning of Iteration, $v_0=0$, so we amend it to be $\hat{v_T}$
$$ \hat{v_t} = \frac{v_t}{1-\beta_1^t} $$,where $\beta_1$ assign weights between previous value and the gradient.
Replace $g_w$ with $\hat{v}_t$:
$$ w_t = w_{t-1} -\eta \hat{v}_t $$################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1) # Alter Dimension
w = np.random.rand(2, 1) # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + ((2*x**2 -2*x)) + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1) # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')
###############Optimiser#############
Epoch = 30
batch_size = 32
eta = 0.01
################## GD #################
w_control, b_control = w, b
for step in range(Epoch):
gw, gb =grad(x, w_control, b_control, t)
w_control = w_control - eta * gw
b_control = b_control - eta * gb
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label = 'GD')
########### SGD <- Batch ###############
w_batch, b_batch = w, b
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
w_batch = w_batch - eta * gw
b_batch = b_batch - eta * gb
yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label = 'SGD')
########### Momentum <- batch + v ###############
w_momentum, b_momentum = w, b
v_w, v_b = 0, 0
beta1 = 0.9
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_momentum, b_momentum, t_idx)
v_w = beta1 * v_w + (1 - beta1)*gw
v_b = beta1 * v_b + (1 - beta1)*gb
v_w_hat = v_w / (1 - beta1**(step+1))
v_b_hat = v_b / (1 - beta1**(step+1))
w_momentum = w_momentum - eta * v_w_hat
b_momentum = b_momentum - eta * v_b_hat
yplt = model(xlim, w_momentum, b_momentum)
plt.plot(xlim[:,0], yplt[:,0], c='y', label = 'Momentum SGD')
plt.legend()
plt.show()
In the Beginning of Iteration, $v_0=0$, so we amend it to be $\hat{v_T}$
$$ \hat{m_t} = \frac{m_t}{1-\beta_1^t} $$,where $\beta_1$ assign weights between previous value and the gradient.
Add a square root of $\hat{m}_t$ in the denominator:
$$ w_t = w_{t-1} -\eta \frac{g_w}{\sqrt{\hat{m}_t}} $$################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1) # Alter Dimension
w = np.random.rand(2, 1) # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + ((2*x**2 -2*x)) + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1) # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')
###############Optimiser#############
Epoch = 30
batch_size = 32
eta = 0.01
################## GD #################
w_control, b_control = w, b
for step in range(Epoch):
gw, gb =grad(x, w_control, b_control, t)
w_control = w_control - eta * gw
b_control = b_control - eta * gb
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label = 'GD')
########### SGD <- Batch ###############
w_batch, b_batch = w, b
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
w_batch = w_batch - eta * gw
b_batch = b_batch - eta * gb
yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label = 'SGD')
########### Momentum <- batch + v ###############
w_momentum, b_momentum = w, b
v_w, v_b = 0, 0
beta1 = 0.9
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_momentum, b_momentum, t_idx)
v_w = beta1 * v_w + (1 - beta1)*gw
v_b = beta1 * v_b + (1 - beta1)*gb
v_w_hat = v_w / (1 - beta1**(step+1))
v_b_hat = v_b / (1 - beta1**(step+1))
w_momentum = w_momentum - eta * v_w_hat
b_momentum = b_momentum - eta * v_b_hat
yplt = model(xlim, w_momentum, b_momentum)
plt.plot(xlim[:,0], yplt[:,0], c='y', label = 'Momentum SGD')
########### RESprop <- batch + m ###############
w_res, b_res = w, b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
beta1 = 0.9
beta2 = 0.999
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_res, b_res, t_idx)
v_w = beta1 * v_w + (1 - beta1)*gw
v_b = beta1 * v_b + (1 - beta1)*gb
v_w_hat = v_w / (1 - beta1**(step+1))
v_b_hat = v_b / (1 - beta1**(step+1))
m_w = beta1 * m_w + (1 - beta2)*gw**2
m_b = beta1 * m_b + (1 - beta2)*gb**2
m_w_hat = m_w / (1 - beta1**(step+1))
m_b_hat = m_b / (1 - beta1**(step+1))
w_res = w_res - eta * np.sqrt(m_w_hat) * gw
b_res = b_res - eta * np.sqrt(m_b_hat) * gb
yplt = model(xlim, w_res, b_res)
plt.plot(xlim[:,0], yplt[:,0], c='pink', label = 'RMS')
plt.legend()
plt.show()
################## Initialise Values
x = np.random.randn(1000, 1)
x = np.concatenate([x, x**2], axis =1) # Alter Dimension
w = np.random.rand(2, 1) # Alter the Dimension
b = np.zeros([1])
e = np.random.normal(0, 0.1, [1000, 1])
y = model(x, w, b) # the estimate
t = y + ((2*x**2 -2*x)) + e ############## Alter Parameter Here - True Value
################## Ploting
xlim = np.linspace(-3,3,1000).reshape(-1,1)
xlim = np.concatenate([xlim, xlim**2], axis =1) # Alter Dimension
plt.scatter(x[:,0], t[:,0], c='black')
yplt = model(xlim, w, b)
#plt.plot(xlim[:,0], yplt[:,0], c='red', label = 'Origin')
###############Optimiser#############
Epoch = 30
batch_size = 32
eta = 0.01
################## GD #################
w_control, b_control = w, b
for step in range(Epoch):
gw, gb =grad(x, w_control, b_control, t)
w_control = w_control - eta * gw
b_control = b_control - eta * gb
yplt = model(xlim, w_control, b_control)
plt.plot(xlim[:,0], yplt[:,0], c='b', label = 'GD')
########### SGD <- Batch ###############
w_batch, b_batch = w, b
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_batch, b_batch, t_idx)
w_batch = w_batch - eta * gw
b_batch = b_batch - eta * gb
yplt = model(xlim, w_batch, b_batch)
plt.plot(xlim[:,0], yplt[:,0], c='g', label = 'SGD')
########### Momentum <- batch + v ###############
w_momentum, b_momentum = w, b
v_w, v_b = 0, 0
beta1 = 0.9
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_momentum, b_momentum, t_idx)
v_w = beta1 * v_w + (1 - beta1)*gw
v_b = beta1 * v_b + (1 - beta1)*gb
v_w_hat = v_w / (1 - beta1**(step+1))
v_b_hat = v_b / (1 - beta1**(step+1))
w_momentum = w_momentum - eta * v_w_hat
b_momentum = b_momentum - eta * v_b_hat
yplt = model(xlim, w_momentum, b_momentum)
plt.plot(xlim[:,0], yplt[:,0], c='y', label = 'Momentum SGD')
########### RESprop <- batch + m ###############
w_res, b_res = w, b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
beta1 = 0.9
beta2 = 0.999
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_res, b_res, t_idx)
v_w = beta1 * v_w + (1 - beta1)*gw
v_b = beta1 * v_b + (1 - beta1)*gb
v_w_hat = v_w / (1 - beta1**(step+1))
v_b_hat = v_b / (1 - beta1**(step+1))
m_w = beta1 * m_w + (1 - beta2)*gw**2
m_b = beta1 * m_b + (1 - beta2)*gb**2
m_w_hat = m_w / (1 - beta1**(step+1))
m_b_hat = m_b / (1 - beta1**(step+1))
w_res = w_res - eta * np.sqrt(m_w_hat) * gw
b_res = b_res - eta * np.sqrt(m_b_hat) * gb
yplt = model(xlim, w_res, b_res)
plt.plot(xlim[:,0], yplt[:,0], c='pink', label = 'RMS')
########### ADAM ###############
w_adam, b_adam = w, b
v_w, v_b = 0, 0
m_w, m_b = 0, 0
beta1 = 0.9
beta2 = 0.999
for step in range(Epoch):
# Suffle with the Batch
idx = np.random.randint(0, 1000, batch_size)
x_idx = x[idx]
t_idx = t[idx]
gw, gb =grad(x_idx, w_adam, b_adam, t_idx)
v_w = beta1 * v_w + (1 - beta1)*gw
v_b = beta1 * v_b + (1 - beta1)*gb
v_w_hat = v_w / (1 - beta1**(step+1))
v_b_hat = v_b / (1 - beta1**(step+1))
m_w = beta1 * m_w + (1 - beta2)*gw**2
m_b = beta1 * m_b + (1 - beta2)*gb**2
m_w_hat = m_w / (1 - beta1**(step+1))
m_b_hat = m_b / (1 - beta1**(step+1))
w_adam = w_adam - eta * np.sqrt(m_w_hat) * v_w_hat
b_adam = b_adam - eta * np.sqrt(m_b_hat) * v_b_hat
yplt = model(xlim, w_adam, b_adam)
plt.plot(xlim[:,0], yplt[:,0], c='blue', label = 'Adam')
plt.legend()
plt.show()