By Fanyu
import numpy as np
import matplotlib.pyplot as plt
x = np.arange(-10,10,0.01)
# dL/dy or partial derivative from the upper chain
dupstream = np.zeros_like(x)
, where $f(x) = xw + b$
, where $a(x)$ is the activation function.
, where $Loss(x)$ is the loss function.
We aim to calcutate the Parameters of $f(.)$, by minimising the loss. So, we take F.O.C. to $Loss(.)$.
$$ \frac{\partial Loss(w)}{\partial w} == 0 $$By Chain Rule,
$$ \frac{\partial Loss(w)}{\partial w} = \frac{\partial Loss(.)}{\partial Softmax} \cdot \frac{\partial Softmax(.)}{\partial a} \cdot \frac{\partial a}{\partial f} \cdot \frac{\partial f}{\partial a} ... \cdot \frac{\partial f}{\partial w} $$Let's Decompose each Part in the Following.
class Relu:
def __init__(self):
self.mask = None
self.out = None
def forward(self, x):
"""
run the relu, and save the x<=0 index
"""
self.mask = (x<=0)
self.out = x.copy()
self.out[self.mask] = 0
return self.out
def backward(self, dupstream):
"""
return the derivatives of Relu
the derivative is either 0 or 1 iff x>0
by the chain rule, we also need to multiply the d(upperstream)
"""
dupstream[self.mask] = 0
dx = dupstream
return dx
# forward
relu = Relu()
test_result = relu.forward(x = x)
plt.plot(x, test_result)
[<matplotlib.lines.Line2D at 0x7f8e82ec7fd0>]
# backward
# derivative is either 10 or 0. 10 is from "1"*dupstream
dx_relu = relu.backward(dupstream+10)
plt.plot(x, dx_relu)
[<matplotlib.lines.Line2D at 0x7f8e82ccd730>]
class Sigmoid:
def __init__(self):
self.out = None
def forward(self, x):
self.out = 1 / (1+ np.exp(-x))
return self.out
def backward(self, dupstream):
# y <- self.out dx = dupstream * y (1-y)
dx = dupstream* (1 - self.out) * self.out
return dx
# forward
sigmoid = Sigmoid()
plt.plot(x, sigmoid.forward(x))
[<matplotlib.lines.Line2D at 0x7f8e829ed730>]
plt.plot(x,np.tanh(x))
[<matplotlib.lines.Line2D at 0x7fb598266730>]
# backward
# mannually move it upward a little bit, to see distinguishmen
plt.plot(x, sigmoid.backward(dupstream+1)+0.1)
plt.plot(x, sigmoid.out * (1-sigmoid.out))
# without the disturbance, two line will collaspe
# that also prove that df(x+c) = df(x)
[<matplotlib.lines.Line2D at 0x7f8e828a8220>]
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.dW = None
self.db = None
self.out = None
def forward(self, x):
self.x = x
self.out = np.dot(x, self.W) + self.b
return self.out
def backward(self, dupstream):
dx = np.dot( dupstream, self.W.T ) # Right Multi
self.dW = np.dot(self.x.T, dupstream) # Left Multi
self.db = np.sum(dupstream, axis = 0) # Left Multi Identify.T <=> sum
return dx
Deviation is as the Following,
https://blog.csdn.net/Wild_Young/article/details/121912675
$\frac{\partial Y}{\partial X}$:
$$Softmax(x) = \frac{1}{1+e^{-x}}$$Let $X$ be a vector with shape = (n,1), and $Y = Softmax(X) $.
That means, for each element of $y_i$ in Y
$$ y_k = \frac{-e^{x_k}}{\sum_i^N e^{x_i}} $$So,
$$ \frac{\partial Y}{\partial x_k} $$If $j = i$,
$$ \frac{\partial y_j}{\partial x_i} = \frac{\partial }{\partial x_i} \Bigg( \frac{e^{x_i}}{\sum e^{x_i}} \Bigg) $$$$ = \frac{ (e^{x_i})'\sum e^{x_i} - e^{x_i} (\sum e^{x_i})' }{\big( \sum e^{x_i} \big)^2} $$$$ =\frac{e^{x_i}}{\sum e^{x_i}} - \frac{e^{x_i}}{\sum e^{x_i}} \cdot \frac{e^{x_i}}{\sum e^{x_i}}$$$$= y_i - y_i^2 =y_i(1-y_i)$$If $j \neq i$,
$$ \frac{\partial y_j}{\partial x_i} = \frac{\partial }{\partial x_i} \Bigg( \frac{e^{x_j}}{\sum e^{x_i}} \Bigg) $$$$ = \frac{-e^{x_i} e^{x_j}}{(\sum e^{x_i})^2} $$$$ =- \frac{e^{x_j}}{\sum e^{x_i}} \cdot \frac{e^{x_i}}{\sum e^{x_i}} $$$$ = -y_j y_i $$$$ \frac{\partial y_j}{\partial x_i}=\left\{ \begin{aligned} & = y_i - y_i^2 \quad \text{,if $i = j$} \\ & = -y_j y_i \quad \text{,if $i \neq j$}\\ \end{aligned} \right. $$Therefore, for $ \frac{\partial Y}{\partial X} $, we got the Jacobian matrix,
$$\frac{\partial \vec{Y}}{\partial \vec{X}} = Diag(Y) - Y^T Y$$However, in the backward porpagation of Softmax, we only need the diagonal, which is in the case of i = j.
$$ \frac{\partial y_i}{\partial x_i} = y_i-y_i^2 =y_i(1-y_i)$$def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[:, t] + 1e-7)) / batch_size
def softmax(x):
c = x.max(axis = 0)
x = x - c
if x.ndim != 1:
temp = np.exp(x)
out = temp/temp.sum(axis = 0)
return out
temp = np.exp(x)
out = temp/temp.sum()
return out
def softmax(x):
c = x.max(axis = 0) # 避免溢出
x = x - c
if x.ndim != 1:
temp = np.exp(x)
out = temp/temp.sum(axis = 0)
return out
temp = np.exp(x)
out = temp/temp.sum()
return out
class Softmax:
def __init__(self):
self.out = None
self.dx = None
def forward(self, x):
c = x.max(axis = 0)
x = x - c
if x.ndim != 1:
temp = np.exp(x)
self.out = temp/temp.sum(axis = 0)
return self.out
temp = np.exp(x)
self.out = temp/temp.sum()
return self.out
def backward(self, dupstream):
"""
Jacobian Matrix
self.dx = np.diag(self.out) - \
np.dot(self.out.reshape(-1,1) , \
self.out.reshape(1,-1) )
"""
self.dx = self.out - np.square(self.out)
return self.dx # n by n matrix
softmax = Softmax()
softmax_out = softmax.forward(x)
plt.plot(x, softmax_out)
softmax_dx = softmax.backward(dupstream)
plt.plot(x, softmax_dx)
print(softmax_out.sum())
0.9999999999999999
# wait to be continued
class LossFunc:
def __init__(self):
self.cross_entropy_out = None
self.MSE_out = None
self.y = None
self.t = None
def cross_entropy(self, y ,t):
self.y = y
self.t = t
if self.y.ndim == 1:
self.t = t.reshape(1, t.size)
self.y = y.reshape(1, y.size)
batch_size = self.y.shape[0]
self.cross_entropy_out = -np.sum(np.log(self.y[:, t] + 1e-7)) / batch_size
return self.cross_entropy_out
def MSE(self, y, t):
pass
The Cross-Entropy Loss function is, $$ L= - \sum_i t_i \ ln(y_i) =-\bigg(\vec{\mathbf{1}}^T\bigg)_{1\times n} \cdot \ \Bigg( \mathbf{t}_{n\times k} \cdot \big(\vec{Y_{log}}^T\big)_{k\times n}\Bigg)_{n\times n} \cdot \bigg(\vec{\mathbf{1}}\bigg)_{n\times 1}$$
, where $\vec{Y_{log}}^T$ is apply log to each element of $\vec{Y}$, and then take transformation.
https://www.matrixcalculus.org
$$\frac{\partial L}{\partial \vec{Y}} = \vec{\mathbf{t}}^T \cdot Diag(Y_{1/Y}) $$, where $Y_{1/Y}$ is $1/y_i$ for each element of $Y$.
$$ \frac{\partial L}{\partial x_k } = \frac{\partial L}{\partial y_k}\cdot \frac{\partial y_k}{\partial x_k} $$$$\frac{\partial L}{\partial x_k } =- \sum_i \frac{\partial }{\partial {y}_k} \bigg( t_i \ ln(y_i) \bigg) \cdot \frac{\partial y_k}{\partial x_k} $$$$\frac{\partial L}{\partial x_k } =- \sum_i \frac{t_i }{{ y}_i} \cdot \frac{\partial {y}_i}{\partial x_k} $$Plug in the derivatives of Softmax, $ \frac{\partial y}{\partial x}$
$$\frac{\partial L}{\partial x_k } = - \frac{t_k }{{ y}_k} \cdot \frac{\partial {y}_k}{\partial x_k} - \sum_{i\neq k} \frac{t_i }{{ y}_i} \cdot \frac{\partial {y}_i}{\partial x_k} $$$$ = -\frac{t_k }{{ y}_k} \cdot {y}_k (1-{y}_k) - \sum_{i\neq k} \frac{t_i }{{ y}_i} \cdot (- {y}_i {y}_k) $$$$ = - t_k (1-{y}_k) + \sum_{i\neq k} t_i {y}_k $$$$ = - t_k + \sum_{i} t_i \ {y}_k $$$$ = - t_k + {y}_k \sum_{i} t_i \quad\text{by the tag, $t$, is sum to be 1} $$$$ \frac{\partial L}{\partial x_k } ={y}_k - t_k$$So, we get $ \frac{\partial L}{\partial x_k} ={y}_k - t_k $.
class CrossEntropy:
def __init__(self):
self.out = None
self.dx = None
self.y = None
self.t = None
def _reform_t(self, t):
if self.t.size != self.y.size:
t_docker = np.zeros_like(self.y) # transform the location tag to one-hot tag
for i in range(len(t_docker)):
t_docker[i , t[i]] = 1
self.t = t_docker
def forward(self, y, t):
self.y = y
self.t = t
self._reform_t(t)
# if y.ndim == 1:
# t = t.reshape(1, t.size)
# y = y.reshape(1, y.size)
# self.out = -np.sum(np.log(y[:, t] + 1e-7)) / len(y)
"""
Loss = ave( t dot y.log.T )
"""
self.out = - np.dot(self.t, np.log(self.y).T).sum() / len(self.y)
return self.out
def forward_matrix(self, y, t):
"""
Basically will get same result as above,
but that method apply pure matrix calcluation
"""
self.y = y
self.t = t
self._reform_t(t)
middle = - np.dot(self.t, np.log(self.y).T)
first = np.ones(len(self.y)).reshape(1,-1) # one vector ^T 1*n
after = first.reshape(-1,1) # one vector n*1
self.dx = np.dot(np.dot(first, middle), after).item()/len(y)
return self.dx
def backward(self, dupstream = 1):
"""
Wait to be Continued
"""
first = np.ones(len(self.y)).reshape(1,-1) # one vector ^T 1*n
after = first.reshape(-1,1) # one vector n*1
pass
class SoftmaxWithLoss:
def __init__(self):
self.loss = None # CrossEntropy Output - Loss
self.y = None # Softmax(x) = y
self.t = None # Tag
self.dx = None
def softmax(self, x):
c = x.max(axis = 0)
x = x - c
if x.ndim != 1:
temp = np.exp(x)
out = temp/temp.sum(axis = 0)
return out
temp = np.exp(x)
out = temp/temp.sum()
return out
def cross_entropy_error(self, y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[:, t] + 1e-7)) / batch_size
def forward(self, x, t):
self.t = t
# initalise the other class. Stupid am I
#softmax = Softmax()
self.y = self.softmax(x)
self.loss = self.cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dupstream = 1):
# the loss func is the top upstream
# so we set dupstream default = 1
batch_size = self.t.shape[0]
if self.t.size == self.y.size: # one-hot-vector case
self.dx = (self.y - self.t) / batch_size
else:
self.dx = self.y.copy()
self.dx[: , self.t] -= 1 # gradient = y - t , where t = 0 or 1
self.dx = self.dx/ batch_size
return self.dx
# 0. Initialise Parameters and Data
x = np.random.randint(100, size = (1_000,2))
t = np.random.choice(np.arange(4), size = (1000,))
t_docker = np.zeros([1000,4]) # transform the location tag to one-hot tag
for i in range(len(t_docker)):
t_docker[i , t[i]] = 1
w = np.random.randn(2,4)
b = np.random.rand(1,4)*10
# 1. Affine
affine = Affine(w,b)
affine_out = affine.forward(x)
# 2. ActivationFunc - Sigmoid
sigmoid = Sigmoid()
a = sigmoid.forward(affine_out)
# 3. Softmax
softmax_class = Softmax()
y = softmax_class.forward(a)
# 4. Loss - CrossEntropy
loss = LossFunc()
loss.cross_entropy(y, t)
6952.599686519977
# P.S. Or, Directly apply SoftmaxWithLoss
sandL = SoftmaxWithLoss()
print('Localtion Tag: ',sandL.forward(a, t))
print('One-hop Tag: ',sandL.forward(a, t_docker))
Localtion Tag: 6952.599686519977 One-hop Tag: 6952.599686519977
dx_snl = sandL.backward()
dx_sigmoid = sigmoid.backward(dx_snl)
dx_affine = affine.backward(dx_sigmoid)
affine.dW
array([[-1.16446106e-01, -3.55403493e-03, -2.43096462e-04, -3.15079329e-02], [-2.58415644e-01, -1.62226686e-03, -1.76893941e-05, -1.18305684e-01]])
affine.db
array([-4.47625279e-03, -4.99360245e-04, -4.87843433e-05, -2.34348217e-03])
func_1 = lambda x: x**2 +5
func_2 = lambda x: x[0]**2 + x[1]**3 +1
x_lim = np.arange(-5,5,0.01)
input_val = np.array([2.0,3.0])
class Differentiate:
def __init__(self):
self.h = 1e-5
self.dx = None
def d1_diff(self, f, x):
fh1 = f(x+self.h)
fh2 = f(x-self.h)
self.dx = (fh1 - fh2)/(2*self.h)
return self.dx
def tangent(self, series, f, x_loc):
"""
Return a Tangent Line, for ploting.
"""
y_loc = f(x_loc)
self.d1_diff(func_1, x_loc)
b = y_loc - self.dx * x_loc
y_series = self.dx * series + b
return y_series
# for f(x1, x2, x_3, ...)
def dn_diff(self, f, x):
grad = np.zeros_like(x)
for i in range(len(x)):
temp_val = x[i]
x[i] = temp_val + self.h
fxh1 = f(x)
x[i] = temp_val - self.h
fxh2 = f(x)
grad[i] = (fxh1 - fxh2) / (2*self.h)
x[i] = temp_val
self.dx = grad
return self.dx
def gradient_descent(self, f, init_x, lr = 0.01, step_num = 1000):
x = init_x
for i in range(step_num):
self.dn_diff(f, x)
x -= lr * self.dx
return x
diff = Differentiate()
diff.d1_diff(func_1, 1)
2.0000000000131024
plt.plot(x_lim, func_1(x_lim))
plt.plot(x_lim, diff.tangent(x_lim, func_1, 1))
[<matplotlib.lines.Line2D at 0x7f9d68343b80>]
diff.dn_diff(func_2, input_val)
array([ 4., 27.])
diff.gradient_descent(func_2, input_val)
array([3.36597239e-09, 3.28189298e-02])