import numpy as np
import matplotlib.pyplot as plt


x = np.arange(-10,10,0.01)
# dL/dy  or partial derivative from the upper chain
dupstream = np.zeros_like(x)


class Relu:
    def __init__(self):
        self.mask = None
        self.out = None
        
    def forward(self, x):
        """
        run the relu, and save the x<=0 index
        """
        self.mask = (x<=0)
        self.out = x.copy()
        self.out[self.mask] = 0
        return self.out
    
    def backward(self, dupstream):
        """
        return the derivatives of Relu
        the derivative is either 0 or 1 iff x>0
        by the chain rule, we also need to multiply the d(upperstream)
        """
        dupstream[self.mask] = 0
        dx = dupstream
        return dx


# forward

relu = Relu()
test_result = relu.forward(x = x)

plt.plot(x, test_result)

[<matplotlib.lines.Line2D at 0x7f8e82ec7fd0>]


# backward
# derivative is either 10 or 0. 10 is from "1"*dupstream
dx_relu = relu.backward(dupstream+10)
plt.plot(x, dx_relu)

[<matplotlib.lines.Line2D at 0x7f8e82ccd730>]


class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        self.out = 1 / (1+ np.exp(-x))
        return self.out
    
    def backward(self, dupstream):
        # y <- self.out     dx = dupstream * y (1-y)
        dx = dupstream* (1 - self.out) * self.out
        return dx


# forward
sigmoid = Sigmoid()
plt.plot(x, sigmoid.forward(x))

[<matplotlib.lines.Line2D at 0x7f8e829ed730>]


plt.plot(x,np.tanh(x))

[<matplotlib.lines.Line2D at 0x7fb598266730>]


# backward

# mannually move it upward a little bit, to see distinguishmen
plt.plot(x, sigmoid.backward(dupstream+1)+0.1) 
plt.plot(x, sigmoid.out * (1-sigmoid.out))

# without the disturbance, two line will collaspe
# that also prove that df(x+c) = df(x)

[<matplotlib.lines.Line2D at 0x7f8e828a8220>]


class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        self.out = None
        
    def forward(self, x):
        self.x = x
        self.out = np.dot(x, self.W) + self.b
        return self.out

    def backward(self, dupstream):
        dx = np.dot( dupstream, self.W.T ) # Right Multi
        self.dW = np.dot(self.x.T, dupstream) # Left Multi
        self.db = np.sum(dupstream, axis = 0) # Left Multi Identify.T <=> sum
        return dx


def cross_entropy_error(y, t):
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    if t.size == y.size:
        t = t.argmax(axis=1)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[:, t] + 1e-7)) / batch_size

def softmax(x):
    c = x.max(axis = 0)
    x = x - c
    if x.ndim != 1:
        temp = np.exp(x)
        out = temp/temp.sum(axis = 0)
        return out        
    temp = np.exp(x)
    out = temp/temp.sum()
    return out


def softmax(x):
    c = x.max(axis = 0) # 避免溢出
    x = x - c
    if x.ndim != 1:
        temp = np.exp(x)
        out = temp/temp.sum(axis = 0)
        return out        
    temp = np.exp(x)
    out = temp/temp.sum()
    return out

class Softmax:
    def __init__(self):
        self.out = None
        self.dx = None
        
    def forward(self, x):
        c = x.max(axis = 0)
        x = x - c
        if x.ndim != 1:
            temp = np.exp(x)
            self.out = temp/temp.sum(axis = 0)
            return self.out        
        temp = np.exp(x)
        self.out = temp/temp.sum()
        return self.out
    
    def backward(self, dupstream):
        """
        Jacobian Matrix
        self.dx = np.diag(self.out) - \
                    np.dot(self.out.reshape(-1,1) , \
                           self.out.reshape(1,-1) )
        """
        self.dx = self.out - np.square(self.out)
        return self.dx # n by n matrix


softmax = Softmax()

softmax_out = softmax.forward(x)
plt.plot(x, softmax_out)


softmax_dx = softmax.backward(dupstream)
plt.plot(x, softmax_dx)
print(softmax_out.sum())

0.9999999999999999


# wait to be continued
class LossFunc:
    def __init__(self):
        self.cross_entropy_out = None
        self.MSE_out = None
        self.y = None
        self.t = None
    
    def cross_entropy(self, y ,t):
        self.y = y
        self.t = t
        if self.y.ndim == 1:
            self.t = t.reshape(1, t.size)
            self.y = y.reshape(1, y.size)
        batch_size = self.y.shape[0]
        self.cross_entropy_out = -np.sum(np.log(self.y[:, t] + 1e-7)) / batch_size
        return self.cross_entropy_out
    
    def MSE(self, y, t):
        pass


class CrossEntropy:
    def __init__(self):
        self.out = None
        self.dx = None
        self.y = None
        self.t = None
        
    def _reform_t(self, t):
        if self.t.size != self.y.size:
            t_docker = np.zeros_like(self.y) # transform the location tag to one-hot tag
            for i in range(len(t_docker)):
                t_docker[i , t[i]] = 1
            self.t = t_docker
        
    def forward(self, y, t):
        self.y = y
        self.t = t
        self._reform_t(t)
#         if y.ndim == 1:
#             t = t.reshape(1, t.size)
#             y = y.reshape(1, y.size)
#         self.out = -np.sum(np.log(y[:, t] + 1e-7)) / len(y)
        """
        Loss = ave( t dot y.log.T )
        """
        self.out = - np.dot(self.t, np.log(self.y).T).sum() / len(self.y)
        return self.out
    
    def forward_matrix(self, y, t):
        """
        Basically will get same result as above,
        but that method apply pure matrix calcluation
        """
        self.y = y
        self.t = t
        self._reform_t(t)
        middle = - np.dot(self.t, np.log(self.y).T)
        first = np.ones(len(self.y)).reshape(1,-1) # one vector ^T 1*n
        after = first.reshape(-1,1) # one vector  n*1
        self.dx = np.dot(np.dot(first, middle), after).item()/len(y)
        return self.dx
    
    def backward(self, dupstream = 1):
        """
        Wait to be Continued
        """
        first = np.ones(len(self.y)).reshape(1,-1) # one vector ^T 1*n
        after = first.reshape(-1,1) # one vector  n*1
        pass


class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None  # CrossEntropy Output - Loss
        self.y = None  # Softmax(x) = y
        self.t = None  # Tag
        self.dx = None
        
    def softmax(self, x):
        c = x.max(axis = 0)
        x = x - c
        if x.ndim != 1:
            temp = np.exp(x)
            out = temp/temp.sum(axis = 0)
            return out        
        temp = np.exp(x)
        out = temp/temp.sum()
        return out

    def cross_entropy_error(self, y, t):
        if y.ndim == 1:
            t = t.reshape(1, t.size)
            y = y.reshape(1, y.size)
        if t.size == y.size:
            t = t.argmax(axis=1)
        batch_size = y.shape[0]
        return -np.sum(np.log(y[:, t] + 1e-7)) / batch_size

    
    def forward(self, x, t):
        self.t = t
        # initalise the other class. Stupid am I
        #softmax = Softmax()
        self.y = self.softmax(x)
        self.loss = self.cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dupstream = 1): 
        # the loss func is the top upstream
        # so we set dupstream default = 1
        batch_size = self.t.shape[0]
        if self.t.size == self.y.size: # one-hot-vector case
            self.dx = (self.y - self.t) / batch_size
        else:
            self.dx = self.y.copy()
            self.dx[: , self.t] -= 1 # gradient = y - t , where t = 0 or 1
            self.dx = self.dx/ batch_size
        return self.dx


# 0. Initialise Parameters and Data
x = np.random.randint(100, size = (1_000,2))
t = np.random.choice(np.arange(4), size = (1000,))
t_docker = np.zeros([1000,4]) # transform the location tag to one-hot tag
for i in range(len(t_docker)):
    t_docker[i , t[i]] = 1

w = np.random.randn(2,4)
b = np.random.rand(1,4)*10


# 1. Affine
affine = Affine(w,b)
affine_out = affine.forward(x)


# 2. ActivationFunc - Sigmoid
sigmoid = Sigmoid()
a = sigmoid.forward(affine_out)


# 3. Softmax
softmax_class = Softmax()
y = softmax_class.forward(a)


# 4. Loss - CrossEntropy
loss = LossFunc()
loss.cross_entropy(y, t)

6952.599686519977


# P.S. Or, Directly apply SoftmaxWithLoss
sandL = SoftmaxWithLoss()
print('Localtion Tag: ',sandL.forward(a, t))

print('One-hop Tag: ',sandL.forward(a, t_docker))

Localtion Tag:  6952.599686519977
One-hop Tag:  6952.599686519977


dx_snl = sandL.backward()
dx_sigmoid = sigmoid.backward(dx_snl)
dx_affine = affine.backward(dx_sigmoid)


affine.dW

array([[-1.16446106e-01, -3.55403493e-03, -2.43096462e-04,
        -3.15079329e-02],
       [-2.58415644e-01, -1.62226686e-03, -1.76893941e-05,
        -1.18305684e-01]])


affine.db

array([-4.47625279e-03, -4.99360245e-04, -4.87843433e-05, -2.34348217e-03])


func_1 = lambda x: x**2 +5
func_2 = lambda x: x[0]**2 + x[1]**3 +1
x_lim = np.arange(-5,5,0.01)
input_val = np.array([2.0,3.0])


class Differentiate:
    def __init__(self):
        self.h = 1e-5
        self.dx = None
        
    def d1_diff(self, f, x):
        fh1 = f(x+self.h)
        fh2 = f(x-self.h)
        self.dx = (fh1 - fh2)/(2*self.h)
        return self.dx
        
    def tangent(self, series, f, x_loc):
        """
        Return a Tangent Line, for ploting.
        """
        y_loc = f(x_loc)
        self.d1_diff(func_1, x_loc)
        b = y_loc - self.dx * x_loc
        y_series = self.dx * series + b
        return y_series
    
    # for f(x1, x2, x_3, ...)
    def dn_diff(self, f, x):
        grad = np.zeros_like(x)
        for i in range(len(x)):
            temp_val = x[i]
            x[i] = temp_val + self.h
            fxh1 = f(x)
            x[i] = temp_val - self.h
            fxh2 = f(x)
            grad[i] = (fxh1 - fxh2) / (2*self.h)
            x[i] = temp_val
            self.dx = grad
        return self.dx
    
    def gradient_descent(self, f, init_x, lr = 0.01, step_num = 1000):
        x = init_x   
        for i in range(step_num):
            self.dn_diff(f, x)
            x -= lr * self.dx
        return x


diff = Differentiate()
diff.d1_diff(func_1, 1)

2.0000000000131024


plt.plot(x_lim, func_1(x_lim))
plt.plot(x_lim, diff.tangent(x_lim, func_1, 1))

[<matplotlib.lines.Line2D at 0x7f9d68343b80>]


diff.dn_diff(func_2, input_val)

array([ 4., 27.])


diff.gradient_descent(func_2, input_val)

array([3.36597239e-09, 3.28189298e-02])

1. Forward Propagation¶

2. Backward Propagation¶

2.1 Activation Function¶

2.1.1 Relu¶

2.1.2 Sigmoid¶

2.2 Affine¶

Forward¶

Backward¶

2.3 Softmax¶

2.4 Loss Function¶

2.4.1 Cross Entropy Error with Softmax¶

3. Concat¶

Forward¶

Backward¶

4. Differentiation¶