Recently I encountered a variant on the normal linear neural layer architecture: Instead of $Z = XW + B$, we now have $Z = (X-A)W + B$. So we have a 'pre-bias' $A$ that affects the activation of the last layer, before multiplication by weights. I don't understand the backpropagation equations for $dA$ and $dB$ ($dW$ is as expected).
Here is the original paper in which it appeared (although the paper itself isn't actually that relevant): https://papers.nips.cc/paper/4830-learning-invariant-representations-of-molecules-for-atomization-energy-prediction.pdf
Here is the link to the full code of the neural network: http://www.quantum-machine.org/code/nn-qm7.tar.gz
class Linear(Module):
def __init__(self,m,n):
self.tr = m**.5 / n**.5
self.lr = 1 / m**.5
self.W = numpy.random.normal(0,1 / m**.5,[m,n]).astype('float32')
self.A = numpy.zeros([m]).astype('float32')
self.B = numpy.zeros([n]).astype('float32')
def forward(self,X):
self.X = X
Y = numpy.dot(X-self.A,self.W)+self.B
return Y
def backward(self,DY):
self.DW = numpy.dot((self.X-self.A).T,DY)
self.DA = -(self.X-self.A).sum(axis=0)
self.DB = DY.sum(axis=0) + numpy.dot(self.DA,self.W)
DX = self.tr * numpy.dot(DY,self.W.T)
return DX
def update(self,lr):
self.W -= lr*self.lr*self.DW
self.B -= lr*self.lr*self.DB
self.A -= lr*self.lr*self.DA
def average(self,nn,a):
self.W = a*nn.W + (1-a)*self.W
self.B = a*nn.B + (1-a)*self.B
self.A = a*nn.A + (1-a)*self.A