I have the below code for gated attention:
class Attn_Net_Gated(nn.Module):
# Attention Network with Sigmoid Gating (3 fc layers). Args:
# L: input feature dimension
# D: hidden layer dimension
# dropout: whether to use dropout (p = 0.25)
# n_classes: number of classes """
def __init__(self, L = 1024, D = 256, dropout = False, n_classes = 1):
super(Attn_Net_Gated, self).__init__()
self.attention_a = [nn.Linear(L, D), nn.Tanh()]
self.attention_b = [nn.Linear(L, D), nn.Sigmoid()]
if dropout:
self.attention_a.append(nn.Dropout(0.25))
self.attention_b.append(nn.Dropout(0.25))
self.attention_a = nn.Sequential(*self.attention_a)
self.attention_b = nn.Sequential(*self.attention_b)
self.attention_c = nn.Linear(D, n_classes)
def forward(self, x):
a = self.attention_a(x)
b = self.attention_b(x)
A = a.mul(b)
A = self.attention_c(A) # N x n_classes
return A, x
And as input I have a feature vector of my image with size of (1, 1024) that I got from ResNet50 encoder. I do not really get what is the idea of element-wise multiplication of tanh and sigmoid outputs? This code is adopted from the paper https://arxiv.org/pdf/1802.04712.pdf.