Add BERT model

michelaugustoap · Feb 2, 2019 · 1a0f1e0 · 1a0f1e0
1 parent cb67363
commit 1a0f1e0
Show file tree

Hide file tree

Showing 7 changed files with 281 additions and 42 deletions.
diff --git a/4-2.Seq2Seq(Attention)/Seq2Seq(Attention)-Tensor.py b/4-2.Seq2Seq(Attention)/Seq2Seq(Attention)-Tensor.py
@@ -26,34 +26,34 @@ def make_batch(sentences):
     return input_batch, output_batch, target_batch
 
 # Model
-enc_inputs = tf.placeholder(tf.float32, [None, None, n_class])  # [batch_size, max_len, n_class]
-dec_inputs = tf.placeholder(tf.float32, [None, None, n_class])  # [batch_size, max_len, n_class]
-targets = tf.placeholder(tf.int64, [1, n_step])  # [batch_size, max_len], not one-hot
+enc_inputs = tf.placeholder(tf.float32, [None, None, n_class])  # [batch_size, n_step, n_class]
+dec_inputs = tf.placeholder(tf.float32, [None, None, n_class])  # [batch_size, n_step, n_class]
+targets = tf.placeholder(tf.int64, [1, n_step])  # [batch_size, n_step], not one-hot
 
 # Linear for attention
 attn = tf.Variable(tf.random_normal([n_hidden, n_hidden]))
 out = tf.Variable(tf.random_normal([n_hidden * 2, n_class]))
 
-def get_att_score(dec_output, enc_output):  # enc_output [max_len, n_hidden]
+def get_att_score(dec_output, enc_output):  # enc_output [n_step, n_hidden]
     score = tf.squeeze(tf.matmul(enc_output, attn), 0)  # score : [n_hidden]
     dec_output = tf.squeeze(dec_output, [0, 1])  # dec_output : [n_hidden]
     return tf.tensordot(dec_output, score, 1)  # inner product make scalar value
 
 def get_att_weight(dec_output, enc_outputs):
-    attn_scores = []  # list of attention scalar : [max_len]
-    enc_outputs = tf.transpose(enc_outputs, [1, 0, 2])  # enc_outputs : [max_len, batch_size, n_hidden]
+    attn_scores = []  # list of attention scalar : [n_step]
+    enc_outputs = tf.transpose(enc_outputs, [1, 0, 2])  # enc_outputs : [n_step, batch_size, n_hidden]
     for i in range(n_step):
         attn_scores.append(get_att_score(dec_output, enc_outputs[i]))
 
     # Normalize scores to weights in range 0 to 1
-    return tf.reshape(tf.nn.softmax(attn_scores), [1, 1, -1])  # [1, 1, max_len]
+    return tf.reshape(tf.nn.softmax(attn_scores), [1, 1, -1])  # [1, 1, n_step]
 
 model = []
 Attention = []
 with tf.variable_scope('encode'):
     enc_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
     enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)
-    # enc_outputs : [batch_size(=1), max_len(=decoder_step), n_hidden(=128)]
+    # enc_outputs : [batch_size(=1), n_step(=decoder_step), n_hidden(=128)]
     # enc_hidden : [batch_size(=1), n_hidden(=128)]
     enc_outputs, enc_hidden = tf.nn.dynamic_rnn(enc_cell, enc_inputs, dtype=tf.float32)
 
@@ -75,10 +75,10 @@ def get_att_weight(dec_output, enc_outputs):
         dec_output = tf.squeeze(dec_output, 0)  # [1, n_step]
         context = tf.squeeze(context, 1)  # [1, n_hidden]
 
-        model.append(tf.matmul(tf.concat((dec_output, context), 1), out))  # [max_len, batch_size(=1), n_class]
+        model.append(tf.matmul(tf.concat((dec_output, context), 1), out))  # [n_step, batch_size(=1), n_class]
 
 trained_attn = tf.stack([Attention[0], Attention[1], Attention[2], Attention[3], Attention[4]], 0)  # to show attention matrix
-model = tf.transpose(model, [1, 0, 2])  # model : [max_len, n_class]
+model = tf.transpose(model, [1, 0, 2])  # model : [n_step, n_class]
 prediction = tf.argmax(model, 2)
 cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=targets))
 optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

diff --git a/4-2.Seq2Seq(Attention)/Seq2Seq(Attention)-Torch.py b/4-2.Seq2Seq(Attention)/Seq2Seq(Attention)-Torch.py
@@ -41,10 +41,10 @@ def __init__(self):
         self.out = nn.Linear(n_hidden * 2, n_class)
 
     def forward(self, enc_inputs, hidden, dec_inputs):
-        enc_inputs = enc_inputs.transpose(0, 1)  # enc_inputs: [max_len(=n_step, time step), batch_size, n_hidden]
-        dec_inputs = dec_inputs.transpose(0, 1)  # dec_inputs: [max_len(=n_step, time step), batch_size, n_hidden]
+        enc_inputs = enc_inputs.transpose(0, 1)  # enc_inputs: [n_step(=n_step, time step), batch_size, n_hidden]
+        dec_inputs = dec_inputs.transpose(0, 1)  # dec_inputs: [n_step(=n_step, time step), batch_size, n_hidden]
 
-        # enc_outputs : [max_len, batch_size, num_directions(=1) * n_hidden], matrix F
+        # enc_outputs : [n_step, batch_size, num_directions(=1) * n_hidden], matrix F
         # enc_hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
         enc_outputs, enc_hidden = self.enc_cell(enc_inputs, hidden)
 
@@ -54,7 +54,7 @@ def forward(self, enc_inputs, hidden, dec_inputs):
         model = Variable(torch.empty([n_step, 1, n_class]))
 
         for i in range(n_step):  # each time step
-            # dec_output : [max_len(=1), batch_size(=1), num_directions(=1) * n_hidden]
+            # dec_output : [n_step(=1), batch_size(=1), num_directions(=1) * n_hidden]
             # hidden : [num_layers(=1) * num_directions(=1), batch_size(=1), n_hidden]
             dec_output, hidden = self.dec_cell(dec_inputs[i].unsqueeze(0), hidden)
             attn_weights = self.get_att_weight(dec_output, enc_outputs)  # attn_weights : [1, 1, n_step]

diff --git a/4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention)-Tensor.py b/4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention)-Tensor.py
@@ -50,12 +50,12 @@
 final_hidden_state = tf.concat([final_state[1][0], final_state[1][1]], 1) # final_hidden_state : [batch_size, n_hidden * num_directions(=2)]
 final_hidden_state = tf.expand_dims(final_hidden_state, 2)                # final_hidden_state : [batch_size, n_hidden * num_directions(=2), 1]
 
-attn_weights = tf.squeeze(tf.matmul(output, final_hidden_state), 2) # attn_weights : [batch_size, len_seq]
+attn_weights = tf.squeeze(tf.matmul(output, final_hidden_state), 2) # attn_weights : [batch_size, n_step]
 soft_attn_weights = tf.nn.softmax(attn_weights, 1)
-new_hidden_state = tf.matmul(tf.transpose(output, [0, 2, 1]), tf.expand_dims(soft_attn_weights, 2)) # new_hidden_state : [batch_size, n_hidden * num_directions(=2), 1]
-new_hidden_state = tf.squeeze(new_hidden_state, 2) # [batch_size, n_hidden * num_directions(=2)]
+context = tf.matmul(tf.transpose(output, [0, 2, 1]), tf.expand_dims(soft_attn_weights, 2)) # context : [batch_size, n_hidden * num_directions(=2), 1]
+context = tf.squeeze(context, 2) # [batch_size, n_hidden * num_directions(=2)]
 
-model = tf.matmul(new_hidden_state, out)
+model = tf.matmul(context, out)
 
 cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
 optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)

diff --git a/4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention)-Torch.py b/4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention)-Torch.py
@@ -45,12 +45,14 @@ def __init__(self):
         self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
         self.out = nn.Linear(n_hidden * 2, num_classes)
 
-    def attention_net(self, lstm_output, final_state):           # lstm_output : [batch_size, len_seq, n_hidden * num_directions(=2)]
-        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1]
+    # lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
+    def attention_net(self, lstm_output, final_state):
+        hidden = final_state.view(-1, n_hidden * 2, 1)   # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
         attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
         soft_attn_weights = F.softmax(attn_weights, 1)
-        new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
-        return new_hidden_state, soft_attn_weights.data.numpy() # new_hidden_state : [batch_size, n_hidden * num_directions(=2)]
+        # [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
+        context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
+        return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]
 
     def forward(self, X):
         input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]

diff --git a/4-4.Transformer/Transformer-Torch.py b/4-4.Transformer/Transformer-Torch.py
@@ -24,9 +24,11 @@
 number_dict = {i: w for i, w in enumerate(set((sentences[1]+' '+sentences[2]).split()))}
 tgt_vocab_size = len(tgt_vocab)
 
-n_step = 5  # number of Step
+src_len = 5
+tgt_len = 5
+
 d_model = 512  # Embedding Size
-d_inner = 2048
+d_ff = 2048 # FeedForward dimension
 d_k = d_v = 64  # dimension of K(=Q), V
 n_layers = 6  # number of Encoder of Decoder Layer
 n_heads = 8  # number of heads in Multi-Head Attention
@@ -75,6 +77,7 @@ def __init__(self):
     def forward(self, Q, K, V, attn_mask=None):
         # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
         residual, batch_size = Q, Q.size(0)
+        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
         q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # q_s: [batch_size x n_heads x len_q x d_k]
         k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  # k_s: [batch_size x n_heads x len_k x d_k]
         v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)  # v_s: [batch_size x n_heads x len_k x d_v]
@@ -90,8 +93,8 @@ def forward(self, Q, K, V, attn_mask=None):
 class PoswiseFeedForwardNet(nn.Module):
     def __init__(self):
         super(PoswiseFeedForwardNet, self).__init__()
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_inner, kernel_size=1)
-        self.conv2 = nn.Conv1d(in_channels=d_inner, out_channels=d_model, kernel_size=1)
+        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
+        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)
 
     def forward(self, inputs):
         residual = inputs # inputs : [batch_size, len_q, d_model]
@@ -127,7 +130,7 @@ class Encoder(nn.Module):
     def __init__(self):
         super(Encoder, self).__init__()
         self.src_emb = nn.Embedding(src_vocab_size, d_model)
-        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(n_step+1 , d_model),freeze=True)
+        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1 , d_model),freeze=True)
         self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])
 
     def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
@@ -142,7 +145,7 @@ class Decoder(nn.Module):
     def __init__(self):
         super(Decoder, self).__init__()
         self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
-        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(n_step+1 , d_model),freeze=True)
+        self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1 , d_model),freeze=True)
         self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
 
     def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
@@ -198,11 +201,11 @@ def showgraph(attn):
 predict = predict.data.max(1, keepdim=True)[1]
 print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])
 
-print('first head of last enc_self_attns')
+print('first head of last state enc_self_attns')
 showgraph(enc_self_attns)
 
-print('first head of last dec_self_attns')
+print('first head of last state dec_self_attns')
 showgraph(dec_self_attns)
 
-print('first head of last dec_enc_attns')
+print('first head of last state dec_enc_attns')
 showgraph(dec_enc_attns)