Skip to content

Commit

Permalink
Add BERT model
Browse files Browse the repository at this point in the history
  • Loading branch information
graykode committed Feb 2, 2019
1 parent cb67363 commit 1a0f1e0
Show file tree
Hide file tree
Showing 7 changed files with 281 additions and 42 deletions.
20 changes: 10 additions & 10 deletions 4-2.Seq2Seq(Attention)/Seq2Seq(Attention)-Tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,34 +26,34 @@ def make_batch(sentences):
return input_batch, output_batch, target_batch

# Model
enc_inputs = tf.placeholder(tf.float32, [None, None, n_class]) # [batch_size, max_len, n_class]
dec_inputs = tf.placeholder(tf.float32, [None, None, n_class]) # [batch_size, max_len, n_class]
targets = tf.placeholder(tf.int64, [1, n_step]) # [batch_size, max_len], not one-hot
enc_inputs = tf.placeholder(tf.float32, [None, None, n_class]) # [batch_size, n_step, n_class]
dec_inputs = tf.placeholder(tf.float32, [None, None, n_class]) # [batch_size, n_step, n_class]
targets = tf.placeholder(tf.int64, [1, n_step]) # [batch_size, n_step], not one-hot

# Linear for attention
attn = tf.Variable(tf.random_normal([n_hidden, n_hidden]))
out = tf.Variable(tf.random_normal([n_hidden * 2, n_class]))

def get_att_score(dec_output, enc_output): # enc_output [max_len, n_hidden]
def get_att_score(dec_output, enc_output): # enc_output [n_step, n_hidden]
score = tf.squeeze(tf.matmul(enc_output, attn), 0) # score : [n_hidden]
dec_output = tf.squeeze(dec_output, [0, 1]) # dec_output : [n_hidden]
return tf.tensordot(dec_output, score, 1) # inner product make scalar value

def get_att_weight(dec_output, enc_outputs):
attn_scores = [] # list of attention scalar : [max_len]
enc_outputs = tf.transpose(enc_outputs, [1, 0, 2]) # enc_outputs : [max_len, batch_size, n_hidden]
attn_scores = [] # list of attention scalar : [n_step]
enc_outputs = tf.transpose(enc_outputs, [1, 0, 2]) # enc_outputs : [n_step, batch_size, n_hidden]
for i in range(n_step):
attn_scores.append(get_att_score(dec_output, enc_outputs[i]))

# Normalize scores to weights in range 0 to 1
return tf.reshape(tf.nn.softmax(attn_scores), [1, 1, -1]) # [1, 1, max_len]
return tf.reshape(tf.nn.softmax(attn_scores), [1, 1, -1]) # [1, 1, n_step]

model = []
Attention = []
with tf.variable_scope('encode'):
enc_cell = tf.nn.rnn_cell.BasicRNNCell(n_hidden)
enc_cell = tf.nn.rnn_cell.DropoutWrapper(enc_cell, output_keep_prob=0.5)
# enc_outputs : [batch_size(=1), max_len(=decoder_step), n_hidden(=128)]
# enc_outputs : [batch_size(=1), n_step(=decoder_step), n_hidden(=128)]
# enc_hidden : [batch_size(=1), n_hidden(=128)]
enc_outputs, enc_hidden = tf.nn.dynamic_rnn(enc_cell, enc_inputs, dtype=tf.float32)

Expand All @@ -75,10 +75,10 @@ def get_att_weight(dec_output, enc_outputs):
dec_output = tf.squeeze(dec_output, 0) # [1, n_step]
context = tf.squeeze(context, 1) # [1, n_hidden]

model.append(tf.matmul(tf.concat((dec_output, context), 1), out)) # [max_len, batch_size(=1), n_class]
model.append(tf.matmul(tf.concat((dec_output, context), 1), out)) # [n_step, batch_size(=1), n_class]

trained_attn = tf.stack([Attention[0], Attention[1], Attention[2], Attention[3], Attention[4]], 0) # to show attention matrix
model = tf.transpose(model, [1, 0, 2]) # model : [max_len, n_class]
model = tf.transpose(model, [1, 0, 2]) # model : [n_step, n_class]
prediction = tf.argmax(model, 2)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=model, labels=targets))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
Expand Down
8 changes: 4 additions & 4 deletions 4-2.Seq2Seq(Attention)/Seq2Seq(Attention)-Torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ def __init__(self):
self.out = nn.Linear(n_hidden * 2, n_class)

def forward(self, enc_inputs, hidden, dec_inputs):
enc_inputs = enc_inputs.transpose(0, 1) # enc_inputs: [max_len(=n_step, time step), batch_size, n_hidden]
dec_inputs = dec_inputs.transpose(0, 1) # dec_inputs: [max_len(=n_step, time step), batch_size, n_hidden]
enc_inputs = enc_inputs.transpose(0, 1) # enc_inputs: [n_step(=n_step, time step), batch_size, n_hidden]
dec_inputs = dec_inputs.transpose(0, 1) # dec_inputs: [n_step(=n_step, time step), batch_size, n_hidden]

# enc_outputs : [max_len, batch_size, num_directions(=1) * n_hidden], matrix F
# enc_outputs : [n_step, batch_size, num_directions(=1) * n_hidden], matrix F
# enc_hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
enc_outputs, enc_hidden = self.enc_cell(enc_inputs, hidden)

Expand All @@ -54,7 +54,7 @@ def forward(self, enc_inputs, hidden, dec_inputs):
model = Variable(torch.empty([n_step, 1, n_class]))

for i in range(n_step): # each time step
# dec_output : [max_len(=1), batch_size(=1), num_directions(=1) * n_hidden]
# dec_output : [n_step(=1), batch_size(=1), num_directions(=1) * n_hidden]
# hidden : [num_layers(=1) * num_directions(=1), batch_size(=1), n_hidden]
dec_output, hidden = self.dec_cell(dec_inputs[i].unsqueeze(0), hidden)
attn_weights = self.get_att_weight(dec_output, enc_outputs) # attn_weights : [1, 1, n_step]
Expand Down
8 changes: 4 additions & 4 deletions 4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention)-Tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@
final_hidden_state = tf.concat([final_state[1][0], final_state[1][1]], 1) # final_hidden_state : [batch_size, n_hidden * num_directions(=2)]
final_hidden_state = tf.expand_dims(final_hidden_state, 2) # final_hidden_state : [batch_size, n_hidden * num_directions(=2), 1]

attn_weights = tf.squeeze(tf.matmul(output, final_hidden_state), 2) # attn_weights : [batch_size, len_seq]
attn_weights = tf.squeeze(tf.matmul(output, final_hidden_state), 2) # attn_weights : [batch_size, n_step]
soft_attn_weights = tf.nn.softmax(attn_weights, 1)
new_hidden_state = tf.matmul(tf.transpose(output, [0, 2, 1]), tf.expand_dims(soft_attn_weights, 2)) # new_hidden_state : [batch_size, n_hidden * num_directions(=2), 1]
new_hidden_state = tf.squeeze(new_hidden_state, 2) # [batch_size, n_hidden * num_directions(=2)]
context = tf.matmul(tf.transpose(output, [0, 2, 1]), tf.expand_dims(soft_attn_weights, 2)) # context : [batch_size, n_hidden * num_directions(=2), 1]
context = tf.squeeze(context, 2) # [batch_size, n_hidden * num_directions(=2)]

model = tf.matmul(new_hidden_state, out)
model = tf.matmul(context, out)

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=model, labels=Y))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
Expand Down
10 changes: 6 additions & 4 deletions 4-3.Bi-LSTM(Attention)/Bi-LSTM(Attention)-Torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,14 @@ def __init__(self):
self.lstm = nn.LSTM(embedding_dim, n_hidden, bidirectional=True)
self.out = nn.Linear(n_hidden * 2, num_classes)

def attention_net(self, lstm_output, final_state): # lstm_output : [batch_size, len_seq, n_hidden * num_directions(=2)]
hidden = final_state.view(-1, n_hidden * 2, 1) # hidden : [batch_size, n_hidden * num_directions(=2), 1]
# lstm_output : [batch_size, n_step, n_hidden * num_directions(=2)], F matrix
def attention_net(self, lstm_output, final_state):
hidden = final_state.view(-1, n_hidden * 2, 1) # hidden : [batch_size, n_hidden * num_directions(=2), 1(=n_layer)]
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2) # attn_weights : [batch_size, n_step]
soft_attn_weights = F.softmax(attn_weights, 1)
new_hidden_state = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
return new_hidden_state, soft_attn_weights.data.numpy() # new_hidden_state : [batch_size, n_hidden * num_directions(=2)]
# [batch_size, n_hidden * num_directions(=2), n_step] * [batch_size, n_step, 1] = [batch_size, n_hidden * num_directions(=2), 1]
context = torch.bmm(lstm_output.transpose(1, 2), soft_attn_weights.unsqueeze(2)).squeeze(2)
return context, soft_attn_weights.data.numpy() # context : [batch_size, n_hidden * num_directions(=2)]

def forward(self, X):
input = self.embedding(X) # input : [batch_size, len_seq, embedding_dim]
Expand Down
21 changes: 12 additions & 9 deletions 4-4.Transformer/Transformer-Torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
number_dict = {i: w for i, w in enumerate(set((sentences[1]+' '+sentences[2]).split()))}
tgt_vocab_size = len(tgt_vocab)

n_step = 5 # number of Step
src_len = 5
tgt_len = 5

d_model = 512 # Embedding Size
d_inner = 2048
d_ff = 2048 # FeedForward dimension
d_k = d_v = 64 # dimension of K(=Q), V
n_layers = 6 # number of Encoder of Decoder Layer
n_heads = 8 # number of heads in Multi-Head Attention
Expand Down Expand Up @@ -75,6 +77,7 @@ def __init__(self):
def forward(self, Q, K, V, attn_mask=None):
# q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
residual, batch_size = Q, Q.size(0)
# (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) # q_s: [batch_size x n_heads x len_q x d_k]
k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2) # k_s: [batch_size x n_heads x len_k x d_k]
v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2) # v_s: [batch_size x n_heads x len_k x d_v]
Expand All @@ -90,8 +93,8 @@ def forward(self, Q, K, V, attn_mask=None):
class PoswiseFeedForwardNet(nn.Module):
def __init__(self):
super(PoswiseFeedForwardNet, self).__init__()
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_inner, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=d_inner, out_channels=d_model, kernel_size=1)
self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1)
self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1)

def forward(self, inputs):
residual = inputs # inputs : [batch_size, len_q, d_model]
Expand Down Expand Up @@ -127,7 +130,7 @@ class Encoder(nn.Module):
def __init__(self):
super(Encoder, self).__init__()
self.src_emb = nn.Embedding(src_vocab_size, d_model)
self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(n_step+1 , d_model),freeze=True)
self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(src_len+1 , d_model),freeze=True)
self.layers = nn.ModuleList([EncoderLayer() for _ in range(n_layers)])

def forward(self, enc_inputs): # enc_inputs : [batch_size x source_len]
Expand All @@ -142,7 +145,7 @@ class Decoder(nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.tgt_emb = nn.Embedding(tgt_vocab_size, d_model)
self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(n_step+1 , d_model),freeze=True)
self.pos_emb = nn.Embedding.from_pretrained(get_sinusoid_encoding_table(tgt_len+1 , d_model),freeze=True)
self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])

def forward(self, dec_inputs, enc_inputs, enc_outputs): # dec_inputs : [batch_size x target_len]
Expand Down Expand Up @@ -198,11 +201,11 @@ def showgraph(attn):
predict = predict.data.max(1, keepdim=True)[1]
print(sentences[0], '->', [number_dict[n.item()] for n in predict.squeeze()])

print('first head of last enc_self_attns')
print('first head of last state enc_self_attns')
showgraph(enc_self_attns)

print('first head of last dec_self_attns')
print('first head of last state dec_self_attns')
showgraph(dec_self_attns)

print('first head of last dec_enc_attns')
print('first head of last state dec_enc_attns')
showgraph(dec_enc_attns)
Loading

0 comments on commit 1a0f1e0

Please sign in to comment.