diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..bee8a64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/models/commons/__init__.py b/models/commons/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/commons/initializer.py b/models/commons/initializer.py
new file mode 100644
index 0000000..1aa22c6
--- /dev/null
+++ b/models/commons/initializer.py
@@ -0,0 +1,29 @@
+import torch.nn as nn
+
+from utils import constant
+
+
+def init_rnn_wt(rnn):
+    for names in rnn._all_weights:
+        for name in names:
+            if name.startswith('weight_'):
+                wt = getattr(rnn, name)
+                nn.init.xavier_uniform_(wt)
+                # wt.data.uniform_(-constant.rand_unif_init_mag, constant.rand_unif_init_mag)
+            elif name.startswith('bias_'):
+                # set forget bias to 1
+                bias = getattr(rnn, name)
+                n = bias.size(0)
+                start, end = n // 4, n // 2
+                bias.data.fill_(0.)
+                bias.data[start:end].fill_(1.)
+
+def init_linear_wt(linear):
+    # linear.weight.data.normal_(std=constant.trunc_norm_init_std)
+    nn.init.xavier_uniform_(linear.weight)
+    if linear.bias is not None:
+        n = linear.bias.size(0)
+        start, end = n // 4, n // 2
+        linear.bias.data.fill_(0.)
+        linear.bias.data[start:end].fill_(1.)
+        # linear.bias.data.nomral_(std=constant.trunc_norm_init_std)
diff --git a/models/commons/vae_lib.py b/models/commons/vae_lib.py
new file mode 100644
index 0000000..abb5c21
--- /dev/null
+++ b/models/commons/vae_lib.py
@@ -0,0 +1,71 @@
+import os
+import math
+import time
+import pprint
+import random
+
+from tqdm import tqdm
+import dill as pickle
+import numpy as np
+from numpy import random
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
+
+from utils import constant
+
+
+def gumbel_softmax(logits, dim, tau=1.0):
+    """
+    Sample z ~ log p(z) + G(0, 1)
+    """
+    eps=1e-20
+    noise = torch.rand(logits.size())
+    noise = -torch.log(-torch.log(noise + eps) + eps) # gumble noise
+    if constant.USE_CUDA:
+        noise = noise.float().cuda()
+    return F.softmax((logits + noise) / tau, dim=dim)
+
+def reparameterization(mu, logvar, z_dim):
+    """
+    Reparameterization trick: z = mu + std*eps; eps ~ N(0, I)
+    """
+    eps = torch.randn(z_dim)
+    eps = eps.cuda() if constant.USE_CUDA else eps
+    return mu + torch.exp(logvar/2) * eps
+
+def split_z(z, B, M, K):
+    return z.view(B, M, K)
+
+def merge_z(z, B, M, K):
+    return z.view(B, M * K)
+
+def cat_mi(p, q):
+    pass
+    
+def cat_kl(logp, logq, dim=1):
+    """
+    \sum q * log(q/p)
+    """
+    if logq.dim() > 2:
+        logq = logq.squeeze()
+
+    q = torch.exp(logq)
+    kl = torch.sum(q * (logq - logp), dim=dim)
+    return torch.mean(kl)
+
+def norm_kl(recog_mu, recog_logvar, prior_mu=None, prior_logvar=None):
+    # find the KL divergence between two Gaussian distributions (defaults to standard normal for prior)
+    if prior_mu is None:
+        prior_mu = torch.zeros(1)
+        prior_logvar = torch.ones(1)
+    if constant.USE_CUDA:
+        prior_mu = prior_mu.cuda()
+        prior_logvar = prior_logvar.cuda()
+    loss = 1.0 + (recog_logvar - prior_logvar)
+    loss -= torch.div(torch.pow(prior_mu - recog_mu, 2), torch.exp(prior_logvar))
+    loss -= torch.div(torch.exp(recog_logvar), torch.exp(prior_logvar))
+    kl_loss = -0.5 * torch.mean(loss, dim=1)
+    return torch.mean(kl_loss)
diff --git a/models/decoders/__init__.py b/models/decoders/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/decoders/rnn_decoder.py b/models/decoders/rnn_decoder.py
new file mode 100644
index 0000000..8596877
--- /dev/null
+++ b/models/decoders/rnn_decoder.py
@@ -0,0 +1,74 @@
+import math
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
+
+from models.commons.attention import Attention
+from models.commons.initializer import init_rnn_wt, init_linear_wt
+from utils import constant
+
+
+class RNNDecoder(nn.Module):
+    def __init__(self, V, D, H, L=1, embedding=None):
+        super(RNNDecoder, self).__init__()
+        self.V = V
+        self.H = H
+        self.L = L
+        self.D = D
+        if constant.attn != 'none':
+            self.attention = Attention(H, constant.attn)
+        # self.dropout = nn.Dropout(constant.dropout)
+        
+        self.cuda = constant.USE_CUDA
+        self.embeddings_cpu = constant.embeddings_cpu
+
+        if embedding is not None:
+            self.embedding = embedding
+        else:
+            self.embedding = nn.Embedding(V, D)
+            self.embedding.weight.requires_grad = True
+
+        if constant.lstm:
+            self.rnn = nn.LSTM(D, H, L, batch_first=True, bidirectional=False)
+        else:
+            self.rnn = nn.GRU(D, H, L, batch_first=True, bidirectional=False)
+            
+        self.out = nn.Linear(H, V)
+        if constant.weight_tie:
+            self.out = nn.Linear(H, V)
+            self.out.weight = self.embedding.weight # Assuming H == D. They share the weight, and updated together
+
+    def forward(self, x_t, last_h, src_hs=None, use_attn=False):
+        # Note: we run this in a for loop (mulitple batches over single token at a time)
+        # batch_size = x_t.size(0)
+        x = self.embedding(x_t)
+        if self.cuda and self.embeddings_cpu:
+            x = x.cuda()
+        # x = self.dropout(x)
+        # x = x.view(1, batch_size, self.H) # S=1 x B x N
+        outputs, dec_h_t = self.rnn(x.unsqueeze(1), last_h) # [B, 1, H] & [1, B, H]
+        
+        if use_attn:
+            h, _ = self.attention(src_hs, src_hs, outputs)
+            # output = self.out(self.linear(h))
+            output = self.out(h)
+        else:
+            # output = self.out(self.linear(outputs))
+            output = self.out(outputs)
+
+        return output.squeeze(), dec_h_t
+
+    def predict_one(self, x_t, last_h, src_hs=None, use_attn=False):
+        with torch.no_grad():
+            x = self.embedding(x_t)
+            outputs, dec_h_t = self.rnn(x.unsqueeze(1), last_h) # [B, 1, H] & [1, B, H]
+            if use_attn:
+                h, _ = self.attention(src_hs, src_hs, outputs)
+                output = self.out(h)
+            else:
+                output = self.out(outputs)
+            return output.squeeze(), dec_h_t
\ No newline at end of file
diff --git a/models/encoders/__init__.py b/models/encoders/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/models/encoders/rnn_encoder.py b/models/encoders/rnn_encoder.py
new file mode 100644
index 0000000..83cd0d6
--- /dev/null
+++ b/models/encoders/rnn_encoder.py
@@ -0,0 +1,80 @@
+import numpy as np
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence
+
+from models.commons.initializer import init_rnn_wt
+from utils import constant
+
+
+class RNNEncoder(nn.Module):
+    def __init__(self, V, D, H, L=1, embedding=None):
+        super(RNNEncoder, self).__init__()
+        self.V = V
+        self.H = H
+        self.L = L
+        self.D = D
+        self.bi = True if constant.bi == 'bi' else False
+        self.use_lstm = constant.lstm
+        # self.dropout = nn.Dropout(constant.dropout)
+        
+        self.cuda = constant.USE_CUDA
+
+        if embedding is not None:
+            self.embedding = embedding
+        else:
+            self.embedding = nn.Embedding(V, D)
+            self.embedding.weight.requires_grad = True
+        
+        self.embedding_dropout = nn.Dropout(constant.dropout)
+
+        if constant.lstm:
+            self.rnn = nn.LSTM(D, H, L, batch_first=True, bidirectional=self.bi)
+        else:
+            self.rnn = nn.GRU(D, H, L, batch_first=True, bidirectional=self.bi)
+
+    def soft_embed(self, x):
+        # x: (T, B, V), (B, V) or (V)
+        return (x.unsqueeze(len(x.shape)) * self.embedding.weight).sum(dim=len(x.shape)-1)
+
+    def forward(self, seqs, lens, soft_encode=False, logits=None):
+        # Note: we run this all at once (over multiple batches of multiple sequences)
+        # x, lens = pad_packed_sequence(pack_sequence(seqs))
+        if not soft_encode:
+            x = self.embedding(seqs)
+            x = self.embedding_dropout(x)
+        else:
+            x = self.soft_embed(logits).transpose(0, 1).contiguous()
+        x = pack_padded_sequence(x, lens, batch_first=True)
+        outputs, hidden = self.rnn(x)
+        outputs, _ = pad_packed_sequence(outputs, batch_first=True)
+
+        if self.use_lstm:
+            h, c = hidden
+
+        if self.bi:
+            # [2, B, H] => [B, 2H]
+            if self.use_lstm:
+                h = h.transpose(0, 1).contiguous().view(-1, 2*self.H)
+                c = c.transpose(0, 1).contiguous().view(-1, 2*self.H)
+                # h = torch.cat((h[0], h[1]), 1)
+                # c = torch.cat((c[0], c[1]), 1)
+                return outputs, h.squeeze(), c.squeeze()
+            else:
+                h = torch.cat((hidden[0], hidden[1]), 1)
+                return outputs, h.squeeze()
+        else:
+            return outputs, hidden.squeeze()
+
+    def predict_one(self, seq):
+        with torch.no_grad():
+            x = self.embedding(seq)
+            outputs, hidden = self.rnn(x)
+            if self.bi:
+                # [2, B, H] => [B, 2H]
+                hidden = torch.cat((hidden[0], hidden[1]), 1)
+                return outputs, hidden
+            else:
+                return outputs, hidden
\ No newline at end of file
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..07e2f98
--- /dev/null
+++ b/utils/__init__.py
@@ -0,0 +1,9 @@
+from .utils import *
+from .dataset import *
+from .sentiment_dataset import *
+from .lang import *
+from .bleu import *
+from .beam_omt import *
+from .rouge import *
+from .masked_cross_entropy import *
+from .embedding_metrics import *
\ No newline at end of file
diff --git a/utils/beam.py b/utils/beam.py
new file mode 100644
index 0000000..fa73d18
--- /dev/null
+++ b/utils/beam.py
@@ -0,0 +1,202 @@
+import sys
+import os
+import time
+
+import torch
+import torch.nn.functional as F
+
+try:
+    from utils import constant
+except ImportError:
+    import constant
+
+"""Beam search implementation in PyTorch."""
+#
+#
+#         hyp1#-hyp1---hyp1 -hyp1
+#                 \             /
+#         hyp2 \-hyp2 /-hyp2#hyp2
+#                               /      \
+#         hyp3#-hyp3---hyp3 -hyp3
+#         ========================
+#
+# Takes care of beams, back pointers, and scores.
+
+# Code borrowed from PyTorch OpenNMT example
+# https://github.com/pytorch/examples/blob/master/OpenNMT/onmt/Beam.py
+
+
+class Beam(object):
+    """Ordered beam of candidate outputs."""
+
+    def __init__(self, size):
+        """Initialize params."""
+        self.size = size
+        self.done = False
+        self.pad = constant.pad_idx
+        self.bos = constant.sou_idx
+        self.eos = constant.eou_idx
+        self.t = torch.cuda if constant.USE_CUDA else torch
+
+        # The score for each translation on the beam.
+        self.scores = self.t.FloatTensor(size).zero_()
+
+        # The backpointers at each time-step.
+        self.prevKs = []
+
+        # The outputs at each time-step.
+        self.nextYs = [self.t.LongTensor(size).fill_(self.pad)]
+        self.nextYs[0][0] = self.bos
+
+    def __str__(self):
+        s = " \n \
+        Beam Search Object: \n \
+        Beam Size: {}\n \
+        Pad IDX: {}\n \
+        Start IDX: {}\n \
+        End IDX: {}\n \
+        Scores: {}\n \
+        Prev Ks: {}\n \
+        Next Ys: {}\n \
+        "
+        return s.format(self.size, self.pad, self.bos, self.eos, \
+                        self.scores, self.prevKs, self.nextYs)
+
+    # Get the outputs for the current timestep.
+    def get_current_state(self):
+        """Get state of beam."""
+        return self.nextYs[-1]
+
+    # Get the backpointers for the current timestep.
+    def get_current_origin(self):
+        """Get the backpointer to the beam at this step."""
+        return self.prevKs[-1]
+
+    #  Given prob over words for every last beam `wordLk`
+    #  : Compute and update the beam search.
+    #
+    # Parameters:
+    #
+    #     * `wordLk`- + log probs of advancing from the last step (K x V)
+    #                 K is what? => beam size
+    # Returns: True if beam search is complete.
+
+    def advance(self, wordLk):
+        """Advance the beam."""
+        num_words = wordLk.size(1)
+
+        # # force the output to be longer than self.min_length
+        # cur_len = len(self.next_ys)
+        # if cur_len < self.min_length:
+        #     for k in range(len(word_probs)):
+        #         word_probs[k][self._eos] = -1e20
+
+        # Sum the previous scores.
+        if len(self.prevKs) > 0:
+            beam_lk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
+            # Don't let EOS have children.
+            for i in range(self.nextYs[-1].size(0)):
+                if self.nextYs[-1][i] == self.eos:
+                    beam_lk[i] = -1e20
+        else:
+            beam_lk = wordLk[0]
+            
+        print(beam_lk)
+        flat_beam_lk = beam_lk.view(-1) # squeeze
+
+        bestScores, bestScoresId = flat_beam_lk.topk(self.size, 0, True, True)
+        self.scores = bestScores
+
+        # bestScoresId is flattened (K, K*V) array, so calculate which
+        # word and beam each score came from
+        prev_k = bestScoresId / num_words
+        # print(bestScores)
+        # print(bestScoresId)
+        # print(prev_k)
+        # print(prev_k * num_words)
+        # print(bestScoresId - prev_k * num_words)
+        self.prevKs.append(prev_k)
+        self.nextYs.append(bestScoresId - prev_k * num_words) # V+1th word => 0th word 
+
+        # End condition is when top-of-beam is EOS.
+        if self.nextYs[-1][0] == self.eos:
+            self.done = True
+
+        for i in range(1, self.size):
+            if self.nextYs[-1][i] == self.eos:
+                self.scores[i] = -1e10
+
+        return self.done
+
+    def sort_best(self):
+        """Sort the beam."""
+        return torch.sort(self.scores, 0, True)
+
+    # Get the score of the best in the beam.
+    def get_best(self):
+        """Get the most likely candidate."""
+        scores, ids = self.sort_best()
+        return scores[1], ids[1]
+
+    # Walk back to construct the full hypothesis.
+    #
+    # Parameters.
+    #
+    #     * `k` - the position in the beam to construct.
+    #
+    # Returns.
+    #
+    #     The hypothesis
+    def get_hyp(self, k):
+        """Get hypotheses."""
+        hyp = []
+        # print(len(self.prevKs), len(self.nextYs))
+        for j in range(len(self.prevKs) - 1, -1, -1):
+            hyp.append(self.nextYs[j + 1][k])
+            k = self.prevKs[j][k]
+
+        return hyp[::-1]
+
+
+if __name__ == "__main__":
+    beam = Beam(constant.beam_size)
+    print(beam)
+
+    V = 5
+    words = ['']
+    probs = beam.t.distributions.normal.Normal(1.0, 2).sample((V,))
+    # probs = beam.t.distributions.normal.Normal(1.0, 2).sample((beam.size, V))
+    probs = F.log_softmax(probs, dim=0)
+    probs = beam.t.Tensor([0.35, 0.3, 0.2, 0.1, 0.05])
+    # print(probs.max(), probs.argmax())
+    # print(probs.shape)
+    # probs = probs.unsqueeze(1)
+    # print(probs.shape)
+    probs = probs.repeat(beam.size, 1)
+    print(probs.shape)
+    # probs = probs.unsqueeze(1).repeat(1, beam.size, 1)
+    # print(probs.shape)
+    topv, topi = probs.topk(beam.size, 1, True, True)
+    print(topv)
+    print(topi)
+    print()
+    while not beam.done:
+        beam.advance(probs)
+        print(beam)
+        beam.advance(probs)
+        print(beam)
+        beam.advance(probs)
+        print(beam)
+        beam.advance(probs)
+        print(beam)
+        beam.advance(probs)
+        print(beam)
+        print(beam.get_hyp(0))
+        print(beam.get_best())
+        # beam.advance(probs)
+        # print(beam)
+        # beam.advance(probs)
+        # print(beam)
+        # beam.advance(probs)
+        # print(beam)
+        break
\ No newline at end of file
diff --git a/utils/beam_omt.py b/utils/beam_omt.py
new file mode 100644
index 0000000..b055655
--- /dev/null
+++ b/utils/beam_omt.py
@@ -0,0 +1,374 @@
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+
+try:
+    from utils import constant
+except ImportError:
+    import constant
+
+# Code borrowed from OpenNMT
+# https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate
+
+
+class Beam(object):
+    """
+    Class for managing the internals of the beam search process.
+    Takes care of beams, back pointers, and scores.
+    Args:
+       size (int): beam size
+       pad, bos, eos (int): indices of padding, beginning, and ending.
+       n_best (int): nbest size to use
+       cuda (bool): use gpu
+       global_scorer (:obj:`GlobalScorer`)
+    """
+
+    def __init__(self, size, 
+                 n_best=1,
+                 global_scorer=None,
+                 min_length=0,
+                 stepwise_penalty=False,
+                 block_ngram_repeat=0,
+                 exclusion_tokens=set()):
+
+        self.size = size
+        self.tt = torch.cuda if constant.USE_CUDA else torch
+
+        # The score for each translation on the beam.
+        self.scores = self.tt.FloatTensor(size).zero_()
+        self.all_scores = []
+
+        # The backpointers at each time-step.
+        self.prev_ks = []
+
+        # The outputs at each time-step.
+        self.next_ys = [self.tt.LongTensor(size)
+                        # .fill_(constant.pad_idx)]
+                        .fill_(constant.sou_idx)]
+        self.next_ys[0][0] = constant.sou_idx
+
+        # Has EOS topped the beam yet.
+        self._eos = constant.eou_idx
+        self.eos_top = False
+
+        # The attentions (matrix) for each time.
+        self.attn = []
+
+        # Time and k pair for finished.
+        self.finished = []
+        self.n_best = n_best
+
+        # Information for global scoring.
+        self.global_scorer = global_scorer
+        self.global_state = {}
+
+        # Minimum prediction length
+        self.min_length = min_length
+
+        # Apply Penalty at every step
+        self.stepwise_penalty = stepwise_penalty
+        self.block_ngram_repeat = block_ngram_repeat
+        self.exclusion_tokens = exclusion_tokens
+
+    def __str__(self):
+        s = " \n \
+        Beam Search Object: \n \
+        Beam Size: {}\n \
+        End IDX: {}\n \
+        Scores: {}\n \
+        Prev Ks: {}\n \
+        Next Ys: {}\n \
+        "
+        return s.format(self.size, self._eos, self.scores, \
+                        self.prev_ks, self.next_ys)#, self.finished)
+
+    def get_current_state(self):
+        "Get the outputs for the current timestep."
+        return self.next_ys[-1]
+
+    def get_current_origin(self):
+        "Get the backpointers for the current timestep."
+        return self.prev_ks[-1]
+
+    def advance(self, word_probs, attn_out=None):
+        """
+        Given prob over words for every last beam `wordLk` and attention
+        `attn_out`: Compute and update the beam search.
+        Parameters:
+        * `word_probs`- probs of advancing from the last step (K x words)
+        * `attn_out`- attention at the last step
+        Returns: True if beam search is complete.
+        """
+        num_words = word_probs.size(1)
+        # if self.stepwise_penalty:
+        #     self.global_scorer.update_score(self, attn_out)
+        # force the output to be longer than self.min_length
+        cur_len = len(self.next_ys)
+        if cur_len < self.min_length:
+            for k in range(len(word_probs)):
+                word_probs[k][self._eos] = -1e20
+        # Sum the previous scores.
+        if len(self.prev_ks) > 0:
+            beam_scores = word_probs + self.scores.unsqueeze(1)
+            # Don't let EOS have children.
+            for i in range(self.next_ys[-1].size(0)):
+                if self.next_ys[-1][i] == self._eos:
+                    beam_scores[i] = -1e20
+
+            # Block ngram repeats
+            if self.block_ngram_repeat > 0:
+                ngrams = []
+                le = len(self.next_ys)
+                for j in range(self.next_ys[-1].size(0)):
+                    hyp, _ = self.get_hyp(le - 1, j)
+                    ngrams = set()
+                    fail = False
+                    gram = []
+                    for i in range(le - 1):
+                        # Last n tokens, n = block_ngram_repeat
+                        gram = (gram +
+                                [hyp[i].item()])[-self.block_ngram_repeat:]
+                        # Skip the blocking if it is in the exclusion list
+                        if set(gram) & self.exclusion_tokens:
+                            continue
+                        if tuple(gram) in ngrams:
+                            fail = True
+                        ngrams.add(tuple(gram))
+                    if fail:
+                        beam_scores[j] = -10e20
+        else:
+            beam_scores = word_probs[0]
+        flat_beam_scores = beam_scores.view(-1)
+        best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0,
+                                                            True, True)
+
+        self.all_scores.append(self.scores)
+        self.scores = best_scores
+
+        # best_scores_id is flattened (K, K*V) array, so calculate which
+        # word and beam each score came from
+        prev_k = best_scores_id / num_words
+        self.prev_ks.append(prev_k)
+        self.next_ys.append((best_scores_id - prev_k * num_words)) # V+1th word => 0th word 
+        # self.attn.append(attn_out.index_select(0, prev_k))
+        # self.global_scorer.update_global_state(self)
+
+        for i in range(self.next_ys[-1].size(0)):
+            if self.next_ys[-1][i] == self._eos:
+                global_scores = self.global_scorer.score(self, self.scores)
+                s = global_scores[i]
+                self.finished.append((s, len(self.next_ys) - 1, i))
+
+        # End condition is when top-of-beam is EOS and no global score.
+        if self.next_ys[-1][0] == self._eos:
+            self.all_scores.append(self.scores)
+            self.eos_top = True
+
+    def done(self):
+        return self.eos_top and len(self.finished) >= self.n_best
+
+    def sort_finished(self, minimum=None):
+        if minimum is not None:
+            i = 0
+            # Add from beam until we have minimum outputs.
+            while len(self.finished) < minimum:
+                global_scores = self.global_scorer.score(self, self.scores)
+                s = global_scores[i]
+                self.finished.append((s, len(self.next_ys) - 1, i))
+                i += 1
+
+        self.finished.sort(key=lambda a: -a[0])
+        scores = [sc for sc, _, _ in self.finished]
+        ks = [(t, k) for _, t, k in self.finished]
+        return scores, ks
+
+    def get_hyp(self, timestep, k):
+        """
+        Walk back to construct the full hypothesis.
+        """
+        hyp, attn = [], []
+        for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1):
+            hyp.append(self.next_ys[j + 1][k])
+            # attn.append(self.attn[j][k])
+            k = self.prev_ks[j][k]
+        return hyp[::-1], None #, torch.stack(attn[::-1])
+
+
+class GNMTGlobalScorer(object):
+    """
+    NMT re-ranking score from
+    "Google's Neural Machine Translation System" :cite:`wu2016google`
+    Args:
+       alpha (float): length parameter
+       beta (float): coverage parameter
+       coverage_penalty (float): coverage_penalty
+       length_penalty (float): length_penalty
+    """
+
+    def __init__(self, alpha=0.8, beta=5, coverage_penalty='none', length_penalty='wu'):
+        self.alpha = alpha
+        self.beta = beta
+        penalty_builder = PenaltyBuilder(coverage_penalty, length_penalty)
+        # Term will be subtracted from probability
+        self.cov_penalty = penalty_builder.coverage_penalty()
+        # Probability will be divided by this
+        self.length_penalty = penalty_builder.length_penalty()
+
+    def score(self, beam, logprobs):
+        """
+        Rescores a prediction based on penalty functions
+        """
+        normalized_probs = self.length_penalty(beam,
+                                               logprobs,
+                                               self.alpha)
+        if not beam.stepwise_penalty:
+            penalty = self.cov_penalty(beam,
+                                       None,
+                                       #beam.global_state["coverage"],
+                                       self.beta)
+            normalized_probs -= penalty
+
+        return normalized_probs
+
+    def update_score(self, beam, attn):
+        """
+        Function to update scores of a Beam that is not finished
+        """
+        if "prev_penalty" in beam.global_state.keys():
+            beam.scores.add_(beam.global_state["prev_penalty"])
+            penalty = self.cov_penalty(beam,
+                                       beam.global_state["coverage"], #+ attn,
+                                       self.beta)
+            beam.scores.sub_(penalty)
+
+    def update_global_state(self, beam):
+        "Keeps the coverage vector as sum of attentions"
+        if len(beam.prev_ks) == 1:
+            beam.global_state["prev_penalty"] = beam.scores.clone().fill_(0.0)
+            beam.global_state["coverage"] = beam.attn[-1]
+            self.cov_total = beam.attn[-1].sum(1)
+        else:
+            self.cov_total += torch.min(beam.attn[-1],
+                                        beam.global_state['coverage']).sum(1)
+            beam.global_state["coverage"] = beam.global_state["coverage"] \
+                .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1])
+
+            prev_penalty = self.cov_penalty(beam,
+                                            beam.global_state["coverage"],
+                                            self.beta)
+            beam.global_state["prev_penalty"] = prev_penalty
+
+
+class PenaltyBuilder(object):
+    """
+    Returns the Length and Coverage Penalty function for Beam Search.
+    Args:
+        length_pen (str): option name of length pen
+        cov_pen (str): option name of cov pen
+    """
+
+    def __init__(self, cov_pen, length_pen):
+        self.length_pen = length_pen
+        self.cov_pen = cov_pen
+
+    def coverage_penalty(self):
+        if self.cov_pen == "wu":
+            return self.coverage_wu
+        elif self.cov_pen == "summary":
+            return self.coverage_summary
+        else:
+            return self.coverage_none
+
+    def length_penalty(self):
+        if self.length_pen == "wu":
+            return self.length_wu
+        elif self.length_pen == "avg":
+            return self.length_average
+        else:
+            return self.length_none
+
+    """
+    Below are all the different penalty terms implemented so far
+    """
+
+    def coverage_wu(self, beam, cov, beta=0.):
+        """
+        NMT coverage re-ranking score from
+        "Google's Neural Machine Translation System" :cite:`wu2016google`.
+        """
+        penalty = -torch.min(cov, cov.clone().fill_(1.0)).log().sum(1)
+        return beta * penalty
+
+    def coverage_summary(self, beam, cov, beta=0.):
+        """
+        Our summary penalty.
+        """
+        penalty = torch.max(cov, cov.clone().fill_(1.0)).sum(1)
+        penalty -= cov.size(1)
+        return beta * penalty
+
+    def coverage_none(self, beam, cov, beta=0.):
+        """
+        returns zero as penalty
+        """
+        return beam.scores.clone().fill_(0.0)
+
+    def length_wu(self, beam, logprobs, alpha=0.):
+        """
+        NMT length re-ranking score from
+        "Google's Neural Machine Translation System" :cite:`wu2016google`.
+        """
+
+        modifier = (((5 + len(beam.next_ys)) ** alpha) /
+                    ((5 + 1) ** alpha))
+        return (logprobs / modifier)
+
+    def length_average(self, beam, logprobs, alpha=0.):
+        """
+        Returns the average probability of tokens in a sequence.
+        """
+        return logprobs / len(beam.next_ys)
+
+    def length_none(self, beam, logprobs, alpha=0., beta=0.):
+        """
+        Returns unmodified scores.
+        """
+        return logprobs
+
+
+if __name__ == "__main__":
+    beam = Beam(constant.beam_size, 
+                global_scorer=GNMTGlobalScorer(), 
+                cuda=constant.USE_CUDA)
+    print(beam)
+
+    V = 5
+    words = ['']
+    probs = beam.tt.distributions.normal.Normal(1.0, 2).sample((V,))
+    # probs = beam.t.distributions.normal.Normal(1.0, 2).sample((beam.size, V))
+    probs = F.log_softmax(probs, dim=0)
+    probs = beam.tt.Tensor([0.35, 0.3, 0.2, 0.1, 0.05])
+    # print(probs.max(), probs.argmax())
+    # print(probs.shape)
+    # probs = probs.unsqueeze(1)
+    # print(probs.shape)
+    probs = probs.repeat(beam.size, 1)
+    print(probs.shape)
+    # probs = probs.unsqueeze(1).repeat(1, beam.size, 1)
+    # print(probs.shape)
+    topv, topi = probs.topk(beam.size, 1, True, True)
+    print(topv)
+    print(topi)
+    print()
+
+    beam.advance(probs)
+    print(beam)
+    beam.advance(probs)
+    print(beam)
+    beam.advance(probs)
+    print(beam)
+    # beam.advance(probs)
+    # print(beam)
+    # beam.advance(probs)
+    # print(beam)
\ No newline at end of file
diff --git a/utils/beam_ptr.py b/utils/beam_ptr.py
new file mode 100644
index 0000000..98f3944
--- /dev/null
+++ b/utils/beam_ptr.py
@@ -0,0 +1,193 @@
+import os
+import sys
+import time
+
+import torch
+import torch.nn.functional as F
+
+try:
+    from utils import constant
+except ImportError:
+    import constant
+
+
+class Beam(object):
+  def __init__(self, tokens, log_probs, state, context, coverage):
+    self.tokens = tokens
+    self.log_probs = log_probs
+    self.state = state
+    self.context = context
+    self.coverage = coverage
+
+  def extend(self, token, log_prob, state, context, coverage):
+    return Beam(tokens = self.tokens + [token],
+                      log_probs = self.log_probs + [log_prob],
+                      state = state,
+                      context = context,
+                      coverage = coverage)
+
+  @property
+  def latest_token(self):
+    return self.tokens[-1]
+
+  @property
+  def avg_log_prob(self):
+    return sum(self.log_probs) / len(self.tokens)
+
+def dup_batch(batch, idx, dup_times):
+    new_batch = {}
+    input_len = batch["input_lengths"][idx]
+    for key in ["input_batch", "target_batch"]:
+        new_batch[key] = batch[key][:input_len, idx:idx+1].repeat(1, dup_times)
+
+    if "input_ext_vocab_batch" in batch:
+        for key in ["input_ext_vocab_batch", "target_ext_vocab_batch"]:
+            new_batch[key] = batch[key][:input_len, idx:idx+1].repeat(1, dup_times)
+        new_batch["article_oovs"] = [batch["article_oovs"][idx] for _ in range(dup_times)]
+        new_batch["max_art_oovs"] =  batch["max_art_oovs"]
+
+    for key in ["input_txt", "target_txt"]:
+        new_batch[key] = [batch[key][idx] for _ in range(dup_times)]
+    for key in ["input_lengths", "target_lengths"]:
+        new_batch[key] = batch[key][idx:idx+1].repeat(dup_times)
+
+    return new_batch
+
+class BeamSearch(object):
+    def __init__(self, model, lang):
+        
+        self.model = model
+        self.lang = lang
+        self.vocab_size = lang.n_words
+
+    def sort_beams(self, beams):
+        return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True)
+
+    def beam_search(self, batch):
+
+        batch_size = batch["input_lengths"].size(0)
+        decoded_sents = []
+    
+        for i in range(batch_size):
+            new_batch = dup_batch(batch, i, constant.beam_size)
+            enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = self.model.get_input_from_batch(new_batch)
+            # Run beam search to get best Hypothesis
+            best_summary = self.beam_search_sample(enc_batch, enc_padding_mask, enc_lens, 
+                                            enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0)
+
+            # Extract the output ids from the hypothesis and convert back to words
+            output_ids = [int(t) for t in best_summary.tokens[1:]]
+            if constant.pointer_gen:
+                art_oovs = batch["article_oovs"][i]
+                len_oovs = len(art_oovs)
+                decoded_words = []
+                for idx in output_ids:
+                    if idx < self.vocab_size:
+                        decoded_words.append(self.lang.index2word[idx])    
+                    elif idx - self.vocab_size < len_oovs:
+                        decoded_words.append(art_oovs[idx - self.vocab_size])
+                    else:
+                        raise ValueError("invalid output id")
+            else:
+                decoded_words = [self.lang.index2word[idx] for idx in output_ids]
+
+            # Remove the [STOP] token from decoded_words, if necessary
+            try:
+                fst_stop_idx = decoded_words.index('EOS')
+                decoded_words = decoded_words[:fst_stop_idx]
+            except ValueError:
+                decoded_words = decoded_words
+
+            decoded_sents.append(decoded_words)
+        return decoded_sents
+
+    def beam_search_sample(self, enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0):
+        #batch should have only one example by duplicate
+        
+        encoder_outputs, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
+        s_t_0 = self.model.reduce_state(encoder_hidden)
+
+        dec_h, dec_c = s_t_0 # 1 x 2*hidden_size
+        dec_h = dec_h.squeeze(0)
+        dec_c = dec_c.squeeze(0)
+        #decoder batch preparation, it has beam_size example initially everything is repeated
+        beams = [Beam(tokens=[constant.SOS_idx],
+                      log_probs=[0.0],
+                      state=(dec_h[0], dec_c[0]),
+                      context = c_t_0[0],
+                      coverage=(coverage_t_0[0] if constant.is_coverage else None))
+                 for _ in range(constant.beam_size)]
+        results = []
+        steps = 0
+        while steps < constant.max_dec_step and len(results) < constant.beam_size:
+            latest_tokens = [h.latest_token for h in beams]
+            latest_tokens = [t if t < self.vocab_size else constant.UNK_idx \
+                             for t in latest_tokens]
+            y_t_1 = torch.LongTensor(latest_tokens)
+            if constant.USE_CUDA:
+                y_t_1 = y_t_1.cuda()
+            all_state_h =[]
+            all_state_c = []
+
+            all_context = []
+
+            for h in beams:
+                state_h, state_c = h.state
+                all_state_h.append(state_h)
+                all_state_c.append(state_c)
+
+                all_context.append(h.context)
+
+            s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0))
+            c_t_1 = torch.stack(all_context, 0)
+
+            coverage_t_1 = None
+            if constant.is_coverage:
+                all_coverage = []
+                for h in beams:
+                    all_coverage.append(h.coverage)
+                coverage_t_1 = torch.stack(all_coverage, 0)
+            final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1,
+                                                        encoder_outputs, enc_padding_mask, c_t_1,
+                                                        extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps, training=False)
+
+            topk_log_probs, topk_ids = torch.topk(final_dist, constant.beam_size * 2)
+
+            dec_h, dec_c = s_t
+            dec_h = dec_h.squeeze()
+            dec_c = dec_c.squeeze()
+
+            all_beams = []
+            num_orig_beams = 1 if steps == 0 else len(beams)
+            for i in range(num_orig_beams):
+                h = beams[i]
+                state_i = (dec_h[i], dec_c[i])
+                context_i = c_t[i]
+                coverage_i = (coverage_t[i] if constant.is_coverage else None)
+
+                for j in range(constant.beam_size * 2):  # for each of the top 2*beam_size hyps:
+                    new_beam = h.extend(token=topk_ids[i, j].item(),
+                                   log_prob=topk_log_probs[i, j].item(),
+                                   state=state_i,
+                                   context=context_i,
+                                   coverage=coverage_i)
+                    all_beams.append(new_beam)
+
+            beams = []
+            for h in self.sort_beams(all_beams):
+                if h.latest_token == constant.EOS_idx:
+                    if steps >= constant.min_dec_steps:
+                        results.append(h)
+                else:
+                    beams.append(h)
+                if len(beams) == constant.beam_size or len(results) == constant.beam_size:
+                    break
+
+            steps += 1
+
+        if len(results) == 0:
+            results = beams
+
+        beams_sorted = self.sort_beams(results)
+
+        return beams_sorted[0]
diff --git a/utils/bleu.py b/utils/bleu.py
new file mode 100644
index 0000000..5e706df
--- /dev/null
+++ b/utils/bleu.py
@@ -0,0 +1,131 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import numpy as np
+
+import os
+import re
+import subprocess
+import tempfile
+import numpy as np
+
+from six.moves import urllib
+
+def wer(r, h):
+    """
+    This is a function that calculate the word error rate in ASR.
+    You can use it like this: wer("what is it".split(), "what is".split()) 
+    """
+    #build the matrix
+    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8).reshape((len(r)+1, len(h)+1))
+    for i in range(len(r)+1):
+        for j in range(len(h)+1):
+            if i == 0: d[0][j] = j
+            elif j == 0: d[i][0] = i
+    for i in range(1,len(r)+1):
+        for j in range(1, len(h)+1):
+            if r[i-1] == h[j-1]:
+                d[i][j] = d[i-1][j-1]
+            else:
+                substitute = d[i-1][j-1] + 1
+                insert = d[i][j-1] + 1
+                delete = d[i-1][j] + 1
+                d[i][j] = min(substitute, insert, delete)
+    result = float(d[len(r)][len(h)]) / len(r) * 100
+    # result = str("%.2f" % result) + "%"
+    return result
+
+# -*- coding: utf-8 -*-
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BLEU metric implementation.
+"""
+
+
+def moses_multi_bleu(hypotheses, references, lowercase=False):
+    """Calculate the bleu score for hypotheses and references
+    using the MOSES ulti-bleu.perl script.
+    Args:
+    hypotheses: A numpy array of strings where each string is a single example.
+    references: A numpy array of strings where each string is a single example.
+    lowercase: If true, pass the "-lc" flag to the multi-bleu script
+    Returns:
+    The BLEU score as a float32 value.
+    """
+
+    if np.size(hypotheses) == 0:
+        return np.float32(0.0)
+
+    
+    # Get MOSES multi-bleu script
+    try:
+        multi_bleu_path, _ = urllib.request.urlretrieve(
+            "https://raw.githubusercontent.com/moses-smt/mosesdecoder/"
+            "master/scripts/generic/multi-bleu.perl")
+        os.chmod(multi_bleu_path, 0o755)
+    except: #pylint: disable=W0702
+        print("Unable to fetch multi-bleu.perl script, using local.")
+        metrics_dir = os.path.dirname(os.path.realpath(__file__))
+        bin_dir = os.path.abspath(os.path.join(metrics_dir, "..", "..", "bin"))
+        multi_bleu_path = os.path.join(bin_dir, "tools/multi-bleu.perl")
+
+
+    # Dump hypotheses and references to tempfiles
+    hypothesis_file = tempfile.NamedTemporaryFile()
+    hypothesis_file.write("\n".join(hypotheses).encode("utf-8"))
+    hypothesis_file.write(b"\n")
+    hypothesis_file.flush()
+    reference_file = tempfile.NamedTemporaryFile()
+    reference_file.write("\n".join(references).encode("utf-8"))
+    reference_file.write(b"\n")
+    reference_file.flush()
+
+
+     # Calculate BLEU using multi-bleu script
+    with open(hypothesis_file.name, "r") as read_pred:
+        bleu_cmd = [multi_bleu_path]
+        if lowercase:
+            bleu_cmd += ["-lc"]
+        bleu_cmd += [reference_file.name]
+        try:
+            bleu_out = subprocess.check_output(bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT)
+            bleu_out = bleu_out.decode("utf-8")
+            re_bleu = re.search(r"BLEU = (.+?), (.+?)/(.+?)/(.+?)/(.+?) ", bleu_out)
+            bleu_score = re_bleu.group(1)
+            bleu_1 = re_bleu.group(2)
+            bleu_2 = re_bleu.group(3)
+            bleu_3 = re_bleu.group(4)
+            bleu_4 = re_bleu.group(5)
+            bleu_score = float(bleu_score)
+            bleu_1 = float(bleu_1)
+            bleu_2 = float(bleu_2)
+            bleu_3 = float(bleu_3)
+            bleu_4 = float(bleu_4)
+        except subprocess.CalledProcessError as error:
+            if error.output is not None:
+                print("multi-bleu.perl script returned non-zero exit code")
+                print(error.output)
+                bleu_score = np.float32(0.0)
+                bleu_1 = np.float32(0.0)
+                bleu_2 = np.float32(0.0)
+                bleu_3 = np.float32(0.0)
+                bleu_4 = np.float32(0.0)
+
+        bleus = [bleu_1, bleu_2, bleu_3, bleu_4]
+
+    # Close temp files
+    hypothesis_file.close()
+    reference_file.close()
+    return bleu_score, bleus
\ No newline at end of file
diff --git a/utils/constant.py b/utils/constant.py
new file mode 100644
index 0000000..7855c98
--- /dev/null
+++ b/utils/constant.py
@@ -0,0 +1,199 @@
+import argparse
+import random
+import numpy as np
+import torch
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--model", type=str, default="RNN") # RNN, LVED
+
+# Hyperparams
+parser.add_argument("--C", type=int, default=1) # number of classes
+parser.add_argument("--H", type=int, default=300)
+parser.add_argument("--D", type=int, default=300)
+parser.add_argument("--B", type=int, default=32)
+parser.add_argument("--L", type=int, default=1)
+parser.add_argument("--M", type=int, default=3) # number of latent variables
+parser.add_argument("--K", type=int, default=5) # dimension of latent variable
+parser.add_argument("--CD", type=int, default=256) # dimension of ICM
+parser.add_argument("--beta", type=float, default=0.5) # aux reward lambda
+parser.add_argument("--lambda_aux", type=float, default=0.5) # aux reward lambda
+parser.add_argument("--lambda_emo", type=float, default=0.5) # emo loss lambda
+parser.add_argument("--lambda_gen", type=float, default=0.5) # gen loss lambda
+parser.add_argument("--lambda_mle", type=float, default=0.5) # mle loss lambda
+parser.add_argument("--lr", type=float, default=0.001)
+parser.add_argument("--tau", type=float, default=1.0) # softmax temperature
+parser.add_argument("--bi", type=str, default="none") # none, bi
+parser.add_argument("--mlp", action="store_true")
+parser.add_argument("--lstm", action="store_true")
+parser.add_argument("--dropout", type=float, default=0.5)
+
+# Train Settings
+parser.add_argument("--attn", type=str, default="none") # none, dot, concat (luong), general
+parser.add_argument("--cuda", action="store_true")
+parser.add_argument("--optim", type=str, default="Adam") # Adam, SGD
+parser.add_argument("--epochs", type=int, default=100)
+parser.add_argument("--round_robin", action="store_true")
+parser.add_argument("--parse", type=str, default="none") # none, user, system
+parser.add_argument("--eval_parse", action="store_true") # eval as parse or not
+parser.add_argument("--embedding", type=str, default="random") # random, fasttext
+parser.add_argument("--share_rnn", action="store_true")
+parser.add_argument("--weight_tie", action="store_true")
+parser.add_argument("--embeddings_cpu", action="store_true")
+parser.add_argument("--share_embeddings", action="store_true")
+parser.add_argument("--update_embeddings", action="store_true")
+
+# Beam Search
+parser.add_argument("--beam", action="store_true")
+parser.add_argument("--beam_size", type=int, default=5)
+parser.add_argument("--topk", action="store_true")
+parser.add_argument("--topk_size", type=int, default=40)
+parser.add_argument("--max_grad_norm", type=float, default=2.0)
+parser.add_argument("--max_enc_steps", type=int, default=400)
+parser.add_argument("--max_dec_steps", type=int, default=20)
+parser.add_argument("--min_dec_steps", type=int, default=5)
+
+## Data & Task: Single vs Mutli
+parser.add_argument("--data", type=str, default="dailydialog") # "dailydialog", "empathetic-dialogue", "personachat", "ed+dd", "all", "sst"
+parser.add_argument("--eval_data", type=str, default="empathetic-dialogue") # "dailydialog", "empathetic-dialogue", "personachat", "ed+dd", "all", "sst"
+parser.add_argument("--task", type=str, default="emotion") # "emotion", "sentiment", "seq2seq", "multiseq", "rlseq", "lved"
+parser.add_argument("--split", type=str, default="dev") # train, dev, test 
+parser.add_argument("--shuffle", action="store_true")
+parser.add_argument("--discrete", action="store_true") # use emotion_t. Otherwise, emtion_(t+1)
+parser.add_argument("--use_arl", action="store_true") # Auto-tune RL
+parser.add_argument("--use_baseline", action="store_true") # baseline reward
+parser.add_argument("--use_binary", action="store_true") # use binary traces
+parser.add_argument("--use_bpr", action="store_true") # batch prior regularization
+parser.add_argument("--use_bow", action="store_true") # BoW loss
+parser.add_argument("--use_bert", action="store_true") # Use pre-trained BERT for sentiment
+parser.add_argument("--use_context", action="store_true")
+parser.add_argument("--use_current", action="store_true")
+parser.add_argument("--use_curiosity", action="store_true") # curiosity reward
+parser.add_argument("--use_cycle", action="store_true") # cycle consistency
+parser.add_argument("--use_emotion", action="store_true")
+parser.add_argument("--use_hybrid", action="store_true") # use hybrid loss
+parser.add_argument("--use_lang", action="store_true")
+parser.add_argument("--use_kl_anneal", action="store_true")
+parser.add_argument("--use_sentiment", action="store_true")
+parser.add_argument("--use_sentiment_agreement", action="store_true")
+parser.add_argument("--use_self_critical", action="store_true") # use self critical baseline
+parser.add_argument("--use_topic", action="store_true") # use topic info for LVED
+parser.add_argument("--use_tau_anneal", action="store_true")
+parser.add_argument("--use_user", action="store_true") # use user simulation
+parser.add_argument("--pretrain_curiosity", action="store_true")
+parser.add_argument("--reset_linear", action="store_true")
+parser.add_argument("--conditional_vae", action="store_true")
+parser.add_argument("--grid_search", action="store_true")
+
+# Save/Load
+parser.add_argument("--restore", action="store_true")
+parser.add_argument("--restore_path", type=str, default="") 
+parser.add_argument("--test", action="store_true")
+parser.add_argument("--test_path", type=str, default="")
+parser.add_argument("--lang_path", type=str, default="") # '_shared' vs ''
+parser.add_argument("--policy_model", type=str, default="")
+parser.add_argument("--reward_model", type=str, default="")
+parser.add_argument("--user_model", type=str, default="")
+parser.add_argument("--aux_reward_model", type=str, default="")
+parser.add_argument("--sentiment_clf", type=str, default="")
+
+
+arg = parser.parse_args()
+print(arg)
+model = arg.model
+
+# Hyperparameters
+C = arg.C
+H = arg.H
+D = arg.D
+B = arg.B
+L = arg.L
+M = arg.M
+K = arg.K
+CD = arg.CD
+beta = arg.beta
+lambda_aux = arg.lambda_aux
+lambda_emo = arg.lambda_emo
+lambda_gen = arg.lambda_gen
+lambda_mle = arg.lambda_mle
+bi=arg.bi
+lr=arg.lr
+tau = arg.tau
+mlp = arg.mlp
+lstm = arg.lstm
+beam_size = arg.beam_size
+topk = arg.topk
+topk_size = arg.topk_size
+
+attn = arg.attn
+beam = arg.beam
+optim = arg.optim
+parse = arg.parse
+eval_parse = arg.eval_parse
+dropout = arg.dropout
+embedding = arg.embedding
+round_robin = arg.round_robin
+epochs = arg.epochs
+share_rnn = arg.share_rnn
+weight_tie = arg.weight_tie
+embeddings_cpu = arg.embeddings_cpu
+share_embeddings = arg.share_embeddings
+update_embeddings = arg.update_embeddings
+
+rand_unif_init_mag=0.02
+trunc_norm_init_std=1e-4
+
+max_grad_norm = arg.max_grad_norm
+max_enc_steps = arg.max_enc_steps
+max_dec_steps = arg.max_dec_steps
+min_dec_steps = arg.min_dec_steps
+
+USE_CUDA = arg.cuda
+
+unk_idx = 0
+pad_idx = 1
+sou_idx = 2
+eou_idx = 3
+
+data = arg.data
+eval_data = arg.eval_data
+task = arg.task
+split = arg.split
+shuffle = arg.shuffle
+discrete = arg.discrete
+use_arl = arg.use_arl
+use_baseline = arg.use_baseline
+use_bpr = arg.use_bpr
+use_bow = arg.use_bow
+use_bert = arg.use_bert
+use_cycle = arg.use_cycle
+use_curiosity = arg.use_curiosity
+use_lang = arg.use_lang
+use_topic = arg.use_topic
+use_binary = arg.use_binary
+use_current = arg.use_current
+use_context = arg.use_context
+use_hybrid = arg.use_hybrid
+use_emotion = arg.use_emotion
+use_sentiment = arg.use_sentiment
+use_sentiment_agreement = arg.use_sentiment_agreement
+use_self_critical = arg.use_self_critical
+use_user = arg.use_user
+reset_linear = arg.reset_linear
+use_kl_anneal = arg.use_kl_anneal
+use_tau_anneal = arg.use_tau_anneal
+pretrain_curiosity = arg.pretrain_curiosity
+conditional_vae = arg.conditional_vae
+grid_search = arg.grid_search
+
+restore = arg.restore
+restore_path = arg.restore_path
+
+test = arg.test
+test_path = arg.test_path
+lang_path = arg.lang_path
+policy_model = arg.policy_model
+reward_model = arg.reward_model
+user_model = arg.user_model
+aux_reward_model = arg.aux_reward_model
+sentiment_clf = arg.sentiment_clf
\ No newline at end of file
diff --git a/utils/dataset.py b/utils/dataset.py
new file mode 100644
index 0000000..8aa7d91
--- /dev/null
+++ b/utils/dataset.py
@@ -0,0 +1,180 @@
+import re
+
+import numpy as np
+import dill as pickle
+
+import torch
+import torch.utils.data as data
+
+from utils import constant
+
+
+class DialogDataset(data.Dataset):
+    def __init__(self, mode='train', dataset='empathetic-dialogue', usr=False, sys=False, path=None, load_fasttext=False):
+        self.mode = mode
+        self.dataset = dataset
+        self.usr = usr
+        self.sys = sys
+        self.fasttext = None
+        self.load_fasttext = load_fasttext
+        self.use_emotion = constant.use_emotion
+        self.use_sentiment = constant.use_sentiment or constant.use_sentiment_agreement
+
+        self._from_file(path)
+
+    def __len__(self):
+        if self.sys and self.dataset == 'empathetic-dialogue':
+            return self.sys_target_lens.shape[0]
+        elif self.usr and self.dataset == 'empathetic-dialogue':
+            return self.usr_target_lens.shape[0]
+        return self.target_lens.shape[0]
+
+    def __getitem__(self, i):
+        dialog = None
+        dialog_len = None
+        target = None
+        target_len = None
+        emotion = None
+        sentiments = None
+        if self.sys and self.dataset == 'empathetic-dialogue':
+            dialog = self.sys_dialogs[i]
+            dialog_len = self.sys_dialog_lens[i]
+            target = self.sys_targets[i]
+            target_len = self.sys_target_lens[i]
+            if self.use_emotion:
+                emotion = self.sys_emotions[i]
+            elif self.use_sentiment:
+                emotion = self.sys_sentiments[i]
+                sentiments = self.sys_sentiments_b[i]
+        elif self.usr and self.dataset == 'empathetic-dialogue':
+            dialog = self.usr_dialogs[i]
+            dialog_len = self.usr_dialog_lens[i]
+            target = self.usr_targets[i]
+            target_len = self.usr_target_lens[i]
+            if self.use_emotion:
+                emotion = self.sys_emotions[i]
+            elif self.use_sentiment:
+                emotion = self.sys_sentiments[i]
+        else:
+            dialog = self.dialogs[i]
+            dialog_len = self.dialog_lens[i]
+            target = self.targets[i]
+            target_len = self.target_lens[i]
+            if self.use_emotion:
+                emotion = self.emotions[i]
+            
+        return dialog, dialog_len, target, target_len, emotion, sentiments
+
+    def _from_file(self, path=None):
+        def load_npy(path):
+            return np.load(path)
+
+        load_path = path if path else 'data/prep/{}/{}.{}.npy'
+        
+        if self.use_emotion:
+            self.emotions            = load_npy(load_path.format(self.dataset, 'emotions', self.mode))
+
+        if self.dataset == 'empathetic-dialogue':
+            self.usr_dialogs         = load_npy(load_path.format(self.dataset, 'usr_dialogs', self.mode))
+            self.usr_dialog_lens     = load_npy(load_path.format(self.dataset, 'usr_dialog_lens', self.mode))
+            self.sys_dialogs         = load_npy(load_path.format(self.dataset, 'sys_dialogs', self.mode))
+            self.sys_dialog_lens     = load_npy(load_path.format(self.dataset, 'sys_dialog_lens', self.mode))
+            self.usr_targets         = load_npy(load_path.format(self.dataset, 'usr_targets', self.mode))
+            self.usr_target_lens     = load_npy(load_path.format(self.dataset, 'usr_target_lens', self.mode))
+            self.sys_targets         = load_npy(load_path.format(self.dataset, 'sys_targets', self.mode))
+            self.sys_target_lens     = load_npy(load_path.format(self.dataset, 'sys_target_lens', self.mode))
+            self.sys_emotions        = load_npy(load_path.format(self.dataset, 'sys_emotions', self.mode))
+            self.sys_sentiments      = load_npy(load_path.format(self.dataset, 'sys_sentiments', self.mode))
+            self.sys_sentiments_b    = load_npy(load_path.format(self.dataset, 'sys_sentiments_binary', self.mode))
+        else:
+            self.dialogs             = load_npy(load_path.format(self.dataset, 'dialogs', self.mode))
+            self.dialog_lens         = load_npy(load_path.format(self.dataset, 'dialog_lens', self.mode))
+            self.targets             = load_npy(load_path.format(self.dataset, 'targets', self.mode))
+            self.target_lens         = load_npy(load_path.format(self.dataset, 'target_lens', self.mode))
+
+        if self.mode == 'train':
+            if self.load_fasttext:
+                self.fasttext = load_npy('data/prep/{}/fasttext.npy'.format(self.dataset))
+
+            with open('data/prep/{}/lang{}.pkl'.format(self.dataset, constant.lang_path), 'rb') as f:
+                self.lang = pickle.load(f)
+
+
+def make_dialog_data_loader(dataset, cuda, embeddings_cpu, batch_size, pad_idx=1, shuffle=True):
+    return torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle,
+                                       collate_fn=collate_fn(mode=dataset.mode, cuda=cuda, 
+                                       embeddings_cpu=embeddings_cpu, pad_idx=pad_idx, V=len(dataset.lang), 
+                                       use_emotion=dataset.use_emotion, use_sentiment=dataset.use_sentiment))
+
+
+def collate_fn(mode='train', cuda=False, embeddings_cpu=False, pad_idx=1, V=None, use_emotion=False, use_sentiment=False):
+    def collate_inner(batch):
+        """
+        Input
+        - batch[0]: dialogs -> [B x UTT_LEN]
+        - batch[1]: dialog_lens -> [B]
+        - batch[2]: targets -> [B x TGT_LEN]
+        - batch[3]: target_lens -> [B]
+        - batch[4]: emotions -> [B]
+
+        Returns
+        - dialogs -> Ready for embedding lookup and packing
+            - Original: Tensor of [B x MAX_TURN x MAX_UTT_LEN], padded with PAD words and PAD arrays
+                Use pack_padded_sequence => Transform to [B * MAX_TURN x MAX_UTT_LEN] later for tensor computation
+            - Flattened: Tensor of [B x MAX_SEQ_LEN], where MAX_SEQ_LEN is max flattened seq len in current batch
+                Use pack_sequence
+        - labels -> Tensor of [B] indicating index of correct emotion
+        """
+
+        # Unzip data (returns tuple of batches)
+        dialogs, dialog_lens, targets, target_lens, emotions, sentiments = zip(*batch)
+
+        sort = np.argsort(dialog_lens)[::-1].tolist()
+        unsort = np.argsort(sort).tolist()
+        dialogs = np.array(dialogs, dtype='object')[sort].tolist()
+        lens = np.array(dialog_lens)[sort]#.tolist()
+        targets = np.array(targets, dtype='object')[sort]#.tolist()
+        target_lens = np.array(target_lens)[sort]#.tolist()
+
+        bow_targets, x_sort, x_unsort = None, None, None
+        # x_sort = np.argsort(target_lens)[::-1].tolist()
+        # x_unsort = np.argsort(x_sort).tolist()
+        # bow_targets = np.zeros((len(targets), V))
+        # for i, target in enumerate(targets):
+        #     bow_targets[i][target] = 1
+        # bow_targets = torch.from_numpy(bow_targets).float()
+        if use_emotion:
+            emotions = torch.from_numpy(np.array(emotions)[sort]).long()
+        elif use_sentiment:
+            emotions = torch.from_numpy(np.array(emotions)[sort]).float()
+            sentiments = torch.from_numpy(np.array(sentiments)[sort]).float()
+
+        # Pad dialogs and targets to their respective max batch lens
+        B = len(dialogs)
+        LD = lens[0]
+        LT = np.max(target_lens)
+        if pad_idx == 0:
+            padded_dialogs = torch.zeros((B, LD))
+            padded_targets = torch.zeros((B, LT))
+        else:
+            padded_dialogs = torch.ones((B, LD)) * pad_idx
+            padded_targets = torch.ones((B, LT)) * pad_idx
+        for b in range(B):
+            padded_dialogs[b, :lens[b]] = torch.from_numpy(np.array(dialogs[b]))
+            padded_targets[b, :target_lens[b]] = torch.from_numpy(np.array(targets[b]))
+      
+        padded_dialogs = padded_dialogs.long()
+        padded_targets = padded_targets.long()
+
+        target_lens = torch.LongTensor(target_lens)
+        if not embeddings_cpu and cuda:
+            padded_dialogs = padded_dialogs.cuda()
+            padded_targets = padded_targets.cuda()
+            target_lens = target_lens.cuda()
+            if use_emotion or use_sentiment:
+                emotions = emotions.cuda()
+                if use_sentiment:
+                    sentiments = sentiments.cuda()
+
+        return padded_dialogs, lens, padded_targets, unsort, bow_targets, emotions, sentiments, x_sort, x_unsort
+    return collate_inner
diff --git a/utils/embedding_metrics.py b/utils/embedding_metrics.py
new file mode 100644
index 0000000..66c0569
--- /dev/null
+++ b/utils/embedding_metrics.py
@@ -0,0 +1,83 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from sklearn.metrics.pairwise import cosine_similarity as cosine
+from collections import Counter
+
+class EmbeddingSim:
+    """
+    """
+    def __init__(self, word2vec):
+        """
+        :param word2vec - a numpy array of word2vec with shape [vocab_size x emb_size]
+        """
+        super(EmbeddingSim, self).__init__()
+        self.word2vec = word2vec
+        
+    def embedding(self, seqs): 
+        """
+        A numpy version of embedding
+        :param seqs - ndarray [batch_sz x seqlen]
+        """
+        batch_size, seqlen = seqs.shape
+        seqs = np.reshape(seqs, (-1)) # convert to 1-d indexes [(batch_sz*seqlen)]
+        embs = self.word2vec[seqs] # lookup [(batch_sz*seqlen) x emb_sz]
+        embs = np.reshape(embs, (batch_size, seqlen, -1)) # recover the shape [batch_sz x seqlen x emb_sz]
+        return embs
+    
+    def extrema(self, embs, lens): # embs: [batch_size x seq_len x emb_size]  lens: [batch_size]
+        """
+        computes the value of every single dimension in the word vectors which has the greatest
+        difference from zero.
+        :param seq: sequence
+        :param seqlen: length of sequence
+        """
+        # Find minimum and maximum value for every dimension in predictions
+        batch_size, seq_len, emb_size = embs.shape
+        max_mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int)
+        for i,length in enumerate(lens):
+            max_mask[i,:length,:]=1
+        min_mask = 1-max_mask
+        seq_max = (embs*max_mask).max(1) # [batch_sz x emb_sz]
+        seq_min = (embs+min_mask).min(1)
+        # Find the maximum absolute value in min and max data
+        comp_mask = seq_max >= np.abs(seq_min)# [batch_sz x emb_sz]
+        # Add vectors for finding final sequence representation for predictions
+        extrema_emb = seq_max* comp_mask + seq_min* np.logical_not(comp_mask)
+        return extrema_emb
+    
+    def mean(self, embs, lens):
+        batch_size, seq_len, emb_size=embs.shape
+        mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int)
+        for i,length in enumerate(lens):
+            mask[i,:length,:]=1
+        return (embs*mask).sum(1)/(mask.sum(1)+1e-8)
+
+    def sim_bow(self, pred, pred_lens, ref, ref_lens):
+        """
+        :param pred - ndarray [batch_size x seqlen]
+        :param pred_lens - list of integers
+        :param ref - ndarray [batch_size x seqlen]
+        """
+        # look up word embeddings for prediction and reference
+        emb_pred = self.embedding(pred) # [batch_sz x seqlen1 x emb_sz]
+        emb_ref = self.embedding(ref) # [batch_sz x seqlen2 x emb_sz]
+        
+        ext_emb_pred=self.extrema(emb_pred, pred_lens)
+        ext_emb_ref=self.extrema(emb_ref, ref_lens)
+        bow_extrema=cosine(ext_emb_pred, ext_emb_ref) # [batch_sz_pred x batch_sz_ref]
+        
+        avg_emb_pred = self.mean(emb_pred, pred_lens) # Calculate mean over seq
+        avg_emb_ref = self.mean(emb_ref, ref_lens) 
+        bow_avg = cosine(avg_emb_pred, avg_emb_ref) # [batch_sz_pred x batch_sz_ref]
+        
+        batch_pred, seqlen_pred, emb_size=emb_pred.shape
+        batch_ref, seqlen_ref, emb_size=emb_ref.shape
+        cos_sim = cosine(emb_pred.reshape((-1, emb_size)), emb_ref.reshape((-1, emb_size))) # [(batch_sz*seqlen1)x(batch_sz*seqlen2)]
+        cos_sim = cos_sim.reshape((batch_pred, seqlen_pred, batch_ref, seqlen_ref))
+        # Find words with max cosine similarity
+        max12 = cos_sim.max(1).mean(2) # max over seqlen_pred
+        max21 = cos_sim.max(3).mean(1) # max over seqlen_ref
+        bow_greedy=(max12+max21)/2 # [batch_pred x batch_ref(1)]
+        return np.max(bow_extrema), np.max(bow_avg), np.max(bow_greedy)
+    
\ No newline at end of file
diff --git a/utils/lang.py b/utils/lang.py
new file mode 100644
index 0000000..12e5c33
--- /dev/null
+++ b/utils/lang.py
@@ -0,0 +1,94 @@
+import nltk
+import spacy
+
+
+class Lang:
+    def __init__(self):
+        self.unk_idx = 0
+        self.pad_idx = 1
+        self.sou_idx = 2
+        self.eou_idx = 3
+
+        self.word2index = {'__unk__': self.unk_idx, '__pad__': self.pad_idx, '__sou__': self.sou_idx, '__eou__': self.eou_idx}
+        self.word2count = {'__unk__': 0, '__pad__': 0, '__sou__': 0, '__eou__': 0}
+        self.index2word = {self.unk_idx: "__unk__", self.pad_idx: "__pad__", self.sou_idx: "__sou__", self.eou_idx: "__eou__"} 
+        self.n_words = 4 # Count default tokens
+
+        self.nlp = spacy.load("en_core_web_sm")
+        # add special case rule
+        special_case = [{spacy.symbols.ORTH: u"__eou__"}]
+        self.nlp.tokenizer.add_special_case(u"__eou__", special_case)
+
+    def __len__(self):
+        return self.n_words
+
+    def tokenize(self, s):
+        # return nltk.word_tokenize(s)
+        return self.nlp.tokenizer(s)
+
+    def addSentence(self, sentence):
+        for word in self.tokenize(sentence):
+            self.addWord(word.text)
+
+    def addSentences(self, sentences):
+        for sentence in sentences:
+            for word in self.tokenize(sentence):
+                self.addWord(word.text)
+
+    def addWord(self, word):
+        if word not in self.word2index:
+            self.word2index[word] = self.n_words
+            self.word2count[word] = 1
+            self.index2word[self.n_words] = word
+            self.n_words += 1
+        else:
+            self.word2count[word] += 1
+
+    def transform(self, sentences):
+        # given unokenized sentences (or iterator), transform to idx mapping
+        return [[self.word2index[token.text] for token in self.tokenize(sentence) if not token.is_space] for sentence in sentences]
+
+    def transform_one(self, sentence):
+        try:
+        # given unokenized sentence, transform to idx mapping
+            return [self.word2index[token.text] for token in self.tokenize(sentence) if not token.is_space]
+        except KeyError as e:
+            print(e)
+            print(sentence)
+            for token in self.tokenize(sentence):
+                if not token.is_space:
+                    print(token.text, token.text in self.word2index)
+            exit(1)
+
+    def transform_unk(self, sentence):
+        # transform with unk
+        ret = []
+        for token in self.tokenize(sentence):
+            if token.text in self.word2index:
+                ret.append(self.word2index[token.text])
+            else:
+                ret.append(self.unk_idx)
+        return ret
+
+    def reverse(self, sentences):
+        # given transformed sentences, reverse it
+        return [[self.index2word[idx] for idx in sentence] for sentence in sentences]
+
+    def reverse_one(self, sentence):
+        # given transformed sentence, reverse it
+        return [self.index2word[idx] for idx in sentence]
+
+    # def trim(self, min_freq=100):
+    #     print('vocab size before trimming: ', len(self))
+    #     self.word2count[self.unk_idx] = min_freq
+    #     self.word2count[self.pad_idx] = min_freq
+    #     self.word2count[self.sou_idx] = min_freq
+    #     self.word2count[self.eou_idx] = min_freq
+
+    #     self.word2count = {k: v for k, v in self.word2count if v >= 100}
+    #     trimmed_word2index = {'__unk__': self.unk_idx, '__pad__': self.pad_idx, '__sou__': self.sou_idx, '__eou__': self.eou_idx}
+    #     trimmed_index2word = {self.unk_idx: "__unk__", self.pad_idx: "__pad__", self.sou_idx: "__sou__", self.eou_idx: "__eou__"} 
+
+    #     self.word2index = trimmed_word2index
+    #     print('vocab size after trimming: ', len(self))
+    #     return self
diff --git a/utils/masked_cross_entropy.py b/utils/masked_cross_entropy.py
new file mode 100644
index 0000000..dcf348e
--- /dev/null
+++ b/utils/masked_cross_entropy.py
@@ -0,0 +1,47 @@
+import torch
+from torch.nn import functional
+
+
+def sequence_mask(sequence_length, max_len=None):
+    if max_len is None:
+        max_len = sequence_length.data.max()
+    batch_size = sequence_length.size(0)
+    seq_range = torch.arange(0, max_len).long()
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    # seq_range_expand = Variable(seq_range_expand)
+    if sequence_length.is_cuda:
+        seq_range_expand = seq_range_expand.cuda()
+    seq_length_expand = (sequence_length.unsqueeze(1).expand_as(seq_range_expand))
+    return seq_range_expand < seq_length_expand
+
+
+def masked_cross_entropy(logits, target, length):
+    """
+    Args:
+        logits: A Variable containing a FloatTensor of size
+            (batch, max_len, num_classes) which contains the
+            unnormalized probability for each class.
+        target: A LongTensor of size
+            (batch, max_len) which contains the index of the true
+            class for each corresponding step.
+        length: A LongTensor of size (batch,)
+            which contains the length of each data in a batch.
+    Returns:
+        loss: An average loss value masked by the length.
+    """
+
+    # logits_flat: (batch * max_len, num_classes)
+    logits_flat = logits.view(-1, logits.size(-1)) ## -1 means infered from other dimentions
+    # log_probs_flat: (batch * max_len, num_classes)
+    log_probs_flat = functional.log_softmax(logits_flat,dim=1)
+    # target_flat: (batch * max_len, 1)
+    target_flat = target.view(-1, 1)
+    # losses_flat: (batch * max_len, 1)
+    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
+    # losses: (batch, max_len)
+    losses = losses_flat.view(*target.size())
+    # mask: (batch, max_len)
+    mask = sequence_mask(sequence_length=length, max_len=target.size(1))  
+    losses = losses * mask.float()
+    loss = losses.sum() / length.float().sum()
+    return loss
diff --git a/utils/rouge.py b/utils/rouge.py
new file mode 100644
index 0000000..c83c821
--- /dev/null
+++ b/utils/rouge.py
@@ -0,0 +1,325 @@
+"""ROUGE metric implementation.
+Copy from tf_seq2seq/seq2seq/metrics/rouge.py.
+This is a modified and slightly extended verison of
+https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import itertools
+import numpy as np
+
+#pylint: disable=C0103
+
+
+def _get_ngrams(n, text):
+  """Calcualtes n-grams.
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+
+def _split_into_words(sentences):
+  """Splits multiple sentences into words and flattens the result"""
+  return list(itertools.chain(*[_.split(" ") for _ in sentences]))
+
+
+def _get_word_ngrams(n, sentences):
+  """Calculates word n-grams for multiple sentences.
+  """
+  assert len(sentences) > 0
+  assert n > 0
+
+  words = _split_into_words(sentences)
+  return _get_ngrams(n, words)
+
+
+def _len_lcs(x, y):
+  """
+  Returns the length of the Longest Common Subsequence between sequences x
+  and y.
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+  Args:
+    x: sequence of words
+    y: sequence of words
+  Returns
+    integer: Length of LCS between x and y
+  """
+  table = _lcs(x, y)
+  n, m = len(x), len(y)
+  return table[n, m]
+
+
+def _lcs(x, y):
+  """
+  Computes the length of the longest common subsequence (lcs) between two
+  strings. The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+  Args:
+    x: collection of words
+    y: collection of words
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table
+
+
+def _recon_lcs(x, y):
+  """
+  Returns the Longest Subsequence between x and y.
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+  Args:
+    x: sequence of words
+    y: sequence of words
+  Returns:
+    sequence: LCS of x and y
+  """
+  i, j = len(x), len(y)
+  table = _lcs(x, y)
+
+  def _recon(i, j):
+    """private recon calculation"""
+    if i == 0 or j == 0:
+      return []
+    elif x[i - 1] == y[j - 1]:
+      return _recon(i - 1, j - 1) + [(x[i - 1], i)]
+    elif table[i - 1, j] > table[i, j - 1]:
+      return _recon(i - 1, j)
+    else:
+      return _recon(i, j - 1)
+
+  recon_tuple = tuple(map(lambda x: x[0], _recon(i, j)))
+  return recon_tuple
+
+
+def rouge_n(evaluated_sentences, reference_sentences, n=2):
+  """
+  Computes ROUGE-N of two text collections of sentences.
+  Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
+  papers/rouge-working-note-v1.3.1.pdf
+  Args:
+    evaluated_sentences: The sentences that have been picked by the summarizer
+    reference_sentences: The sentences from the referene set
+    n: Size of ngram.  Defaults to 2.
+  Returns:
+    A tuple (f1, precision, recall) for ROUGE-N
+  Raises:
+    ValueError: raises exception if a param has len <= 0
+  """
+  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+    raise ValueError("Collections must contain at least 1 sentence.")
+
+  evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences)
+  reference_ngrams = _get_word_ngrams(n, reference_sentences)
+  reference_count = len(reference_ngrams)
+  evaluated_count = len(evaluated_ngrams)
+
+  # Gets the overlapping ngrams between evaluated and reference
+  overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams)
+  overlapping_count = len(overlapping_ngrams)
+
+  # Handle edge case. This isn't mathematically correct, but it's good enough
+  if evaluated_count == 0:
+    precision = 0.0
+  else:
+    precision = overlapping_count / evaluated_count
+
+  if reference_count == 0:
+    recall = 0.0
+  else:
+    recall = overlapping_count / reference_count
+
+  f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
+
+  # return overlapping_count / reference_count
+  return f1_score, precision, recall
+
+
+def _f_p_r_lcs(llcs, m, n):
+  """
+  Computes the LCS-based F-measure score
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+  Returns:
+    Float. LCS-based F-measure score
+  """
+  r_lcs = llcs / m
+  p_lcs = llcs / n
+  beta = p_lcs / (r_lcs + 1e-12)
+  num = (1 + (beta**2)) * r_lcs * p_lcs
+  denom = r_lcs + ((beta**2) * p_lcs)
+  f_lcs = num / (denom + 1e-12)
+  return f_lcs, p_lcs, r_lcs
+
+
+def rouge_l_sentence_level(evaluated_sentences, reference_sentences):
+  """
+  Computes ROUGE-L (sentence level) of two text collections of sentences.
+  http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+  Args:
+    evaluated_sentences: The sentences that have been picked by the summarizer
+    reference_sentences: The sentences from the referene set
+  Returns:
+    A float: F_lcs
+  Raises:
+    ValueError: raises exception if a param has len <= 0
+  """
+  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+    raise ValueError("Collections must contain at least 1 sentence.")
+  reference_words = _split_into_words(reference_sentences)
+  evaluated_words = _split_into_words(evaluated_sentences)
+  m = len(reference_words)
+  n = len(evaluated_words)
+  lcs = _len_lcs(evaluated_words, reference_words)
+  return _f_p_r_lcs(lcs, m, n)
+
+
+def _union_lcs(evaluated_sentences, reference_sentence):
+  """
+  Returns LCS_u(r_i, C) which is the LCS score of the union longest common
+  subsequence between reference sentence ri and candidate summary C. For example
+  if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and
+  c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is
+  "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The
+  union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and
+  LCS_u(r_i, C) = 4/5.
+  Args:
+    evaluated_sentences: The sentences that have been picked by the summarizer
+    reference_sentence: One of the sentences in the reference summaries
+  Returns:
+    float: LCS_u(r_i, C)
+  ValueError:
+    Raises exception if a param has len <= 0
+  """
+  if len(evaluated_sentences) <= 0:
+    raise ValueError("Collections must contain at least 1 sentence.")
+
+  lcs_union = set()
+  reference_words = _split_into_words([reference_sentence])
+  combined_lcs_length = 0
+  for eval_s in evaluated_sentences:
+    evaluated_words = _split_into_words([eval_s])
+    lcs = set(_recon_lcs(reference_words, evaluated_words))
+    combined_lcs_length += len(lcs)
+    lcs_union = lcs_union.union(lcs)
+
+  union_lcs_count = len(lcs_union)
+  union_lcs_value = union_lcs_count / combined_lcs_length
+  return union_lcs_value
+
+
+def rouge_l_summary_level(evaluated_sentences, reference_sentences):
+  """
+  Computes ROUGE-L (summary level) of two text collections of sentences.
+  http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+  Calculated according to:
+  R_lcs = SUM(1, u)[LCS<union>(r_i,C)]/m
+  P_lcs = SUM(1, u)[LCS<union>(r_i,C)]/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+  where:
+  SUM(i,u) = SUM from i through u
+  u = number of sentences in reference summary
+  C = Candidate summary made up of v sentences
+  m = number of words in reference summary
+  n = number of words in candidate summary
+  Args:
+    evaluated_sentences: The sentences that have been picked by the summarizer
+    reference_sentence: One of the sentences in the reference summaries
+  Returns:
+    A float: F_lcs
+  Raises:
+    ValueError: raises exception if a param has len <= 0
+  """
+  if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0:
+    raise ValueError("Collections must contain at least 1 sentence.")
+
+  # total number of words in reference sentences
+  m = len(_split_into_words(reference_sentences))
+
+  # total number of words in evaluated sentences
+  n = len(_split_into_words(evaluated_sentences))
+
+  union_lcs_sum_across_all_references = 0
+  for ref_s in reference_sentences:
+    union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences,
+                                                      ref_s)
+  return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n)
+
+
+def rouge(hypotheses, references):
+  """Calculates average rouge scores for a list of hypotheses and
+  references"""
+
+  # Filter out hyps that are of 0 length
+  # hyps_and_refs = zip(hypotheses, references)
+  # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0]
+  # hypotheses, references = zip(*hyps_and_refs)
+
+  # Calculate ROUGE-1 F1, precision, recall scores
+  rouge_1 = [
+      rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references)
+  ]
+  rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1))
+
+  # Calculate ROUGE-2 F1, precision, recall scores
+  rouge_2 = [
+      rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references)
+  ]
+  rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2))
+
+  # Calculate ROUGE-L F1, precision, recall scores
+  rouge_l = [
+      rouge_l_sentence_level([hyp], [ref])
+      for hyp, ref in zip(hypotheses, references)
+  ]
+  rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l))
+
+  return {
+      "rouge_1/f_score": rouge_1_f,
+      "rouge_1/r_score": rouge_1_r,
+      "rouge_1/p_score": rouge_1_p,
+      "rouge_2/f_score": rouge_2_f,
+      "rouge_2/r_score": rouge_2_r,
+      "rouge_2/p_score": rouge_2_p,
+      "rouge_l/f_score": rouge_l_f,
+      "rouge_l/r_score": rouge_l_r,
+      "rouge_l/p_score": rouge_l_p,
+  }
diff --git a/utils/sentiment_dataset.py b/utils/sentiment_dataset.py
new file mode 100644
index 0000000..80d2228
--- /dev/null
+++ b/utils/sentiment_dataset.py
@@ -0,0 +1,108 @@
+import re
+
+import numpy as np
+import dill as pickle
+
+import torch
+import torch.utils.data as data
+from pytorch_pretrained_bert.tokenization import BertTokenizer
+
+from utils import constant, text_input2bert_input
+
+
+class SentimentDataset(data.Dataset):
+    def __init__(self, mode='train', dataset='sst', load_fasttext=False):
+        self.mode = mode
+        self.dataset = dataset
+        self.fasttext = None
+        self.load_fasttext = load_fasttext
+        self.use_bert = constant.use_bert
+        if self.use_bert:
+            self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+
+        self._from_file()
+
+    def __len__(self):
+        return self.sentiments.shape[0]
+
+    def __getitem__(self, i):
+        if self.use_bert:
+            input_id, input_mask, segment_id = text_input2bert_input(self.texts[i], self.bert_tokenizer, seq_length=128)
+            return input_id, input_mask, segment_id, self.sentiments[i]
+        return self.sentences[i], self.sentence_lens[i], self.sentiments[i]
+
+    def _from_file(self):
+        def load_npy(path):
+            return np.load(path)
+
+        load_path = 'data/prep/{}/{}.{}.npy'
+
+        self.sentiments        = load_npy(load_path.format(self.dataset, 'sentiments', self.mode))
+        
+        if self.use_bert:
+            self.texts             = load_npy(load_path.format(self.dataset, 'texts', self.mode))
+        else:
+            self.sentences         = load_npy(load_path.format(self.dataset, 'sentences', self.mode))
+            self.sentence_lens     = load_npy(load_path.format(self.dataset, 'sentence_lens', self.mode))
+        
+            if self.mode == 'train':
+                if self.load_fasttext:
+                    self.fasttext = load_npy('data/prep/{}/fasttext.npy'.format(self.dataset))
+
+                with open('data/prep/{}/lang.pkl'.format(self.dataset), 'rb') as f:
+                    self.lang = pickle.load(f)
+
+
+
+def make_sentiment_data_loader(dataset, cuda, batch_size, pad_idx=1, shuffle=True):
+    return torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle,
+                                       collate_fn=collate_fn(cuda=cuda, bert=dataset.use_bert, pad_idx=pad_idx))
+
+
+def collate_fn(cuda=False, bert=False, pad_idx=1):
+    def collate_inner(batch):
+        """
+        Input
+        - batch[0]: sentences -> [B x L]
+        - batch[1]: sentence_lens -> [B]
+        - batch[2]: sentiments -> [B]
+        """
+        # Unzip data (returns tuple of batches)
+        if bert:
+            input_ids, input_masks, segment_ids, sentiments = zip(*batch)
+            input_ids = torch.stack(input_ids)
+            input_masks = torch.stack(input_masks)
+            segment_ids = torch.stack(segment_ids)
+            sentiments = torch.from_numpy(np.array(sentiments)).float()
+            if cuda:
+                input_ids = input_ids.cuda()
+                input_masks = input_masks.cuda()
+                segment_ids = segment_ids.cuda()
+                sentiments = sentiments.cuda()
+            return input_ids, input_masks, segment_ids, sentiments
+        else:
+            sentences, sentence_lens, sentiments = zip(*batch)
+
+        sort = np.argsort(sentence_lens)[::-1].tolist()
+        sentences = np.array(sentences, dtype='object')[sort].tolist()
+        sentence_lens = np.array(sentence_lens)[sort]#.tolist()
+        sentiments = torch.from_numpy(np.array(sentiments)[sort]).float()
+
+        # Pad dialogs and targets to their respective max batch lens
+        B = len(sentences)
+        L = sentence_lens[0]
+        if pad_idx == 0:
+            padded_sentences = torch.zeros((B, L))
+        else:
+            padded_sentences = torch.ones((B, L)) * pad_idx
+        for b in range(B):
+            padded_sentences[b, :sentence_lens[b]] = torch.from_numpy(np.array(sentences[b]))
+      
+        padded_sentences = padded_sentences.long()
+
+        if cuda:
+            padded_sentences = padded_sentences.cuda()
+            sentiments = sentiments.cuda()
+
+        return padded_sentences, sentence_lens, sentiments
+    return collate_inner
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000..74e0ceb
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,303 @@
+import math
+from datetime import datetime
+from functools import reduce
+import operator
+
+import numpy as np
+from nltk.util import ngrams, everygrams
+
+import torch
+
+from utils import constant
+
+
+def tile(x, count, dim=0):
+    """
+    Tiles x on dimension dim count times.
+    """
+    perm = list(range(len(x.size())))
+    if dim != 0:
+        perm[0], perm[dim] = perm[dim], perm[0]
+        x = x.permute(perm).contiguous()
+    out_size = list(x.size())
+    out_size[0] *= count
+    batch = x.size(0)
+    x = x.view(batch, -1) \
+        .transpose(0, 1) \
+        .repeat(count, 1) \
+        .transpose(0, 1) \
+        .contiguous() \
+        .view(*out_size)
+    if dim != 0:
+        x = x.permute(perm).contiguous()
+    return x
+
+def save_model(model, metric, score, path=None):
+    if path is not None:
+        save_path = path
+    else:
+        save_path = 'trained/{}.{}.{}.{}.{}.{}.{:.4f}.{}' # data.task.model.H.lr.attn.metric.parse
+        misc = ''
+        if constant.lstm:
+            misc += 'lstm.'
+ 
+        save_path = save_path.format(constant.data, constant.task, constant.model, constant.H, constant.lr, constant.attn, metric, score, misc)
+    torch.save(model.state_dict(), save_path)
+    return save_path
+
+def load_model(model, path):
+    if path == "":
+        return model
+    if constant.USE_CUDA:
+        model.load_state_dict(torch.load(path))
+    else:
+        model.load_state_dict(torch.load(path, map_location='cpu'))
+    return model
+    
+def save_ckpt(model, optim, epoch):
+    save_path = 'ckpt/{}.{}.{}.{}.{}.{}' # dataset.task.epoch.lr.misc.time
+    misc = ''
+    if constant.lstm:
+        misc += 'lstm.'
+    date = datetime.now().date()
+    time = datetime.now().time()
+    dt = '{}-{}-{}-{}-{}-{}'.format(date.year, date.month, date.day, time.hour, time.minute, time.second)
+    save_path = save_path.format(constant.data, constant.task, epoch, constant.lr, misc, dt)
+        
+    state = {
+        'epoch': epoch, 
+        'state_dict': model.state_dict(),
+        'optimizer': optim.state_dict()
+    }
+    torch.save(state, save_path)
+    return save_path
+
+def load_ckpt(model, optim, path):
+    # Note: Input model & optimizer should be pre-defined.
+    # This routine only updates their states.
+    start_epoch = 0
+    if os.path.isfile(path):
+        print("=> loading checkpoint '{}'".format(path))
+        checkpoint = torch.load(path)
+        start_epoch = checkpoint['epoch']
+        model.load_state_dict(checkpoint['state_dict'])
+        optim.load_state_dict(checkpoint['optimizer'])
+        if constant.USE_CUDA:
+            for state in optim.state.values():
+                for k, v in state.items():
+                    if isinstance(v, torch.Tensor):
+                        state[k] = v.cuda()
+        print("=> loaded checkpoint '{}' (epoch {})".format(path, checkpoint['epoch']))
+    else:
+        print("=> no checkpoint found at '{}'".format(path))
+
+    return model, optim, start_epoch
+
+def to_categorical(y, num_classes):
+    """ 1-hot encodes a tensor """
+    return np.eye(num_classes, dtype='uint8')[y]
+
+def get_metrics(predictions, ground, C=7, verbose=False):
+    """Given predicted labels and the respective ground truth labels, display some metrics
+    Input: shape [# of samples, NUM_CLASSES]
+        predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class
+        ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0]
+    Output:
+        accuracy : Average accuracy
+        microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001
+        microRecall : Recall calculated on a micro level
+        microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification  
+    """
+    one_hot = np.zeros((ground.shape[0], C))
+    one_hot[np.arange(ground.shape[0]), ground] = 1
+    ground = one_hot
+    label2emotion = {
+        0: 'none',
+        1: 'anger',
+        2: 'disgust',
+        3: 'fear',
+        4: 'happiness',
+        5: 'sadness',
+        6: 'surprise'
+    }
+    # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0]
+    discretePredictions = to_categorical(predictions.argmax(axis=1), num_classes=C)
+    
+    truePositives = np.sum(discretePredictions*ground, axis=0)
+    falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0)
+    falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0)
+    if(verbose):
+        print("True Positives per class : ", truePositives)
+        print("False Positives per class : ", falsePositives)
+        print("False Negatives per class : ", falseNegatives)
+    
+    # ------------- Macro level calculation ---------------
+    macroPrecision = 0
+    macroRecall = 0
+    # We ignore the "Others" class during the calculation of Precision, Recall and F1
+    for c in range(1, C):
+        precision = truePositives[c] / (truePositives[c] + falsePositives[c])
+        macroPrecision += precision
+        recall = truePositives[c] / (truePositives[c] + falseNegatives[c])
+        macroRecall += recall
+        f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0
+        if(verbose):
+            print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1))
+    
+    macroPrecision /= 3
+    macroRecall /= 3
+    macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0
+    if(verbose):
+        print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1))   
+    
+    # ------------- Micro level calculation ---------------
+    truePositives = truePositives[1:].sum()
+    falsePositives = falsePositives[1:].sum()
+    falseNegatives = falseNegatives[1:].sum()
+    if(verbose):
+        print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives))
+    
+    microPrecision = truePositives / (truePositives + falsePositives)
+    microRecall = truePositives / (truePositives + falseNegatives)
+    
+    microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0
+    # -----------------------------------------------------
+    
+    predictions = predictions.argmax(axis=1)
+    ground = ground.argmax(axis=1)
+    accuracy = np.mean(predictions==ground)
+    if(verbose):
+        print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1))
+    return accuracy, microPrecision, microRecall, microF1
+
+def text_input2bert_input(text, bert_tokenizer, seq_length=512):
+    tokens_a = bert_tokenizer.tokenize(text)
+    # Account for [CLS] and [SEP] with "- 2"
+    if len(tokens_a) > seq_length - 2:
+        tokens_a = tokens_a[0:(seq_length - 2)]
+
+    tokens = [] # equals raw text tokens 
+    input_type_ids = [] # equals segments_ids
+    tokens.append("[CLS]")
+    input_type_ids.append(0)
+    for token in tokens_a:
+        tokens.append(token)
+        input_type_ids.append(0)
+    tokens.append("[SEP]")
+    input_type_ids.append(0)
+
+    input_ids = bert_tokenizer.convert_tokens_to_ids(tokens) # WordPiece embedding rep
+
+    # The mask has 1 for real tokens and 0 for padding tokens. Only real
+    # tokens are attended to.
+    input_mask = [1] * len(input_ids)
+
+    # Zero-pad up to the sequence length.
+    while len(input_ids) < seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        input_type_ids.append(0)
+
+    input_ids_batch = torch.tensor(input_ids, dtype=torch.long)
+    input_mask_batch = torch.tensor(input_mask, dtype=torch.long)
+    segment_id_batch = torch.zeros(input_ids_batch.size(), dtype=torch.long)
+
+    return input_ids_batch, input_mask_batch, segment_id_batch
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (..., vocabulary size)
+            top_k >0: keep only top k tokens with highest probability (top-k filtering).
+            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+    """
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    # if top_p > 0.0:
+    #     sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    #     cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+    #     # Remove tokens with cumulative probability above the threshold
+    #     sorted_indices_to_remove = cumulative_probs > top_p
+    #     # Shift the indices to the right to keep also the first token above the threshold
+    #     sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    #     sorted_indices_to_remove[..., 0] = 0
+
+    #     indices_to_remove = sorted_indices[sorted_indices_to_remove]
+    #     logits[indices_to_remove] = filter_value
+    return logits
+
+def get_sentiment(sentiment_clf, sentences, tokenizer):
+    input_ids, input_masks, segment_ids = zip(*[text_input2bert_input(sentence, tokenizer, seq_length=128) for sentence in sentences])
+    input_ids = torch.stack(input_ids)
+    input_masks = torch.stack(input_masks)
+    segment_ids = torch.stack(segment_ids)
+
+    if constant.USE_CUDA:
+        input_ids = input_ids.cuda()
+        input_masks = input_masks.cuda()
+        segment_ids = segment_ids.cuda()
+
+    # get reward with generated sentence
+    with torch.no_grad():
+        R = sentiment_clf.predict_prob((input_ids, segment_ids, input_masks))
+
+    return R
+
+def get_user_response(user_model, refs, sents, vocab):
+    sents = [vocab.transform_one(sent) for sent in sents]
+    lens = [len(sentence) for sentence in sents]
+    sort = np.argsort(lens)[::-1].tolist()
+    unsort = np.argsort(sort).tolist()
+    sents = np.array(sents, dtype='object')[sort].tolist()
+    lens = np.array(lens)[sort]
+
+    B = len(sents)
+    L = lens[0]
+    padded_sents = torch.ones((B, L)) * constant.pad_idx
+    for b in range(B):
+        padded_sents[b, :lens[b]] = torch.from_numpy(np.array(sents[b]))
+
+    padded_sents = padded_sents.long()
+    if constant.USE_CUDA:
+        padded_sents = padded_sents.cuda()
+
+    return np.array(user_model.predict_batch(padded_sents, lens, np.zeros((B, L))))[unsort].tolist()
+
+def distinct_ngrams(sentences):
+    unigram = []
+    bigram = []
+    trigram = []
+    for sent in sentences:
+        s = sent.split()
+        unigram.append(s)
+        bigram.append(list(ngrams(s, 2)))
+        trigram.append(list(ngrams(s, 3)))
+    unigram = reduce(operator.concat, unigram)
+    bigram = reduce(operator.concat, bigram)
+    trigram = reduce(operator.concat, trigram)
+    d1 = len(set(unigram))/len(unigram)
+    d2 = len(set(bigram))/len(bigram)
+    d3 = len(set(trigram))/len(trigram)
+    return d1, d2, d3
+
+# def get_embedding_similarity(refs, sents, vocab, encoder, mode='average', model='fasttext'):
+#     if model == 'fasttext':
+#         sents = [vocab.transform_one(sent) for sent in sents]
+#         lens = np.array(lens)[sort]
+
+#         B = len(sents)
+#         L = lens[0]
+#         padded_sents = torch.ones((B, L)) * constant.pad_idx
+#         for b in range(B):
+#             padded_sents[b, :lens[b]] = torch.from_numpy(np.array(sents[b]))
+
+#         padded_sents = padded_sents.long()
+#         if constant.USE_CUDA:
+#             padded_sents = padded_sents.cuda()
+#     elif model == 'bert':
+#         pass