diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/models/commons/__init__.py b/models/commons/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/commons/initializer.py b/models/commons/initializer.py new file mode 100644 index 0000000..1aa22c6 --- /dev/null +++ b/models/commons/initializer.py @@ -0,0 +1,29 @@ +import torch.nn as nn + +from utils import constant + + +def init_rnn_wt(rnn): + for names in rnn._all_weights: + for name in names: + if name.startswith('weight_'): + wt = getattr(rnn, name) + nn.init.xavier_uniform_(wt) + # wt.data.uniform_(-constant.rand_unif_init_mag, constant.rand_unif_init_mag) + elif name.startswith('bias_'): + # set forget bias to 1 + bias = getattr(rnn, name) + n = bias.size(0) + start, end = n // 4, n // 2 + bias.data.fill_(0.) + bias.data[start:end].fill_(1.) + +def init_linear_wt(linear): + # linear.weight.data.normal_(std=constant.trunc_norm_init_std) + nn.init.xavier_uniform_(linear.weight) + if linear.bias is not None: + n = linear.bias.size(0) + start, end = n // 4, n // 2 + linear.bias.data.fill_(0.) + linear.bias.data[start:end].fill_(1.) + # linear.bias.data.nomral_(std=constant.trunc_norm_init_std) diff --git a/models/commons/vae_lib.py b/models/commons/vae_lib.py new file mode 100644 index 0000000..abb5c21 --- /dev/null +++ b/models/commons/vae_lib.py @@ -0,0 +1,71 @@ +import os +import math +import time +import pprint +import random + +from tqdm import tqdm +import dill as pickle +import numpy as np +from numpy import random + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence + +from utils import constant + + +def gumbel_softmax(logits, dim, tau=1.0): + """ + Sample z ~ log p(z) + G(0, 1) + """ + eps=1e-20 + noise = torch.rand(logits.size()) + noise = -torch.log(-torch.log(noise + eps) + eps) # gumble noise + if constant.USE_CUDA: + noise = noise.float().cuda() + return F.softmax((logits + noise) / tau, dim=dim) + +def reparameterization(mu, logvar, z_dim): + """ + Reparameterization trick: z = mu + std*eps; eps ~ N(0, I) + """ + eps = torch.randn(z_dim) + eps = eps.cuda() if constant.USE_CUDA else eps + return mu + torch.exp(logvar/2) * eps + +def split_z(z, B, M, K): + return z.view(B, M, K) + +def merge_z(z, B, M, K): + return z.view(B, M * K) + +def cat_mi(p, q): + pass + +def cat_kl(logp, logq, dim=1): + """ + \sum q * log(q/p) + """ + if logq.dim() > 2: + logq = logq.squeeze() + + q = torch.exp(logq) + kl = torch.sum(q * (logq - logp), dim=dim) + return torch.mean(kl) + +def norm_kl(recog_mu, recog_logvar, prior_mu=None, prior_logvar=None): + # find the KL divergence between two Gaussian distributions (defaults to standard normal for prior) + if prior_mu is None: + prior_mu = torch.zeros(1) + prior_logvar = torch.ones(1) + if constant.USE_CUDA: + prior_mu = prior_mu.cuda() + prior_logvar = prior_logvar.cuda() + loss = 1.0 + (recog_logvar - prior_logvar) + loss -= torch.div(torch.pow(prior_mu - recog_mu, 2), torch.exp(prior_logvar)) + loss -= torch.div(torch.exp(recog_logvar), torch.exp(prior_logvar)) + kl_loss = -0.5 * torch.mean(loss, dim=1) + return torch.mean(kl_loss) diff --git a/models/decoders/__init__.py b/models/decoders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/decoders/rnn_decoder.py b/models/decoders/rnn_decoder.py new file mode 100644 index 0000000..8596877 --- /dev/null +++ b/models/decoders/rnn_decoder.py @@ -0,0 +1,74 @@ +import math + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence + +from models.commons.attention import Attention +from models.commons.initializer import init_rnn_wt, init_linear_wt +from utils import constant + + +class RNNDecoder(nn.Module): + def __init__(self, V, D, H, L=1, embedding=None): + super(RNNDecoder, self).__init__() + self.V = V + self.H = H + self.L = L + self.D = D + if constant.attn != 'none': + self.attention = Attention(H, constant.attn) + # self.dropout = nn.Dropout(constant.dropout) + + self.cuda = constant.USE_CUDA + self.embeddings_cpu = constant.embeddings_cpu + + if embedding is not None: + self.embedding = embedding + else: + self.embedding = nn.Embedding(V, D) + self.embedding.weight.requires_grad = True + + if constant.lstm: + self.rnn = nn.LSTM(D, H, L, batch_first=True, bidirectional=False) + else: + self.rnn = nn.GRU(D, H, L, batch_first=True, bidirectional=False) + + self.out = nn.Linear(H, V) + if constant.weight_tie: + self.out = nn.Linear(H, V) + self.out.weight = self.embedding.weight # Assuming H == D. They share the weight, and updated together + + def forward(self, x_t, last_h, src_hs=None, use_attn=False): + # Note: we run this in a for loop (mulitple batches over single token at a time) + # batch_size = x_t.size(0) + x = self.embedding(x_t) + if self.cuda and self.embeddings_cpu: + x = x.cuda() + # x = self.dropout(x) + # x = x.view(1, batch_size, self.H) # S=1 x B x N + outputs, dec_h_t = self.rnn(x.unsqueeze(1), last_h) # [B, 1, H] & [1, B, H] + + if use_attn: + h, _ = self.attention(src_hs, src_hs, outputs) + # output = self.out(self.linear(h)) + output = self.out(h) + else: + # output = self.out(self.linear(outputs)) + output = self.out(outputs) + + return output.squeeze(), dec_h_t + + def predict_one(self, x_t, last_h, src_hs=None, use_attn=False): + with torch.no_grad(): + x = self.embedding(x_t) + outputs, dec_h_t = self.rnn(x.unsqueeze(1), last_h) # [B, 1, H] & [1, B, H] + if use_attn: + h, _ = self.attention(src_hs, src_hs, outputs) + output = self.out(h) + else: + output = self.out(outputs) + return output.squeeze(), dec_h_t \ No newline at end of file diff --git a/models/encoders/__init__.py b/models/encoders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/encoders/rnn_encoder.py b/models/encoders/rnn_encoder.py new file mode 100644 index 0000000..83cd0d6 --- /dev/null +++ b/models/encoders/rnn_encoder.py @@ -0,0 +1,80 @@ +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.utils.rnn import pack_sequence, pack_padded_sequence, pad_packed_sequence + +from models.commons.initializer import init_rnn_wt +from utils import constant + + +class RNNEncoder(nn.Module): + def __init__(self, V, D, H, L=1, embedding=None): + super(RNNEncoder, self).__init__() + self.V = V + self.H = H + self.L = L + self.D = D + self.bi = True if constant.bi == 'bi' else False + self.use_lstm = constant.lstm + # self.dropout = nn.Dropout(constant.dropout) + + self.cuda = constant.USE_CUDA + + if embedding is not None: + self.embedding = embedding + else: + self.embedding = nn.Embedding(V, D) + self.embedding.weight.requires_grad = True + + self.embedding_dropout = nn.Dropout(constant.dropout) + + if constant.lstm: + self.rnn = nn.LSTM(D, H, L, batch_first=True, bidirectional=self.bi) + else: + self.rnn = nn.GRU(D, H, L, batch_first=True, bidirectional=self.bi) + + def soft_embed(self, x): + # x: (T, B, V), (B, V) or (V) + return (x.unsqueeze(len(x.shape)) * self.embedding.weight).sum(dim=len(x.shape)-1) + + def forward(self, seqs, lens, soft_encode=False, logits=None): + # Note: we run this all at once (over multiple batches of multiple sequences) + # x, lens = pad_packed_sequence(pack_sequence(seqs)) + if not soft_encode: + x = self.embedding(seqs) + x = self.embedding_dropout(x) + else: + x = self.soft_embed(logits).transpose(0, 1).contiguous() + x = pack_padded_sequence(x, lens, batch_first=True) + outputs, hidden = self.rnn(x) + outputs, _ = pad_packed_sequence(outputs, batch_first=True) + + if self.use_lstm: + h, c = hidden + + if self.bi: + # [2, B, H] => [B, 2H] + if self.use_lstm: + h = h.transpose(0, 1).contiguous().view(-1, 2*self.H) + c = c.transpose(0, 1).contiguous().view(-1, 2*self.H) + # h = torch.cat((h[0], h[1]), 1) + # c = torch.cat((c[0], c[1]), 1) + return outputs, h.squeeze(), c.squeeze() + else: + h = torch.cat((hidden[0], hidden[1]), 1) + return outputs, h.squeeze() + else: + return outputs, hidden.squeeze() + + def predict_one(self, seq): + with torch.no_grad(): + x = self.embedding(seq) + outputs, hidden = self.rnn(x) + if self.bi: + # [2, B, H] => [B, 2H] + hidden = torch.cat((hidden[0], hidden[1]), 1) + return outputs, hidden + else: + return outputs, hidden \ No newline at end of file diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..07e2f98 --- /dev/null +++ b/utils/__init__.py @@ -0,0 +1,9 @@ +from .utils import * +from .dataset import * +from .sentiment_dataset import * +from .lang import * +from .bleu import * +from .beam_omt import * +from .rouge import * +from .masked_cross_entropy import * +from .embedding_metrics import * \ No newline at end of file diff --git a/utils/beam.py b/utils/beam.py new file mode 100644 index 0000000..fa73d18 --- /dev/null +++ b/utils/beam.py @@ -0,0 +1,202 @@ +import sys +import os +import time + +import torch +import torch.nn.functional as F + +try: + from utils import constant +except ImportError: + import constant + +"""Beam search implementation in PyTorch.""" +# +# +# hyp1#-hyp1---hyp1 -hyp1 +# \ / +# hyp2 \-hyp2 /-hyp2#hyp2 +# / \ +# hyp3#-hyp3---hyp3 -hyp3 +# ======================== +# +# Takes care of beams, back pointers, and scores. + +# Code borrowed from PyTorch OpenNMT example +# https://github.com/pytorch/examples/blob/master/OpenNMT/onmt/Beam.py + + +class Beam(object): + """Ordered beam of candidate outputs.""" + + def __init__(self, size): + """Initialize params.""" + self.size = size + self.done = False + self.pad = constant.pad_idx + self.bos = constant.sou_idx + self.eos = constant.eou_idx + self.t = torch.cuda if constant.USE_CUDA else torch + + # The score for each translation on the beam. + self.scores = self.t.FloatTensor(size).zero_() + + # The backpointers at each time-step. + self.prevKs = [] + + # The outputs at each time-step. + self.nextYs = [self.t.LongTensor(size).fill_(self.pad)] + self.nextYs[0][0] = self.bos + + def __str__(self): + s = " \n \ + Beam Search Object: \n \ + Beam Size: {}\n \ + Pad IDX: {}\n \ + Start IDX: {}\n \ + End IDX: {}\n \ + Scores: {}\n \ + Prev Ks: {}\n \ + Next Ys: {}\n \ + " + return s.format(self.size, self.pad, self.bos, self.eos, \ + self.scores, self.prevKs, self.nextYs) + + # Get the outputs for the current timestep. + def get_current_state(self): + """Get state of beam.""" + return self.nextYs[-1] + + # Get the backpointers for the current timestep. + def get_current_origin(self): + """Get the backpointer to the beam at this step.""" + return self.prevKs[-1] + + # Given prob over words for every last beam `wordLk` + # : Compute and update the beam search. + # + # Parameters: + # + # * `wordLk`- + log probs of advancing from the last step (K x V) + # K is what? => beam size + # Returns: True if beam search is complete. + + def advance(self, wordLk): + """Advance the beam.""" + num_words = wordLk.size(1) + + # # force the output to be longer than self.min_length + # cur_len = len(self.next_ys) + # if cur_len < self.min_length: + # for k in range(len(word_probs)): + # word_probs[k][self._eos] = -1e20 + + # Sum the previous scores. + if len(self.prevKs) > 0: + beam_lk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk) + # Don't let EOS have children. + for i in range(self.nextYs[-1].size(0)): + if self.nextYs[-1][i] == self.eos: + beam_lk[i] = -1e20 + else: + beam_lk = wordLk[0] + + print(beam_lk) + flat_beam_lk = beam_lk.view(-1) # squeeze + + bestScores, bestScoresId = flat_beam_lk.topk(self.size, 0, True, True) + self.scores = bestScores + + # bestScoresId is flattened (K, K*V) array, so calculate which + # word and beam each score came from + prev_k = bestScoresId / num_words + # print(bestScores) + # print(bestScoresId) + # print(prev_k) + # print(prev_k * num_words) + # print(bestScoresId - prev_k * num_words) + self.prevKs.append(prev_k) + self.nextYs.append(bestScoresId - prev_k * num_words) # V+1th word => 0th word + + # End condition is when top-of-beam is EOS. + if self.nextYs[-1][0] == self.eos: + self.done = True + + for i in range(1, self.size): + if self.nextYs[-1][i] == self.eos: + self.scores[i] = -1e10 + + return self.done + + def sort_best(self): + """Sort the beam.""" + return torch.sort(self.scores, 0, True) + + # Get the score of the best in the beam. + def get_best(self): + """Get the most likely candidate.""" + scores, ids = self.sort_best() + return scores[1], ids[1] + + # Walk back to construct the full hypothesis. + # + # Parameters. + # + # * `k` - the position in the beam to construct. + # + # Returns. + # + # The hypothesis + def get_hyp(self, k): + """Get hypotheses.""" + hyp = [] + # print(len(self.prevKs), len(self.nextYs)) + for j in range(len(self.prevKs) - 1, -1, -1): + hyp.append(self.nextYs[j + 1][k]) + k = self.prevKs[j][k] + + return hyp[::-1] + + +if __name__ == "__main__": + beam = Beam(constant.beam_size) + print(beam) + + V = 5 + words = [''] + probs = beam.t.distributions.normal.Normal(1.0, 2).sample((V,)) + # probs = beam.t.distributions.normal.Normal(1.0, 2).sample((beam.size, V)) + probs = F.log_softmax(probs, dim=0) + probs = beam.t.Tensor([0.35, 0.3, 0.2, 0.1, 0.05]) + # print(probs.max(), probs.argmax()) + # print(probs.shape) + # probs = probs.unsqueeze(1) + # print(probs.shape) + probs = probs.repeat(beam.size, 1) + print(probs.shape) + # probs = probs.unsqueeze(1).repeat(1, beam.size, 1) + # print(probs.shape) + topv, topi = probs.topk(beam.size, 1, True, True) + print(topv) + print(topi) + print() + while not beam.done: + beam.advance(probs) + print(beam) + beam.advance(probs) + print(beam) + beam.advance(probs) + print(beam) + beam.advance(probs) + print(beam) + beam.advance(probs) + print(beam) + print(beam.get_hyp(0)) + print(beam.get_best()) + # beam.advance(probs) + # print(beam) + # beam.advance(probs) + # print(beam) + # beam.advance(probs) + # print(beam) + break \ No newline at end of file diff --git a/utils/beam_omt.py b/utils/beam_omt.py new file mode 100644 index 0000000..b055655 --- /dev/null +++ b/utils/beam_omt.py @@ -0,0 +1,374 @@ +from __future__ import division + +import torch +import torch.nn.functional as F + +try: + from utils import constant +except ImportError: + import constant + +# Code borrowed from OpenNMT +# https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/translate + + +class Beam(object): + """ + Class for managing the internals of the beam search process. + Takes care of beams, back pointers, and scores. + Args: + size (int): beam size + pad, bos, eos (int): indices of padding, beginning, and ending. + n_best (int): nbest size to use + cuda (bool): use gpu + global_scorer (:obj:`GlobalScorer`) + """ + + def __init__(self, size, + n_best=1, + global_scorer=None, + min_length=0, + stepwise_penalty=False, + block_ngram_repeat=0, + exclusion_tokens=set()): + + self.size = size + self.tt = torch.cuda if constant.USE_CUDA else torch + + # The score for each translation on the beam. + self.scores = self.tt.FloatTensor(size).zero_() + self.all_scores = [] + + # The backpointers at each time-step. + self.prev_ks = [] + + # The outputs at each time-step. + self.next_ys = [self.tt.LongTensor(size) + # .fill_(constant.pad_idx)] + .fill_(constant.sou_idx)] + self.next_ys[0][0] = constant.sou_idx + + # Has EOS topped the beam yet. + self._eos = constant.eou_idx + self.eos_top = False + + # The attentions (matrix) for each time. + self.attn = [] + + # Time and k pair for finished. + self.finished = [] + self.n_best = n_best + + # Information for global scoring. + self.global_scorer = global_scorer + self.global_state = {} + + # Minimum prediction length + self.min_length = min_length + + # Apply Penalty at every step + self.stepwise_penalty = stepwise_penalty + self.block_ngram_repeat = block_ngram_repeat + self.exclusion_tokens = exclusion_tokens + + def __str__(self): + s = " \n \ + Beam Search Object: \n \ + Beam Size: {}\n \ + End IDX: {}\n \ + Scores: {}\n \ + Prev Ks: {}\n \ + Next Ys: {}\n \ + " + return s.format(self.size, self._eos, self.scores, \ + self.prev_ks, self.next_ys)#, self.finished) + + def get_current_state(self): + "Get the outputs for the current timestep." + return self.next_ys[-1] + + def get_current_origin(self): + "Get the backpointers for the current timestep." + return self.prev_ks[-1] + + def advance(self, word_probs, attn_out=None): + """ + Given prob over words for every last beam `wordLk` and attention + `attn_out`: Compute and update the beam search. + Parameters: + * `word_probs`- probs of advancing from the last step (K x words) + * `attn_out`- attention at the last step + Returns: True if beam search is complete. + """ + num_words = word_probs.size(1) + # if self.stepwise_penalty: + # self.global_scorer.update_score(self, attn_out) + # force the output to be longer than self.min_length + cur_len = len(self.next_ys) + if cur_len < self.min_length: + for k in range(len(word_probs)): + word_probs[k][self._eos] = -1e20 + # Sum the previous scores. + if len(self.prev_ks) > 0: + beam_scores = word_probs + self.scores.unsqueeze(1) + # Don't let EOS have children. + for i in range(self.next_ys[-1].size(0)): + if self.next_ys[-1][i] == self._eos: + beam_scores[i] = -1e20 + + # Block ngram repeats + if self.block_ngram_repeat > 0: + ngrams = [] + le = len(self.next_ys) + for j in range(self.next_ys[-1].size(0)): + hyp, _ = self.get_hyp(le - 1, j) + ngrams = set() + fail = False + gram = [] + for i in range(le - 1): + # Last n tokens, n = block_ngram_repeat + gram = (gram + + [hyp[i].item()])[-self.block_ngram_repeat:] + # Skip the blocking if it is in the exclusion list + if set(gram) & self.exclusion_tokens: + continue + if tuple(gram) in ngrams: + fail = True + ngrams.add(tuple(gram)) + if fail: + beam_scores[j] = -10e20 + else: + beam_scores = word_probs[0] + flat_beam_scores = beam_scores.view(-1) + best_scores, best_scores_id = flat_beam_scores.topk(self.size, 0, + True, True) + + self.all_scores.append(self.scores) + self.scores = best_scores + + # best_scores_id is flattened (K, K*V) array, so calculate which + # word and beam each score came from + prev_k = best_scores_id / num_words + self.prev_ks.append(prev_k) + self.next_ys.append((best_scores_id - prev_k * num_words)) # V+1th word => 0th word + # self.attn.append(attn_out.index_select(0, prev_k)) + # self.global_scorer.update_global_state(self) + + for i in range(self.next_ys[-1].size(0)): + if self.next_ys[-1][i] == self._eos: + global_scores = self.global_scorer.score(self, self.scores) + s = global_scores[i] + self.finished.append((s, len(self.next_ys) - 1, i)) + + # End condition is when top-of-beam is EOS and no global score. + if self.next_ys[-1][0] == self._eos: + self.all_scores.append(self.scores) + self.eos_top = True + + def done(self): + return self.eos_top and len(self.finished) >= self.n_best + + def sort_finished(self, minimum=None): + if minimum is not None: + i = 0 + # Add from beam until we have minimum outputs. + while len(self.finished) < minimum: + global_scores = self.global_scorer.score(self, self.scores) + s = global_scores[i] + self.finished.append((s, len(self.next_ys) - 1, i)) + i += 1 + + self.finished.sort(key=lambda a: -a[0]) + scores = [sc for sc, _, _ in self.finished] + ks = [(t, k) for _, t, k in self.finished] + return scores, ks + + def get_hyp(self, timestep, k): + """ + Walk back to construct the full hypothesis. + """ + hyp, attn = [], [] + for j in range(len(self.prev_ks[:timestep]) - 1, -1, -1): + hyp.append(self.next_ys[j + 1][k]) + # attn.append(self.attn[j][k]) + k = self.prev_ks[j][k] + return hyp[::-1], None #, torch.stack(attn[::-1]) + + +class GNMTGlobalScorer(object): + """ + NMT re-ranking score from + "Google's Neural Machine Translation System" :cite:`wu2016google` + Args: + alpha (float): length parameter + beta (float): coverage parameter + coverage_penalty (float): coverage_penalty + length_penalty (float): length_penalty + """ + + def __init__(self, alpha=0.8, beta=5, coverage_penalty='none', length_penalty='wu'): + self.alpha = alpha + self.beta = beta + penalty_builder = PenaltyBuilder(coverage_penalty, length_penalty) + # Term will be subtracted from probability + self.cov_penalty = penalty_builder.coverage_penalty() + # Probability will be divided by this + self.length_penalty = penalty_builder.length_penalty() + + def score(self, beam, logprobs): + """ + Rescores a prediction based on penalty functions + """ + normalized_probs = self.length_penalty(beam, + logprobs, + self.alpha) + if not beam.stepwise_penalty: + penalty = self.cov_penalty(beam, + None, + #beam.global_state["coverage"], + self.beta) + normalized_probs -= penalty + + return normalized_probs + + def update_score(self, beam, attn): + """ + Function to update scores of a Beam that is not finished + """ + if "prev_penalty" in beam.global_state.keys(): + beam.scores.add_(beam.global_state["prev_penalty"]) + penalty = self.cov_penalty(beam, + beam.global_state["coverage"], #+ attn, + self.beta) + beam.scores.sub_(penalty) + + def update_global_state(self, beam): + "Keeps the coverage vector as sum of attentions" + if len(beam.prev_ks) == 1: + beam.global_state["prev_penalty"] = beam.scores.clone().fill_(0.0) + beam.global_state["coverage"] = beam.attn[-1] + self.cov_total = beam.attn[-1].sum(1) + else: + self.cov_total += torch.min(beam.attn[-1], + beam.global_state['coverage']).sum(1) + beam.global_state["coverage"] = beam.global_state["coverage"] \ + .index_select(0, beam.prev_ks[-1]).add(beam.attn[-1]) + + prev_penalty = self.cov_penalty(beam, + beam.global_state["coverage"], + self.beta) + beam.global_state["prev_penalty"] = prev_penalty + + +class PenaltyBuilder(object): + """ + Returns the Length and Coverage Penalty function for Beam Search. + Args: + length_pen (str): option name of length pen + cov_pen (str): option name of cov pen + """ + + def __init__(self, cov_pen, length_pen): + self.length_pen = length_pen + self.cov_pen = cov_pen + + def coverage_penalty(self): + if self.cov_pen == "wu": + return self.coverage_wu + elif self.cov_pen == "summary": + return self.coverage_summary + else: + return self.coverage_none + + def length_penalty(self): + if self.length_pen == "wu": + return self.length_wu + elif self.length_pen == "avg": + return self.length_average + else: + return self.length_none + + """ + Below are all the different penalty terms implemented so far + """ + + def coverage_wu(self, beam, cov, beta=0.): + """ + NMT coverage re-ranking score from + "Google's Neural Machine Translation System" :cite:`wu2016google`. + """ + penalty = -torch.min(cov, cov.clone().fill_(1.0)).log().sum(1) + return beta * penalty + + def coverage_summary(self, beam, cov, beta=0.): + """ + Our summary penalty. + """ + penalty = torch.max(cov, cov.clone().fill_(1.0)).sum(1) + penalty -= cov.size(1) + return beta * penalty + + def coverage_none(self, beam, cov, beta=0.): + """ + returns zero as penalty + """ + return beam.scores.clone().fill_(0.0) + + def length_wu(self, beam, logprobs, alpha=0.): + """ + NMT length re-ranking score from + "Google's Neural Machine Translation System" :cite:`wu2016google`. + """ + + modifier = (((5 + len(beam.next_ys)) ** alpha) / + ((5 + 1) ** alpha)) + return (logprobs / modifier) + + def length_average(self, beam, logprobs, alpha=0.): + """ + Returns the average probability of tokens in a sequence. + """ + return logprobs / len(beam.next_ys) + + def length_none(self, beam, logprobs, alpha=0., beta=0.): + """ + Returns unmodified scores. + """ + return logprobs + + +if __name__ == "__main__": + beam = Beam(constant.beam_size, + global_scorer=GNMTGlobalScorer(), + cuda=constant.USE_CUDA) + print(beam) + + V = 5 + words = [''] + probs = beam.tt.distributions.normal.Normal(1.0, 2).sample((V,)) + # probs = beam.t.distributions.normal.Normal(1.0, 2).sample((beam.size, V)) + probs = F.log_softmax(probs, dim=0) + probs = beam.tt.Tensor([0.35, 0.3, 0.2, 0.1, 0.05]) + # print(probs.max(), probs.argmax()) + # print(probs.shape) + # probs = probs.unsqueeze(1) + # print(probs.shape) + probs = probs.repeat(beam.size, 1) + print(probs.shape) + # probs = probs.unsqueeze(1).repeat(1, beam.size, 1) + # print(probs.shape) + topv, topi = probs.topk(beam.size, 1, True, True) + print(topv) + print(topi) + print() + + beam.advance(probs) + print(beam) + beam.advance(probs) + print(beam) + beam.advance(probs) + print(beam) + # beam.advance(probs) + # print(beam) + # beam.advance(probs) + # print(beam) \ No newline at end of file diff --git a/utils/beam_ptr.py b/utils/beam_ptr.py new file mode 100644 index 0000000..98f3944 --- /dev/null +++ b/utils/beam_ptr.py @@ -0,0 +1,193 @@ +import os +import sys +import time + +import torch +import torch.nn.functional as F + +try: + from utils import constant +except ImportError: + import constant + + +class Beam(object): + def __init__(self, tokens, log_probs, state, context, coverage): + self.tokens = tokens + self.log_probs = log_probs + self.state = state + self.context = context + self.coverage = coverage + + def extend(self, token, log_prob, state, context, coverage): + return Beam(tokens = self.tokens + [token], + log_probs = self.log_probs + [log_prob], + state = state, + context = context, + coverage = coverage) + + @property + def latest_token(self): + return self.tokens[-1] + + @property + def avg_log_prob(self): + return sum(self.log_probs) / len(self.tokens) + +def dup_batch(batch, idx, dup_times): + new_batch = {} + input_len = batch["input_lengths"][idx] + for key in ["input_batch", "target_batch"]: + new_batch[key] = batch[key][:input_len, idx:idx+1].repeat(1, dup_times) + + if "input_ext_vocab_batch" in batch: + for key in ["input_ext_vocab_batch", "target_ext_vocab_batch"]: + new_batch[key] = batch[key][:input_len, idx:idx+1].repeat(1, dup_times) + new_batch["article_oovs"] = [batch["article_oovs"][idx] for _ in range(dup_times)] + new_batch["max_art_oovs"] = batch["max_art_oovs"] + + for key in ["input_txt", "target_txt"]: + new_batch[key] = [batch[key][idx] for _ in range(dup_times)] + for key in ["input_lengths", "target_lengths"]: + new_batch[key] = batch[key][idx:idx+1].repeat(dup_times) + + return new_batch + +class BeamSearch(object): + def __init__(self, model, lang): + + self.model = model + self.lang = lang + self.vocab_size = lang.n_words + + def sort_beams(self, beams): + return sorted(beams, key=lambda h: h.avg_log_prob, reverse=True) + + def beam_search(self, batch): + + batch_size = batch["input_lengths"].size(0) + decoded_sents = [] + + for i in range(batch_size): + new_batch = dup_batch(batch, i, constant.beam_size) + enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0 = self.model.get_input_from_batch(new_batch) + # Run beam search to get best Hypothesis + best_summary = self.beam_search_sample(enc_batch, enc_padding_mask, enc_lens, + enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0) + + # Extract the output ids from the hypothesis and convert back to words + output_ids = [int(t) for t in best_summary.tokens[1:]] + if constant.pointer_gen: + art_oovs = batch["article_oovs"][i] + len_oovs = len(art_oovs) + decoded_words = [] + for idx in output_ids: + if idx < self.vocab_size: + decoded_words.append(self.lang.index2word[idx]) + elif idx - self.vocab_size < len_oovs: + decoded_words.append(art_oovs[idx - self.vocab_size]) + else: + raise ValueError("invalid output id") + else: + decoded_words = [self.lang.index2word[idx] for idx in output_ids] + + # Remove the [STOP] token from decoded_words, if necessary + try: + fst_stop_idx = decoded_words.index('EOS') + decoded_words = decoded_words[:fst_stop_idx] + except ValueError: + decoded_words = decoded_words + + decoded_sents.append(decoded_words) + return decoded_sents + + def beam_search_sample(self, enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_0, coverage_t_0): + #batch should have only one example by duplicate + + encoder_outputs, encoder_hidden = self.model.encoder(enc_batch, enc_lens) + s_t_0 = self.model.reduce_state(encoder_hidden) + + dec_h, dec_c = s_t_0 # 1 x 2*hidden_size + dec_h = dec_h.squeeze(0) + dec_c = dec_c.squeeze(0) + #decoder batch preparation, it has beam_size example initially everything is repeated + beams = [Beam(tokens=[constant.SOS_idx], + log_probs=[0.0], + state=(dec_h[0], dec_c[0]), + context = c_t_0[0], + coverage=(coverage_t_0[0] if constant.is_coverage else None)) + for _ in range(constant.beam_size)] + results = [] + steps = 0 + while steps < constant.max_dec_step and len(results) < constant.beam_size: + latest_tokens = [h.latest_token for h in beams] + latest_tokens = [t if t < self.vocab_size else constant.UNK_idx \ + for t in latest_tokens] + y_t_1 = torch.LongTensor(latest_tokens) + if constant.USE_CUDA: + y_t_1 = y_t_1.cuda() + all_state_h =[] + all_state_c = [] + + all_context = [] + + for h in beams: + state_h, state_c = h.state + all_state_h.append(state_h) + all_state_c.append(state_c) + + all_context.append(h.context) + + s_t_1 = (torch.stack(all_state_h, 0).unsqueeze(0), torch.stack(all_state_c, 0).unsqueeze(0)) + c_t_1 = torch.stack(all_context, 0) + + coverage_t_1 = None + if constant.is_coverage: + all_coverage = [] + for h in beams: + all_coverage.append(h.coverage) + coverage_t_1 = torch.stack(all_coverage, 0) + final_dist, s_t, c_t, attn_dist, p_gen, coverage_t = self.model.decoder(y_t_1, s_t_1, + encoder_outputs, enc_padding_mask, c_t_1, + extra_zeros, enc_batch_extend_vocab, coverage_t_1, steps, training=False) + + topk_log_probs, topk_ids = torch.topk(final_dist, constant.beam_size * 2) + + dec_h, dec_c = s_t + dec_h = dec_h.squeeze() + dec_c = dec_c.squeeze() + + all_beams = [] + num_orig_beams = 1 if steps == 0 else len(beams) + for i in range(num_orig_beams): + h = beams[i] + state_i = (dec_h[i], dec_c[i]) + context_i = c_t[i] + coverage_i = (coverage_t[i] if constant.is_coverage else None) + + for j in range(constant.beam_size * 2): # for each of the top 2*beam_size hyps: + new_beam = h.extend(token=topk_ids[i, j].item(), + log_prob=topk_log_probs[i, j].item(), + state=state_i, + context=context_i, + coverage=coverage_i) + all_beams.append(new_beam) + + beams = [] + for h in self.sort_beams(all_beams): + if h.latest_token == constant.EOS_idx: + if steps >= constant.min_dec_steps: + results.append(h) + else: + beams.append(h) + if len(beams) == constant.beam_size or len(results) == constant.beam_size: + break + + steps += 1 + + if len(results) == 0: + results = beams + + beams_sorted = self.sort_beams(results) + + return beams_sorted[0] diff --git a/utils/bleu.py b/utils/bleu.py new file mode 100644 index 0000000..5e706df --- /dev/null +++ b/utils/bleu.py @@ -0,0 +1,131 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import numpy as np + +import os +import re +import subprocess +import tempfile +import numpy as np + +from six.moves import urllib + +def wer(r, h): + """ + This is a function that calculate the word error rate in ASR. + You can use it like this: wer("what is it".split(), "what is".split()) + """ + #build the matrix + d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8).reshape((len(r)+1, len(h)+1)) + for i in range(len(r)+1): + for j in range(len(h)+1): + if i == 0: d[0][j] = j + elif j == 0: d[i][0] = i + for i in range(1,len(r)+1): + for j in range(1, len(h)+1): + if r[i-1] == h[j-1]: + d[i][j] = d[i-1][j-1] + else: + substitute = d[i-1][j-1] + 1 + insert = d[i][j-1] + 1 + delete = d[i-1][j] + 1 + d[i][j] = min(substitute, insert, delete) + result = float(d[len(r)][len(h)]) / len(r) * 100 + # result = str("%.2f" % result) + "%" + return result + +# -*- coding: utf-8 -*- +# Copyright 2017 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BLEU metric implementation. +""" + + +def moses_multi_bleu(hypotheses, references, lowercase=False): + """Calculate the bleu score for hypotheses and references + using the MOSES ulti-bleu.perl script. + Args: + hypotheses: A numpy array of strings where each string is a single example. + references: A numpy array of strings where each string is a single example. + lowercase: If true, pass the "-lc" flag to the multi-bleu script + Returns: + The BLEU score as a float32 value. + """ + + if np.size(hypotheses) == 0: + return np.float32(0.0) + + + # Get MOSES multi-bleu script + try: + multi_bleu_path, _ = urllib.request.urlretrieve( + "https://raw.githubusercontent.com/moses-smt/mosesdecoder/" + "master/scripts/generic/multi-bleu.perl") + os.chmod(multi_bleu_path, 0o755) + except: #pylint: disable=W0702 + print("Unable to fetch multi-bleu.perl script, using local.") + metrics_dir = os.path.dirname(os.path.realpath(__file__)) + bin_dir = os.path.abspath(os.path.join(metrics_dir, "..", "..", "bin")) + multi_bleu_path = os.path.join(bin_dir, "tools/multi-bleu.perl") + + + # Dump hypotheses and references to tempfiles + hypothesis_file = tempfile.NamedTemporaryFile() + hypothesis_file.write("\n".join(hypotheses).encode("utf-8")) + hypothesis_file.write(b"\n") + hypothesis_file.flush() + reference_file = tempfile.NamedTemporaryFile() + reference_file.write("\n".join(references).encode("utf-8")) + reference_file.write(b"\n") + reference_file.flush() + + + # Calculate BLEU using multi-bleu script + with open(hypothesis_file.name, "r") as read_pred: + bleu_cmd = [multi_bleu_path] + if lowercase: + bleu_cmd += ["-lc"] + bleu_cmd += [reference_file.name] + try: + bleu_out = subprocess.check_output(bleu_cmd, stdin=read_pred, stderr=subprocess.STDOUT) + bleu_out = bleu_out.decode("utf-8") + re_bleu = re.search(r"BLEU = (.+?), (.+?)/(.+?)/(.+?)/(.+?) ", bleu_out) + bleu_score = re_bleu.group(1) + bleu_1 = re_bleu.group(2) + bleu_2 = re_bleu.group(3) + bleu_3 = re_bleu.group(4) + bleu_4 = re_bleu.group(5) + bleu_score = float(bleu_score) + bleu_1 = float(bleu_1) + bleu_2 = float(bleu_2) + bleu_3 = float(bleu_3) + bleu_4 = float(bleu_4) + except subprocess.CalledProcessError as error: + if error.output is not None: + print("multi-bleu.perl script returned non-zero exit code") + print(error.output) + bleu_score = np.float32(0.0) + bleu_1 = np.float32(0.0) + bleu_2 = np.float32(0.0) + bleu_3 = np.float32(0.0) + bleu_4 = np.float32(0.0) + + bleus = [bleu_1, bleu_2, bleu_3, bleu_4] + + # Close temp files + hypothesis_file.close() + reference_file.close() + return bleu_score, bleus \ No newline at end of file diff --git a/utils/constant.py b/utils/constant.py new file mode 100644 index 0000000..7855c98 --- /dev/null +++ b/utils/constant.py @@ -0,0 +1,199 @@ +import argparse +import random +import numpy as np +import torch + +parser = argparse.ArgumentParser() + +parser.add_argument("--model", type=str, default="RNN") # RNN, LVED + +# Hyperparams +parser.add_argument("--C", type=int, default=1) # number of classes +parser.add_argument("--H", type=int, default=300) +parser.add_argument("--D", type=int, default=300) +parser.add_argument("--B", type=int, default=32) +parser.add_argument("--L", type=int, default=1) +parser.add_argument("--M", type=int, default=3) # number of latent variables +parser.add_argument("--K", type=int, default=5) # dimension of latent variable +parser.add_argument("--CD", type=int, default=256) # dimension of ICM +parser.add_argument("--beta", type=float, default=0.5) # aux reward lambda +parser.add_argument("--lambda_aux", type=float, default=0.5) # aux reward lambda +parser.add_argument("--lambda_emo", type=float, default=0.5) # emo loss lambda +parser.add_argument("--lambda_gen", type=float, default=0.5) # gen loss lambda +parser.add_argument("--lambda_mle", type=float, default=0.5) # mle loss lambda +parser.add_argument("--lr", type=float, default=0.001) +parser.add_argument("--tau", type=float, default=1.0) # softmax temperature +parser.add_argument("--bi", type=str, default="none") # none, bi +parser.add_argument("--mlp", action="store_true") +parser.add_argument("--lstm", action="store_true") +parser.add_argument("--dropout", type=float, default=0.5) + +# Train Settings +parser.add_argument("--attn", type=str, default="none") # none, dot, concat (luong), general +parser.add_argument("--cuda", action="store_true") +parser.add_argument("--optim", type=str, default="Adam") # Adam, SGD +parser.add_argument("--epochs", type=int, default=100) +parser.add_argument("--round_robin", action="store_true") +parser.add_argument("--parse", type=str, default="none") # none, user, system +parser.add_argument("--eval_parse", action="store_true") # eval as parse or not +parser.add_argument("--embedding", type=str, default="random") # random, fasttext +parser.add_argument("--share_rnn", action="store_true") +parser.add_argument("--weight_tie", action="store_true") +parser.add_argument("--embeddings_cpu", action="store_true") +parser.add_argument("--share_embeddings", action="store_true") +parser.add_argument("--update_embeddings", action="store_true") + +# Beam Search +parser.add_argument("--beam", action="store_true") +parser.add_argument("--beam_size", type=int, default=5) +parser.add_argument("--topk", action="store_true") +parser.add_argument("--topk_size", type=int, default=40) +parser.add_argument("--max_grad_norm", type=float, default=2.0) +parser.add_argument("--max_enc_steps", type=int, default=400) +parser.add_argument("--max_dec_steps", type=int, default=20) +parser.add_argument("--min_dec_steps", type=int, default=5) + +## Data & Task: Single vs Mutli +parser.add_argument("--data", type=str, default="dailydialog") # "dailydialog", "empathetic-dialogue", "personachat", "ed+dd", "all", "sst" +parser.add_argument("--eval_data", type=str, default="empathetic-dialogue") # "dailydialog", "empathetic-dialogue", "personachat", "ed+dd", "all", "sst" +parser.add_argument("--task", type=str, default="emotion") # "emotion", "sentiment", "seq2seq", "multiseq", "rlseq", "lved" +parser.add_argument("--split", type=str, default="dev") # train, dev, test +parser.add_argument("--shuffle", action="store_true") +parser.add_argument("--discrete", action="store_true") # use emotion_t. Otherwise, emtion_(t+1) +parser.add_argument("--use_arl", action="store_true") # Auto-tune RL +parser.add_argument("--use_baseline", action="store_true") # baseline reward +parser.add_argument("--use_binary", action="store_true") # use binary traces +parser.add_argument("--use_bpr", action="store_true") # batch prior regularization +parser.add_argument("--use_bow", action="store_true") # BoW loss +parser.add_argument("--use_bert", action="store_true") # Use pre-trained BERT for sentiment +parser.add_argument("--use_context", action="store_true") +parser.add_argument("--use_current", action="store_true") +parser.add_argument("--use_curiosity", action="store_true") # curiosity reward +parser.add_argument("--use_cycle", action="store_true") # cycle consistency +parser.add_argument("--use_emotion", action="store_true") +parser.add_argument("--use_hybrid", action="store_true") # use hybrid loss +parser.add_argument("--use_lang", action="store_true") +parser.add_argument("--use_kl_anneal", action="store_true") +parser.add_argument("--use_sentiment", action="store_true") +parser.add_argument("--use_sentiment_agreement", action="store_true") +parser.add_argument("--use_self_critical", action="store_true") # use self critical baseline +parser.add_argument("--use_topic", action="store_true") # use topic info for LVED +parser.add_argument("--use_tau_anneal", action="store_true") +parser.add_argument("--use_user", action="store_true") # use user simulation +parser.add_argument("--pretrain_curiosity", action="store_true") +parser.add_argument("--reset_linear", action="store_true") +parser.add_argument("--conditional_vae", action="store_true") +parser.add_argument("--grid_search", action="store_true") + +# Save/Load +parser.add_argument("--restore", action="store_true") +parser.add_argument("--restore_path", type=str, default="") +parser.add_argument("--test", action="store_true") +parser.add_argument("--test_path", type=str, default="") +parser.add_argument("--lang_path", type=str, default="") # '_shared' vs '' +parser.add_argument("--policy_model", type=str, default="") +parser.add_argument("--reward_model", type=str, default="") +parser.add_argument("--user_model", type=str, default="") +parser.add_argument("--aux_reward_model", type=str, default="") +parser.add_argument("--sentiment_clf", type=str, default="") + + +arg = parser.parse_args() +print(arg) +model = arg.model + +# Hyperparameters +C = arg.C +H = arg.H +D = arg.D +B = arg.B +L = arg.L +M = arg.M +K = arg.K +CD = arg.CD +beta = arg.beta +lambda_aux = arg.lambda_aux +lambda_emo = arg.lambda_emo +lambda_gen = arg.lambda_gen +lambda_mle = arg.lambda_mle +bi=arg.bi +lr=arg.lr +tau = arg.tau +mlp = arg.mlp +lstm = arg.lstm +beam_size = arg.beam_size +topk = arg.topk +topk_size = arg.topk_size + +attn = arg.attn +beam = arg.beam +optim = arg.optim +parse = arg.parse +eval_parse = arg.eval_parse +dropout = arg.dropout +embedding = arg.embedding +round_robin = arg.round_robin +epochs = arg.epochs +share_rnn = arg.share_rnn +weight_tie = arg.weight_tie +embeddings_cpu = arg.embeddings_cpu +share_embeddings = arg.share_embeddings +update_embeddings = arg.update_embeddings + +rand_unif_init_mag=0.02 +trunc_norm_init_std=1e-4 + +max_grad_norm = arg.max_grad_norm +max_enc_steps = arg.max_enc_steps +max_dec_steps = arg.max_dec_steps +min_dec_steps = arg.min_dec_steps + +USE_CUDA = arg.cuda + +unk_idx = 0 +pad_idx = 1 +sou_idx = 2 +eou_idx = 3 + +data = arg.data +eval_data = arg.eval_data +task = arg.task +split = arg.split +shuffle = arg.shuffle +discrete = arg.discrete +use_arl = arg.use_arl +use_baseline = arg.use_baseline +use_bpr = arg.use_bpr +use_bow = arg.use_bow +use_bert = arg.use_bert +use_cycle = arg.use_cycle +use_curiosity = arg.use_curiosity +use_lang = arg.use_lang +use_topic = arg.use_topic +use_binary = arg.use_binary +use_current = arg.use_current +use_context = arg.use_context +use_hybrid = arg.use_hybrid +use_emotion = arg.use_emotion +use_sentiment = arg.use_sentiment +use_sentiment_agreement = arg.use_sentiment_agreement +use_self_critical = arg.use_self_critical +use_user = arg.use_user +reset_linear = arg.reset_linear +use_kl_anneal = arg.use_kl_anneal +use_tau_anneal = arg.use_tau_anneal +pretrain_curiosity = arg.pretrain_curiosity +conditional_vae = arg.conditional_vae +grid_search = arg.grid_search + +restore = arg.restore +restore_path = arg.restore_path + +test = arg.test +test_path = arg.test_path +lang_path = arg.lang_path +policy_model = arg.policy_model +reward_model = arg.reward_model +user_model = arg.user_model +aux_reward_model = arg.aux_reward_model +sentiment_clf = arg.sentiment_clf \ No newline at end of file diff --git a/utils/dataset.py b/utils/dataset.py new file mode 100644 index 0000000..8aa7d91 --- /dev/null +++ b/utils/dataset.py @@ -0,0 +1,180 @@ +import re + +import numpy as np +import dill as pickle + +import torch +import torch.utils.data as data + +from utils import constant + + +class DialogDataset(data.Dataset): + def __init__(self, mode='train', dataset='empathetic-dialogue', usr=False, sys=False, path=None, load_fasttext=False): + self.mode = mode + self.dataset = dataset + self.usr = usr + self.sys = sys + self.fasttext = None + self.load_fasttext = load_fasttext + self.use_emotion = constant.use_emotion + self.use_sentiment = constant.use_sentiment or constant.use_sentiment_agreement + + self._from_file(path) + + def __len__(self): + if self.sys and self.dataset == 'empathetic-dialogue': + return self.sys_target_lens.shape[0] + elif self.usr and self.dataset == 'empathetic-dialogue': + return self.usr_target_lens.shape[0] + return self.target_lens.shape[0] + + def __getitem__(self, i): + dialog = None + dialog_len = None + target = None + target_len = None + emotion = None + sentiments = None + if self.sys and self.dataset == 'empathetic-dialogue': + dialog = self.sys_dialogs[i] + dialog_len = self.sys_dialog_lens[i] + target = self.sys_targets[i] + target_len = self.sys_target_lens[i] + if self.use_emotion: + emotion = self.sys_emotions[i] + elif self.use_sentiment: + emotion = self.sys_sentiments[i] + sentiments = self.sys_sentiments_b[i] + elif self.usr and self.dataset == 'empathetic-dialogue': + dialog = self.usr_dialogs[i] + dialog_len = self.usr_dialog_lens[i] + target = self.usr_targets[i] + target_len = self.usr_target_lens[i] + if self.use_emotion: + emotion = self.sys_emotions[i] + elif self.use_sentiment: + emotion = self.sys_sentiments[i] + else: + dialog = self.dialogs[i] + dialog_len = self.dialog_lens[i] + target = self.targets[i] + target_len = self.target_lens[i] + if self.use_emotion: + emotion = self.emotions[i] + + return dialog, dialog_len, target, target_len, emotion, sentiments + + def _from_file(self, path=None): + def load_npy(path): + return np.load(path) + + load_path = path if path else 'data/prep/{}/{}.{}.npy' + + if self.use_emotion: + self.emotions = load_npy(load_path.format(self.dataset, 'emotions', self.mode)) + + if self.dataset == 'empathetic-dialogue': + self.usr_dialogs = load_npy(load_path.format(self.dataset, 'usr_dialogs', self.mode)) + self.usr_dialog_lens = load_npy(load_path.format(self.dataset, 'usr_dialog_lens', self.mode)) + self.sys_dialogs = load_npy(load_path.format(self.dataset, 'sys_dialogs', self.mode)) + self.sys_dialog_lens = load_npy(load_path.format(self.dataset, 'sys_dialog_lens', self.mode)) + self.usr_targets = load_npy(load_path.format(self.dataset, 'usr_targets', self.mode)) + self.usr_target_lens = load_npy(load_path.format(self.dataset, 'usr_target_lens', self.mode)) + self.sys_targets = load_npy(load_path.format(self.dataset, 'sys_targets', self.mode)) + self.sys_target_lens = load_npy(load_path.format(self.dataset, 'sys_target_lens', self.mode)) + self.sys_emotions = load_npy(load_path.format(self.dataset, 'sys_emotions', self.mode)) + self.sys_sentiments = load_npy(load_path.format(self.dataset, 'sys_sentiments', self.mode)) + self.sys_sentiments_b = load_npy(load_path.format(self.dataset, 'sys_sentiments_binary', self.mode)) + else: + self.dialogs = load_npy(load_path.format(self.dataset, 'dialogs', self.mode)) + self.dialog_lens = load_npy(load_path.format(self.dataset, 'dialog_lens', self.mode)) + self.targets = load_npy(load_path.format(self.dataset, 'targets', self.mode)) + self.target_lens = load_npy(load_path.format(self.dataset, 'target_lens', self.mode)) + + if self.mode == 'train': + if self.load_fasttext: + self.fasttext = load_npy('data/prep/{}/fasttext.npy'.format(self.dataset)) + + with open('data/prep/{}/lang{}.pkl'.format(self.dataset, constant.lang_path), 'rb') as f: + self.lang = pickle.load(f) + + +def make_dialog_data_loader(dataset, cuda, embeddings_cpu, batch_size, pad_idx=1, shuffle=True): + return torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, + collate_fn=collate_fn(mode=dataset.mode, cuda=cuda, + embeddings_cpu=embeddings_cpu, pad_idx=pad_idx, V=len(dataset.lang), + use_emotion=dataset.use_emotion, use_sentiment=dataset.use_sentiment)) + + +def collate_fn(mode='train', cuda=False, embeddings_cpu=False, pad_idx=1, V=None, use_emotion=False, use_sentiment=False): + def collate_inner(batch): + """ + Input + - batch[0]: dialogs -> [B x UTT_LEN] + - batch[1]: dialog_lens -> [B] + - batch[2]: targets -> [B x TGT_LEN] + - batch[3]: target_lens -> [B] + - batch[4]: emotions -> [B] + + Returns + - dialogs -> Ready for embedding lookup and packing + - Original: Tensor of [B x MAX_TURN x MAX_UTT_LEN], padded with PAD words and PAD arrays + Use pack_padded_sequence => Transform to [B * MAX_TURN x MAX_UTT_LEN] later for tensor computation + - Flattened: Tensor of [B x MAX_SEQ_LEN], where MAX_SEQ_LEN is max flattened seq len in current batch + Use pack_sequence + - labels -> Tensor of [B] indicating index of correct emotion + """ + + # Unzip data (returns tuple of batches) + dialogs, dialog_lens, targets, target_lens, emotions, sentiments = zip(*batch) + + sort = np.argsort(dialog_lens)[::-1].tolist() + unsort = np.argsort(sort).tolist() + dialogs = np.array(dialogs, dtype='object')[sort].tolist() + lens = np.array(dialog_lens)[sort]#.tolist() + targets = np.array(targets, dtype='object')[sort]#.tolist() + target_lens = np.array(target_lens)[sort]#.tolist() + + bow_targets, x_sort, x_unsort = None, None, None + # x_sort = np.argsort(target_lens)[::-1].tolist() + # x_unsort = np.argsort(x_sort).tolist() + # bow_targets = np.zeros((len(targets), V)) + # for i, target in enumerate(targets): + # bow_targets[i][target] = 1 + # bow_targets = torch.from_numpy(bow_targets).float() + if use_emotion: + emotions = torch.from_numpy(np.array(emotions)[sort]).long() + elif use_sentiment: + emotions = torch.from_numpy(np.array(emotions)[sort]).float() + sentiments = torch.from_numpy(np.array(sentiments)[sort]).float() + + # Pad dialogs and targets to their respective max batch lens + B = len(dialogs) + LD = lens[0] + LT = np.max(target_lens) + if pad_idx == 0: + padded_dialogs = torch.zeros((B, LD)) + padded_targets = torch.zeros((B, LT)) + else: + padded_dialogs = torch.ones((B, LD)) * pad_idx + padded_targets = torch.ones((B, LT)) * pad_idx + for b in range(B): + padded_dialogs[b, :lens[b]] = torch.from_numpy(np.array(dialogs[b])) + padded_targets[b, :target_lens[b]] = torch.from_numpy(np.array(targets[b])) + + padded_dialogs = padded_dialogs.long() + padded_targets = padded_targets.long() + + target_lens = torch.LongTensor(target_lens) + if not embeddings_cpu and cuda: + padded_dialogs = padded_dialogs.cuda() + padded_targets = padded_targets.cuda() + target_lens = target_lens.cuda() + if use_emotion or use_sentiment: + emotions = emotions.cuda() + if use_sentiment: + sentiments = sentiments.cuda() + + return padded_dialogs, lens, padded_targets, unsort, bow_targets, emotions, sentiments, x_sort, x_unsort + return collate_inner diff --git a/utils/embedding_metrics.py b/utils/embedding_metrics.py new file mode 100644 index 0000000..66c0569 --- /dev/null +++ b/utils/embedding_metrics.py @@ -0,0 +1,83 @@ +import numpy as np +import torch +import torch.nn.functional as F +from sklearn.metrics.pairwise import cosine_similarity as cosine +from collections import Counter + +class EmbeddingSim: + """ + """ + def __init__(self, word2vec): + """ + :param word2vec - a numpy array of word2vec with shape [vocab_size x emb_size] + """ + super(EmbeddingSim, self).__init__() + self.word2vec = word2vec + + def embedding(self, seqs): + """ + A numpy version of embedding + :param seqs - ndarray [batch_sz x seqlen] + """ + batch_size, seqlen = seqs.shape + seqs = np.reshape(seqs, (-1)) # convert to 1-d indexes [(batch_sz*seqlen)] + embs = self.word2vec[seqs] # lookup [(batch_sz*seqlen) x emb_sz] + embs = np.reshape(embs, (batch_size, seqlen, -1)) # recover the shape [batch_sz x seqlen x emb_sz] + return embs + + def extrema(self, embs, lens): # embs: [batch_size x seq_len x emb_size] lens: [batch_size] + """ + computes the value of every single dimension in the word vectors which has the greatest + difference from zero. + :param seq: sequence + :param seqlen: length of sequence + """ + # Find minimum and maximum value for every dimension in predictions + batch_size, seq_len, emb_size = embs.shape + max_mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int) + for i,length in enumerate(lens): + max_mask[i,:length,:]=1 + min_mask = 1-max_mask + seq_max = (embs*max_mask).max(1) # [batch_sz x emb_sz] + seq_min = (embs+min_mask).min(1) + # Find the maximum absolute value in min and max data + comp_mask = seq_max >= np.abs(seq_min)# [batch_sz x emb_sz] + # Add vectors for finding final sequence representation for predictions + extrema_emb = seq_max* comp_mask + seq_min* np.logical_not(comp_mask) + return extrema_emb + + def mean(self, embs, lens): + batch_size, seq_len, emb_size=embs.shape + mask = np.zeros((batch_size, seq_len, emb_size), dtype=np.int) + for i,length in enumerate(lens): + mask[i,:length,:]=1 + return (embs*mask).sum(1)/(mask.sum(1)+1e-8) + + def sim_bow(self, pred, pred_lens, ref, ref_lens): + """ + :param pred - ndarray [batch_size x seqlen] + :param pred_lens - list of integers + :param ref - ndarray [batch_size x seqlen] + """ + # look up word embeddings for prediction and reference + emb_pred = self.embedding(pred) # [batch_sz x seqlen1 x emb_sz] + emb_ref = self.embedding(ref) # [batch_sz x seqlen2 x emb_sz] + + ext_emb_pred=self.extrema(emb_pred, pred_lens) + ext_emb_ref=self.extrema(emb_ref, ref_lens) + bow_extrema=cosine(ext_emb_pred, ext_emb_ref) # [batch_sz_pred x batch_sz_ref] + + avg_emb_pred = self.mean(emb_pred, pred_lens) # Calculate mean over seq + avg_emb_ref = self.mean(emb_ref, ref_lens) + bow_avg = cosine(avg_emb_pred, avg_emb_ref) # [batch_sz_pred x batch_sz_ref] + + batch_pred, seqlen_pred, emb_size=emb_pred.shape + batch_ref, seqlen_ref, emb_size=emb_ref.shape + cos_sim = cosine(emb_pred.reshape((-1, emb_size)), emb_ref.reshape((-1, emb_size))) # [(batch_sz*seqlen1)x(batch_sz*seqlen2)] + cos_sim = cos_sim.reshape((batch_pred, seqlen_pred, batch_ref, seqlen_ref)) + # Find words with max cosine similarity + max12 = cos_sim.max(1).mean(2) # max over seqlen_pred + max21 = cos_sim.max(3).mean(1) # max over seqlen_ref + bow_greedy=(max12+max21)/2 # [batch_pred x batch_ref(1)] + return np.max(bow_extrema), np.max(bow_avg), np.max(bow_greedy) + \ No newline at end of file diff --git a/utils/lang.py b/utils/lang.py new file mode 100644 index 0000000..12e5c33 --- /dev/null +++ b/utils/lang.py @@ -0,0 +1,94 @@ +import nltk +import spacy + + +class Lang: + def __init__(self): + self.unk_idx = 0 + self.pad_idx = 1 + self.sou_idx = 2 + self.eou_idx = 3 + + self.word2index = {'__unk__': self.unk_idx, '__pad__': self.pad_idx, '__sou__': self.sou_idx, '__eou__': self.eou_idx} + self.word2count = {'__unk__': 0, '__pad__': 0, '__sou__': 0, '__eou__': 0} + self.index2word = {self.unk_idx: "__unk__", self.pad_idx: "__pad__", self.sou_idx: "__sou__", self.eou_idx: "__eou__"} + self.n_words = 4 # Count default tokens + + self.nlp = spacy.load("en_core_web_sm") + # add special case rule + special_case = [{spacy.symbols.ORTH: u"__eou__"}] + self.nlp.tokenizer.add_special_case(u"__eou__", special_case) + + def __len__(self): + return self.n_words + + def tokenize(self, s): + # return nltk.word_tokenize(s) + return self.nlp.tokenizer(s) + + def addSentence(self, sentence): + for word in self.tokenize(sentence): + self.addWord(word.text) + + def addSentences(self, sentences): + for sentence in sentences: + for word in self.tokenize(sentence): + self.addWord(word.text) + + def addWord(self, word): + if word not in self.word2index: + self.word2index[word] = self.n_words + self.word2count[word] = 1 + self.index2word[self.n_words] = word + self.n_words += 1 + else: + self.word2count[word] += 1 + + def transform(self, sentences): + # given unokenized sentences (or iterator), transform to idx mapping + return [[self.word2index[token.text] for token in self.tokenize(sentence) if not token.is_space] for sentence in sentences] + + def transform_one(self, sentence): + try: + # given unokenized sentence, transform to idx mapping + return [self.word2index[token.text] for token in self.tokenize(sentence) if not token.is_space] + except KeyError as e: + print(e) + print(sentence) + for token in self.tokenize(sentence): + if not token.is_space: + print(token.text, token.text in self.word2index) + exit(1) + + def transform_unk(self, sentence): + # transform with unk + ret = [] + for token in self.tokenize(sentence): + if token.text in self.word2index: + ret.append(self.word2index[token.text]) + else: + ret.append(self.unk_idx) + return ret + + def reverse(self, sentences): + # given transformed sentences, reverse it + return [[self.index2word[idx] for idx in sentence] for sentence in sentences] + + def reverse_one(self, sentence): + # given transformed sentence, reverse it + return [self.index2word[idx] for idx in sentence] + + # def trim(self, min_freq=100): + # print('vocab size before trimming: ', len(self)) + # self.word2count[self.unk_idx] = min_freq + # self.word2count[self.pad_idx] = min_freq + # self.word2count[self.sou_idx] = min_freq + # self.word2count[self.eou_idx] = min_freq + + # self.word2count = {k: v for k, v in self.word2count if v >= 100} + # trimmed_word2index = {'__unk__': self.unk_idx, '__pad__': self.pad_idx, '__sou__': self.sou_idx, '__eou__': self.eou_idx} + # trimmed_index2word = {self.unk_idx: "__unk__", self.pad_idx: "__pad__", self.sou_idx: "__sou__", self.eou_idx: "__eou__"} + + # self.word2index = trimmed_word2index + # print('vocab size after trimming: ', len(self)) + # return self diff --git a/utils/masked_cross_entropy.py b/utils/masked_cross_entropy.py new file mode 100644 index 0000000..dcf348e --- /dev/null +++ b/utils/masked_cross_entropy.py @@ -0,0 +1,47 @@ +import torch +from torch.nn import functional + + +def sequence_mask(sequence_length, max_len=None): + if max_len is None: + max_len = sequence_length.data.max() + batch_size = sequence_length.size(0) + seq_range = torch.arange(0, max_len).long() + seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len) + # seq_range_expand = Variable(seq_range_expand) + if sequence_length.is_cuda: + seq_range_expand = seq_range_expand.cuda() + seq_length_expand = (sequence_length.unsqueeze(1).expand_as(seq_range_expand)) + return seq_range_expand < seq_length_expand + + +def masked_cross_entropy(logits, target, length): + """ + Args: + logits: A Variable containing a FloatTensor of size + (batch, max_len, num_classes) which contains the + unnormalized probability for each class. + target: A LongTensor of size + (batch, max_len) which contains the index of the true + class for each corresponding step. + length: A LongTensor of size (batch,) + which contains the length of each data in a batch. + Returns: + loss: An average loss value masked by the length. + """ + + # logits_flat: (batch * max_len, num_classes) + logits_flat = logits.view(-1, logits.size(-1)) ## -1 means infered from other dimentions + # log_probs_flat: (batch * max_len, num_classes) + log_probs_flat = functional.log_softmax(logits_flat,dim=1) + # target_flat: (batch * max_len, 1) + target_flat = target.view(-1, 1) + # losses_flat: (batch * max_len, 1) + losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat) + # losses: (batch, max_len) + losses = losses_flat.view(*target.size()) + # mask: (batch, max_len) + mask = sequence_mask(sequence_length=length, max_len=target.size(1)) + losses = losses * mask.float() + loss = losses.sum() / length.float().sum() + return loss diff --git a/utils/rouge.py b/utils/rouge.py new file mode 100644 index 0000000..c83c821 --- /dev/null +++ b/utils/rouge.py @@ -0,0 +1,325 @@ +"""ROUGE metric implementation. +Copy from tf_seq2seq/seq2seq/metrics/rouge.py. +This is a modified and slightly extended verison of +https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py. +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import itertools +import numpy as np + +#pylint: disable=C0103 + + +def _get_ngrams(n, text): + """Calcualtes n-grams. + Args: + n: which n-grams to calculate + text: An array of tokens + Returns: + A set of n-grams + """ + ngram_set = set() + text_length = len(text) + max_index_ngram_start = text_length - n + for i in range(max_index_ngram_start + 1): + ngram_set.add(tuple(text[i:i + n])) + return ngram_set + + +def _split_into_words(sentences): + """Splits multiple sentences into words and flattens the result""" + return list(itertools.chain(*[_.split(" ") for _ in sentences])) + + +def _get_word_ngrams(n, sentences): + """Calculates word n-grams for multiple sentences. + """ + assert len(sentences) > 0 + assert n > 0 + + words = _split_into_words(sentences) + return _get_ngrams(n, words) + + +def _len_lcs(x, y): + """ + Returns the length of the Longest Common Subsequence between sequences x + and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + Args: + x: sequence of words + y: sequence of words + Returns + integer: Length of LCS between x and y + """ + table = _lcs(x, y) + n, m = len(x), len(y) + return table[n, m] + + +def _lcs(x, y): + """ + Computes the length of the longest common subsequence (lcs) between two + strings. The implementation below uses a DP programming algorithm and runs + in O(nm) time where n = len(x) and m = len(y). + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + Args: + x: collection of words + y: collection of words + Returns: + Table of dictionary of coord and len lcs + """ + n, m = len(x), len(y) + table = dict() + for i in range(n + 1): + for j in range(m + 1): + if i == 0 or j == 0: + table[i, j] = 0 + elif x[i - 1] == y[j - 1]: + table[i, j] = table[i - 1, j - 1] + 1 + else: + table[i, j] = max(table[i - 1, j], table[i, j - 1]) + return table + + +def _recon_lcs(x, y): + """ + Returns the Longest Subsequence between x and y. + Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence + Args: + x: sequence of words + y: sequence of words + Returns: + sequence: LCS of x and y + """ + i, j = len(x), len(y) + table = _lcs(x, y) + + def _recon(i, j): + """private recon calculation""" + if i == 0 or j == 0: + return [] + elif x[i - 1] == y[j - 1]: + return _recon(i - 1, j - 1) + [(x[i - 1], i)] + elif table[i - 1, j] > table[i, j - 1]: + return _recon(i - 1, j) + else: + return _recon(i, j - 1) + + recon_tuple = tuple(map(lambda x: x[0], _recon(i, j))) + return recon_tuple + + +def rouge_n(evaluated_sentences, reference_sentences, n=2): + """ + Computes ROUGE-N of two text collections of sentences. + Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/ + papers/rouge-working-note-v1.3.1.pdf + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + n: Size of ngram. Defaults to 2. + Returns: + A tuple (f1, precision, recall) for ROUGE-N + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + evaluated_ngrams = _get_word_ngrams(n, evaluated_sentences) + reference_ngrams = _get_word_ngrams(n, reference_sentences) + reference_count = len(reference_ngrams) + evaluated_count = len(evaluated_ngrams) + + # Gets the overlapping ngrams between evaluated and reference + overlapping_ngrams = evaluated_ngrams.intersection(reference_ngrams) + overlapping_count = len(overlapping_ngrams) + + # Handle edge case. This isn't mathematically correct, but it's good enough + if evaluated_count == 0: + precision = 0.0 + else: + precision = overlapping_count / evaluated_count + + if reference_count == 0: + recall = 0.0 + else: + recall = overlapping_count / reference_count + + f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8)) + + # return overlapping_count / reference_count + return f1_score, precision, recall + + +def _f_p_r_lcs(llcs, m, n): + """ + Computes the LCS-based F-measure score + Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + Args: + llcs: Length of LCS + m: number of words in reference summary + n: number of words in candidate summary + Returns: + Float. LCS-based F-measure score + """ + r_lcs = llcs / m + p_lcs = llcs / n + beta = p_lcs / (r_lcs + 1e-12) + num = (1 + (beta**2)) * r_lcs * p_lcs + denom = r_lcs + ((beta**2) * p_lcs) + f_lcs = num / (denom + 1e-12) + return f_lcs, p_lcs, r_lcs + + +def rouge_l_sentence_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (sentence level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + Calculated according to: + R_lcs = LCS(X,Y)/m + P_lcs = LCS(X,Y)/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + where: + X = reference summary + Y = Candidate summary + m = length of reference summary + n = length of candidate summary + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentences: The sentences from the referene set + Returns: + A float: F_lcs + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + reference_words = _split_into_words(reference_sentences) + evaluated_words = _split_into_words(evaluated_sentences) + m = len(reference_words) + n = len(evaluated_words) + lcs = _len_lcs(evaluated_words, reference_words) + return _f_p_r_lcs(lcs, m, n) + + +def _union_lcs(evaluated_sentences, reference_sentence): + """ + Returns LCS_u(r_i, C) which is the LCS score of the union longest common + subsequence between reference sentence ri and candidate summary C. For example + if r_i= w1 w2 w3 w4 w5, and C contains two sentences: c1 = w1 w2 w6 w7 w8 and + c2 = w1 w3 w8 w9 w5, then the longest common subsequence of r_i and c1 is + "w1 w2" and the longest common subsequence of r_i and c2 is "w1 w3 w5". The + union longest common subsequence of r_i, c1, and c2 is "w1 w2 w3 w5" and + LCS_u(r_i, C) = 4/5. + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentence: One of the sentences in the reference summaries + Returns: + float: LCS_u(r_i, C) + ValueError: + Raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + lcs_union = set() + reference_words = _split_into_words([reference_sentence]) + combined_lcs_length = 0 + for eval_s in evaluated_sentences: + evaluated_words = _split_into_words([eval_s]) + lcs = set(_recon_lcs(reference_words, evaluated_words)) + combined_lcs_length += len(lcs) + lcs_union = lcs_union.union(lcs) + + union_lcs_count = len(lcs_union) + union_lcs_value = union_lcs_count / combined_lcs_length + return union_lcs_value + + +def rouge_l_summary_level(evaluated_sentences, reference_sentences): + """ + Computes ROUGE-L (summary level) of two text collections of sentences. + http://research.microsoft.com/en-us/um/people/cyl/download/papers/ + rouge-working-note-v1.3.1.pdf + Calculated according to: + R_lcs = SUM(1, u)[LCS(r_i,C)]/m + P_lcs = SUM(1, u)[LCS(r_i,C)]/n + F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs) + where: + SUM(i,u) = SUM from i through u + u = number of sentences in reference summary + C = Candidate summary made up of v sentences + m = number of words in reference summary + n = number of words in candidate summary + Args: + evaluated_sentences: The sentences that have been picked by the summarizer + reference_sentence: One of the sentences in the reference summaries + Returns: + A float: F_lcs + Raises: + ValueError: raises exception if a param has len <= 0 + """ + if len(evaluated_sentences) <= 0 or len(reference_sentences) <= 0: + raise ValueError("Collections must contain at least 1 sentence.") + + # total number of words in reference sentences + m = len(_split_into_words(reference_sentences)) + + # total number of words in evaluated sentences + n = len(_split_into_words(evaluated_sentences)) + + union_lcs_sum_across_all_references = 0 + for ref_s in reference_sentences: + union_lcs_sum_across_all_references += _union_lcs(evaluated_sentences, + ref_s) + return _f_p_r_lcs(union_lcs_sum_across_all_references, m, n) + + +def rouge(hypotheses, references): + """Calculates average rouge scores for a list of hypotheses and + references""" + + # Filter out hyps that are of 0 length + # hyps_and_refs = zip(hypotheses, references) + # hyps_and_refs = [_ for _ in hyps_and_refs if len(_[0]) > 0] + # hypotheses, references = zip(*hyps_and_refs) + + # Calculate ROUGE-1 F1, precision, recall scores + rouge_1 = [ + rouge_n([hyp], [ref], 1) for hyp, ref in zip(hypotheses, references) + ] + rouge_1_f, rouge_1_p, rouge_1_r = map(np.mean, zip(*rouge_1)) + + # Calculate ROUGE-2 F1, precision, recall scores + rouge_2 = [ + rouge_n([hyp], [ref], 2) for hyp, ref in zip(hypotheses, references) + ] + rouge_2_f, rouge_2_p, rouge_2_r = map(np.mean, zip(*rouge_2)) + + # Calculate ROUGE-L F1, precision, recall scores + rouge_l = [ + rouge_l_sentence_level([hyp], [ref]) + for hyp, ref in zip(hypotheses, references) + ] + rouge_l_f, rouge_l_p, rouge_l_r = map(np.mean, zip(*rouge_l)) + + return { + "rouge_1/f_score": rouge_1_f, + "rouge_1/r_score": rouge_1_r, + "rouge_1/p_score": rouge_1_p, + "rouge_2/f_score": rouge_2_f, + "rouge_2/r_score": rouge_2_r, + "rouge_2/p_score": rouge_2_p, + "rouge_l/f_score": rouge_l_f, + "rouge_l/r_score": rouge_l_r, + "rouge_l/p_score": rouge_l_p, + } diff --git a/utils/sentiment_dataset.py b/utils/sentiment_dataset.py new file mode 100644 index 0000000..80d2228 --- /dev/null +++ b/utils/sentiment_dataset.py @@ -0,0 +1,108 @@ +import re + +import numpy as np +import dill as pickle + +import torch +import torch.utils.data as data +from pytorch_pretrained_bert.tokenization import BertTokenizer + +from utils import constant, text_input2bert_input + + +class SentimentDataset(data.Dataset): + def __init__(self, mode='train', dataset='sst', load_fasttext=False): + self.mode = mode + self.dataset = dataset + self.fasttext = None + self.load_fasttext = load_fasttext + self.use_bert = constant.use_bert + if self.use_bert: + self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + + self._from_file() + + def __len__(self): + return self.sentiments.shape[0] + + def __getitem__(self, i): + if self.use_bert: + input_id, input_mask, segment_id = text_input2bert_input(self.texts[i], self.bert_tokenizer, seq_length=128) + return input_id, input_mask, segment_id, self.sentiments[i] + return self.sentences[i], self.sentence_lens[i], self.sentiments[i] + + def _from_file(self): + def load_npy(path): + return np.load(path) + + load_path = 'data/prep/{}/{}.{}.npy' + + self.sentiments = load_npy(load_path.format(self.dataset, 'sentiments', self.mode)) + + if self.use_bert: + self.texts = load_npy(load_path.format(self.dataset, 'texts', self.mode)) + else: + self.sentences = load_npy(load_path.format(self.dataset, 'sentences', self.mode)) + self.sentence_lens = load_npy(load_path.format(self.dataset, 'sentence_lens', self.mode)) + + if self.mode == 'train': + if self.load_fasttext: + self.fasttext = load_npy('data/prep/{}/fasttext.npy'.format(self.dataset)) + + with open('data/prep/{}/lang.pkl'.format(self.dataset), 'rb') as f: + self.lang = pickle.load(f) + + + +def make_sentiment_data_loader(dataset, cuda, batch_size, pad_idx=1, shuffle=True): + return torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, + collate_fn=collate_fn(cuda=cuda, bert=dataset.use_bert, pad_idx=pad_idx)) + + +def collate_fn(cuda=False, bert=False, pad_idx=1): + def collate_inner(batch): + """ + Input + - batch[0]: sentences -> [B x L] + - batch[1]: sentence_lens -> [B] + - batch[2]: sentiments -> [B] + """ + # Unzip data (returns tuple of batches) + if bert: + input_ids, input_masks, segment_ids, sentiments = zip(*batch) + input_ids = torch.stack(input_ids) + input_masks = torch.stack(input_masks) + segment_ids = torch.stack(segment_ids) + sentiments = torch.from_numpy(np.array(sentiments)).float() + if cuda: + input_ids = input_ids.cuda() + input_masks = input_masks.cuda() + segment_ids = segment_ids.cuda() + sentiments = sentiments.cuda() + return input_ids, input_masks, segment_ids, sentiments + else: + sentences, sentence_lens, sentiments = zip(*batch) + + sort = np.argsort(sentence_lens)[::-1].tolist() + sentences = np.array(sentences, dtype='object')[sort].tolist() + sentence_lens = np.array(sentence_lens)[sort]#.tolist() + sentiments = torch.from_numpy(np.array(sentiments)[sort]).float() + + # Pad dialogs and targets to their respective max batch lens + B = len(sentences) + L = sentence_lens[0] + if pad_idx == 0: + padded_sentences = torch.zeros((B, L)) + else: + padded_sentences = torch.ones((B, L)) * pad_idx + for b in range(B): + padded_sentences[b, :sentence_lens[b]] = torch.from_numpy(np.array(sentences[b])) + + padded_sentences = padded_sentences.long() + + if cuda: + padded_sentences = padded_sentences.cuda() + sentiments = sentiments.cuda() + + return padded_sentences, sentence_lens, sentiments + return collate_inner diff --git a/utils/utils.py b/utils/utils.py new file mode 100644 index 0000000..74e0ceb --- /dev/null +++ b/utils/utils.py @@ -0,0 +1,303 @@ +import math +from datetime import datetime +from functools import reduce +import operator + +import numpy as np +from nltk.util import ngrams, everygrams + +import torch + +from utils import constant + + +def tile(x, count, dim=0): + """ + Tiles x on dimension dim count times. + """ + perm = list(range(len(x.size()))) + if dim != 0: + perm[0], perm[dim] = perm[dim], perm[0] + x = x.permute(perm).contiguous() + out_size = list(x.size()) + out_size[0] *= count + batch = x.size(0) + x = x.view(batch, -1) \ + .transpose(0, 1) \ + .repeat(count, 1) \ + .transpose(0, 1) \ + .contiguous() \ + .view(*out_size) + if dim != 0: + x = x.permute(perm).contiguous() + return x + +def save_model(model, metric, score, path=None): + if path is not None: + save_path = path + else: + save_path = 'trained/{}.{}.{}.{}.{}.{}.{:.4f}.{}' # data.task.model.H.lr.attn.metric.parse + misc = '' + if constant.lstm: + misc += 'lstm.' + + save_path = save_path.format(constant.data, constant.task, constant.model, constant.H, constant.lr, constant.attn, metric, score, misc) + torch.save(model.state_dict(), save_path) + return save_path + +def load_model(model, path): + if path == "": + return model + if constant.USE_CUDA: + model.load_state_dict(torch.load(path)) + else: + model.load_state_dict(torch.load(path, map_location='cpu')) + return model + +def save_ckpt(model, optim, epoch): + save_path = 'ckpt/{}.{}.{}.{}.{}.{}' # dataset.task.epoch.lr.misc.time + misc = '' + if constant.lstm: + misc += 'lstm.' + date = datetime.now().date() + time = datetime.now().time() + dt = '{}-{}-{}-{}-{}-{}'.format(date.year, date.month, date.day, time.hour, time.minute, time.second) + save_path = save_path.format(constant.data, constant.task, epoch, constant.lr, misc, dt) + + state = { + 'epoch': epoch, + 'state_dict': model.state_dict(), + 'optimizer': optim.state_dict() + } + torch.save(state, save_path) + return save_path + +def load_ckpt(model, optim, path): + # Note: Input model & optimizer should be pre-defined. + # This routine only updates their states. + start_epoch = 0 + if os.path.isfile(path): + print("=> loading checkpoint '{}'".format(path)) + checkpoint = torch.load(path) + start_epoch = checkpoint['epoch'] + model.load_state_dict(checkpoint['state_dict']) + optim.load_state_dict(checkpoint['optimizer']) + if constant.USE_CUDA: + for state in optim.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda() + print("=> loaded checkpoint '{}' (epoch {})".format(path, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(path)) + + return model, optim, start_epoch + +def to_categorical(y, num_classes): + """ 1-hot encodes a tensor """ + return np.eye(num_classes, dtype='uint8')[y] + +def get_metrics(predictions, ground, C=7, verbose=False): + """Given predicted labels and the respective ground truth labels, display some metrics + Input: shape [# of samples, NUM_CLASSES] + predictions : Model output. Every row has 4 decimal values, with the highest belonging to the predicted class + ground : Ground truth labels, converted to one-hot encodings. A sample belonging to Happy class will be [0, 1, 0, 0] + Output: + accuracy : Average accuracy + microPrecision : Precision calculated on a micro level. Ref - https://datascience.stackexchange.com/questions/15989/micro-average-vs-macro-average-performance-in-a-multiclass-classification-settin/16001 + microRecall : Recall calculated on a micro level + microF1 : Harmonic mean of microPrecision and microRecall. Higher value implies better classification + """ + one_hot = np.zeros((ground.shape[0], C)) + one_hot[np.arange(ground.shape[0]), ground] = 1 + ground = one_hot + label2emotion = { + 0: 'none', + 1: 'anger', + 2: 'disgust', + 3: 'fear', + 4: 'happiness', + 5: 'sadness', + 6: 'surprise' + } + # [0.1, 0.3 , 0.2, 0.1] -> [0, 1, 0, 0] + discretePredictions = to_categorical(predictions.argmax(axis=1), num_classes=C) + + truePositives = np.sum(discretePredictions*ground, axis=0) + falsePositives = np.sum(np.clip(discretePredictions - ground, 0, 1), axis=0) + falseNegatives = np.sum(np.clip(ground-discretePredictions, 0, 1), axis=0) + if(verbose): + print("True Positives per class : ", truePositives) + print("False Positives per class : ", falsePositives) + print("False Negatives per class : ", falseNegatives) + + # ------------- Macro level calculation --------------- + macroPrecision = 0 + macroRecall = 0 + # We ignore the "Others" class during the calculation of Precision, Recall and F1 + for c in range(1, C): + precision = truePositives[c] / (truePositives[c] + falsePositives[c]) + macroPrecision += precision + recall = truePositives[c] / (truePositives[c] + falseNegatives[c]) + macroRecall += recall + f1 = ( 2 * recall * precision ) / (precision + recall) if (precision+recall) > 0 else 0 + if(verbose): + print("Class %s : Precision : %.3f, Recall : %.3f, F1 : %.3f" % (label2emotion[c], precision, recall, f1)) + + macroPrecision /= 3 + macroRecall /= 3 + macroF1 = (2 * macroRecall * macroPrecision ) / (macroPrecision + macroRecall) if (macroPrecision+macroRecall) > 0 else 0 + if(verbose): + print("Ignoring the Others class, Macro Precision : %.4f, Macro Recall : %.4f, Macro F1 : %.4f" % (macroPrecision, macroRecall, macroF1)) + + # ------------- Micro level calculation --------------- + truePositives = truePositives[1:].sum() + falsePositives = falsePositives[1:].sum() + falseNegatives = falseNegatives[1:].sum() + if(verbose): + print("Ignoring the Others class, Micro TP : %d, FP : %d, FN : %d" % (truePositives, falsePositives, falseNegatives)) + + microPrecision = truePositives / (truePositives + falsePositives) + microRecall = truePositives / (truePositives + falseNegatives) + + microF1 = ( 2 * microRecall * microPrecision ) / (microPrecision + microRecall) if (microPrecision+microRecall) > 0 else 0 + # ----------------------------------------------------- + + predictions = predictions.argmax(axis=1) + ground = ground.argmax(axis=1) + accuracy = np.mean(predictions==ground) + if(verbose): + print("Accuracy : %.4f, Micro Precision : %.4f, Micro Recall : %.4f, Micro F1 : %.4f" % (accuracy, microPrecision, microRecall, microF1)) + return accuracy, microPrecision, microRecall, microF1 + +def text_input2bert_input(text, bert_tokenizer, seq_length=512): + tokens_a = bert_tokenizer.tokenize(text) + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] + + tokens = [] # equals raw text tokens + input_type_ids = [] # equals segments_ids + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) + + input_ids = bert_tokenizer.convert_tokens_to_ids(tokens) # WordPiece embedding rep + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) + + input_ids_batch = torch.tensor(input_ids, dtype=torch.long) + input_mask_batch = torch.tensor(input_mask, dtype=torch.long) + segment_id_batch = torch.zeros(input_ids_batch.size(), dtype=torch.long) + + return input_ids_batch, input_mask_batch, segment_id_batch + +def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (..., vocabulary size) + top_k >0: keep only top k tokens with highest probability (top-k filtering). + top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + """ + top_k = min(top_k, logits.size(-1)) # Safety check + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + # if top_p > 0.0: + # sorted_logits, sorted_indices = torch.sort(logits, descending=True) + # cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # # Remove tokens with cumulative probability above the threshold + # sorted_indices_to_remove = cumulative_probs > top_p + # # Shift the indices to the right to keep also the first token above the threshold + # sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + # sorted_indices_to_remove[..., 0] = 0 + + # indices_to_remove = sorted_indices[sorted_indices_to_remove] + # logits[indices_to_remove] = filter_value + return logits + +def get_sentiment(sentiment_clf, sentences, tokenizer): + input_ids, input_masks, segment_ids = zip(*[text_input2bert_input(sentence, tokenizer, seq_length=128) for sentence in sentences]) + input_ids = torch.stack(input_ids) + input_masks = torch.stack(input_masks) + segment_ids = torch.stack(segment_ids) + + if constant.USE_CUDA: + input_ids = input_ids.cuda() + input_masks = input_masks.cuda() + segment_ids = segment_ids.cuda() + + # get reward with generated sentence + with torch.no_grad(): + R = sentiment_clf.predict_prob((input_ids, segment_ids, input_masks)) + + return R + +def get_user_response(user_model, refs, sents, vocab): + sents = [vocab.transform_one(sent) for sent in sents] + lens = [len(sentence) for sentence in sents] + sort = np.argsort(lens)[::-1].tolist() + unsort = np.argsort(sort).tolist() + sents = np.array(sents, dtype='object')[sort].tolist() + lens = np.array(lens)[sort] + + B = len(sents) + L = lens[0] + padded_sents = torch.ones((B, L)) * constant.pad_idx + for b in range(B): + padded_sents[b, :lens[b]] = torch.from_numpy(np.array(sents[b])) + + padded_sents = padded_sents.long() + if constant.USE_CUDA: + padded_sents = padded_sents.cuda() + + return np.array(user_model.predict_batch(padded_sents, lens, np.zeros((B, L))))[unsort].tolist() + +def distinct_ngrams(sentences): + unigram = [] + bigram = [] + trigram = [] + for sent in sentences: + s = sent.split() + unigram.append(s) + bigram.append(list(ngrams(s, 2))) + trigram.append(list(ngrams(s, 3))) + unigram = reduce(operator.concat, unigram) + bigram = reduce(operator.concat, bigram) + trigram = reduce(operator.concat, trigram) + d1 = len(set(unigram))/len(unigram) + d2 = len(set(bigram))/len(bigram) + d3 = len(set(trigram))/len(trigram) + return d1, d2, d3 + +# def get_embedding_similarity(refs, sents, vocab, encoder, mode='average', model='fasttext'): +# if model == 'fasttext': +# sents = [vocab.transform_one(sent) for sent in sents] +# lens = np.array(lens)[sort] + +# B = len(sents) +# L = lens[0] +# padded_sents = torch.ones((B, L)) * constant.pad_idx +# for b in range(B): +# padded_sents[b, :lens[b]] = torch.from_numpy(np.array(sents[b])) + +# padded_sents = padded_sents.long() +# if constant.USE_CUDA: +# padded_sents = padded_sents.cuda() +# elif model == 'bert': +# pass