-
Notifications
You must be signed in to change notification settings - Fork 153
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
34 changed files
with
551,241 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Assignment 5 written questions | ||
## Character-based convolutional encoder for NMT | ||
### Model description and written questions | ||
(a) 对于卷积神经网络也是一样的。卷积核是作用于输入序列的固定长度,所以无论输入序列长度有多长区别只在于进行卷积的次数 | ||
(b) 如果需要至少一个输出的话,左右两侧padding各自的大小为:(kernel_size - (m_model+2))/2;如果是要kernel的每一个列向量权重都可以作用于输入序列的每一个字符上的话,左右两侧padding的大小为kernel_size-1。 | ||
(c) 除了能减缓梯度消失的问题,同时也使得网络自适应地选择向后传递的信息;b_gate的初始化应该尽量使sigmoid函数初始输出为1,即尽量保留x_proj的值,所以初始化应该为positive的。 | ||
(d) 优点1:更好的并行化;优点2:每一个step的representation都能够接触到全局的信息。 | ||
(f)自己编写了sanity_check以后发现在模型构建以及其他各个部分的过程中,使用简单的程序来进行检查是十分必要的,比如排查输入输出维度是否和预期的匹配,网络中各个部分有哪些参数,对网络的计算过程有更清晰的认识。 | ||
## Analyzing NMT Systems | ||
(a) 找到的字典中的词——'traducir':4603; 'traduzco':40991; 'traduce':7931; '不存在的——'traduces', 'traduzca', 'traduzcas' | ||
why bad:很多没有出现在词典中,但是和词典中一些词形式相近的词在翻译时作为`<UNK>`处理会极大影响对原句的语义表征。 | ||
this model's solution:使用word-character based model的好处是不受提供的词典大小的限制,在出现词典外的单词时通过character model能够捕获到与词典内形式相近的词的意思 | ||
(b) | ||
i.`word2vec all`--nearest words for each item | ||
* `financial`: economic, business, markets, market, money | ||
* `neuron`: neurons, dendrites, cerebellum, nerve, excitatory | ||
* `Francisco`: san, jose, diego, california, los | ||
* `naturally`: occurring, easily, natural, humans, therefore | ||
* `expectation`: operator, assumption, consequence, otherwise, implies | ||
ii. `character-base`--nearest words for same items | ||
* `financial`: vertical, informal, physical, cultural, electrical | ||
* `neuron`: Newton, George, NBA, Delhi, golden | ||
* `Francisco`: France, platform, tissue, Foundation, microphone | ||
* `naturally`: practically, typically, significantly, mentally, gradually | ||
* `expectation`: exception, indication, integration, separation, expected | ||
iii. | ||
Word2Vec是语义相似性,CharCNN是形式上的相似。从各自的模型来看,Word2Vec是基于上下文训练得出word embedding,所以能够很好的反应各词之间的语义信息;而CharCNN对字符向量运用卷积,因为单一的字符是没有确切的意义的,所以卷积得到的结果很可能就是根据词的组成字符的相似性得出的。 |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
CS224N 2019-20: Homework 5 | ||
""" | ||
|
||
import torch | ||
import torch.nn as nn | ||
|
||
|
||
class CharDecoder(nn.Module): | ||
def __init__(self, hidden_size, char_embedding_size=50, target_vocab=None): | ||
""" Init Character Decoder. | ||
@param hidden_size (int): Hidden size of the decoder LSTM | ||
@param char_embedding_size (int): dimensionality of character embeddings | ||
@param target_vocab (VocabEntry): vocabulary for the target language. See vocab.py for documentation. | ||
""" | ||
super(CharDecoder, self).__init__() | ||
self.target_vocab = target_vocab | ||
self.charDecoder = nn.LSTM(char_embedding_size, hidden_size) | ||
self.char_output_projection = nn.Linear(hidden_size, len(self.target_vocab.char2id)) | ||
self.decoderCharEmb = nn.Embedding(len(self.target_vocab.char2id), char_embedding_size, | ||
padding_idx=self.target_vocab.char_pad) | ||
|
||
def forward(self, input, dec_hidden=None): | ||
""" Forward pass of character decoder. | ||
@param input (Tensor): tensor of integers, shape (length, batch_size) | ||
@param dec_hidden (tuple(Tensor, Tensor)): internal state of the LSTM before reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size) | ||
@returns scores (Tensor): called s_t in the PDF, shape (length, batch_size, self.vocab_size) | ||
@returns dec_hidden (tuple(Tensor, Tensor)): internal state of the LSTM after reading the input characters. A tuple of two tensors of shape (1, batch, hidden_size) | ||
""" | ||
### YOUR CODE HERE for part 2a | ||
### TODO - Implement the forward pass of the character decoder. | ||
x = self.decoderCharEmb(input) | ||
x, dec_hidden = self.charDecoder(x, dec_hidden) if dec_hidden else self.charDecoder(x) | ||
scores = self.char_output_projection(x) | ||
return scores, dec_hidden | ||
### END YOUR CODE | ||
|
||
def train_forward(self, char_sequence, dec_hidden=None): | ||
""" Forward computation during training. | ||
@param char_sequence (Tensor): tensor of integers, shape (length, batch_size). Note that "length" here and in forward() need not be the same. | ||
@param dec_hidden (tuple(Tensor, Tensor)): initial internal state of the LSTM, obtained from the output of the word-level decoder. A tuple of two tensors of shape (1, batch_size, hidden_size) | ||
@returns The cross-entropy loss (Tensor), computed as the *sum* of cross-entropy losses of all the words in the batch. | ||
""" | ||
### YOUR CODE HERE for part 2b | ||
### TODO - Implement training forward pass. | ||
### | ||
### Hint: - Make sure padding characters do not contribute to the cross-entropy loss. Check vocab.py to find the padding token's index. | ||
### - char_sequence corresponds to the sequence x_1 ... x_{n+1} (e.g., <START>,m,u,s,i,c,<END>). Read the handout about how to construct input and target sequence of CharDecoderLSTM. | ||
### - Carefully read the documentation for nn.CrossEntropyLoss and our handout to see what this criterion have already included: | ||
### https://pytorch.org/docs/stable/nn.html#crossentropyloss | ||
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | ||
tgt_char_seq = char_sequence[1:, :] | ||
input_char_seq = [] # input char_sequence without <end> token | ||
for item in char_sequence: | ||
vec = [self.target_vocab.char_pad | ||
if char_idx==self.target_vocab.end_of_word else char_idx for char_idx in item] | ||
input_char_seq.append(torch.tensor(vec, device=device).unsqueeze(0)) | ||
input_char_seq = torch.cat(input_char_seq)[:-1, :] | ||
|
||
target_scores, _ = self.forward(input_char_seq, dec_hidden) | ||
metric = nn.CrossEntropyLoss(ignore_index=self.target_vocab.char_pad, reduction='sum') | ||
loss = metric(target_scores.view(-1, len(self.target_vocab.char2id)).contiguous(), tgt_char_seq.view(-1).contiguous()) | ||
|
||
return loss | ||
### END YOUR CODE | ||
|
||
def decode_greedy(self, initialStates, device, max_length=21): | ||
""" Greedy decoding | ||
@param initialStates (tuple(Tensor, Tensor)): initial internal state of the LSTM, a tuple of two tensors of size (1, batch_size, hidden_size) | ||
@param device: torch.device (indicates whether the model is on CPU or GPU) | ||
@param max_length (int): maximum length of words to decode | ||
@returns decodedWords (List[str]): a list (of length batch_size) of strings, each of which has length <= max_length. | ||
The decoded strings should NOT contain the start-of-word and end-of-word characters. | ||
""" | ||
|
||
### YOUR CODE HERE for part 2c | ||
### TODO - Implement greedy decoding. | ||
### Hints: | ||
### - Use initialStates to get batch_size. | ||
### - Use target_vocab.char2id and target_vocab.id2char to convert between integers and characters | ||
### - Use torch.tensor(..., device=device) to turn a list of character indices into a tensor. | ||
### - You may find torch.argmax or torch.argmax useful | ||
### - We use curly brackets as start-of-word and end-of-word characters. That is, use the character '{' for <START> and '}' for <END>. | ||
### Their indices are self.target_vocab.start_of_word and self.target_vocab.end_of_word, respectively. | ||
batch_size = initialStates[0].size()[1] | ||
decodedChars = [] | ||
for step in range(max_length): | ||
if step == 0: | ||
chars_in = torch.empty(1, batch_size, dtype=torch.long, device=device).fill_(self.target_vocab.start_of_word) | ||
scores, dec_hidden = self.forward(chars_in, initialStates) | ||
chars_out = torch.argmax(scores, dim=-1) | ||
else: | ||
scores, dec_hidden = self.forward(chars_in, dec_hidden) | ||
chars_out = torch.argmax(scores, dim=-1) | ||
chars_in = chars_out | ||
decodedChars.append([self.target_vocab.id2char[char_idx.item()] for char_idx in chars_out[0]]) | ||
|
||
decodedWords = [] | ||
for batch_id in range(batch_size): | ||
word = '' | ||
for i in range(max_length): | ||
if self.target_vocab.char2id[decodedChars[i][batch_id]] != self.target_vocab.end_of_word: | ||
word = word + decodedChars[i][batch_id] | ||
else: | ||
break | ||
decodedWords.append(word) | ||
|
||
return decodedWords | ||
### END YOUR CODE | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
#!/usr/bin/env python3 | ||
# -*- coding: utf-8 -*- | ||
|
||
""" | ||
CS224N 2019-20: Homework 5 | ||
Usage: | ||
cnn.py view | ||
cnn.py value | ||
cnn.py -h | ||
""" | ||
|
||
from docopt import docopt | ||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
import numpy as np | ||
|
||
class CNN(nn.Module): | ||
# Remember to delete the above 'pass' after your implementation | ||
### YOUR CODE HERE for part 1g | ||
def __init__(self, char_embedding, n_features, kernel_size=5, padding=1): | ||
"""define conv1d network | ||
params: | ||
char_embedding (int): characters' embedding dimension | ||
n_features (int): number of conv1d filters | ||
kernel_size (int): convolution window size | ||
""" | ||
super(CNN, self).__init__() | ||
self.char_embedding = char_embedding | ||
self.conv = nn.Conv1d(char_embedding, n_features, kernel_size, padding=padding) | ||
|
||
def forward(self, x): | ||
""" | ||
params: | ||
x (n_words, char_embed, n_chars): words in a sentence with embedded characters | ||
return: | ||
x_conv (n_words, word_embedding): embedded words matrix | ||
""" | ||
assert x.size()[-2] == self.char_embedding, "input tensor shape invalid, should be (n_words, char_embed, n_chars)" | ||
x = self.conv(x) | ||
x = F.relu(x) | ||
x_conv, _ = torch.max(x, dim=-1) | ||
return x_conv | ||
|
||
### END YOUR CODE | ||
|
||
|
||
if __name__ == '__main__': | ||
args = docopt(__doc__) | ||
seed = 2020 | ||
torch.manual_seed(seed) | ||
torch.cuda.manual_seed(seed) | ||
np.random.seed(seed // 2) | ||
|
||
x = torch.tensor([[[1., 1., 1., 1.], | ||
[-2, -2, -2., -2.]], | ||
[[2, 2, 1, 1], | ||
[0.5, 0.5, 0, 0]]], dtype=torch.float32) | ||
print("input tensor shape: ", x.size()) | ||
x = x.permute(0, 2, 1).contiguous() | ||
model = CNN(x.size()[-2], 3, kernel_size=2) | ||
if args['view']: | ||
print("model's parameter print...") | ||
for p in model.parameters(): | ||
print(p) | ||
elif args['value']: | ||
print("value confirmation...") | ||
for p in model.parameters(): | ||
if p.dim() > 1: | ||
nn.init.ones_(p) | ||
else: | ||
nn.init.zeros_(p) | ||
x_conv = model(x) | ||
print("input:\n{}\nsize: {}".format(x, x.size())) | ||
print("output:\n{}\nsize: {}".format(x_conv, x_conv.size())) |
Oops, something went wrong.