-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
1,540 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#coding: UTF-8 | ||
import mindspore | ||
import mindspore.nn as nn | ||
import mindspore.ops as ops | ||
import numpy as np | ||
|
||
class Config(object): | ||
|
||
"""配置参数""" | ||
def __init__(self, dataset, embedding): | ||
self.model_name = 'DPCNN' | ||
self.train_path = dataset + '/data/train.txt' # 训练集 | ||
self.dev_path = dataset + '/data/dev.txt' # 验证集 | ||
self.test_path = dataset + '/data/test.txt' # 测试集 | ||
self.class_list = [x.strip() for x in open( | ||
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单 | ||
self.vocab_path = dataset + '/data/vocab.pkl' # 词表 | ||
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 | ||
self.log_path = dataset + '/log/' + self.model_name | ||
self.embedding_pretrained = mindspore.Tensor( | ||
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\ | ||
if embedding != 'random' else None # 预训练词向量 | ||
|
||
self.dropout = 0.5 # 随机失活 | ||
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 | ||
self.num_classes = len(self.class_list) # 类别数 | ||
self.n_vocab = 0 # 词表大小,在运行时赋值 | ||
self.num_epochs = 20 # epoch数 | ||
self.batch_size = 128 # mini-batch大小 | ||
self.pad_size = 32 # 每句话处理成的长度(短填长切) | ||
self.learning_rate = 1e-3 # 学习率 | ||
self.embed = self.embedding_pretrained.size(1)\ | ||
if self.embedding_pretrained is not None else 300 # 字向量维度 | ||
self.num_filters = 250 # 卷积核数量(channels数) | ||
|
||
'''Deep Pyramid Convolutional Neural Networks for Text Categorization''' | ||
|
||
|
||
class Model(nn.Cell): | ||
def __init__(self, config): | ||
super(Model, self).__init__() | ||
if config.embedding_pretrained is not None: | ||
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) | ||
else: | ||
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) | ||
self.conv_region = nn.Conv2d(1, config.num_filters, (3, config.embed), stride=1, has_bias=True) | ||
self.conv = nn.Conv2d(config.num_filters, config.num_filters, (3, 1), stride=1, has_bias=True) | ||
self.max_pool = nn.MaxPool2d(kernel_size=(3, 1), stride=2) | ||
self.padding1 = nn.ZeroPad2d((0, 0, 1, 1)) # top bottom | ||
self.padding2 = nn.ZeroPad2d((0, 0, 0, 1)) # bottom | ||
self.relu = nn.ReLU() | ||
self.fc = nn.Dense(config.num_filters, config.num_classes) | ||
|
||
def forward(self, x): | ||
x = x[0] | ||
x = self.embedding(x) | ||
x = x.unsqueeze(1) # [batch_size, 250, seq_len, 1] | ||
x = self.conv_region(x) # [batch_size, 250, seq_len-3+1, 1] | ||
|
||
x = self.padding1(x) # [batch_size, 250, seq_len, 1] | ||
x = self.relu(x) | ||
x = self.conv(x) # [batch_size, 250, seq_len-3+1, 1] | ||
x = self.padding1(x) # [batch_size, 250, seq_len, 1] | ||
x = self.relu(x) | ||
x = self.conv(x) # [batch_size, 250, seq_len-3+1, 1] | ||
while x.size()[2] > 2: | ||
x = self._block(x) | ||
x = x.squeeze() # [batch_size, num_filters(250)] | ||
x = self.fc(x) | ||
return x | ||
|
||
def _block(self, x): | ||
x = self.padding2(x) | ||
px = self.max_pool(x) | ||
|
||
x = self.padding1(px) | ||
x = ops.relu(x) | ||
x = self.conv(x) | ||
|
||
x = self.padding1(x) | ||
x = ops.relu(x) | ||
x = self.conv(x) | ||
|
||
# Short Cut | ||
x = x + px | ||
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#coding: UTF-8 | ||
import mindspore | ||
import mindspore.nn as nn | ||
import mindspore.ops as ops | ||
import numpy as np | ||
|
||
class Config(object): | ||
|
||
"""配置参数""" | ||
def __init__(self, dataset, embedding): | ||
self.model_name = 'FastText' | ||
self.train_path = dataset + '/data/train.txt' # 训练集 | ||
self.dev_path = dataset + '/data/dev.txt' # 验证集 | ||
self.test_path = dataset + '/data/test.txt' # 测试集 | ||
self.class_list = [x.strip() for x in open( | ||
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单 | ||
self.vocab_path = dataset + '/data/vocab.pkl' # 词表 | ||
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 | ||
self.log_path = dataset + '/log/' + self.model_name | ||
self.embedding_pretrained = mindspore.Tensor( | ||
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\ | ||
if embedding != 'random' else None # 预训练词向量 | ||
|
||
self.dropout = 0.5 # 随机失活 | ||
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 | ||
self.num_classes = len(self.class_list) # 类别数 | ||
self.n_vocab = 0 # 词表大小,在运行时赋值 | ||
self.num_epochs = 20 # epoch数 | ||
self.batch_size = 128 # mini-batch大小 | ||
self.pad_size = 32 # 每句话处理成的长度(短填长切) | ||
self.learning_rate = 1e-3 # 学习率 | ||
self.embed = self.embedding_pretrained.size(1)\ | ||
if self.embedding_pretrained is not None else 300 # 字向量维度 | ||
self.hidden_size = 256 # 隐藏层大小 | ||
self.n_gram_vocab = 250499 | ||
|
||
|
||
'''Bag of Tricks for Efficient Text Classification''' | ||
|
||
|
||
class Model(nn.Cell): | ||
def __init__(self, config): | ||
super(Model, self).__init__() | ||
if config.embedding_pretrained is not None: | ||
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) | ||
else: | ||
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) | ||
self.embedding_ngram2 = nn.Embedding(config.n_gram_vocab, config.embed) | ||
self.embedding_ngram3 = nn.Embedding(config.n_gram_vocab, config.embed) | ||
self.dropout = nn.Dropout(config.dropout) | ||
self.fc1 = nn.Dense(config.embed * 3, config.hidden_size) | ||
# self.dropout2 = nn.Dropout(config.dropout) | ||
self.fc2 = nn.Dense(config.hidden_size, config.num_classes) | ||
|
||
def forward(self, x): | ||
|
||
out_word = self.embedding(x[0]) | ||
out_bigram = self.embedding_ngram2(x[2]) | ||
out_trigram = self.embedding_ngram3(x[3]) | ||
out = ops.cat((out_word, out_bigram, out_trigram), -1) | ||
|
||
out = out.mean(dim=1) | ||
out = self.dropout(out) | ||
out = self.fc1(out) | ||
out = ops.relu(out) | ||
out = self.fc2(out) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import mindspore | ||
import mindspore.nn as nn | ||
import mindspore.ops as ops | ||
import numpy as np | ||
from mindspore import Tensor, CSRTensor, COOTensor | ||
|
||
class Config(object): | ||
|
||
"""配置参数""" | ||
def __init__(self, dataset, embedding): | ||
self.model_name = 'TextCNN' | ||
self.train_path = dataset + '/data/train.txt' # 训练集 | ||
self.dev_path = dataset + '/data/dev.txt' # 验证集 | ||
self.test_path = dataset + '/data/test.txt' # 测试集 | ||
self.class_list = [x.strip() for x in open( | ||
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单 | ||
self.vocab_path = dataset + '/data/vocab.pkl' # 词表 | ||
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 | ||
self.log_path = dataset + '/log/' + self.model_name | ||
self.embedding_pretrained = mindspore.Tensor( | ||
np.load(dataset + '/data/' + embedding)["embeddings"].astype(Tensor.float32))\ | ||
if embedding != 'random' else None # 预训练词向量 | ||
|
||
self.dropout = 0.5 # 随机失活 | ||
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 | ||
self.num_classes = len(self.class_list) # 类别数 | ||
self.n_vocab = 0 # 词表大小,在运行时赋值 | ||
self.num_epochs = 20 # epoch数 | ||
self.batch_size = 128 # mini-batch大小 | ||
self.pad_size = 32 # 每句话处理成的长度(短填长切) | ||
self.learning_rate = 1e-3 # 学习率 | ||
self.embed = self.embedding_pretrained.size(1)\ | ||
if self.embedding_pretrained is not None else 300 # 字向量维度 | ||
self.filter_sizes = (2, 3, 4) # 卷积核尺寸 | ||
self.num_filters = 256 # 卷积核数量(channels数) | ||
|
||
|
||
'''Convolutional Neural Networks for Sentence Classification''' | ||
|
||
|
||
class Model(nn.Cell): | ||
def __init__(self, config): | ||
super(Model, self).__init__() # 继承自nn.Cell类 创建一个模型类实例(self) | ||
if config.embedding_pretrained is not None: | ||
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) | ||
else: | ||
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) | ||
self.convs = nn.CellList( | ||
[nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes], has_bias=True) | ||
self.dropout = nn.Dropout(config.dropout) | ||
self.fc = nn.Dense(config.num_filters * len(config.filter_sizes), config.num_classes) | ||
|
||
|
||
def conv_and_pool(self, x, conv): | ||
x = ops.relu(conv(x)) #relu:激活函数,它将所有负值变为零,并保持正值不变 | ||
x = ops.squeeze(x, axis = 3) #squeeze:去掉维度,axis = 删除指定 axis 中大小为1的维度 | ||
x = ops.adaptive_max_pool1d(x, x.size(2)) #应用一维自适应最大池化操作 | ||
x = ops.squeeze(x, axis = 2) | ||
return x | ||
|
||
def forward(self, x): | ||
out = self.embedding(x[0]) | ||
out = out.unsqueeze(1) | ||
out = ops.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) #拼接卷积核池化操作后的张量 | ||
out = self.dropout(out) | ||
out = self.fc(out) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#coding: UTF-8 | ||
import mindspore | ||
import mindspore.ops as ops | ||
import numpy as np | ||
import mindspore.nn as nn | ||
|
||
class Config(object): | ||
|
||
"""配置参数""" | ||
def __init__(self, dataset, embedding): | ||
self.model_name = 'TextRCNN' | ||
self.train_path = dataset + '/data/train.txt' # 训练集 | ||
self.dev_path = dataset + '/data/dev.txt' # 验证集 | ||
self.test_path = dataset + '/data/test.txt' # 测试集 | ||
self.class_list = [x.strip() for x in open( | ||
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单 | ||
self.vocab_path = dataset + '/data/vocab.pkl' # 词表 | ||
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 | ||
self.log_path = dataset + '/log/' + self.model_name | ||
self.embedding_pretrained = mindspore.Tensor( | ||
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\ | ||
if embedding != 'random' else None # 预训练词向量 | ||
|
||
self.dropout = 1.0 # 随机失活 | ||
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 | ||
self.num_classes = len(self.class_list) # 类别数 | ||
self.n_vocab = 0 # 词表大小,在运行时赋值 | ||
self.num_epochs = 10 # epoch数 | ||
self.batch_size = 128 # mini-batch大小 | ||
self.pad_size = 32 # 每句话处理成的长度(短填长切) | ||
self.learning_rate = 1e-3 # 学习率 | ||
self.embed = self.embedding_pretrained.size(1)\ | ||
if self.embedding_pretrained is not None else 300 # 字向量维度, 若使用了预训练词向量,则维度统一 | ||
self.hidden_size = 256 # lstm隐藏层 | ||
self.num_layers = 1 # lstm层数 | ||
|
||
'''Recurrent Convolutional Neural Networks for Text Classification''' | ||
|
||
|
||
class Model(nn.Cell): | ||
def __init__(self, config): | ||
super(Model, self).__init__() | ||
if config.embedding_pretrained is not None: | ||
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) | ||
else: | ||
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) | ||
self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers, | ||
bidirectional=True, batch_first=True, dropout=config.dropout) | ||
self.maxpool = nn.MaxPool1d(config.pad_size) | ||
self.fc = nn.Dense(config.hidden_size * 2 + config.embed, config.num_classes) | ||
|
||
def forward(self, x): | ||
x, _ = x | ||
embed = self.embedding(x) # [batch_size, seq_len, embeding]=[64, 32, 64] | ||
out, _ = self.lstm(embed) | ||
out = ops.cat((embed, out), 2) | ||
out = ops.relu(out) | ||
out = out.permute(0, 2, 1) | ||
out = self.maxpool(out).squeeze() | ||
out = self.fc(out) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#coding: UTF-8 | ||
import mindspore | ||
import mindspore.nn as nn | ||
import numpy as np | ||
|
||
|
||
class Config(object): | ||
|
||
"""配置参数""" | ||
def __init__(self, dataset, embedding): | ||
self.model_name = 'TextRNN' | ||
self.train_path = dataset + '/data/train.txt' # 训练集 | ||
self.dev_path = dataset + '/data/dev.txt' # 验证集 | ||
self.test_path = dataset + '/data/test.txt' # 测试集 | ||
self.class_list = [x.strip() for x in open( | ||
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单 | ||
self.vocab_path = dataset + '/data/vocab.pkl' # 词表 | ||
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果 | ||
self.log_path = dataset + '/log/' + self.model_name | ||
self.embedding_pretrained = mindspore.Tensor( | ||
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\ | ||
if embedding != 'random' else None # 预训练词向量 | ||
|
||
self.dropout = 0.5 # 随机失活 | ||
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练 | ||
self.num_classes = len(self.class_list) # 类别数 | ||
self.n_vocab = 0 # 词表大小,在运行时赋值 | ||
self.num_epochs = 10 # epoch数 | ||
self.batch_size = 128 # mini-batch大小 | ||
self.pad_size = 32 # 每句话处理成的长度(短填长切) | ||
self.learning_rate = 1e-3 # 学习率 | ||
self.embed = self.embedding_pretrained.size(1)\ | ||
if self.embedding_pretrained is not None else 300 # 字向量维度, 若使用了预训练词向量,则维度统一 | ||
self.hidden_size = 128 # lstm隐藏层 | ||
self.num_layers = 2 # lstm层数 | ||
|
||
|
||
'''Recurrent Neural Network for Text Classification with Multi-Task Learning''' | ||
|
||
|
||
class Model(nn.Cell): | ||
def __init__(self, config): | ||
super(Model, self).__init__() | ||
if config.embedding_pretrained is not None: | ||
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False) | ||
else: | ||
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1) | ||
self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers, | ||
bidirectional=True, batch_first=True, dropout=config.dropout) | ||
self.fc = nn.Dense(config.hidden_size * 2, config.num_classes) | ||
|
||
def forward(self, x): | ||
x, _ = x | ||
out = self.embedding(x) # [batch_size, seq_len, embeding]=[128, 32, 300] | ||
out, _ = self.lstm(out) | ||
out = self.fc(out[:, -1, :]) # 句子最后时刻的 hidden state | ||
return out |
Oops, something went wrong.