Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
HawkingC authored Jun 8, 2023
1 parent ac5355d commit e3fdb21
Show file tree
Hide file tree
Showing 14 changed files with 1,540 additions and 0 deletions.
86 changes: 86 additions & 0 deletions models/DPCNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#coding: UTF-8
import mindspore
import mindspore.nn as nn
import mindspore.ops as ops
import numpy as np

class Config(object):

"""配置参数"""
def __init__(self, dataset, embedding):
self.model_name = 'DPCNN'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.log_path = dataset + '/log/' + self.model_name
self.embedding_pretrained = mindspore.Tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
if embedding != 'random' else None # 预训练词向量

self.dropout = 0.5 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.num_epochs = 20 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 300 # 字向量维度
self.num_filters = 250 # 卷积核数量(channels数)

'''Deep Pyramid Convolutional Neural Networks for Text Categorization'''


class Model(nn.Cell):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.conv_region = nn.Conv2d(1, config.num_filters, (3, config.embed), stride=1, has_bias=True)
self.conv = nn.Conv2d(config.num_filters, config.num_filters, (3, 1), stride=1, has_bias=True)
self.max_pool = nn.MaxPool2d(kernel_size=(3, 1), stride=2)
self.padding1 = nn.ZeroPad2d((0, 0, 1, 1)) # top bottom
self.padding2 = nn.ZeroPad2d((0, 0, 0, 1)) # bottom
self.relu = nn.ReLU()
self.fc = nn.Dense(config.num_filters, config.num_classes)

def forward(self, x):
x = x[0]
x = self.embedding(x)
x = x.unsqueeze(1) # [batch_size, 250, seq_len, 1]
x = self.conv_region(x) # [batch_size, 250, seq_len-3+1, 1]

x = self.padding1(x) # [batch_size, 250, seq_len, 1]
x = self.relu(x)
x = self.conv(x) # [batch_size, 250, seq_len-3+1, 1]
x = self.padding1(x) # [batch_size, 250, seq_len, 1]
x = self.relu(x)
x = self.conv(x) # [batch_size, 250, seq_len-3+1, 1]
while x.size()[2] > 2:
x = self._block(x)
x = x.squeeze() # [batch_size, num_filters(250)]
x = self.fc(x)
return x

def _block(self, x):
x = self.padding2(x)
px = self.max_pool(x)

x = self.padding1(px)
x = ops.relu(x)
x = self.conv(x)

x = self.padding1(x)
x = ops.relu(x)
x = self.conv(x)

# Short Cut
x = x + px
return x
67 changes: 67 additions & 0 deletions models/FastText.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#coding: UTF-8
import mindspore
import mindspore.nn as nn
import mindspore.ops as ops
import numpy as np

class Config(object):

"""配置参数"""
def __init__(self, dataset, embedding):
self.model_name = 'FastText'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.log_path = dataset + '/log/' + self.model_name
self.embedding_pretrained = mindspore.Tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
if embedding != 'random' else None # 预训练词向量

self.dropout = 0.5 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.num_epochs = 20 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 300 # 字向量维度
self.hidden_size = 256 # 隐藏层大小
self.n_gram_vocab = 250499


'''Bag of Tricks for Efficient Text Classification'''


class Model(nn.Cell):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.embedding_ngram2 = nn.Embedding(config.n_gram_vocab, config.embed)
self.embedding_ngram3 = nn.Embedding(config.n_gram_vocab, config.embed)
self.dropout = nn.Dropout(config.dropout)
self.fc1 = nn.Dense(config.embed * 3, config.hidden_size)
# self.dropout2 = nn.Dropout(config.dropout)
self.fc2 = nn.Dense(config.hidden_size, config.num_classes)

def forward(self, x):

out_word = self.embedding(x[0])
out_bigram = self.embedding_ngram2(x[2])
out_trigram = self.embedding_ngram3(x[3])
out = ops.cat((out_word, out_bigram, out_trigram), -1)

out = out.mean(dim=1)
out = self.dropout(out)
out = self.fc1(out)
out = ops.relu(out)
out = self.fc2(out)
return out
67 changes: 67 additions & 0 deletions models/TextCNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import mindspore
import mindspore.nn as nn
import mindspore.ops as ops
import numpy as np
from mindspore import Tensor, CSRTensor, COOTensor

class Config(object):

"""配置参数"""
def __init__(self, dataset, embedding):
self.model_name = 'TextCNN'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.log_path = dataset + '/log/' + self.model_name
self.embedding_pretrained = mindspore.Tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype(Tensor.float32))\
if embedding != 'random' else None # 预训练词向量

self.dropout = 0.5 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.num_epochs = 20 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 300 # 字向量维度
self.filter_sizes = (2, 3, 4) # 卷积核尺寸
self.num_filters = 256 # 卷积核数量(channels数)


'''Convolutional Neural Networks for Sentence Classification'''


class Model(nn.Cell):
def __init__(self, config):
super(Model, self).__init__() # 继承自nn.Cell类 创建一个模型类实例(self)
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.convs = nn.CellList(
[nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes], has_bias=True)
self.dropout = nn.Dropout(config.dropout)
self.fc = nn.Dense(config.num_filters * len(config.filter_sizes), config.num_classes)


def conv_and_pool(self, x, conv):
x = ops.relu(conv(x)) #relu:激活函数,它将所有负值变为零,并保持正值不变
x = ops.squeeze(x, axis = 3) #squeeze:去掉维度,axis = 删除指定 axis 中大小为1的维度
x = ops.adaptive_max_pool1d(x, x.size(2)) #应用一维自适应最大池化操作
x = ops.squeeze(x, axis = 2)
return x

def forward(self, x):
out = self.embedding(x[0])
out = out.unsqueeze(1)
out = ops.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) #拼接卷积核池化操作后的张量
out = self.dropout(out)
out = self.fc(out)
return out
61 changes: 61 additions & 0 deletions models/TextRCNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#coding: UTF-8
import mindspore
import mindspore.ops as ops
import numpy as np
import mindspore.nn as nn

class Config(object):

"""配置参数"""
def __init__(self, dataset, embedding):
self.model_name = 'TextRCNN'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.log_path = dataset + '/log/' + self.model_name
self.embedding_pretrained = mindspore.Tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
if embedding != 'random' else None # 预训练词向量

self.dropout = 1.0 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.num_epochs = 10 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 300 # 字向量维度, 若使用了预训练词向量,则维度统一
self.hidden_size = 256 # lstm隐藏层
self.num_layers = 1 # lstm层数

'''Recurrent Convolutional Neural Networks for Text Classification'''


class Model(nn.Cell):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
bidirectional=True, batch_first=True, dropout=config.dropout)
self.maxpool = nn.MaxPool1d(config.pad_size)
self.fc = nn.Dense(config.hidden_size * 2 + config.embed, config.num_classes)

def forward(self, x):
x, _ = x
embed = self.embedding(x) # [batch_size, seq_len, embeding]=[64, 32, 64]
out, _ = self.lstm(embed)
out = ops.cat((embed, out), 2)
out = ops.relu(out)
out = out.permute(0, 2, 1)
out = self.maxpool(out).squeeze()
out = self.fc(out)
return out
57 changes: 57 additions & 0 deletions models/TextRNN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#coding: UTF-8
import mindspore
import mindspore.nn as nn
import numpy as np


class Config(object):

"""配置参数"""
def __init__(self, dataset, embedding):
self.model_name = 'TextRNN'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.log_path = dataset + '/log/' + self.model_name
self.embedding_pretrained = mindspore.Tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32'))\
if embedding != 'random' else None # 预训练词向量

self.dropout = 0.5 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.n_vocab = 0 # 词表大小,在运行时赋值
self.num_epochs = 10 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.size(1)\
if self.embedding_pretrained is not None else 300 # 字向量维度, 若使用了预训练词向量,则维度统一
self.hidden_size = 128 # lstm隐藏层
self.num_layers = 2 # lstm层数


'''Recurrent Neural Network for Text Classification with Multi-Task Learning'''


class Model(nn.Cell):
def __init__(self, config):
super(Model, self).__init__()
if config.embedding_pretrained is not None:
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
else:
self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)
self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
bidirectional=True, batch_first=True, dropout=config.dropout)
self.fc = nn.Dense(config.hidden_size * 2, config.num_classes)

def forward(self, x):
x, _ = x
out = self.embedding(x) # [batch_size, seq_len, embeding]=[128, 32, 300]
out, _ = self.lstm(out)
out = self.fc(out[:, -1, :]) # 句子最后时刻的 hidden state
return out
Loading

0 comments on commit e3fdb21

Please sign in to comment.