Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
HawkingC authored Jun 4, 2023
1 parent c8ca936 commit 1cd0773
Show file tree
Hide file tree
Showing 11 changed files with 10,947 additions and 0 deletions.
10 changes: 10 additions & 0 deletions THUCNews/data/class.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
finance
realty
stocks
education
science
society
politics
sports
game
entertainment
10,000 changes: 10,000 additions & 0 deletions THUCNews/data/dev.txt

Large diffs are not rendered by default.

Binary file added THUCNews/data/embedding_SougouNews.npz
Binary file not shown.
Binary file added THUCNews/data/embedding_Tencent.npz
Binary file not shown.
134 changes: 134 additions & 0 deletions models/DPCNN.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#coding: UTF-8\n",
"import mindspore\n",
"import mindspore.nn as nn\n",
"import mindspore.ops as ops\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"class Config(object):\n",
"\n",
" \"\"\"配置参数\"\"\"\n",
" def __init__(self, dataset, embedding):\n",
" self.model_name = 'DPCNN'\n",
" self.train_path = dataset + '/data/train.txt' # 训练集\n",
" self.dev_path = dataset + '/data/dev.txt' # 验证集\n",
" self.test_path = dataset + '/data/test.txt' # 测试集\n",
" self.class_list = [x.strip() for x in open(\n",
" dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单\n",
" self.vocab_path = dataset + '/data/vocab.pkl' # 词表\n",
" self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果\n",
" self.log_path = dataset + '/log/' + self.model_name\n",
" self.embedding_pretrained = mindspore.Tensor(\n",
" np.load(dataset + '/data/' + embedding)[\"embeddings\"].astype('float32'))\\\n",
" if embedding != 'random' else None # 预训练词向量\n",
"\n",
" self.dropout = 0.5 # 随机失活\n",
" self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练\n",
" self.num_classes = len(self.class_list) # 类别数\n",
" self.n_vocab = 0 # 词表大小,在运行时赋值\n",
" self.num_epochs = 20 # epoch数\n",
" self.batch_size = 128 # mini-batch大小\n",
" self.pad_size = 32 # 每句话处理成的长度(短填长切)\n",
" self.learning_rate = 1e-3 # 学习率\n",
" self.embed = self.embedding_pretrained.size(1)\\\n",
" if self.embedding_pretrained is not None else 300 # 字向量维度\n",
" self.num_filters = 250 # 卷积核数量(channels数)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"'''Deep Pyramid Convolutional Neural Networks for Text Categorization'''\n",
"\n",
"\n",
"class Model(nn.Cell):\n",
" def __init__(self, config):\n",
" super(Model, self).__init__()\n",
" if config.embedding_pretrained is not None:\n",
" self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)\n",
" else:\n",
" self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)\n",
" self.conv_region = nn.Conv2d(1, config.num_filters, (3, config.embed), stride=1, has_bias=True)\n",
" self.conv = nn.Conv2d(config.num_filters, config.num_filters, (3, 1), stride=1, has_bias=True)\n",
" self.max_pool = nn.MaxPool2d(kernel_size=(3, 1), stride=2)\n",
" self.padding1 = nn.ZeroPad2d((0, 0, 1, 1)) # top bottom\n",
" self.padding2 = nn.ZeroPad2d((0, 0, 0, 1)) # bottom\n",
" self.relu = nn.ReLU()\n",
" self.fc = nn.Dense(config.num_filters, config.num_classes)\n",
"\n",
" def forward(self, x):\n",
" x = x[0]\n",
" x = self.embedding(x)\n",
" x = x.unsqueeze(1) # [batch_size, 250, seq_len, 1]\n",
" x = self.conv_region(x) # [batch_size, 250, seq_len-3+1, 1]\n",
"\n",
" x = self.padding1(x) # [batch_size, 250, seq_len, 1]\n",
" x = self.relu(x)\n",
" x = self.conv(x) # [batch_size, 250, seq_len-3+1, 1]\n",
" x = self.padding1(x) # [batch_size, 250, seq_len, 1]\n",
" x = self.relu(x)\n",
" x = self.conv(x) # [batch_size, 250, seq_len-3+1, 1]\n",
" while x.size()[2] > 2:\n",
" x = self._block(x)\n",
" x = x.squeeze() # [batch_size, num_filters(250)]\n",
" x = self.fc(x)\n",
" return x\n",
"\n",
" def _block(self, x):\n",
" x = self.padding2(x)\n",
" px = self.max_pool(x)\n",
"\n",
" x = self.padding1(px)\n",
" x = ops.relu(x)\n",
" x = self.conv(x)\n",
"\n",
" x = self.padding1(x)\n",
" x = ops.relu(x)\n",
" x = self.conv(x)\n",
"\n",
" # Short Cut\n",
" x = x + px\n",
" return x"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "MindSpore",
"language": "python",
"name": "mindspore"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
114 changes: 114 additions & 0 deletions models/FastText.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"#coding: UTF-8\n",
"import mindspore\n",
"import mindspore.nn as nn\n",
"import mindspore.ops as ops\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"class Config(object):\n",
"\n",
" \"\"\"配置参数\"\"\"\n",
" def __init__(self, dataset, embedding):\n",
" self.model_name = 'FastText'\n",
" self.train_path = dataset + '/data/train.txt' # 训练集\n",
" self.dev_path = dataset + '/data/dev.txt' # 验证集\n",
" self.test_path = dataset + '/data/test.txt' # 测试集\n",
" self.class_list = [x.strip() for x in open(\n",
" dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单\n",
" self.vocab_path = dataset + '/data/vocab.pkl' # 词表\n",
" self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果\n",
" self.log_path = dataset + '/log/' + self.model_name\n",
" self.embedding_pretrained = mindspore.Tensor(\n",
" np.load(dataset + '/data/' + embedding)[\"embeddings\"].astype('float32'))\\\n",
" if embedding != 'random' else None # 预训练词向量\n",
"\n",
" self.dropout = 0.5 # 随机失活\n",
" self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练\n",
" self.num_classes = len(self.class_list) # 类别数\n",
" self.n_vocab = 0 # 词表大小,在运行时赋值\n",
" self.num_epochs = 20 # epoch数\n",
" self.batch_size = 128 # mini-batch大小\n",
" self.pad_size = 32 # 每句话处理成的长度(短填长切)\n",
" self.learning_rate = 1e-3 # 学习率\n",
" self.embed = self.embedding_pretrained.size(1)\\\n",
" if self.embedding_pretrained is not None else 300 # 字向量维度\n",
" self.hidden_size = 256 # 隐藏层大小\n",
" self.n_gram_vocab = 250499"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"'''Bag of Tricks for Efficient Text Classification'''\n",
"\n",
"\n",
"class Model(nn.Cell):\n",
" def __init__(self, config):\n",
" super(Model, self).__init__()\n",
" if config.embedding_pretrained is not None:\n",
" self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)\n",
" else:\n",
" self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)\n",
" self.embedding_ngram2 = nn.Embedding(config.n_gram_vocab, config.embed)\n",
" self.embedding_ngram3 = nn.Embedding(config.n_gram_vocab, config.embed)\n",
" self.dropout = nn.Dropout(config.dropout)\n",
" self.fc1 = nn.Dense(config.embed * 3, config.hidden_size)\n",
" # self.dropout2 = nn.Dropout(config.dropout)\n",
" self.fc2 = nn.Dense(config.hidden_size, config.num_classes)\n",
"\n",
" def forward(self, x):\n",
"\n",
" out_word = self.embedding(x[0])\n",
" out_bigram = self.embedding_ngram2(x[2])\n",
" out_trigram = self.embedding_ngram3(x[3])\n",
" out = ops.cat((out_word, out_bigram, out_trigram), -1)\n",
"\n",
" out = out.mean(dim=1)\n",
" out = self.dropout(out)\n",
" out = self.fc1(out)\n",
" out = ops.relu(out)\n",
" out = self.fc2(out)\n",
" return out"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "MindSpore",
"language": "python",
"name": "mindspore"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
114 changes: 114 additions & 0 deletions models/TextCNN.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"import mindspore\n",
"import mindspore.nn as nn\n",
"import mindspore.ops as ops\n",
"import numpy as np\n",
"from mindspore import Tensor, CSRTensor, COOTensor"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"class Config(object):\n",
"\n",
" \"\"\"配置参数\"\"\"\n",
" def __init__(self, dataset, embedding):\n",
" self.model_name = 'TextCNN'\n",
" self.train_path = dataset + '/data/train.txt' # 训练集\n",
" self.dev_path = dataset + '/data/dev.txt' # 验证集\n",
" self.test_path = dataset + '/data/test.txt' # 测试集\n",
" self.class_list = [x.strip() for x in open(\n",
" dataset + '/data/class.txt', encoding='utf-8').readlines()] # 类别名单\n",
" self.vocab_path = dataset + '/data/vocab.pkl' # 词表\n",
" self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果\n",
" self.log_path = dataset + '/log/' + self.model_name\n",
" self.embedding_pretrained = mindspore.Tensor(\n",
" np.load(dataset + '/data/' + embedding)[\"embeddings\"].astype(Tensor.float32))\\\n",
" if embedding != 'random' else None # 预训练词向量\n",
" \n",
" self.dropout = 0.5 # 随机失活\n",
" self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练\n",
" self.num_classes = len(self.class_list) # 类别数\n",
" self.n_vocab = 0 # 词表大小,在运行时赋值\n",
" self.num_epochs = 20 # epoch数\n",
" self.batch_size = 128 # mini-batch大小\n",
" self.pad_size = 32 # 每句话处理成的长度(短填长切)\n",
" self.learning_rate = 1e-3 # 学习率\n",
" self.embed = self.embedding_pretrained.size(1)\\\n",
" if self.embedding_pretrained is not None else 300 # 字向量维度\n",
" self.filter_sizes = (2, 3, 4) # 卷积核尺寸\n",
" self.num_filters = 256 # 卷积核数量(channels数)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"'''Convolutional Neural Networks for Sentence Classification'''\n",
"\n",
"\n",
"class Model(nn.Cell):\n",
" def __init__(self, config):\n",
" super(Model, self).__init__() # 继承自nn.Cell类 创建一个模型类实例(self)\n",
" if config.embedding_pretrained is not None:\n",
" self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)\n",
" else:\n",
" self.embedding = nn.Embedding(config.n_vocab, config.embed, padding_idx=config.n_vocab - 1)\n",
" self.convs = nn.CellList(\n",
" [nn.Conv2d(1, config.num_filters, (k, config.embed)) for k in config.filter_sizes], has_bias=True)\n",
" self.dropout = nn.Dropout(config.dropout)\n",
" self.fc = nn.Dense(config.num_filters * len(config.filter_sizes), config.num_classes)\n",
"\n",
"\n",
" def conv_and_pool(self, x, conv):\n",
" x = ops.relu(conv(x)) #relu:激活函数,它将所有负值变为零,并保持正值不变\n",
" x = ops.squeeze(x, axis = 3) #squeeze:去掉维度,axis = 删除指定 axis 中大小为1的维度\n",
" x = ops.adaptive_max_pool1d(x, x.size(2)) #应用一维自适应最大池化操作\n",
" x = ops.squeeze(x, axis = 2)\n",
" return x\n",
" \n",
" def forward(self, x):\n",
" out = self.embedding(x[0])\n",
" out = out.unsqueeze(1)\n",
" out = ops.cat([self.conv_and_pool(out, conv) for conv in self.convs], 1) #拼接卷积核池化操作后的张量\n",
" out = self.dropout(out)\n",
" out = self.fc(out)\n",
" return out \n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "MindSpore",
"language": "python",
"name": "mindspore"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 1cd0773

Please sign in to comment.