Skip to content

Commit

Permalink
version 0.0.6
Browse files Browse the repository at this point in the history
  • Loading branch information
celtics1863 committed Feb 18, 2022
1 parent 5c8c782 commit d028387
Show file tree
Hide file tree
Showing 40 changed files with 22,243 additions and 225 deletions.
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
include *.md
recursive-include src *.txt *.py word2vec* *.vocab
recursive-include src *.txt *.py word2vec* *.vocab *.json *.csv
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="envtext",
version="0.0.5",
version="0.0.6",
author="Bi Huaibin",
author_email="[email protected]",
description="envtext for Chinese texts analysis in Environment domain",
Expand Down
73 changes: 67 additions & 6 deletions src/envtext.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: envtext
Version: 0.0.5
Version: 0.0.6
Summary: envtext for Chinese texts analysis in Environment domain
Home-page: https://github.com/celtics1863/envtext
Author: Bi Huaibin
Expand Down Expand Up @@ -39,7 +39,8 @@ Description-Content-Type: text/markdown
- 为神经网络模型精简了接口,只保留了必要的batch_size, learning_rate等参数
- 进一步优化huggingface transformers输入输出接口,支持20余种数据集格式
- 一键使用模型,让领域专家精力集中在分析问题上


5. :five: 使用transformers接口,支持轻松自定义模型

下一步计划:
- [ ] 数据集支持:支持常用**标注工具**数据集
Expand Down Expand Up @@ -114,15 +115,13 @@ model.save_result('result.csv')
```
#### 2.2 使用RNN

目前RNN的初始化接口没有完全与Bert同步,后续有同步计划,尽请期待。
```python
from envtext.models import RNNCLS

model = RNNCLS()
model.load('本地pytorch_model.bin所在文件夹')
model = RNNCLS('本地pytorch_model.bin所在文件夹')

#进行预测
model('气候[Mask][Mask]是各国政府都关心的话题')
model('气候变化是各国政府都关心的话题')

#导出结果
model.save_result('result.csv')
Expand Down Expand Up @@ -167,6 +166,68 @@ model = BertCLS('celtics1863/env-bert-chinese',config)
model.train(datasets)
```

### 4. 自定义模型


首先自定义一个回归任务的Bert模型
```python
from envtext.models.bert_base import BertBase
import torch
from transformers import BertPreTrainedModel,BertModel

class MyBert(BertPreTrainedModel):
def __init__(self, config):
super(MyBert, self).__init__(config)
self.bert = BertModel(config) #bert模型
self.regressor = torch.nn.Linear(config.hidden_size, 1) #回归器
self.loss = torch.nn.MSELoss() #损失函数

def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
position_ids=None, inputs_embeds=None, head_mask=None):
outputs = self.bert(input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds)
#使用[CLS] token
cls_output = outputs[0][:,0,:]

# 得到判别值
logits = self.regressor(cls_output)

outputs = (logits,)

#这里需要与bert的接口保持一致,在有labels输入的情况下,返回(loss,logits),否则返回(logits,)
if labels is not None:
loss = self.loss(logits.squeeze(),labels)
outputs = (loss,) + outputs
return outputs

```
将模型与envtext的接口对接

```
class MyBertModel(BertBase):
#重写初始化函数
def initialize_bert(self,path = None,config = None,**kwargs):
super().initialize_bert(path,config,**kwargs)
self.model = MyBert.from_pretrained(self.model_path)

重写预测函数
def predict_per_sentence(self,text, print_result = True ,save_result = True):
tokens=self.tokenizer.encode(text, return_tensors='pt',add_special_tokens=True).to(self.model.device) #获得token
logits = self.model(tokens)[0] #获得logits

if print_result:
#打印结果
print(logits[0].clone().detach().cpu())

if save_result:
#保存结果
self.result[text] = logits[0].clone().detach().cpu()
```


更详细的教程,请参见我们的案例 [jupyter notebooks]('jupyter_notebooks')

Expand Down
3 changes: 3 additions & 0 deletions src/envtext.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ src/envtext/files/bert_vocab.txt
src/envtext/files/env_vocab.jieba.txt
src/envtext/files/onehot_vocab.txt
src/envtext/files/word2vec64
src/envtext/files/datasets/CLS_IsClimate.json
src/envtext/files/datasets/CLUENER.json
src/envtext/files/datasets/SA_Intensity.json
src/envtext/models/__init__.py
src/envtext/models/bert_base.py
src/envtext/models/bert_cls.py
Expand Down
Binary file modified src/envtext/data/__pycache__/load_dataset_base.cpython-39.pyc
Binary file not shown.
Binary file modified src/envtext/data/__pycache__/utils.cpython-39.pyc
Binary file not shown.
32 changes: 15 additions & 17 deletions src/envtext/data/load_dataset_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ def has_split(js):

@staticmethod
def generate_datasets(js,task,train,valid,test,text,label,sep = ' ' ,label_as_key = False):
task = _unify_task(task)
# task = _unify_task(task)
if task == 'CLUENER':
return generate_cluener_datasets(js,train,valid,test,text,label)

if not label_as_key:
if task == 'CLS':
return generate_cls_datasets(js,train,valid,test,text,label)
Expand All @@ -26,7 +29,7 @@ def generate_datasets(js,task,train,valid,test,text,label,sep = ' ' ,label_as_ke
elif task == 'KW':
return generate_keyword_datasets(js,train,valid,test,text,label,sep)
elif task == 'NER':
return generate_cluener_ner_datasets(js,train,valid,test,text,label)
return generate_keyword_datasets(js,train,valid,test,text,label)
else:
raise NotImplemented
else:
Expand All @@ -39,18 +42,6 @@ def generate_datasets(js,task,train,valid,test,text,label,sep = ' ' ,label_as_ke
else:
raise NotImplemented

#确认task
def _unify_task(task):
if task.lower() in [0,'cls','classification','classify']:
return 'CLS'
elif task.lower() in [1,'reg','regression','regressor','sa','sentitive analysis']:
return 'REG'
elif task.lower() in [2,'ner','namely entity recognition']:
return 'NER'
elif task.lower() in [2,'key','kw','key word','keyword','keywords','key words']:
return 'KW'
elif task.lower() in [3,'mcls','multi-class','multiclass','multiclasses']:
return 'MCLS'

#将标签转换为数字label
def convert_label2onehot(ids,labels):
Expand Down Expand Up @@ -328,7 +319,7 @@ def align_keyword_label(text,words):

return datasets,config

def generate_cluener_ner_datasets(js,train,valid,test,text,label):
def generate_cluener_datasets(js,train,valid,test,text,label):
#数据集
datasets = { 'train':defaultdict(list),'valid':defaultdict(list),'test':defaultdict(list)}
mapping = { train:'train', valid:'valid', test:'test'}
Expand Down Expand Up @@ -392,7 +383,10 @@ def align_keyword_label(text,annos):

#数据集参数
config = {
'labels':labels,
'labels':list(label2id.keys()),
'entities':labels,
'num_labels':len(label2id),
'num_entities':len(labels),
'label2id':label2id,
'id2label':id2label,
'counter':counter,
Expand Down Expand Up @@ -476,4 +470,8 @@ def align_keyword_label(text,annos):
'Avg. Length':AvgL
}

return datasets,config
return datasets,config


def generate_ner_datasets(js,train,valid,test,text,label):
pass
65 changes: 51 additions & 14 deletions src/envtext/data/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from .load_dataset import *
from collections import defaultdict
from ..files import FileConfig


def sampler_dataset(dataset, p = 0.5):
import random
sampled_dataset = {'train':defaultdict(list),'valid':defaultdict(list),'test':defaultdict(list)}
Expand All @@ -13,7 +16,23 @@ def sampler_dataset(dataset, p = 0.5):
return sampled_dataset


def load_dataset(path,task,format = None,split=0.5,label_as_key = False,
#确认task
def _unify_task(task):
if task.lower() in [0,'cls','classification','classify']:
return 'CLS'
elif task.lower() in [1,'reg','regression','regressor','sa','sentitive analysis']:
return 'REG'
elif task.lower() in [2,'ner','namely entity recognition']:
return 'NER'
elif task.lower() in [2,'key','kw','key word','keyword','keywords','key words']:
return 'KW'
elif task.lower() in [3,'mcls','multi-class','multiclass','multiclasses','mc','multi-choice','multichoice']:
return 'MCLS'
elif task.lower() in [4,'cluener','clue_ner','clue ner']:
return 'CLUENER'


def load_dataset(path,task = None,format = None , sampler = 1 ,split=0.5,label_as_key = False,
sep = ' ',dataset = 'dataset',train = 'train',valid = 'valid' ,test = 'test', text = 'text', label = 'label'):
'''
读取数据集的通用接口,用来处理各种输入。
Expand All @@ -27,7 +46,7 @@ def load_dataset(path,task,format = None,split=0.5,label_as_key = False,
{'train':{'label_1':[], 'label_2':[], ... },...}
{'label_1':[], 'label_2':[], ... } 以split为比例随机划分训练集和验证集
format = 'json_L'
format = 'jsonL'
适用于如下类型的json line数据格式:
{'text': text_1, 'label':label_1, 'dataset':'train'}
{'text': text_2, 'label':label_2, 'dataset':'train'}
Expand Down Expand Up @@ -100,31 +119,49 @@ def load_dataset(path,task,format = None,split=0.5,label_as_key = False,
'''
# kwargs = {'task':task,'split':split,'sep':sep,'dataset':dataset,'train':train,'valid':valid,'test':test,'label':label}
config = FileConfig()

if path.lower() in config.datasets_names:
info = config.datasets_info[config.datasets_names[path.lower()]]
task = info['task']
format = info['format']
path = info['path']

task = _unify_task(task)

if task == 'CLUENER':
format = 'jsonL'

if format is None:
if path.split('.')[-1] == 'json':
try:
return LoadJson.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadJson.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
except:
return LoadJson2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadJson2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
elif path.split('.')[-1] == 'csv':
return LoadExcel.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadExcel.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
else:
try:
return LoadText.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadText.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
except:
return LoadText2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadText2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)

elif format == 'json' and not label_as_key:
return LoadJson.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadJson.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
elif format == 'json2' or (format == 'json' and label_as_key):
return LoadJson2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadJson2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
elif format == 'jsonL':
return LoadJsonL.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadJsonL.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
elif format == 'text':
return LoadText.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadText.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
elif format == 'text2':
return LoadText2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadText2.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
elif format == 'excel':
return LoadExcel.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
datasets,config = LoadExcel.load_dataset(path,task,split,sep,dataset,train,valid,test,text,label)
else:
raise NotImplemented
raise NotImplemented

if sampler and 0 < sampler < 1:
datasets = sampler_dataset(datasets,sampler)

return datasets,config
56 changes: 56 additions & 0 deletions src/envtext/files/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
bert_vocab = './bert_vocab.txt'
word2vec64 = './word2vec64'
word2vec256 = './word2vec256'
datasets_dir = './datasets'
sa_intensity = './datasets/SA_Intensity.json'
cls_isclimate = './datasets/CLS_IsClimate.json'
cluener = './datasets/CLUENER.json'

import os

Expand Down Expand Up @@ -33,6 +37,58 @@ def word2vec64(self):
def word2vec256(self):
return self.get_abs_path(word2vec256)

@property
def datasets(self):
datasets = []
for file in os.listdir(os.path.join(basedir,datasets_dir)):
datasets.append(os.path.join(basedir,datasets_dir,file))
return datasets

@property
def SA_Intensity(self):
return {
'path':self.get_abs_path(sa_intensity),
'name':['SA','sa','sa_intensity','reg','regression'],
'task':'sentitive analysis',
'format':'json2'
}

@property
def CLS_IsClimate(self):
return {
'path':self.get_abs_path(cls_isclimate),
'name':['cls','classification','isclimate','cls_isclimate'],
'task':'classification',
'format':'json2'
}


@property
def CLUENER(self):
return {
'path':self.get_abs_path(cluener),
'name':['ner','namely entity recognition','clue ner','cluener'],
'task':'cluener',
'format':'jsonL'
}

@property
def datasets_info(self):
info = {
'sa_intensity':self.SA_Intensity,
'cls_isclimate':self.CLS_IsClimate,
'cluener':self.CLUENER
}
return info

@property
def datasets_names(self):
info = self.datasets_info
names = {}
for k,v in info.items():
for name in v['name']:
names[name] = k
return names

def get_word2vec_path(self,vector_size = 64):
if vector_size == 256:
Expand Down
Binary file modified src/envtext/files/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Loading

0 comments on commit d028387

Please sign in to comment.