fixed version

ZikF220 · Apr 26, 2020 · aff88ea · aff88ea
commit aff88ea
Show file tree

Hide file tree

Showing 20 changed files with 907 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,121 @@
+# project
+*.npz
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# PyCharm
+.idea/
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Dasheng Ji
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
diff --git a/README.md b/README.md
@@ -0,0 +1,96 @@
+A PyTorch implementation of the BI-LSTM-CRF model.
+
+# Features:
+- Compared with [PyTorch BI-LSTM-CRF tutorial][1], following improvements are performed:
+    - Full support for mini-batch computation
+    - Full vectorized implementation. Specially, removing all loops in "score sentence" algorithm, which dramatically improve training performance
+    - CUDA supported
+    - Very simple APIs for [CRF module](#CRF)
+        - START/STOP tags are automatically added in CRF
+        - A inner Linear Layer is included which transform from feature space to tag space
+- Specialized for NLP sequence tagging tasks
+- Easy to train your own sequence tagging models
+- MIT License
+
+# Installation
+- dependencies
+    - Python 3
+    - [PyTorch][5]
+- install
+    ```sh
+    $ pip install bi-lstm-crf
+    ```
+
+# Training
+### corpus
+- prepare your corpus in the specified [structure and format][2]
+- there is also a sample corpus in [`bi_lstm_crf/app/sample_corpus`][3]
+
+### training
+```sh
+$ python -m bi_lstm_crf corpus_dir --model_dir "model_xxx"
+```
+- more [options][4]
+- [detail of model_dir][7]
+
+### training curve
+```python
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# the training losses are saved in the model_dir
+df = pd.read_csv(".../model_dir/loss.csv")
+df[["train_loss", "val_loss"]].ffill().plot(grid=True)
+plt.show()
+```
+
+# Prediction
+```python
+from bi_lstm_crf.app import WordsTagger
+
+model = WordsTagger(model_dir="xxx")
+tags, sequences = model(["市领导到成都..."])  # CHAR-based model
+print(tags)  
+# [["B", "B", "I", "B", "B-LOC", "I-LOC", "I-LOC", "I-LOC", "I-LOC", "B", "I", "B", "I"]]
+print(sequences)
+# [['市', '领导', '到', ('成都', 'LOC'), ...]]
+
+# model([["市", "领导", "到", "成都", ...]])  # WORD-based model
+```
+
+# <a id="CRF">CRF Module
+The CRF module can be easily embeded into other models:
+```python
+from bi_lstm_crf import CRF
+
+# a BERT-CRF model for sequence tagging
+class BertCrf(nn.Module):
+    def __init__(self, ...):
+        ...
+        self.bert = BERT(...)
+        self.crf = CRF(in_features, num_tags)
+
+    def loss(self, xs, tags):
+        features, = self.bert(xs)
+        masks = xs.gt(0)
+        loss = self.crf.loss(features, tags, masks)
+        return loss
+
+    def forward(self, xs):
+        features, = self.bert(xs)
+        masks = xs.gt(0)
+        scores, tag_seq = self.crf(features, masks)
+        return scores, tag_seq
+```
+
+# References
+1. [Zhiheng Huang, Wei Xu, and Kai Yu. 2015. Bidirectional LSTM-CRF Models for Sequence Tagging][6]. arXiv:1508.01991.
+2. PyTorch tutorial [ADVANCED: MAKING DYNAMIC DECISIONS AND THE BI-LSTM CRF][1]
+
+[1]:https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
+[2]:https://github.com/jidasheng/bi-lstm-crf/wiki/corpus-structure-and-format
+[3]:https://github.com/jidasheng/bi-lstm-crf/tree/master/bi_lstm_crf/app/sample_corpus
+[4]:https://github.com/jidasheng/bi-lstm-crf/wiki/training-options
+[5]:https://pytorch.org/
+[6]:https://arxiv.org/abs/1508.01991
+[7]:https://github.com/jidasheng/bi-lstm-crf/wiki/details-of-model_dir
diff --git a/bi_lstm_crf/__init__.py b/bi_lstm_crf/__init__.py
@@ -0,0 +1,4 @@
+from .model import CRF, BiRnnCrf
+
+__version__ = '0.2.0'
+__license__ = 'MIT'
diff --git a/bi_lstm_crf/__main__.py b/bi_lstm_crf/__main__.py
@@ -0,0 +1,3 @@
+from .app.train import main
+
+main()
diff --git a/bi_lstm_crf/app/__init__.py b/bi_lstm_crf/app/__init__.py
@@ -0,0 +1,2 @@
+from .predict import WordsTagger
+from .train import train
diff --git a/bi_lstm_crf/app/predict.py b/bi_lstm_crf/app/predict.py
@@ -0,0 +1,84 @@
+import argparse
+import numpy as np
+from bi_lstm_crf.app.preprocessing import *
+from bi_lstm_crf.app.utils import *
+
+
+class WordsTagger:
+    def __init__(self, model_dir, device=None):
+        args_ = load_json_file(arguments_filepath(model_dir))
+        args = argparse.Namespace(**args_)
+        args.model_dir = model_dir
+        self.args = args
+
+        self.preprocessor = Preprocessor(config_dir=model_dir, verbose=False)
+        self.model = build_model(self.args, self.preprocessor, load=True, verbose=False)
+        self.device = running_device(device)
+        self.model.to(self.device)
+
+        self.model.eval()
+
+    def __call__(self, sentences, begin_tags="BS"):
+        """predict texts
+
+        :param sentences: a text or a list of text
+        :param begin_tags: begin tags for the beginning of a span
+        :return:
+        """
+        if not isinstance(sentences, (list, tuple)):
+            raise ValueError("sentences must be a list of sentence")
+
+        try:
+            sent_tensor = np.asarray([self.preprocessor.sent_to_vector(s) for s in sentences])
+            sent_tensor = torch.from_numpy(sent_tensor).to(self.device)
+            with torch.no_grad():
+                _, tags = self.model(sent_tensor)
+            tags = self.preprocessor.decode_tags(tags)
+        except RuntimeError as e:
+            print("*** runtime error: {}".format(e))
+            raise e
+        return tags, self.tokens_from_tags(sentences, tags, begin_tags=begin_tags)
+
+    @staticmethod
+    def tokens_from_tags(sentences, tags_list, begin_tags):
+        """extract entities from tags
+
+        :param sentences: a list of sentence
+        :param tags_list: a list of tags
+        :param begin_tags:
+        :return:
+        """
+        if not tags_list:
+            return []
+
+        def _tokens(sentence, ts):
+            begins = [(idx, t[2:]) for idx, t in enumerate(ts) if t[0] in begin_tags + "O"] + [(len(ts), "O")]
+            begins = [b for idx, b in enumerate(begins) if idx == 0 or ts[idx] != "O" or ts[idx - 1] != "O"]
+            if begins[0][0] != 0:
+                print('warning: tags does begin with any of {}: \n{}\n{}'.format(begin_tags, sentence, ts))
+                begins.insert(0, (0, 0))
+
+            tokens_ = [(sentence[s:e], tag) for (s, tag), (e, _) in zip(begins[:-1], begins[1:])]
+            return [((t, tag) if tag else t) for t, tag in tokens_]
+
+        tokens_list = [_tokens(sentence, ts) for sentence, ts in zip(sentences, tags_list)]
+        return tokens_list
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("sentence", type=str, help="the sentence to be predicted")
+    parser.add_argument('--model_dir', type=str, required=True, help="the model directory for model files")
+    parser.add_argument('--device', type=str, default=None,
+                        help='the training device: "cuda:0", "cpu:0". It will be auto-detected by default')
+
+    args = parser.parse_args()
+
+    results = WordsTagger(args.model_dir, args.device)([args.sentence])
+    print(args.sentence)
+    for objs in results:
+        print(json.dumps(objs[0], ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bi_lstm_crf/app/preprocessing/__init__.py b/bi_lstm_crf/app/preprocessing/__init__.py
@@ -0,0 +1,2 @@
+from .utils import *
+from .preprocess import Preprocessor
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .predict import WordsTagger
		from .train import train
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .utils import *
		from .preprocess import Preprocessor