Revise documentation

13401095975 · Oct 26, 2021 · f3321ed · f3321ed
1 parent 69a660e
commit f3321ed
Show file tree

Hide file tree

Showing 10 changed files with 131 additions and 69 deletions.
diff --git a/README.md b/README.md
@@ -112,11 +112,15 @@ The result is guaranteed to be `96.70` as the random feed is fixed. Different fr
 If you use HanLP in your research, please cite this repository. 
 
 ```latex
-@software{hanlp2,
-  author = {Han He},
-  title = {{HanLP: Han Language Processing}},
+@inproceedings{he-choi-2019,
+  title = {Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT},
+  author = {Han He and Jinho Choi},
+  booktitle = {The Thirty-Third International Flairs Conference},
+  conference = {Florida Artificial Intelligence Research Society Conference},
   year = {2020},
-  url = {https://github.com/hankcs/HanLP},
+  keywords = {part-of-speech tagging, syntactic parsing, semantic parsing, Transformer, BERT},
+  abstract = {This paper presents new state-of-the-art models for three tasks, part-of-speech tagging, syntactic parsing, and semantic parsing, using the cutting-edge contextualized embedding framework known as BERT. For each task, we first replicate and simplify the current state-of-the-art approach to enhance its model efficiency. We then evaluate our simplified approaches on those three tasks using token embeddings generated by BERT. 12 datasets in both English and Chinese are used for our experiments. The BERT models outperform the previously best-performing models by 2.5\% on average (7.5\% for the most significant case). All models and source codes are available in public so that researchers can improve upon and utilize them to establish strong baselines for the next decade.},
+  url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}
 }
 ```
 

diff --git a/docs/references.bib b/docs/references.bib
diff --git a/docs/tutorial.md b/docs/tutorial.md
@@ -23,6 +23,14 @@ production environments. In this tutorial, we'll walk through the APIs in HanLP
 HanLP offers out-of-the-box RESTful API and native Python API which share very similar interfaces 
 while they are designed for different scenes.
 
+```{code-cell} ipython3
+:tags: [remove_cell]
+
+import hanlp_common.constant
+
+hanlp_common.constant.IPYTHON = False  # Avoid pretty_print prints html which doesn't play well with this theme
+```
+
 ## RESTful API
 
 RESTful API is an endpoint where you send your documents to then get the parsed annotations back. 

diff --git a/hanlp/components/mtl/tasks/ner/tag_ner.py b/hanlp/components/mtl/tasks/ner/tag_ner.py
@@ -64,7 +64,7 @@ def __init__(self,
         r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
         NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
         During decoding, it performs longest-prefix-matching of these words to override the prediction from
-        underlining statistical model. It also uses a blacklist to mask out mis-predicted  entities.
+        underlying statistical model. It also uses a blacklist to mask out mis-predicted  entities.
 
         .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
             do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

diff --git a/hanlp/components/mtl/tasks/sdp.py b/hanlp/components/mtl/tasks/sdp.py
@@ -49,7 +49,8 @@ def __init__(self,
                  use_pos=False,
                  **kwargs) -> None:
         r"""Implementation of "Stanford's graph-based neural dependency parser at
-        the conll 2017 shared task" (:cite:`dozat2017stanford`).
+        the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
+        (:cite:`he-choi-2019`).
 
         Args:
             trn: Path to training set.

diff --git a/hanlp/components/ner/transformer_ner.py b/hanlp/components/ner/transformer_ner.py
@@ -23,7 +23,7 @@ def __init__(self, **kwargs) -> None:
         (:cite:`lafferty2001conditional`) layer for
         NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
         During decoding, it performs longest-prefix-matching of these words to override the prediction from
-        underlining statistical model. It also uses a blacklist to mask out mis-predicted  entities.
+        underlying statistical model. It also uses a blacklist to mask out mis-predicted  entities.
 
         .. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
             do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.

diff --git a/hanlp/components/parsers/biaffine/biaffine_sdp.py b/hanlp/components/parsers/biaffine/biaffine_sdp.py
@@ -19,8 +19,9 @@
 
 class BiaffineSemanticDependencyParser(BiaffineDependencyParser):
     def __init__(self) -> None:
-        """Implementation of "Stanford's graph-based neural dependency parser at
-        the conll 2017 shared task" (:cite:`dozat2017stanford`).
+        r"""Implementation of "Stanford's graph-based neural dependency parser at
+        the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
+        (:cite:`he-choi-2019`).
         """
         super().__init__()
 

diff --git a/hanlp/pretrained/sdp.py b/hanlp/pretrained/sdp.py
@@ -4,15 +4,15 @@
 from hanlp_common.constant import HANLP_URL
 
 SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip'
-'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval16 news data.'
+'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.'
 SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip'
-'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval16 text data.'
+'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.'
 
 SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip'
-'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval15 PAS data.'
+'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.'
 SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip'
-'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval15 PSD data.'
+'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.'
 SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip'
-'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval15 DM data.'
+'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.'
 
 ALL = {}
diff --git a/plugins/hanlp_restful/hanlp_restful/__init__.py b/plugins/hanlp_restful/hanlp_restful/__init__.py
@@ -154,7 +154,15 @@ def text_style_transfer(self, text: Union[str, List[str]], target_style: str, la
 
             HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'],
                                       target_style='gov_doc')
-            # Output: ['国家对中石油寄予厚望。', '要以创新驱动高质量发展。']
+            # Output:
+            [
+                '国家对中石油寄予厚望。',
+                '要以创新驱动高质量发展。'
+            ]
+
+            HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry')
+            # Output:
+            '我看见窗外的白云绿林'
 
         Returns:
             Text or a list of text of the target style.
@@ -180,7 +188,12 @@ def semantic_textual_similarity(self, text: Union[Tuple[str, str], List[Tuple[st
                 ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),
                 ('北京到上海的动车票', '上海到北京的动车票'),
             ])
-            # Output: [0.9764469861984253, 0.0, 0.003458738327026367]
+            # Output:
+            [
+                0.9764469861984253,   # Similarity of ('看图猜一电影名', '看图猜电影')
+                0.0,                  # Similarity of ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用')
+                0.003458738327026367  # Similarity of ('北京到上海的动车票', '上海到北京的动车票')
+            ]
 
         Returns:
             Similarities.
@@ -219,7 +232,7 @@ def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[Li
                           [['她的猫', 4, 7], ['它', 11, 12]]],        # 指代说话人的姐姐的猫
 
         .. image:: https://file.hankcs.com/img/coref_demo_small.png
-            :alt: my-picture1
+            :alt: Coreference resolution visualization
 
         Returns:
             When ``text`` is specified, return the clusters and tokens. Otherwise just the clusters, In this case, you need to ``sum(tokens, [])`` in order to match the span indices with tokens
@@ -230,12 +243,37 @@ def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[Li
         return response
 
     def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None) -> List[List[str]]:
-        """ Split a document into sentences and tokenize them.
+        """ Split a document into sentences and tokenize them. Note that it is always faster to tokenize a whole
+        document than to tokenize each sentence one by one. So avoid calling this method sentence by sentence but put
+        sentences into a ``list`` and pass them to the ``text`` argument.
 
         Args:
-            text: A document (str), or a list of sentences (List[str]).
+            text: A document (``str``), or a list of sentences (``List[str]``).
             coarse: Whether to perform coarse-grained or fine-grained tokenization.
 
+        Examples::
+
+            # Avoid tokenizing sentence by sentence, it is expensive:
+            HanLP.tokenize('商品和服务。')
+            [['商品', '和', '服务', '。']]
+            HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司')
+            [['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]
+
+            # Instead, the following codes are much faster:
+            HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司')
+            [['商品', '和', '服务', '。'],
+             ['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]
+
+            # To tokenize with coarse-grained standard:
+            HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)
+            [['商品', '和', '服务', '。'],
+             ['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']]
+
+            # To tokenize pre-segmented sentences:
+            HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重'])
+            [['商品', '和', '服务', '。'],
+             ['当', '下雨天', '地面', '积水', '分', '外', '严重']]
+
         Returns:
             A list of tokenized sentences.
         """

diff --git a/plugins/hanlp_restful/tests/test_client.py b/plugins/hanlp_restful/tests/test_client.py
@@ -31,13 +31,18 @@ def test_sents_mul(self):
         doc = self.HanLP.parse(text, language='mul')
 
     def test_tokenize(self):
-        print(self.HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司'))
-        print(self.HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司', coarse=True))
-        print(self.HanLP.tokenize(['商品和服务', '当下雨天地面积水分外严重']))
+        print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司'))
+        print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True))
+        print(self.HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重']))
 
     def test_coreference_resolution(self):
         print(self.HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。'))
 
+    def test_text_style_transfer(self):
+        print(self.HanLP.text_style_transfer('国家对中石油抱有很大的期望.', target_style='gov_doc'))
+        print(self.HanLP.text_style_transfer('打工人，打工魂，打工都是人上人', target_style='gov_doc'))
+        print(self.HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry'))
+
 
 if __name__ == '__main__':
     unittest.main()