Skip to content

Commit

Permalink
Revise documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
hankcs committed Oct 26, 2021
1 parent 69a660e commit f3321ed
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 69 deletions.
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,15 @@ The result is guaranteed to be `96.70` as the random feed is fixed. Different fr
If you use HanLP in your research, please cite this repository.

```latex
@software{hanlp2,
author = {Han He},
title = {{HanLP: Han Language Processing}},
@inproceedings{he-choi-2019,
title = {Establishing Strong Baselines for the New Decade: Sequence Tagging, Syntactic and Semantic Parsing with BERT},
author = {Han He and Jinho Choi},
booktitle = {The Thirty-Third International Flairs Conference},
conference = {Florida Artificial Intelligence Research Society Conference},
year = {2020},
url = {https://github.com/hankcs/HanLP},
keywords = {part-of-speech tagging, syntactic parsing, semantic parsing, Transformer, BERT},
abstract = {This paper presents new state-of-the-art models for three tasks, part-of-speech tagging, syntactic parsing, and semantic parsing, using the cutting-edge contextualized embedding framework known as BERT. For each task, we first replicate and simplify the current state-of-the-art approach to enhance its model efficiency. We then evaluate our simplified approaches on those three tasks using token embeddings generated by BERT. 12 datasets in both English and Chinese are used for our experiments. The BERT models outperform the previously best-performing models by 2.5\% on average (7.5\% for the most significant case). All models and source codes are available in public so that researchers can improve upon and utilize them to establish strong baselines for the next decade.},
url = {https://www.aaai.org/ocs/index.php/FLAIRS/FLAIRS20/paper/view/18438}
}
```

Expand Down
99 changes: 52 additions & 47 deletions docs/references.bib

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions docs/tutorial.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ production environments. In this tutorial, we'll walk through the APIs in HanLP
HanLP offers out-of-the-box RESTful API and native Python API which share very similar interfaces
while they are designed for different scenes.

```{code-cell} ipython3
:tags: [remove_cell]
import hanlp_common.constant
hanlp_common.constant.IPYTHON = False # Avoid pretty_print prints html which doesn't play well with this theme
```

## RESTful API

RESTful API is an endpoint where you send your documents to then get the parsed annotations back.
Expand Down
2 changes: 1 addition & 1 deletion hanlp/components/mtl/tasks/ner/tag_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def __init__(self,
r"""A simple tagger using a linear layer with an optional CRF (:cite:`lafferty2001conditional`) layer for
NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
During decoding, it performs longest-prefix-matching of these words to override the prediction from
underlining statistical model. It also uses a blacklist to mask out mis-predicted entities.
underlying statistical model. It also uses a blacklist to mask out mis-predicted entities.
.. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.
Expand Down
3 changes: 2 additions & 1 deletion hanlp/components/mtl/tasks/sdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def __init__(self,
use_pos=False,
**kwargs) -> None:
r"""Implementation of "Stanford's graph-based neural dependency parser at
the conll 2017 shared task" (:cite:`dozat2017stanford`).
the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
(:cite:`he-choi-2019`).
Args:
trn: Path to training set.
Expand Down
2 changes: 1 addition & 1 deletion hanlp/components/ner/transformer_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, **kwargs) -> None:
(:cite:`lafferty2001conditional`) layer for
NER task. It can utilize whitelist gazetteers which is dict mapping from entity name to entity type.
During decoding, it performs longest-prefix-matching of these words to override the prediction from
underlining statistical model. It also uses a blacklist to mask out mis-predicted entities.
underlying statistical model. It also uses a blacklist to mask out mis-predicted entities.
.. Note:: For algorithm beginners, longest-prefix-matching is the prerequisite to understand what dictionary can
do and what it can't do. The tutorial in `this book <http://nlp.hankcs.com/book.php>`_ can be very helpful.
Expand Down
5 changes: 3 additions & 2 deletions hanlp/components/parsers/biaffine/biaffine_sdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@

class BiaffineSemanticDependencyParser(BiaffineDependencyParser):
def __init__(self) -> None:
"""Implementation of "Stanford's graph-based neural dependency parser at
the conll 2017 shared task" (:cite:`dozat2017stanford`).
r"""Implementation of "Stanford's graph-based neural dependency parser at
the conll 2017 shared task" (:cite:`dozat2017stanford`) and "Establishing Strong Baselines for the New Decade"
(:cite:`he-choi-2019`).
"""
super().__init__()

Expand Down
10 changes: 5 additions & 5 deletions hanlp/pretrained/sdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
from hanlp_common.constant import HANLP_URL

SEMEVAL16_NEWS_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-news-biaffine_20191231_235407.zip'
'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval16 news data.'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 news data.'
SEMEVAL16_TEXT_BIAFFINE_ZH = HANLP_URL + 'sdp/semeval16-text-biaffine_20200101_002257.zip'
'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval16 text data.'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval16 text data.'

SEMEVAL15_PAS_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_pas_20200103_152405.zip'
'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval15 PAS data.'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PAS data.'
SEMEVAL15_PSD_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_psd_20200106_123009.zip'
'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval15 PSD data.'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 PSD data.'
SEMEVAL15_DM_BIAFFINE_EN = HANLP_URL + 'sdp/semeval15_biaffine_dm_20200106_122808.zip'
'Biaffine SDP (:cite:`bertbaseline`) trained on SemEval15 DM data.'
'Biaffine SDP (:cite:`he-choi-2019`) trained on SemEval15 DM data.'

ALL = {}
48 changes: 43 additions & 5 deletions plugins/hanlp_restful/hanlp_restful/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,15 @@ def text_style_transfer(self, text: Union[str, List[str]], target_style: str, la
HanLP.text_style_transfer(['国家对中石油抱有很大的期望.', '要用创新去推动高质量的发展。'],
target_style='gov_doc')
# Output: ['国家对中石油寄予厚望。', '要以创新驱动高质量发展。']
# Output:
[
'国家对中石油寄予厚望。',
'要以创新驱动高质量发展。'
]
HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry')
# Output:
'我看见窗外的白云绿林'
Returns:
Text or a list of text of the target style.
Expand All @@ -180,7 +188,12 @@ def semantic_textual_similarity(self, text: Union[Tuple[str, str], List[Tuple[st
('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用'),
('北京到上海的动车票', '上海到北京的动车票'),
])
# Output: [0.9764469861984253, 0.0, 0.003458738327026367]
# Output:
[
0.9764469861984253, # Similarity of ('看图猜一电影名', '看图猜电影')
0.0, # Similarity of ('无线路由器怎么无线上网', '无线上网卡和无线路由器怎么用')
0.003458738327026367 # Similarity of ('北京到上海的动车票', '上海到北京的动车票')
]
Returns:
Similarities.
Expand Down Expand Up @@ -219,7 +232,7 @@ def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[Li
[['她的猫', 4, 7], ['它', 11, 12]]], # 指代说话人的姐姐的猫
.. image:: https://file.hankcs.com/img/coref_demo_small.png
:alt: my-picture1
:alt: Coreference resolution visualization
Returns:
When ``text`` is specified, return the clusters and tokens. Otherwise just the clusters, In this case, you need to ``sum(tokens, [])`` in order to match the span indices with tokens
Expand All @@ -230,12 +243,37 @@ def coreference_resolution(self, text: Optional[str] = None, tokens: Optional[Li
return response

def tokenize(self, text: Union[str, List[str]], coarse: Optional[bool] = None) -> List[List[str]]:
""" Split a document into sentences and tokenize them.
""" Split a document into sentences and tokenize them. Note that it is always faster to tokenize a whole
document than to tokenize each sentence one by one. So avoid calling this method sentence by sentence but put
sentences into a ``list`` and pass them to the ``text`` argument.
Args:
text: A document (str), or a list of sentences (List[str]).
text: A document (``str``), or a list of sentences (``List[str]``).
coarse: Whether to perform coarse-grained or fine-grained tokenization.
Examples::
# Avoid tokenizing sentence by sentence, it is expensive:
HanLP.tokenize('商品和服务。')
[['商品', '和', '服务', '。']]
HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司')
[['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]
# Instead, the following codes are much faster:
HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司')
[['商品', '和', '服务', '。'],
['阿婆主', '来到', '北京', '立方庭', '参观', '自然', '语义', '科技', '公司']]
# To tokenize with coarse-grained standard:
HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True)
[['商品', '和', '服务', '。'],
['阿婆主', '来到', '北京', '立方庭', '参观', '自然语义科技公司']]
# To tokenize pre-segmented sentences:
HanLP.tokenize(['商品和服务。', '当下雨天地面积水分外严重'])
[['商品', '和', '服务', '。'],
['当', '下雨天', '地面', '积水', '分', '外', '严重']]
Returns:
A list of tokenized sentences.
"""
Expand Down
11 changes: 8 additions & 3 deletions plugins/hanlp_restful/tests/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,18 @@ def test_sents_mul(self):
doc = self.HanLP.parse(text, language='mul')

def test_tokenize(self):
print(self.HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司'))
print(self.HanLP.tokenize('阿婆主来到北京立方庭参观自然语义科技公司', coarse=True))
print(self.HanLP.tokenize(['商品和服务', '当下雨天地面积水分外严重']))
print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司'))
print(self.HanLP.tokenize('商品和服务。阿婆主来到北京立方庭参观自然语义科技公司', coarse=True))
print(self.HanLP.tokenize(['商品和服务', '当下雨天地面积水分外严重']))

def test_coreference_resolution(self):
print(self.HanLP.coreference_resolution('我姐送我她的猫。我很喜欢它。'))

def test_text_style_transfer(self):
print(self.HanLP.text_style_transfer('国家对中石油抱有很大的期望.', target_style='gov_doc'))
print(self.HanLP.text_style_transfer('打工人,打工魂,打工都是人上人', target_style='gov_doc'))
print(self.HanLP.text_style_transfer('我看到了窗户外面有白色的云和绿色的森林', target_style='modern_poetry'))


if __name__ == '__main__':
unittest.main()

0 comments on commit f3321ed

Please sign in to comment.