Skip to content

Commit

Permalink
Extend nmod and flat chains to the right + unit tests
Browse files Browse the repository at this point in the history
  • Loading branch information
aajanki committed Jul 24, 2023
1 parent fb883fb commit 31fe5bf
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 8 deletions.
2 changes: 1 addition & 1 deletion Changelog
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Finnish language model for spaCy

<unreleased>

* The noun chunker now includes also "flat" dependencies: e.g. "maaliskuun 7. päivänä"
* The noun chunker includes chains of flats and nmods: e.g. "maaliskuun 7. päivänä"

Version 0.13.0, 2023-07-21

Expand Down
14 changes: 9 additions & 5 deletions fi/fi.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,13 @@ def potential_np_head(word):
word.dep in np_deps or word.head.pos == PRON
)

def extend_right(word):
res = word.i
for rdep in word.rights:
if rdep.dep in extend_deps:
res = extend_right(rdep)
return res

doc = doclike.doc # Ensure works on both Doc and Span.
if not doc.has_annotation("DEP"):
raise ValueError(Errors.E029)
Expand Down Expand Up @@ -471,12 +478,9 @@ def potential_np_head(word):
if lbracket <= prev_end:
continue

rbracket = word.i
# Try to extend the span to the right to capture
# appositions and noun modifiers
for rdep in word.rights:
if rdep.dep in extend_deps:
rbracket = rdep.i
# noun phrase extensions
rbracket = extend_right(word)
prev_end = rbracket

yield lbracket, rbracket + 1, np_label
Expand Down
25 changes: 23 additions & 2 deletions tests/unit/test_noun_chunks.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import pytest
from spacy.lang.fi import Finnish
from fi import FinnishExtended
from util import get_doc_from_text

fi_nlp = Finnish()
fi_nlp = FinnishExtended()
fi_tokenizer = fi_nlp.tokenizer


Expand Down Expand Up @@ -140,6 +140,27 @@
[5, -1, 3, 2, 1, 0],
['Lain', 'varhaiskasvatus', 'suunnitelmallista toimintaa'],
),
(
'Asiasta päätettiin maaliskuun 7. päivänä tehdyllä sopimuksella',
['NOUN', 'VERB', 'NOUN', 'ADJ', 'NOUN', 'VERB', 'NOUN'],
['obl', 'ROOT', 'obl', 'flat', 'flat', 'acl', 'obl'],
[1, 0, 3, -1, -2, 1, -5],
['Asiasta', 'maaliskuun 7. päivänä', 'sopimuksella'],
),
(
'Tutkija tuli Helsingin yliopiston fysiikan laitokselta',
['NOUN', 'VERB', 'PROPN', 'NOUN', 'NOUN', 'NOUN'],
['nsubj', 'ROOT', 'nmod:poss', 'nmod:poss', 'nmod:poss', 'obl'],
[1, 0, 1, 2, 1, -4],
['Tutkija', 'Helsingin yliopiston fysiikan laitokselta'],
),
(
'Rakenteluun tarvitsee osia optoerottimista veneen moottoreihin',
['NOUN', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN'],
['obl', 'ROOT', 'obj', 'nmod', 'nmod:poss', 'nmod'],
[1, 0, -1, -1, 1, -2],
['Rakenteluun', 'osia optoerottimista veneen moottoreihin'],
),
]

FI_NP_TEST_XFAIL_EXAMPLES = [
Expand Down

0 comments on commit 31fe5bf

Please sign in to comment.