diff --git a/Changelog b/Changelog index 963f9d2..93e6606 100644 --- a/Changelog +++ b/Changelog @@ -3,6 +3,8 @@ Finnish language model for spaCy * The noun chunker includes chains of flats and nmods: e.g. "maaliskuun 7. päivänä" +* The parser doesn't try to detect nsubj:outer, dislocated and goeswith + dependencies anymore. There's not enough training data to learn those. Version 0.13.0, 2023-07-21 diff --git a/docs/tags.md b/docs/tags.md index 142e0ea..70117ed 100644 --- a/docs/tags.md +++ b/docs/tags.md @@ -52,12 +52,10 @@ These are the possible values for `token.dep`: | det | [determiner](https://universaldependencies.org/fi/dep/det.html) | | dep | [unspecified dependency](https://universaldependencies.org/u/dep/dep.html) | | discourse | [discourse element](https://universaldependencies.org/fi/dep/discourse.html) | -| dislocated | [dislocated elements](https://universaldependencies.org/u/dep/dislocated.html) | | fixed | [fixed multi-word expression](https://universaldependencies.org/fi/dep/fixed.html) | | flat | [flat phrase without a clear head](https://universaldependencies.org/fi/dep/flat.html) | | flat:foreign | [foreign words](https://universaldependencies.org/u/dep/flat-foreign.html) | | flat:name | [names](https://universaldependencies.org/u/dep/flat-name.html) | -| goeswith | [relation that links two parts of a compound word that are erroneously separated](https://universaldependencies.org/fi/dep/goeswith.html) | | mark | [subordinating conjunction, complementizer, or comparative conjunction](https://universaldependencies.org/fi/dep/mark.html) | | nmod | [nominal modifier](https://universaldependencies.org/fi/dep/nmod.html) | | nmod:gobj | [genitive object](https://universaldependencies.org/fi/dep/nmod-gobj.html) | @@ -65,7 +63,6 @@ These are the possible values for `token.dep`: | nmod:poss | [genitive modifier](https://universaldependencies.org/fi/dep/nmod-poss.html) | | nsubj | [nominal subject](https://universaldependencies.org/fi/dep/nsubj.html) | | nsubj:cop | [nominal copular subject](https://universaldependencies.org/fi/dep/nsubj-cop.html) | -| nsubj:outer | [outer clause nominal subject](https://universaldependencies.org/u/dep/nsubj-outer.html) | | nummod | [numeric modifier](https://universaldependencies.org/fi/dep/nummod.html) | | obj | [direct object](https://universaldependencies.org/fi/dep/obj.html) | | obl | [oblique nominal](https://universaldependencies.org/u/dep/obl.html) | diff --git a/project.yml b/project.yml index 3ec74cf..5661dc5 100644 --- a/project.yml +++ b/project.yml @@ -142,7 +142,7 @@ commands: help: "Convert the data to spaCy's format" script: - "mkdir -p corpus/${vars.treebank}/preprocessed corpus/${vars.treebank}/spacy" - - "python tools/preprocess_UD-TDT.py assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu" + - "python tools/preprocess_UD-TDT.py --trainset assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu" - "python -m spacy convert corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu corpus/${vars.treebank}/spacy --n-sents 6" - "mv corpus/${vars.treebank}/spacy/${vars.train_name}.spacy corpus/${vars.treebank}/spacy/train.spacy" - "python tools/preprocess_UD-TDT.py assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.dev_name}.conllu" diff --git a/tools/preprocess_UD-TDT.py b/tools/preprocess_UD-TDT.py index 3b6fe05..58e450a 100644 --- a/tools/preprocess_UD-TDT.py +++ b/tools/preprocess_UD-TDT.py @@ -3,10 +3,19 @@ import typer from pathlib import Path +TOKEN_ID = 0 +ORTH = 1 +LEMMA = 2 +UPOS = 3 +XPOS = 4 +DEPREL = 7 +DEPS = 8 +MISC = 9 def main( input_file: Path = typer.Argument(..., help='Input file'), - output_file: Path = typer.Argument(..., help='Output file') + output_file: Path = typer.Argument(..., help='Output file'), + trainset: bool = typer.Option(False, help='Extra preprocessing for the training set'), ): with open(input_file) as inf, open(output_file, 'w') as outf: for line in inf: @@ -14,13 +23,8 @@ def main( outf.write(line) else: columns = line.rstrip('\n').split('\t') - token_id = columns[0] - orth = columns[1] - lemma = columns[2] - upos = columns[3] - xpos = columns[4] - if '-' in token_id: + if '-' in columns[TOKEN_ID]: # Skip multiword tokens. # # Assert that the UD input has undefined data on @@ -28,22 +32,40 @@ def main( assert all(x == '_' for x in columns[2:]) continue - columns[2] = fix_compund_word_lemmas(orth, lemma) + columns[LEMMA] = fix_compund_word_lemmas(columns[ORTH], columns[LEMMA]) # error in the data? - if xpos == 'Adj': - columns[4] = 'A' + if columns[XPOS] == 'Adj': + columns[XPOS] = 'A' # The fine-grained tags are some times more coarse than # the "coarse-grained" tags - if upos == 'SCONJ': - columns[4] = 'SC' - elif upos == 'PROPN': - columns[4] = 'Propn' - elif upos == 'AUX': - columns[4] = 'Aux' - - columns[9] = 'O' + if columns[UPOS] == 'SCONJ': + columns[XPOS] = 'SC' + elif columns[UPOS] == 'PROPN': + columns[XPOS] = 'Propn' + elif columns[UPOS] == 'AUX': + columns[XPOS] = 'Aux' + + columns[MISC] = 'O' + + if trainset: + # There are too few 'nsubj:outer's, 'dislocated's + # and 'goeswith's for learning. Replace them in + # the training set to avoid wasting model capacity + # on them. 'nsubj:outer' is replaced with + # 'nsubj:cop' like they used to be in + # UD-Finnish-TDT before October 2022. Others are + # replaced by general 'dep's. + if columns[DEPREL] == 'nsubj:outer': + columns[DEPREL] = 'nsubj:cop' + columns[DEPS] = columns[DEPS].replace('nsubj:outer', 'nsubj:cop') + elif columns[DEPREL] == 'dislocated': + columns[DEPREL] = 'dep' + columns[DEPS] = columns[DEPS].replace('dislocated', 'dep') + elif columns[DEPREL] == 'goeswith': + columns[DEPREL] = 'dep' + columns[DEPS] = columns[DEPS].replace('goeswith', 'dep') outf.write('\t'.join(columns)) outf.write('\n')