Remove infrequent tags from the training data: nsubj:outer, dislocate…

…d, goeswith No sense in spending model capacity in trying to learn these uncommon dependencies.
aajanki · Jul 25, 2023 · f1264e7 · f1264e7
1 parent 31fe5bf
commit f1264e7
Showing 4 changed files with 43 additions and 22 deletions.
diff --git a/Changelog b/Changelog
@@ -3,6 +3,8 @@ Finnish language model for spaCy
 <unreleased>
 
 * The noun chunker includes chains of flats and nmods: e.g. "maaliskuun 7. päivänä"
+* The parser doesn't try to detect nsubj:outer, dislocated and goeswith
+  dependencies anymore. There's not enough training data to learn those.
 
 Version 0.13.0, 2023-07-21
 

diff --git a/docs/tags.md b/docs/tags.md
@@ -52,20 +52,17 @@ These are the possible values for `token.dep`:
 | det          | [determiner](https://universaldependencies.org/fi/dep/det.html)                                                                             |
 | dep          | [unspecified dependency](https://universaldependencies.org/u/dep/dep.html)                                                                  |
 | discourse    | [discourse element](https://universaldependencies.org/fi/dep/discourse.html)                                                                |
-| dislocated   | [dislocated elements](https://universaldependencies.org/u/dep/dislocated.html)                                                              |
 | fixed        | [fixed multi-word expression](https://universaldependencies.org/fi/dep/fixed.html)                                                          |
 | flat         | [flat phrase without a clear head](https://universaldependencies.org/fi/dep/flat.html)                                                      |
 | flat:foreign | [foreign words](https://universaldependencies.org/u/dep/flat-foreign.html)                                                                  |
 | flat:name    | [names](https://universaldependencies.org/u/dep/flat-name.html)                                                                             |
-| goeswith     | [relation that links two parts of a compound word that are erroneously separated](https://universaldependencies.org/fi/dep/goeswith.html)   |
 | mark         | [subordinating conjunction, complementizer, or comparative conjunction](https://universaldependencies.org/fi/dep/mark.html)                 |
 | nmod         | [nominal modifier](https://universaldependencies.org/fi/dep/nmod.html)                                                                      |
 | nmod:gobj    | [genitive object](https://universaldependencies.org/fi/dep/nmod-gobj.html)                                                                  |
 | nmod:gsubj   | [genitive subject](https://universaldependencies.org/fi/dep/nmod-gsubj.html)                                                                |
 | nmod:poss    | [genitive modifier](https://universaldependencies.org/fi/dep/nmod-poss.html)                                                                |
 | nsubj        | [nominal subject](https://universaldependencies.org/fi/dep/nsubj.html)                                                                      |
 | nsubj:cop    | [nominal copular subject](https://universaldependencies.org/fi/dep/nsubj-cop.html)                                                          |
-| nsubj:outer  | [outer clause nominal subject](https://universaldependencies.org/u/dep/nsubj-outer.html)                                                    |
 | nummod       | [numeric modifier](https://universaldependencies.org/fi/dep/nummod.html)                                                                    |
 | obj          | [direct object](https://universaldependencies.org/fi/dep/obj.html)                                                                          |
 | obl          | [oblique nominal](https://universaldependencies.org/u/dep/obl.html)                                                                         |

diff --git a/project.yml b/project.yml
@@ -142,7 +142,7 @@ commands:
     help: "Convert the data to spaCy's format"
     script:
       - "mkdir -p corpus/${vars.treebank}/preprocessed corpus/${vars.treebank}/spacy"
-      - "python tools/preprocess_UD-TDT.py assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu"
+      - "python tools/preprocess_UD-TDT.py --trainset assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu"
       - "python -m spacy convert corpus/${vars.treebank}/preprocessed/${vars.train_name}.conllu corpus/${vars.treebank}/spacy --n-sents 6"
       - "mv corpus/${vars.treebank}/spacy/${vars.train_name}.spacy corpus/${vars.treebank}/spacy/train.spacy"
       - "python tools/preprocess_UD-TDT.py assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/preprocessed/${vars.dev_name}.conllu"

diff --git a/tools/preprocess_UD-TDT.py b/tools/preprocess_UD-TDT.py
@@ -3,47 +3,69 @@
 import typer
 from pathlib import Path
 
+TOKEN_ID = 0
+ORTH = 1
+LEMMA = 2
+UPOS = 3
+XPOS = 4
+DEPREL = 7
+DEPS = 8
+MISC = 9
 
 def main(
         input_file: Path = typer.Argument(..., help='Input file'),
-        output_file: Path = typer.Argument(..., help='Output file')
+        output_file: Path = typer.Argument(..., help='Output file'),
+        trainset: bool = typer.Option(False, help='Extra preprocessing for the training set'),
 ):
     with open(input_file) as inf, open(output_file, 'w') as outf:
         for line in inf:
             if line == '\n' or line.startswith('#'):
                 outf.write(line)
             else:
                 columns = line.rstrip('\n').split('\t')
-                token_id = columns[0]
-                orth = columns[1]
-                lemma = columns[2]
-                upos = columns[3]
-                xpos = columns[4]
 
-                if '-' in token_id:
+                if '-' in columns[TOKEN_ID]:
                     # Skip multiword tokens.
                     #
                     # Assert that the UD input has undefined data on
                     # multiword tokens.
                     assert all(x == '_' for x in columns[2:])
                     continue
 
-                columns[2] = fix_compund_word_lemmas(orth, lemma)
+                columns[LEMMA] = fix_compund_word_lemmas(columns[ORTH], columns[LEMMA])
 
                 # error in the data?
-                if xpos == 'Adj':
-                    columns[4] = 'A'
+                if columns[XPOS] == 'Adj':
+                    columns[XPOS] = 'A'
 
                 # The fine-grained tags are some times more coarse than
                 # the "coarse-grained" tags
-                if upos == 'SCONJ':
-                    columns[4] = 'SC'
-                elif upos == 'PROPN':
-                    columns[4] = 'Propn'
-                elif upos == 'AUX':
-                    columns[4] = 'Aux'
-
-                columns[9] = 'O'
+                if columns[UPOS] == 'SCONJ':
+                    columns[XPOS] = 'SC'
+                elif columns[UPOS] == 'PROPN':
+                    columns[XPOS] = 'Propn'
+                elif columns[UPOS] == 'AUX':
+                    columns[XPOS] = 'Aux'
+
+                columns[MISC] = 'O'
+
+                if trainset:
+                    # There are too few 'nsubj:outer's, 'dislocated's
+                    # and 'goeswith's for learning. Replace them in
+                    # the training set to avoid wasting model capacity
+                    # on them. 'nsubj:outer' is replaced with
+                    # 'nsubj:cop' like they used to be in
+                    # UD-Finnish-TDT before October 2022. Others are
+                    # replaced by general 'dep's.
+                    if columns[DEPREL] == 'nsubj:outer':
+                        columns[DEPREL] = 'nsubj:cop'
+                        columns[DEPS] = columns[DEPS].replace('nsubj:outer', 'nsubj:cop')
+                    elif columns[DEPREL] == 'dislocated':
+                        columns[DEPREL] = 'dep'
+                        columns[DEPS] = columns[DEPS].replace('dislocated', 'dep')
+                    elif columns[DEPREL] == 'goeswith':
+                        columns[DEPREL] = 'dep'
+                        columns[DEPS] = columns[DEPS].replace('goeswith', 'dep')
 
                 outf.write('\t'.join(columns))
                 outf.write('\n')