Adding Latvian (#145)

barrust · Apr 26, 2023 · c080f49 · c080f49
1 parent ae142fa
commit c080f49
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 4 deletions.
diff --git a/README.rst b/README.rst
@@ -147,6 +147,7 @@ The currently supported dictionaries are:
 * German        - 'de'
 * Russian       - 'ru'
 * Arabic        - 'ar'
+* Latvian       - 'lv'
 
 Dictionary Creation and Updating
 -------------------------------------------------------------------------------

diff --git a/scripts/build_dictionary.py b/scripts/build_dictionary.py
@@ -11,7 +11,8 @@
             French Input:     http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fr.gz
             Portuguese Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz
             Russian Input:    http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz
-            Arabic Input:    http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
+            Arabic Input:     http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
+            Latvian Input:    https://huggingface.co/datasets/RaivisDejus/latvian-text
     Requirements:
             The script requires more than the standard library to run in its
             entirety. You will also need to install the NLTK package to build a
@@ -647,6 +648,91 @@ def clean_arabic(word_frequency, filepath_exclude, filepath_include):
     return word_frequency
 
 
+def clean_latvian(word_frequency, filepath_exclude, filepath_include):
+    """Clean a Latvian word frequency list
+
+    Args:
+        word_frequency (Counter):
+        filepath_exclude (str):
+        filepath_include (str):
+    """
+    letters = set("aābcčdeēfgģhiījkķlļmnņoprsštuūvzž")
+
+    # remove words with invalid characters
+    invalid_chars = list()
+
+    for key in word_frequency:
+        kl = set(key)
+        if kl.issubset(letters):
+            continue
+        invalid_chars.append(key)
+    for misfit in invalid_chars:
+        word_frequency.pop(misfit)
+
+    # remove words without a vowel
+    no_vowels = list()
+    vowels = set("aāiīeēouū")
+    for key in word_frequency:
+        if vowels.isdisjoint(key):
+            no_vowels.append(key)
+    for misfit in no_vowels:
+        word_frequency.pop(misfit)
+
+    # remove ellipses
+    ellipses = list()
+    for key in word_frequency:
+        if ".." in key:
+            ellipses.append(key)
+    for misfit in ellipses:
+        word_frequency.pop(misfit)
+
+    # leading or trailing doubles aa or ii
+    doubles = list()
+    for key in word_frequency:
+        if key.startswith("аа"):
+            doubles.append(key)
+        elif key.startswith("ii"):
+            doubles.append(key)
+    for misfit in doubles:
+        word_frequency.pop(misfit)
+
+    # remove single letters
+    single_letters = list()
+    for key in word_frequency:
+        if len(key) == 1:
+            single_letters.append(key)
+    for misfit in single_letters:
+        word_frequency.pop(misfit)
+
+    # TODO: other possible fixes?
+
+    # remove small numbers
+    small_frequency = list()
+    for key in word_frequency:
+        if word_frequency[key] <= MINIMUM_FREQUENCY:
+            small_frequency.append(key)
+    for misfit in small_frequency:
+        word_frequency.pop(misfit)
+
+    # remove flagged misspellings
+    with load_file(filepath_exclude) as fobj:
+        for line in fobj:
+            line = line.strip()
+            if line in word_frequency:
+                word_frequency.pop(line)
+
+    # Add known missing words back in (ugh)
+    with load_file(filepath_include) as fobj:
+        for line in fobj:
+            line = line.strip()
+            if line in word_frequency:
+                print("{} is already found in the dictionary! Skipping!")
+            else:
+                word_frequency[line] = MINIMUM_FREQUENCY
+
+    return word_frequency
+
+
 def _parse_args():
     """parse arguments for command-line usage"""
     import argparse
@@ -655,7 +741,7 @@ def _parse_args():
         description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
     )
     parser.add_argument(
-        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar"]
+        "-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv"]
     )
     parser.add_argument(
         "-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
@@ -731,9 +817,12 @@ def _parse_args():
         word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
     elif args.language == "ar":
         word_frequency = clean_arabic(word_frequency, exclude_filepath, include_filepath)
+    elif args.language == "lv":
+        word_frequency = clean_latvian(word_frequency, exclude_filepath, include_filepath)
 
     # export word frequency for review!
     word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))
+    print(word_frequency_path)
     export_word_frequency(word_frequency_path, word_frequency)
 
     if args.misfit_file:

diff --git a/scripts/data/lv_exclude.txt b/scripts/data/lv_exclude.txt
@@ -0,0 +1,11 @@
+aa
+aan
+ab
+abd
+ganrīz
+ij
+iz
+ledusgabals
+nava
+šā
+tuše
diff --git a/scripts/data/lv_full.json.gz b/scripts/data/lv_full.json.gz
diff --git a/scripts/data/lv_include.txt b/scripts/data/lv_include.txt
diff --git a/spellchecker/resources/lv.json.gz b/spellchecker/resources/lv.json.gz
diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py
@@ -18,7 +18,7 @@ class SpellChecker:
 
     Args:
         language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
-            `en`, `es`, `de`, `fr`, `pt` and `ru`. Defaults to `en`. A list of languages may be provided and all \
+            `en`, `es`, `de`, `fr`, `pt`, `ru` and `lv`. Defaults to `en`. A list of languages may be provided and all \
                 languages will be loaded.
         local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
             will be loaded
@@ -82,7 +82,7 @@ def __iter__(self) -> typing.Generator[str, None, None]:
     @classmethod
     def languages(cls) -> typing.Iterable[str]:
         """list: A list of all official languages supported by the library"""
-        return ["de", "en", "es", "fr", "pt", "ru", "ar"]
+        return ["de", "en", "es", "fr", "pt", "ru", "ar", "lv"]
 
     @property
     def word_frequency(self) -> "WordFrequency":