Skip to content

Commit

Permalink
Adding Latvian (#145)
Browse files Browse the repository at this point in the history
  • Loading branch information
raivisdejus authored Apr 26, 2023
1 parent ae142fa commit c080f49
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 4 deletions.
1 change: 1 addition & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ The currently supported dictionaries are:
* German - 'de'
* Russian - 'ru'
* Arabic - 'ar'
* Latvian - 'lv'

Dictionary Creation and Updating
-------------------------------------------------------------------------------
Expand Down
93 changes: 91 additions & 2 deletions scripts/build_dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
French Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.fr.gz
Portuguese Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.pt.gz
Russian Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ru.gz
Arabic Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
Arabic Input: http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/mono/OpenSubtitles.raw.ar.gz
Latvian Input: https://huggingface.co/datasets/RaivisDejus/latvian-text
Requirements:
The script requires more than the standard library to run in its
entirety. You will also need to install the NLTK package to build a
Expand Down Expand Up @@ -647,6 +648,91 @@ def clean_arabic(word_frequency, filepath_exclude, filepath_include):
return word_frequency


def clean_latvian(word_frequency, filepath_exclude, filepath_include):
"""Clean a Latvian word frequency list
Args:
word_frequency (Counter):
filepath_exclude (str):
filepath_include (str):
"""
letters = set("aābcčdeēfgģhiījkķlļmnņoprsštuūvzž")

# remove words with invalid characters
invalid_chars = list()

for key in word_frequency:
kl = set(key)
if kl.issubset(letters):
continue
invalid_chars.append(key)
for misfit in invalid_chars:
word_frequency.pop(misfit)

# remove words without a vowel
no_vowels = list()
vowels = set("aāiīeēouū")
for key in word_frequency:
if vowels.isdisjoint(key):
no_vowels.append(key)
for misfit in no_vowels:
word_frequency.pop(misfit)

# remove ellipses
ellipses = list()
for key in word_frequency:
if ".." in key:
ellipses.append(key)
for misfit in ellipses:
word_frequency.pop(misfit)

# leading or trailing doubles aa or ii
doubles = list()
for key in word_frequency:
if key.startswith("аа"):
doubles.append(key)
elif key.startswith("ii"):
doubles.append(key)
for misfit in doubles:
word_frequency.pop(misfit)

# remove single letters
single_letters = list()
for key in word_frequency:
if len(key) == 1:
single_letters.append(key)
for misfit in single_letters:
word_frequency.pop(misfit)

# TODO: other possible fixes?

# remove small numbers
small_frequency = list()
for key in word_frequency:
if word_frequency[key] <= MINIMUM_FREQUENCY:
small_frequency.append(key)
for misfit in small_frequency:
word_frequency.pop(misfit)

# remove flagged misspellings
with load_file(filepath_exclude) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
word_frequency.pop(line)

# Add known missing words back in (ugh)
with load_file(filepath_include) as fobj:
for line in fobj:
line = line.strip()
if line in word_frequency:
print("{} is already found in the dictionary! Skipping!")
else:
word_frequency[line] = MINIMUM_FREQUENCY

return word_frequency


def _parse_args():
"""parse arguments for command-line usage"""
import argparse
Expand All @@ -655,7 +741,7 @@ def _parse_args():
description="Build a new dictionary (word frequency) using the OpenSubtitles2018 project"
)
parser.add_argument(
"-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar"]
"-l", "--language", required=True, help="The language being built", choices=["en", "es", "de", "fr", "pt", "ru", "ar", "lv"]
)
parser.add_argument(
"-f", "--file-path", help="The path to the downloaded text file OR the saved word frequency json"
Expand Down Expand Up @@ -731,9 +817,12 @@ def _parse_args():
word_frequency = clean_russian(word_frequency, exclude_filepath, include_filepath)
elif args.language == "ar":
word_frequency = clean_arabic(word_frequency, exclude_filepath, include_filepath)
elif args.language == "lv":
word_frequency = clean_latvian(word_frequency, exclude_filepath, include_filepath)

# export word frequency for review!
word_frequency_path = os.path.join(script_path, "{}.json".format(args.language))
print(word_frequency_path)
export_word_frequency(word_frequency_path, word_frequency)

if args.misfit_file:
Expand Down
11 changes: 11 additions & 0 deletions scripts/data/lv_exclude.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
aa
aan
ab
abd
ganrīz
ij
iz
ledusgabals
nava
šā
tuše
Binary file added scripts/data/lv_full.json.gz
Binary file not shown.
Empty file added scripts/data/lv_include.txt
Empty file.
Binary file added spellchecker/resources/lv.json.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class SpellChecker:
Args:
language (str): The language of the dictionary to load or None for no dictionary. Supported languages are \
`en`, `es`, `de`, `fr`, `pt` and `ru`. Defaults to `en`. A list of languages may be provided and all \
`en`, `es`, `de`, `fr`, `pt`, `ru` and `lv`. Defaults to `en`. A list of languages may be provided and all \
languages will be loaded.
local_dictionary (str): The path to a locally stored word frequency dictionary; if provided, no language \
will be loaded
Expand Down Expand Up @@ -82,7 +82,7 @@ def __iter__(self) -> typing.Generator[str, None, None]:
@classmethod
def languages(cls) -> typing.Iterable[str]:
"""list: A list of all official languages supported by the library"""
return ["de", "en", "es", "fr", "pt", "ru", "ar"]
return ["de", "en", "es", "fr", "pt", "ru", "ar", "lv"]

@property
def word_frequency(self) -> "WordFrequency":
Expand Down

0 comments on commit c080f49

Please sign in to comment.