Skip to content
This repository has been archived by the owner on Jul 19, 2019. It is now read-only.

Commit

Permalink
- Remove support of Latin Version of Serbo-Croatian because it confli…
Browse files Browse the repository at this point in the history
…cts a lot with modern Croatian

- Add support Afrikaans language
- Refactor code
- Fix bugs
- Add more bugs
  • Loading branch information
abadojack committed Mar 3, 2019
1 parent f718d49 commit 8fc1ef4
Show file tree
Hide file tree
Showing 15 changed files with 284 additions and 90 deletions.
28 changes: 24 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ import (

func main() {
info := whatlanggo.Detect("Foje funkcias kaj foje ne funkcias")
fmt.Println("Language:", whatlanggo.LangToString(info.Lang), "Script:", whatlanggo.Scripts[info.Script])
fmt.Println("Language:", whatlanggo.LangToString(info.Lang), " Script:", whatlanggo.Scripts[info.Script], " Confidence: ", info.Confidence)
}
```

Expand Down Expand Up @@ -59,8 +59,28 @@ fmt.Println("Language:", whatlanggo.LangToString(info.Lang), "Script:", whatlang
```
For more details, please check the [documentation](https://godoc.org/github.com/abadojack/whatlanggo).

## TODO
Add reliability metrics in the _[Info](https://godoc.org/github.com/abadojack/whatlanggo#Info)_ struct.
## Requirements
Go 1.8 or higher

## How does it work?

### How does the language recognition work?

The algorithm is based on the trigram language models, which is a particular case of n-grams.
To understand the idea, please check the original whitepaper [Cavnar and Trenkle '94: N-Gram-Based Text Categorization'](https://www.researchgate.net/publication/2375544_N-Gram-Based_Text_Categorization).

### How _IsReliable_ calculated?

It is based on the following factors:
* How many unique trigrams are in the given text
* How big is the difference between the first and the second(not returned) detected languages? This metric is called `rate` in the code base.

Therefore, it can be presented as 2d space with threshold functions, that splits it into "Reliable" and "Not reliable" areas.
This function is a hyperbola and it looks like the following one:

<img alt="Language recognition whatlang rust" src="https://raw.githubusercontent.com/abadojack/whatlanggo/master/images/whatlang_is_reliable.png" width="450" height="300" />

For more details, please check a blog article [Introduction to Rust Whatlang Library and Natural Language Identification Algorithms](https://www.greyblake.com/blog/2017-07-30-introduction-to-rust-whatlang-library-and-natural-language-identification-algorithms/).

## License
[MIT](https://github.com/abadojack/whatlanggo/blob/master/LICENSE)
Expand All @@ -69,4 +89,4 @@ Add reliability metrics in the _[Info](https://godoc.org/github.com/abadojack/wh
whatlanggo is a derivative of [Franc](https://github.com/wooorm/franc) (JavaScript, MIT) by [Titus Wormer](https://github.com/wooorm).

## Acknowledgements
Thanks to [greyblake](https://github.com/greyblake) (Potapov Sergey) for creating [whatlang-rs](https://github.com/greyblake/whatlang-rs) from where I got the idea and logic.
Thanks to [greyblake](https://github.com/greyblake) (Potapov Sergey) for creating [whatlang-rs](https://github.com/greyblake/whatlang-rs) from where I got the idea and algorithms.
8 changes: 8 additions & 0 deletions constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package whatlanggo

const maxTrigramDistance = 300
const maxTotalDistance = 90000

// ReliableConfidenceThreshold is confidence rating that has to be succeeded
// for the language detection to be considered reliable.
const ReliableConfidenceThreshold = 0.8
48 changes: 29 additions & 19 deletions detect.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,22 @@ import (
"unicode"
)

const maxDist = 300

//Detect language and script of the given text.
// Detect language and script of the given text.
func Detect(text string) Info {
return DetectWithOptions(text, Options{})
}

//DetectLang detects only the language by a given text.
// DetectLang detects only the language by a given text.
func DetectLang(text string) Lang {
return Detect(text).Lang
}

//DetectLangWithOptions detects only the language of the given text with the provided options.
// DetectLangWithOptions detects only the language of the given text with the provided options.
func DetectLangWithOptions(text string, options Options) Lang {
return DetectWithOptions(text, options).Lang
}

//DetectWithOptions detects the language and script of the given text with the provided options.
// DetectWithOptions detects the language and script of the given text with the provided options.
func DetectWithOptions(text string, options Options) Info {
script := DetectScript(text)
if script != nil {
Expand All @@ -33,6 +31,7 @@ func DetectWithOptions(text string, options Options) Info {
Confidence: confidence,
}
}

return Info{
Lang: -1,
Script: nil,
Expand Down Expand Up @@ -88,15 +87,19 @@ func detectLangBaseOnScript(text string, options Options, script *unicode.RangeT
return Khm, 1
case _HiraganaKatakana:
return Jpn, 1
default:
return -1, 0
}
return -1, 0
}

type langDistance struct {
lang Lang
dist int
}

func detectLangInProfiles(text string, options Options, langProfileList langProfileList) (Lang, float64) {
trigrams := getTrigramsWithPositions(text)
type langDistance struct {
lang Lang
dist int
}

langDistances := []langDistance{}

for lang, langTrigrams := range langProfileList {
Expand All @@ -116,15 +119,22 @@ func detectLangInProfiles(text string, options Options, langProfileList langProf
langDistances = append(langDistances, langDistance{lang, dist})
}

if len(langDistances) < 2 {
switch len(langDistances) {
case 0:
return -1, 0
case 1:
return langDistances[0].lang, 1
default:
return calculateConfidence(langDistances, trigrams)
}
}

func calculateConfidence(langDistances []langDistance, trigrams map[string]int) (Lang, float64) {
sort.SliceStable(langDistances, func(i, j int) bool { return langDistances[i].dist < langDistances[j].dist })
langDist1 := langDistances[0]
langDist2 := langDistances[1]
score1 := 90000 - langDist1.dist
score2 := 90000 - langDist2.dist
score1 := maxTotalDistance - langDist1.dist
score2 := maxTotalDistance - langDist2.dist

var confidence float64
if score1 == 0 {
Expand All @@ -151,12 +161,12 @@ func detectLangInProfiles(text string, options Options, langProfileList langProf
// Hyperbola function. Everything that is above the function has confidence = 1.0
// If rate is below, confidence is calculated proportionally.
// Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
//
confident_rate := float64(12.0/len(trigrams)) + 0.05
if rate > confident_rate {

confidentRate := float64(12.0/float64(len(trigrams))) + 0.05
if rate > confidentRate {
confidence = 1.0
} else {
confidence = rate / confident_rate
confidence = rate / confidentRate
}

return langDist1.lang, confidence
Expand All @@ -168,7 +178,7 @@ func calculateDistance(langTrigrams []string, textTrigrams map[string]int) int {
if n, ok := textTrigrams[trigram]; ok {
dist = abs(n - i)
} else {
dist = maxDist
dist = maxTrigramDistance
}
totalDist += dist
}
Expand Down
92 changes: 72 additions & 20 deletions detect_test.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package whatlanggo

import (
"encoding/json"
"io/ioutil"
"os"
"testing"
"unicode"
)

func Test_Detect(t *testing.T) {
func TestDetect(t *testing.T) {
tests := map[string]Info{
"Además de todo lo anteriormente dicho, también encontramos...": {Spa, unicode.Latin, 1},
"बहुत बहुत (धन्यवाद / शुक्रिया)!": {Hin, unicode.Devanagari, 1},
Expand Down Expand Up @@ -47,7 +50,7 @@ func Test_Detect(t *testing.T) {
}
}

func Test_DetectLang(t *testing.T) {
func TestDetectLang(t *testing.T) {
tests := map[string]Lang{
"Та нічого, все нормально. А в тебе як?": Ukr,
"Vouloir, c'est pouvoir": Fra,
Expand All @@ -65,14 +68,25 @@ func Test_DetectLang(t *testing.T) {
}
}

func Test_DetectWithOptions(t *testing.T) {
//without blacklist
// Test detect with empty options and supported language and script
func TestDetectWithOptionsEmptySupportedLang(t *testing.T) {
want := Info{Epo, unicode.Latin, 1}
got := DetectWithOptions("La viro amas hundojn. Hundo estas la plej bona amiko de viro", Options{})
if want.Lang != got.Lang && want.Script != got.Script {
t.Fatalf("want %v %v got %v %v", want.Lang, want.Script, got.Lang, got.Script)
}
}

// Test detect with empty options and nonsupported script(Balinese)
func TestDetectWithOptionsEmptyNonSupportedLang(t *testing.T) {
want := Info{-1, nil, 0}
got := DetectWithOptions("ᬅᬓ᭄ᬱᬭᬯ᭄ᬬᬜ᭄ᬚᬦ", Options{})
if want.Lang != got.Lang && want.Script != got.Script {
t.Fatalf("want %v %v got %v %v", want.Lang, want.Script, got.Lang, got.Script)
}
}

func TestDetectWithOptionsWithBlacklist(t *testing.T) {
text := "האקדמיה ללשון העברית"
//All languages with Hebrew text blacklisted ... returns correct script but invalid language
options1 := Options{
Expand All @@ -81,21 +95,8 @@ func Test_DetectWithOptions(t *testing.T) {
Ydd: true,
},
}
want = Info{-1, unicode.Hebrew, 1}
got = DetectWithOptions(text, options1)
if got.Lang != want.Lang && want.Script != got.Script {
t.Fatalf("Want %s %s got %s %s", LangToString(want.Lang), Scripts[want.Script], LangToString(got.Lang), Scripts[got.Script])
}

text = "Mi ne scias!"
want = Info{Epo, unicode.Latin, 1}
options2 := Options{
Whitelist: map[Lang]bool{
Epo: true,
Ukr: true,
},
}
got = DetectWithOptions(text, options2)
want := Info{-1, unicode.Hebrew, 1}
got := DetectWithOptions(text, options1)
if got.Lang != want.Lang && want.Script != got.Script {
t.Fatalf("Want %s %s got %s %s", LangToString(want.Lang), Scripts[want.Script], LangToString(got.Lang), Scripts[got.Script])
}
Expand All @@ -113,7 +114,22 @@ func Test_DetectWithOptions(t *testing.T) {
}
}

func Test_DetectLangWithOptions(t *testing.T) {
func TestWithOptionsWithWhitelist(t *testing.T) {
text := "Mi ne scias!"
want := Info{Epo, unicode.Latin, 1}
options2 := Options{
Whitelist: map[Lang]bool{
Epo: true,
Ukr: true,
},
}
got := DetectWithOptions(text, options2)
if got.Lang != want.Lang && want.Script != got.Script {
t.Fatalf("Want %s %s got %s %s", LangToString(want.Lang), Scripts[want.Script], LangToString(got.Lang), Scripts[got.Script])
}
}

func TestDetectLangWithOptions(t *testing.T) {
text := "All evil come from a single cause ... man's inability to sit still in a room"
want := Eng
//without blacklist
Expand All @@ -140,3 +156,39 @@ func Test_DetectLangWithOptions(t *testing.T) {
t.Fatalf("want %s got %s", LangToString(want), LangToString(got))
}
}

func Test_detectLangBaseOnScriptUnsupportedScript(t *testing.T) {
want := Info{-1, nil, 0}
gotLang, gotConfidence := detectLangBaseOnScript("ᬅᬓ᭄ᬱᬭᬯ᭄ᬬᬜ᭄ᬚᬦ", Options{}, unicode.Balinese)
if want.Lang != gotLang && want.Confidence != gotConfidence {
t.Fatalf("want %v %v got %v %v", want.Lang, want.Script, gotLang, gotConfidence)
}
}

func TestWithMultipleExamples(t *testing.T) {
examplesFile, err := os.Open("testdata/examples.json")
if err != nil {
t.Fatal("Error opening testdata/examples.json")
}

defer examplesFile.Close()

byteValue, err := ioutil.ReadAll(examplesFile)
if err != nil {
t.Fatal("Error reading testdata/examples.json")
}

var examples map[string]string
err = json.Unmarshal(byteValue, &examples)
if err != nil {
t.Fatal("Error Unmarshalling json")
}

for lang, text := range examples {
want := CodeToLang(lang)
info := Detect(text)
if info.Lang != want && !info.IsReliable() {
t.Fatalf("want %v, got %v", Langs[want], Langs[info.Lang])
}
}
}
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module github.com/abadojack/whatlanggo

go 1.12
Binary file added images/whatlang_is_reliable.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions info.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,8 @@ type Info struct {
Script *unicode.RangeTable
Confidence float64
}

// IsReliable returns true if Confidence is greater than the Reliable Confidence Threshold
func (info *Info) IsReliable() bool {
return info.Confidence > ReliableConfidenceThreshold
}
Loading

0 comments on commit 8fc1ef4

Please sign in to comment.