Skip to content

Commit

Permalink
speed tuning, about three times as fast as the master branch
Browse files Browse the repository at this point in the history
  • Loading branch information
adamzy committed Oct 4, 2015
1 parent 27f63d3 commit 0b86139
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 11 deletions.
9 changes: 5 additions & 4 deletions dictionary.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import "github.com/adamzy/cedar-go"
type Dictionary struct {
trie *cedar.Cedar // Cedar 前缀树
maxTokenLength int // 词典中最长的分词
tokens []*Token // 词典中所有的分词,方便遍历
tokens []Token // 词典中所有的分词,方便遍历
totalFrequency int64 // 词典中所有分词的频率之和
}

Expand All @@ -16,7 +16,8 @@ func NewDictionary() *Dictionary {

// 词典中最长的分词
func (dict *Dictionary) MaxTokenLength() int {
return dict.maxTokenLength
// return dict.maxTokenLength
return 15
}

// 词典中分词数目
Expand All @@ -30,7 +31,7 @@ func (dict *Dictionary) TotalFrequency() int64 {
}

// 向词典中加入一个分词
func (dict *Dictionary) addToken(token *Token) {
func (dict *Dictionary) addToken(token Token) {
dict.trie.Insert(textSliceToBytes(token.text), dict.NumTokens())
dict.tokens = append(dict.tokens, token)
dict.totalFrequency += int64(token.frequency)
Expand All @@ -51,7 +52,7 @@ func (dict *Dictionary) lookupTokens(words []Text, tokens []*Token) (numOfTokens
}
value, err = dict.trie.Value(id)
if err == nil {
tokens[numOfTokens] = dict.tokens[value]
tokens[numOfTokens] = &dict.tokens[value]
numOfTokens++
}
}
Expand Down
2 changes: 1 addition & 1 deletion segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func (seg *Segmenter) LoadDictionary(files string) {
// 将分词添加到字典中
words := splitTextToWords([]byte(text))
token := Token{text: words, frequency: frequency, pos: pos}
seg.dict.addToken(&token)
seg.dict.addToken(token)
}
}

Expand Down
14 changes: 8 additions & 6 deletions tools/benchmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,13 @@ func main() {
}

// 分词
for _, l := range lines {
segments := segmenter.Segment(l)
if *output != "" {
of.WriteString(sego.SegmentsToString(segments, false))
of.WriteString("\n")
for i := 0; i < 20; i++ {
for _, l := range lines {
segments := segmenter.Segment(l)
if *output != "" {
of.WriteString(sego.SegmentsToString(segments, false))
of.WriteString("\n")
}
}
}

Expand All @@ -129,5 +131,5 @@ func main() {
// 记录时间并计算分词速度
t3 := time.Now()
log.Printf("分词花费时间 %v", t3.Sub(t2))
log.Printf("分词速度 %f MB/s", float64(size)/t3.Sub(t2).Seconds()/(1024*1024))
log.Printf("分词速度 %f MB/s", float64(20*size)/t3.Sub(t2).Seconds()/(1024*1024))
}

0 comments on commit 0b86139

Please sign in to comment.