-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDictSegment.go
238 lines (219 loc) · 6.39 KB
/
DictSegment.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
package ikgo
import (
"sort"
)
var (
charMap map[rune]rune = make(map[rune]rune)
ARRAY_LENGTH_LIMIT int = 3
)
type DictSegment struct {
childrenMap map[rune]*DictSegment //Map存储结构
childrenArray []*DictSegment //数组方式存储结构
nodeChar rune //当前节点上存储的字符
storeSize int //当前节点存储的Segment数目 ==> storeSize <=ARRAY_LENGTH_LIMIT ,使用数组存储, storeSize >ARRAY_LENGTH_LIMIT ,则使用Map存储
nodeState int //当前DictSegment状态 ,默认 0 , 1表示从根节点到当前节点的路径表示一个词
}
func NewDictSegment(nodeChar rune) *DictSegment {
return &DictSegment{nodeChar: nodeChar}
}
/*
* 判断是否有下一个节点
*/
func (ds *DictSegment) hasNextNode() bool {
return ds.storeSize > 0
}
/**
* 匹配词段
* @param charArray
* @param begin
* @param length
* @param searchHit
* @return Hit
*/
func (ds *DictSegment) matchSegSearch(charArray []rune, begin, length int, searchHit *Hit) *Hit {
if searchHit == nil {
//如果hit为空,新建
searchHit = &Hit{}
searchHit.beg = begin
} else {
//否则要将HIT状态重置
searchHit.setUnmatch()
}
//设置hit的当前处理位置
searchHit.end = begin
keyChar := charArray[begin]
//引用实例变量为本地变量,避免查询时遇到更新的同步问题
segmentArray := ds.childrenArray
segmentMap := ds.childrenMap
var nds *DictSegment = nil
//STEP1 在节点中查找keyChar对应的DictSegment
if len(segmentArray) > 0 {
keySegment := NewDictSegment(keyChar)
position := sort.Search(ds.storeSize, func(i int) bool {
return keySegment.nodeChar == segmentArray[i].nodeChar
})
if position != ds.storeSize {
nds = segmentArray[position]
}
} else if len(segmentMap) > 0 {
nds, _ = segmentMap[keyChar]
}
//STEP2 找到DictSegment,判断词的匹配状态,是否继续递归,还是返回结果
if nds != nil {
if length > 1 {
//词未匹配完,继续往下搜索
return nds.matchSegSearch(charArray, begin+1, length-1, searchHit)
}
if length == 1 {
//搜索最后一个char
if nds.nodeState == 1 {
//添加HIT状态为完全匹配
searchHit.setMatch()
}
if nds.hasNextNode() {
//添加HIT状态为前缀匹配
searchHit.setPrefix()
//记录当前位置的DictSegment
searchHit.matchedDictSegment = nds
}
return searchHit
}
}
//STEP3 没有找到DictSegment, 将HIT设置为不匹配
return searchHit
}
/**
* 匹配词段
* @param charArray
* @param begin
* @param length
* @return Hit
*/
func (ds *DictSegment) matchSeg(charArray []rune, begin, length int) *Hit {
return ds.matchSegSearch(charArray, begin, length, nil)
}
/**
* 匹配词段
* @param charArray
* @return Hit
*/
func (ds *DictSegment) match(charArray []rune) *Hit {
return ds.matchSeg(charArray, 0, len(charArray))
}
func (ds *DictSegment) getChildrenArray() []*DictSegment {
if ds.childrenArray == nil {
ds.childrenArray = make([]*DictSegment, ARRAY_LENGTH_LIMIT)
}
return ds.childrenArray
}
func (ds *DictSegment) getChildrenMap() map[rune]*DictSegment {
if ds.childrenMap == nil {
ds.childrenMap = make(map[rune]*DictSegment, ARRAY_LENGTH_LIMIT*2)
}
return ds.childrenMap
}
/**
* 查找本节点下对应的keyChar的segment *
* @param keyChar
* @param create =1如果没有找到,则创建新的segment ; =0如果没有找到,不创建,返回null
* @return
*/
func (ds *DictSegment) lookforSegment(keyChar rune, create int) *DictSegment {
var nds *DictSegment = nil
if ds.storeSize <= ARRAY_LENGTH_LIMIT {
//获取数组容器,如果数组未创建则创建数组
segmentArray := ds.getChildrenArray()
//搜寻数组
keySegment := NewDictSegment(keyChar)
position := sort.Search(ds.storeSize, func(i int) bool {
return keySegment == segmentArray[i]
})
if position != len(segmentArray) {
nds = segmentArray[position]
}
//遍历数组后没有找到对应的segment
if nds == nil && create == 1 {
nds = keySegment
if ds.storeSize < ARRAY_LENGTH_LIMIT {
//数组容量未满,使用数组存储,插入
idx := ds.storeSize
for i := ds.storeSize; i > 0; i-- {
if segmentArray[i-1].nodeChar < nds.nodeChar {
idx = i
break
} else {
segmentArray[i] = segmentArray[i-1]
}
}
segmentArray[idx] = nds
//segment数目+1
ds.storeSize++
} else {
//数组容量已满,切换Map存储
//获取Map容器,如果Map未创建,则创建Map
segmentMap := ds.getChildrenMap()
//将数组中的segment迁移到Map中
for i := 0; i < ds.storeSize; i++ {
segmentMap[segmentArray[i].nodeChar] = segmentArray[i]
}
//存储新的segment
segmentMap[keyChar] = nds
//segment数目+1 , 必须在释放数组前执行storeSize++ , 确保极端情况下,不会取到空的数组
ds.storeSize++
//释放当前的数组引用
ds.childrenArray = nil
}
}
} else {
//获取Map容器,如果Map未创建,则创建Map
segmentMap := ds.getChildrenMap()
//搜索Map
var exists bool
nds, exists = segmentMap[keyChar]
if !exists && create == 1 {
//构造新的segment
nds = NewDictSegment(keyChar)
segmentMap[keyChar] = nds
//当前节点存储segment数目+1
ds.storeSize++
}
}
return nds
}
/**
* 加载填充词典片段
* @param charArray
* @param begin
* @param length
* @param enabled
*/
func (ds *DictSegment) fillSegmentSeg(charArray []rune, begin, length, enabled int) {
//获取字典表中的汉字对象
beginChar := charArray[begin]
keyChar, exists := charMap[beginChar]
//字典中没有该字,则将其添加入字典
if !exists {
charMap[beginChar] = beginChar
keyChar = beginChar
}
//搜索当前节点的存储,查询对应keyChar的keyChar,如果没有则创建
nds := ds.lookforSegment(keyChar, enabled)
if nds != nil {
//处理keyChar对应的segment
if length > 1 {
//词元还没有完全加入词典树
nds.fillSegmentSeg(charArray, begin+1, length-1, enabled)
} else if length == 1 {
//已经是词元的最后一个char,设置当前节点状态为enabled,
//enabled=1表明一个完整的词,enabled=0表示从词典中屏蔽当前词
nds.nodeState = enabled
}
}
}
/**
* 加载填充词典片段
* @param charArray
*/
func (ds *DictSegment) fillSegment(charArray []rune) {
ds.fillSegmentSeg(charArray, 0, len(charArray), 1)
}