Skip to content

Commit 9f5bd84

Browse files
authored
add utf8 string support for aho corasick (TheAlgorithms#379)
1 parent a59a423 commit 9f5bd84

File tree

1 file changed

+37
-2
lines changed

1 file changed

+37
-2
lines changed

src/string/aho_corasick.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ impl AhoCorasick {
6464
pub fn search<'a>(&self, s: &'a str) -> Vec<&'a str> {
6565
let mut ans = vec![];
6666
let mut cur = Rc::clone(&self.root);
67-
for (i, c) in s.chars().enumerate() {
67+
let mut position: usize = 0;
68+
for (_, c) in s.chars().enumerate() {
6869
loop {
6970
if let Some(child) = Rc::clone(&cur).borrow().trans.get(&c) {
7071
cur = Rc::clone(child);
@@ -76,8 +77,9 @@ impl AhoCorasick {
7677
None => break,
7778
}
7879
}
80+
position += c.len_utf8();
7981
for &len in &cur.borrow().lengths {
80-
ans.push(&s[i + 1 - len..=i]);
82+
ans.push(&s[position - len..position]);
8183
}
8284
}
8385
ans
@@ -95,4 +97,37 @@ mod tests {
9597
let res = ac.search("ababcxyzacxy12678acxy6543");
9698
assert_eq!(res, ["abc", "xyz", "acxy", "678", "acxy", "6543",]);
9799
}
100+
101+
#[test]
102+
fn test_aho_corasick_with_utf8() {
103+
let dict = [
104+
"abc",
105+
"中文",
106+
"abc中",
107+
"abcd",
108+
"xyz",
109+
"acxy",
110+
"efg",
111+
"123",
112+
"678",
113+
"6543",
114+
"ハンバーガー",
115+
];
116+
let ac = AhoCorasick::new(&dict);
117+
let res = ac.search("ababc中xyzacxy12678acxyハンバーガー6543中文");
118+
assert_eq!(
119+
res,
120+
[
121+
"abc",
122+
"abc中",
123+
"xyz",
124+
"acxy",
125+
"678",
126+
"acxy",
127+
"ハンバーガー",
128+
"6543",
129+
"中文"
130+
]
131+
);
132+
}
98133
}

0 commit comments

Comments
 (0)