Skip to content

Commit

Permalink
1. 优化分词性能
Browse files Browse the repository at this point in the history
2. 优化拼音搜索
  • Loading branch information
ldqk committed Dec 8, 2022
1 parent bf77922 commit b9bae0d
Show file tree
Hide file tree
Showing 5 changed files with 394 additions and 335 deletions.
32 changes: 31 additions & 1 deletion Masuit.LuceneEFCore.SearchEngine/Extensions/StringHelpers.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System.Collections.Generic;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;

Expand Down Expand Up @@ -28,5 +29,34 @@ internal static string RemoveHtmlTag(this string html)
strText = Regex.Replace(strText, "&[^;]+;", "");
return strText;
}

/// <summary>
/// 添加多个元素
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="this"></param>
/// <param name="values"></param>
public static void AddRange<T>(this ICollection<T> @this, IEnumerable<T> values)
{
foreach (var obj in values)
{
@this.Add(obj);
}
}

/// <summary>
/// 移除符合条件的元素
/// </summary>
/// <typeparam name="T"></typeparam>
/// <param name="this"></param>
/// <param name="where"></param>
public static void RemoveWhere<T>(this ICollection<T> @this, Func<T, bool> @where)
{
foreach (var obj in @this.Where(where).ToList())
{
@this.Remove(obj);
}
}

}
}
223 changes: 121 additions & 102 deletions Masuit.LuceneEFCore.SearchEngine/KeywordsManager.cs
Original file line number Diff line number Diff line change
@@ -1,120 +1,139 @@
using JiebaNet.Segmenter;
using Lucene.Net.Analysis.JieBa;
using Lucene.Net.Analysis;
using Microsoft.EntityFrameworkCore.Metadata.Internal;
using System.Collections.Generic;
using System.Linq;
using System.Text.RegularExpressions;
using TinyPinyin;

namespace Masuit.LuceneEFCore.SearchEngine
{
public static class KeywordsManager
{
/// <summary>
/// 近义词组
/// </summary>
internal static HashSet<(string key, string value)> SynonymWords { get; set; } = new();
public static class KeywordsManager
{
/// <summary>
/// 近义词组
/// </summary>
internal static HashSet<(string key, string value)> SynonymWords { get; set; } = new();

internal static HashSet<(string key, string value)> Pinyins { get; set; } = new();
private static HashSet<(string key, string value)> Pinyins { get; set; } = new();
private static ILookup<string, string> _pinyinsLookup;

private static readonly JiebaSegmenter JiebaSegmenter = new();
internal static ILookup<string, string> PinyinsLookup => _pinyinsLookup ??= Pinyins.ToLookup(t => t.key, t => t.value);

/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pair"></param>
public static void AddSynonyms(KeyValuePair<string, string> pair)
{
SynonymWords.Add((pair.Key, pair.Value));
AddWords(pair.Key, pair.Value);
}
private static readonly JiebaSegmenter JiebaSegmenter = new();

/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pair"></param>
public static void AddSynonyms((string, string) pair)
{
SynonymWords.Add((pair.Item1, pair.Item2));
AddWords(pair.Item1, pair.Item2);
}
/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pair"></param>
public static void AddSynonyms(KeyValuePair<string, string> pair)
{
SynonymWords.Add((pair.Key, pair.Value));
AddWords(pair.Key, pair.Value);
}

/// <summary>
/// 添加近义词
/// </summary>
public static void AddSynonyms(string key, string value, params string[] values)
{
SynonymWords.Add((key, value));
AddWords(key, value);
foreach (var s in values)
{
SynonymWords.Add((key, s));
AddWords(s);
}
}
/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pair"></param>
public static void AddSynonyms((string, string) pair)
{
SynonymWords.Add((pair.Item1, pair.Item2));
AddWords(pair.Item1, pair.Item2);
}

/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pairs"></param>
public static void AddSynonyms(IEnumerable<(string key, string value)> pairs)
{
foreach (var t in pairs)
{
SynonymWords.Add(t);
AddWords(t.key, t.value);
}
}
/// <summary>
/// 添加近义词
/// </summary>
public static void AddSynonyms(string key, string value, params string[] values)
{
SynonymWords.Add((key, value));
AddWords(key, value);
foreach (var s in values)
{
SynonymWords.Add((key, s));
AddWords(s);
}
}

/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pairs"></param>
public static void AddSynonyms(IEnumerable<KeyValuePair<string, string>> pairs)
{
foreach (var pair in pairs)
{
AddWords(pair.Key, pair.Value);
SynonymWords.Add((pair.Key, pair.Value));
}
}
/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pairs"></param>
public static void AddSynonyms(IEnumerable<(string key, string value)> pairs)
{
foreach (var t in pairs)
{
SynonymWords.Add(t);
AddWords(t.key, t.value);
}
}

/// <summary>
/// 添加关键词
/// </summary>
/// <param name="word"></param>
public static void AddWords(string word)
{
JiebaSegmenter.AddWord(word);
Pinyins.Add((word, PinyinHelper.GetPinyin(Regex.Replace(word, @"\p{P}|\p{S}", ""))));
}
/// <summary>
/// 添加近义词
/// </summary>
/// <param name="pairs"></param>
public static void AddSynonyms(IEnumerable<KeyValuePair<string, string>> pairs)
{
foreach (var pair in pairs)
{
AddWords(pair.Key, pair.Value);
SynonymWords.Add((pair.Key, pair.Value));
}
}

/// <summary>
/// 添加关键词
/// </summary>
/// <param name="words"></param>
public static void AddWords(IEnumerable<string> words)
{
foreach (var s in words)
{
JiebaSegmenter.AddWord(s);
Pinyins.Add((s, PinyinHelper.GetPinyin(Regex.Replace(s, @"\p{P}|\p{S}", ""))));
}
}
/// <summary>
/// 添加关键词
/// </summary>
/// <param name="word"></param>
public static void AddWords(string word)
{
JiebaSegmenter.AddWord(word);
var pinyin = PinyinHelper.GetPinyin(Regex.Replace(word, @"[^\u4e00-\u9fa5]", ""));
if (!string.IsNullOrEmpty(pinyin))
{
var key = pinyin.ToLower();
Pinyins.Add((key.Replace(" ", ""), word));
Pinyins.Add((new string(key.Split(' ').Select(s => s[0]).ToArray()), word));
}
}

/// <summary>
/// 添加关键词
/// </summary>
/// <param name="word"></param>
/// <param name="words"></param>
public static void AddWords(string word, params string[] words)
{
JiebaSegmenter.AddWord(word);
foreach (var s in words)
{
JiebaSegmenter.AddWord(s);
Pinyins.Add((s, PinyinHelper.GetPinyin(Regex.Replace(s, @"\p{P}|\p{S}", ""))));
}
}
}
/// <summary>
/// 添加关键词
/// </summary>
/// <param name="words"></param>
public static void AddWords(IEnumerable<string> words)
{
foreach (var s in words)
{
JiebaSegmenter.AddWord(s);
var pinyin = PinyinHelper.GetPinyin(Regex.Replace(s, @"[^\u4e00-\u9fa5]", ""));
if (!string.IsNullOrEmpty(pinyin))
{
var key = pinyin.ToLower();
Pinyins.Add((key.Replace(" ", ""), s));
Pinyins.Add((new string(key.Split(' ').Select(ss => ss[0]).ToArray()), s));
}
}
}

/// <summary>
/// 添加关键词
/// </summary>
/// <param name="word"></param>
/// <param name="words"></param>
public static void AddWords(string word, params string[] words)
{
JiebaSegmenter.AddWord(word);
foreach (var s in words)
{
JiebaSegmenter.AddWord(s);
var pinyin = PinyinHelper.GetPinyin(Regex.Replace(s, @"[^\u4e00-\u9fa5]", ""));
if (!string.IsNullOrEmpty(pinyin))
{
var key = pinyin.ToLower();
Pinyins.Add((key.Replace(" ", ""), s));
Pinyins.Add((new string(key.Split(' ').Select(ss => ss[0]).ToArray()), s));
}
}
}
}
}
Loading

0 comments on commit b9bae0d

Please sign in to comment.