加入暗黑模式

HotSec · May 12, 2022 · d8c9949 · d8c9949
1 parent e0d1f76
commit d8c9949
Show file tree

Hide file tree

Showing 15 changed files with 428 additions and 245 deletions.
diff --git a/main.go b/main.go
@@ -73,10 +73,8 @@ func initContainer(args Args, tokenizer *words.Tokenizer) *searcher.Container {
 		Tokenizer: tokenizer,
 		Shard:     args.Shard,
 	}
-	err := container.Init()
-	if err != nil {
-		panic(err)
-	}
+	go container.Init()
+
 	return container
 }
 

diff --git a/searcher/engine.go b/searcher/engine.go
@@ -316,6 +316,7 @@ func (e *Engine) MultiSearch(request *model.SearchRequest) *model.SearchResult {
 
 	fastSort := &sorts.FastSort{
 		IsDebug: e.IsDebug,
+		Order:   request.Order,
 	}
 
 	_time := utils.ExecTime(func() {
@@ -336,6 +337,14 @@ func (e *Engine) MultiSearch(request *model.SearchRequest) *model.SearchResult {
 	// 处理分页
 	request = request.GetAndSetDefault()
 
+	//计算交集得分和去重
+	fastSort.Process()
+
+	wordMap := make(map[string]bool)
+	for _, word := range words {
+		wordMap[word] = true
+	}
+
 	//读取文档
 	var result = &model.SearchResult{
 		Total: fastSort.Count(),
@@ -347,14 +356,6 @@ func (e *Engine) MultiSearch(request *model.SearchRequest) *model.SearchResult {
 	_time += utils.ExecTime(func() {
 
 		pager := new(pagination.Pagination)
-		var resultItems []model.SliceItem
-		_tt := utils.ExecTime(func() {
-			resultItems = fastSort.GetAll(request.Order)
-		})
-
-		if e.IsDebug {
-			log.Println("处理排序耗时", _tt, "ms")
-		}
 
 		pager.Init(request.Limit, fastSort.Count())
 		//设置总页数
@@ -364,15 +365,18 @@ func (e *Engine) MultiSearch(request *model.SearchRequest) *model.SearchResult {
 		if pager.PageCount != 0 {
 
 			start, end := pager.GetPage(request.Page)
-			items := resultItems[start:end]
-			count := len(items)
+
+			var resultItems = make([]model.SliceItem, 0)
+			fastSort.GetAll(&resultItems, start, end)
+
+			count := len(resultItems)
+
 			result.Documents = make([]model.ResponseDoc, count)
 			//只读取前面100个
 			wg := new(sync.WaitGroup)
 			wg.Add(count)
-			for index, item := range items {
-
-				go e.getDocument(item, &result.Documents[index], request, &words, wg)
+			for index, item := range resultItems {
+				go e.getDocument(item, &result.Documents[index], request, &wordMap, wg)
 			}
 			wg.Wait()
 		}
@@ -385,7 +389,7 @@ func (e *Engine) MultiSearch(request *model.SearchRequest) *model.SearchResult {
 	return result
 }
 
-func (e *Engine) getDocument(item model.SliceItem, doc *model.ResponseDoc, request *model.SearchRequest, words *[]string, wg *sync.WaitGroup) {
+func (e *Engine) getDocument(item model.SliceItem, doc *model.ResponseDoc, request *model.SearchRequest, wordMap *map[string]bool, wg *sync.WaitGroup) {
 	buf := e.GetDocById(item.Id)
 	defer wg.Done()
 	doc.Score = item.Score
@@ -395,15 +399,18 @@ func (e *Engine) getDocument(item model.SliceItem, doc *model.ResponseDoc, reque
 		storageDoc := new(model.StorageIndexDoc)
 		utils.Decoder(buf, &storageDoc)
 		doc.Document = storageDoc.Document
+		doc.Keys = storageDoc.Keys
 		text := storageDoc.Text
 		//处理关键词高亮
 		highlight := request.Highlight
 		if highlight != nil {
 			//全部小写
 			text = strings.ToLower(text)
 			//还可以优化，只替换击中的词
-			for _, word := range *words {
-				text = strings.ReplaceAll(text, word, fmt.Sprintf("%s%s%s", highlight.PreTag, word, highlight.PostTag))
+			for _, key := range storageDoc.Keys {
+				if ok := (*wordMap)[key]; ok {
+					text = strings.ReplaceAll(text, key, fmt.Sprintf("%s%s%s", highlight.PreTag, key, highlight.PostTag))
+				}
 			}
 		}
 		doc.Text = text
@@ -426,9 +433,7 @@ func (e *Engine) processKeySearch(word string, fastSort *sorts.FastSort, wg *syn
 		ids := make([]uint32, 0)
 		//解码
 		utils.Decoder(buf, &ids)
-		//ids越多，说明这个词频越高，这个词越重要
-		frequency := (len(ids) % base) + 1
-		fastSort.Add(ids, frequency)
+		fastSort.Add(&ids)
 	}
 
 }
@@ -523,10 +528,11 @@ func (e *Engine) Drop() error {
 		return err
 	}
 	for _, d := range dir {
-		err := os.RemoveAll(path.Join([]string{e.IndexPath, d.Name()}...))
+		err := os.RemoveAll(path.Join([]string{d.Name()}...))
 		if err != nil {
 			return err
 		}
+		os.Remove(e.IndexPath)
 	}
 
 	//清空内存

diff --git a/searcher/model/doc.go b/searcher/model/doc.go
@@ -15,7 +15,8 @@ type StorageIndexDoc struct {
 
 type ResponseDoc struct {
 	IndexDoc
-	Score int `json:"score,omitempty"` //得分
+	Score int      `json:"score,omitempty"` //得分
+	Keys  []string `json:"keys,omitempty"`
 }
 
 type RemoveIndexModel struct {

diff --git a/searcher/sorts/fast.go b/searcher/sorts/fast.go
@@ -3,6 +3,7 @@ package sorts
 import (
 	"gofound/searcher/model"
 	"sort"
+	"strings"
 	"sync"
 )
 
@@ -22,13 +23,13 @@ func (x ScoreSlice) Swap(i, j int) {
 	x[i], x[j] = x[j], x[i]
 }
 
-type SortSlice []model.SliceItem
+type SortSlice []uint32
 
 func (x SortSlice) Len() int {
 	return len(x)
 }
 func (x SortSlice) Less(i, j int) bool {
-	return x[i].Id < x[j].Id
+	return x[i] < x[j]
 }
 func (x SortSlice) Swap(i, j int) {
 	x[i], x[j] = x[j], x[i]
@@ -48,58 +49,93 @@ type FastSort struct {
 
 	data []model.SliceItem
 
-	count int //总数
-}
-
-func (f *FastSort) Add(ids []uint32, frequency int) {
-	f.Lock()
-	defer f.Unlock()
+	temps []uint32
 
-	for _, id := range ids {
+	count int //总数
 
-		found, index := find(f.data, id)
-		if found {
-			f.data[index].Score += 1
-		} else {
+	Order string //排序方式
+}
 
-			f.data = append(f.data, model.SliceItem{
-				Id:    id,
-				Score: 1,
-			})
-		}
-	}
-	f.count = len(f.data)
+func (f *FastSort) Add(ids *[]uint32) {
+	//f.Lock()
+	//defer f.Unlock()
+
+	//for _, id := range *ids {
+	//
+	//	found, index := f.find(&id)
+	//	if found {
+	//		f.data[index].Score += 1
+	//	} else {
+	//
+	//		f.data = append(f.data, model.SliceItem{
+	//			Id:    id,
+	//			Score: 1,
+	//		})
+	//		f.Sort()
+	//	}
+	//}
+	//f.count = len(f.data)
+	f.temps = append(f.temps, *ids...)
 }
 
 // 二分法查找
-func find(data []model.SliceItem, target uint32) (bool, int) {
+func (f *FastSort) find(target *uint32) (bool, int) {
+
 	low := 0
-	high := len(data) - 1
+	high := f.count - 1
 	for low <= high {
 		mid := (low + high) / 2
-		if data[mid].Id == target {
+		if f.data[mid].Id == *target {
 			return true, mid
-		} else if data[mid].Id < target {
+		} else if f.data[mid].Id < *target {
 			high = mid - 1
 		} else {
 			low = mid + 1
 		}
 	}
 	return false, -1
+	//for index, item := range f.data {
+	//	if item.Id == *target {
+	//		return true, index
+	//	}
+	//}
+	//return false, -1
 }
 
 // Count 获取数量
 func (f *FastSort) Count() int {
 	return f.count
 }
 
-func (f *FastSort) GetAll(order string) []model.SliceItem {
+// Sort 排序
+func (f *FastSort) Sort() {
+	if strings.ToLower(f.Order) == DESC {
+		sort.Sort(sort.Reverse(SortSlice(f.temps)))
+	} else {
+		sort.Sort(SortSlice(f.temps))
+	}
+}
+
+// Process 处理数据
+func (f *FastSort) Process() {
+	//计算重复
+	f.Sort()
 
-	if order == DESC {
-		sort.Sort(sort.Reverse(SortSlice(f.data)))
+	for _, temp := range f.temps {
+		if found, index := f.find(&temp); found {
+			f.data[index].Score += 1
+		} else {
+			f.data = append(f.data, model.SliceItem{
+				Id:    temp,
+				Score: 1,
+			})
+			f.count++
+		}
 	}
 	//对分数进行排序
 	sort.Sort(sort.Reverse(ScoreSlice(f.data)))
+}
+func (f *FastSort) GetAll(result *[]model.SliceItem, start int, end int) {
 
-	return f.data
+	*result = f.data[start:end]
 }
diff --git a/searcher/utils/utils.go b/searcher/utils/utils.go
@@ -11,6 +11,8 @@ import (
 	"log"
 	"os"
 	"path/filepath"
+	"regexp"
+	"strings"
 	"time"
 )
 
@@ -277,7 +279,7 @@ func ReleaseAssets(file fs.File, out string) {
 
 }
 
-// DirSizeMB getFileSize get file size by path(B)
+// DirSizeB DirSizeMB getFileSize get file size by path(B)
 func DirSizeB(path string) int64 {
 	var size int64
 	filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
@@ -307,3 +309,36 @@ func exists(path string) bool {
 	_, err := os.Stat(path)
 	return err == nil || os.IsExist(err)
 }
+
+// RemovePunctuation 移除所有的标点符号
+func RemovePunctuation(str string) string {
+	reg := regexp.MustCompile(`\p{P}+`)
+	return reg.ReplaceAllString(str, "")
+}
+
+// RemoveSpace 移除所有的空格
+func RemoveSpace(str string) string {
+	reg := regexp.MustCompile(`\s+`)
+	return reg.ReplaceAllString(str, "")
+}
+
+func contains(s *[]string, e string, skipIndex int) bool {
+	for index, a := range *s {
+		if index != skipIndex && strings.Contains(a, e) {
+			return true
+		}
+	}
+	return false
+}
+
+// GetLongWords 获取长词
+func GetLongWords(words *[]string) []string {
+
+	var newWords = make([]string, 0)
+	for index, w := range *words {
+		if !contains(words, w, index) {
+			newWords = append(newWords, w)
+		}
+	}
+	return newWords
+}
diff --git a/searcher/words/tokenizer.go b/searcher/words/tokenizer.go
@@ -36,6 +36,10 @@ func NewTokenizer(dictionaryPath string) *Tokenizer {
 func (t *Tokenizer) Cut(text string) []string {
 	//不区分大小写
 	text = strings.ToLower(text)
+	//移除所有的标点符号
+	text = utils.RemovePunctuation(text)
+	//移除所有的空格
+	text = utils.RemoveSpace(text)
 
 	var wordMap = make(map[string]int)
 

diff --git a/tests/http/index.http b/tests/http/index.http
@@ -65,9 +65,3 @@ Content-Type: application/json
     "number": 223
   }
 }
-
-###
-GET localhost:5678/api/dump
-Accept: application/json
-
-###