move from private repo to github

SJTU-IPADS · May 14, 2022 · d4e4ade · d4e4ade
1 parent 607b60e
commit d4e4ade
Show file tree

Hide file tree

Showing 1,169 changed files with 542,229 additions and 2 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,19 @@
+default: all
+
+all:
+	cd client ; make all -j8 ; cd -
+	cd manager ; make all -j8 ; cd -
+	cd executor ; make all -j8 ; cd -
+
+	cd kanban/deps ; make hdr_histogram ; make linenoise
+	cd kanban ; make -j8
+	cd kanban ; ./src/redis-server ./redis.conf &
+	sleep 1
+	cd kanban ; ./src/redis-cli FLUSHALL
+	pkill redis-server
+
+clean:
+	cd client ; make clean ; cd -
+	cd manager ; make clean ; cd -
+	cd executor ; make clean ; cd -
+	cd kanban ; make clean ; cd -
diff --git a/README.md b/README.md
@@ -1,3 +1,113 @@
-# DeSearch
+# Desearch
 
-(To be released)
+Desearch is an experimental system towards a decentralized search engine.
+To achieve good scalability and fault tolerance, desearch decouples computation and storage using a stateless trusted network and stateful blockchain-regulated cloud store.
+
+The current desearch implementation relies on Intel SGX as the trusted hardware and Redis as the store.
+
+**Warning: This repo hosts an academic proof-of-concept prototype and has not received a careful code review.**
+
+## Getting Started
+
+Hardware Requirement: SGX-capable desktops or SGX-capable cloud machines. To check whether your machine supports SGX, please refer to [Intel SGX Hardware](https://github.com/ayeks/SGX-hardware).
+
+Note that if you wish to run several SGX nodes on your local machine without support of scalable SGX, it might take longer to bootstrap the whole system because of the scarce encrypted memory (usually 128MB/256MB on SGXv1).
+
+This repo provides a non-SGX version in the `executor` folder, and an SGX version in the `sgx-executor` folder. The non-SGX version helps you debug more easily if you want to build extensions to desearch; it uses the same folder structure as the SGX version. Note that only the SGX version contains ORAM-based queriers.
+
+### Prerequisite
+
+Under Ubuntu 20.04, building desearch executors requires the following depecencies installed:
+
+- [SGX SDK + SGX PSW](https://01.org/intel-software-guard-extensions/downloads)
+- [Boost C++ Libraries](https://www.boost.org/)
+- [Hiredis](https://github.com/redis/hiredis)
+- [Redis++](https://github.com/sewenew/redis-plus-plus)
+
+You can refer to the `deps` folder for the correct versions of Redis, Hiredis, and Redis++.
+
+```shell
+apt intall -y libboost-all-dev
+
+cd deps
+tar zxf redis-6.2.6.tar.gz
+tar zxf redis-plus-plus-1.2.3.tar.gz
+
+pushd redis-6.2.6/deps/hiredis/
+make -j$(nproc) && make install
+popd
+
+pushd redis-plus-plus-1.2.3
+mkdir build && cd build && cmake ..
+make -j$(nproc) && make install
+popd
+```
+
+### Kanban
+
+Kanban is an unmodified [Redis](https://redis.io/) that leverages relatively cheap cloud storage.
+
+Simply start the Redis server:
+```shell
+cd Kanban
+cd deps && make hdr_histogram && make linenoise
+cd - && make all
+./src/redis-server ./redis.conf
+```
+
+To clear all states from Kanban, you can issue `./src/redis-cli FLUSHALL`.
+
+### Manager
+
+Manager is a special executor that makes Kanban resistant to tampering.
+
+```shell
+cd manager
+make all
+./manager
+```
+
+### Executor
+
+Executor consist of the whole search pipeline. This release consolidates all search roles within one executable. You are free to modify `executor/start.sh` to launch more executors as you like.
+
+```shell
+cd executor
+make all
+bash ./start.sh
+```
+
+### Client
+
+Client is a Web server that serves as the entry for desearch.
+
+```sh
+cd client
+make all
+./client
+```
+Then double click the `client/index.html`, or use a fancy Web entry `client/WebUI/index.html`.
+
+![demo](img/demo.png)
+![fanyc](img/fancy.png)
+
+### How to build a distributed setup
+
+To extend to a WAN setup, you need to modify a few network configurations:
+- `config.hpp`: change `KANBAN_ADDR` to a global public IP
+- `executor/start.sh`: change querier IP address to a public one that clients can reach
+
+### Limitations and FAQs
+
+See [executor/README.md](executor/README.md)
+
+## Contributors
+
+- Mingyu Li
+- Jinhao Zhu
+- Tianxu Zhang
+- Sajin Sasy: [ZeroTrace](https://github.com/sshsshy/ZeroTrace)
+
+## License
+
+MulanPSL-2.0 (see [here](https://opensource.org/licenses/MulanPSL-2.0))
diff --git a/bench/README.md b/bench/README.md
@@ -0,0 +1,16 @@
+# Benchmark
+
+This folder hosts a candidate keyword list named `top-10k-words.txt` that we sampled from the Steemit dataset.
+
+The `bench.go` shows how we used to benchmark desearch queriers.
+Note that the dependency [Tachymeter](https://github.com/jamiealquiza/tachymeter) is required.
+
+You can also use `ApacheBench`:
+```shell
+ab -n 10000 -c 1 http://127.0.0.1:12000/query/steemit/page/1
+```
+
+To increase the concurency level of the client, you can update the the thread pool number in `client/contrib/SimpleWeb/server_http.hpp`:
+```shell
+346    std::size_t thread_pool_size = 1;
+```
diff --git a/bench/bench.go b/bench/bench.go
@@ -0,0 +1,101 @@
+// Fetchall fetches URLs in parallel and reports their times and sizes.
+package main
+
+import(
+    "fmt"
+    "io"
+    "io/ioutil"
+    "net/http"
+    "time"
+    "flag"
+    "math/rand"
+    "strings"
+    "log"
+
+    "github.com/jamiealquiza/tachymeter"
+//    "strconv"
+)
+
+var (
+    request_number = flag.Int("requests", 1000, "number of requests")
+    keyword_number = flag.Int("keywords", 1, "number of keywords")
+)
+
+func main() {
+
+    config := tachymeter.New(&tachymeter.Config{Size: 50})
+
+    // for parallel
+    // runtime.GOMAXPROCS(16)
+
+    // read entire keyword list into memory
+    content, err := ioutil.ReadFile("./top-10k-words.txt")
+    if err != nil {
+        log.Fatal(err)
+    }
+    text := string(content)
+    keywords := strings.Split(text, "\n")
+
+    // random shuffle
+    rand.Seed(time.Now().UnixNano())
+    for i := len(keywords) - 1; i > 0; i-- { // Fisher–Yates shuffle
+        j := rand.Intn(i + 1)
+        keywords[i], keywords[j] = keywords[j], keywords[i]
+    }
+
+    // NOTE: add the list of all the queriers below
+    // otherwise you are just benchmarking the local web server
+    urls := []string{
+        // "http://localhost:12000" + "/query/",
+        }
+
+    benchmark_start := time.Now()
+    ch := make(chan string)
+    for i := 0;i < *request_number; i++ {
+
+        keyword_list := string("")
+        for j := 0; j < (*keyword_number-1); j++ {
+            keyword_list += keywords[(i+j)%10000] + "+"
+        }
+        keyword_list += keywords[(i+(*keyword_number))%10000]
+
+        url := urls[i%len(urls)] + keyword_list + "/page/1"
+        go func (url string, ch chan<- string) { // start a goroutine
+            start := time.Now()
+            resp, err := http.Get(url)
+            if err != nil {
+                ch <- fmt.Sprint(err) // send to channel ch
+                return
+            }
+
+            nbytes, err := io.Copy(ioutil.Discard, resp.Body)
+            resp.Body.Close() // don't leak resources
+            if err != nil {
+                ch <- fmt.Sprintf("while reading %s: %v", url, err)
+                return
+            }
+            config.AddTime(time.Since(start))
+            secs := time.Since(start).Seconds()
+            ch <- fmt.Sprintf("%.2fs  %7d  %s", secs, nbytes, url)
+        } (url, ch)
+    }
+    for i := 0;i < *request_number; i++ {
+        fmt.Println(<-ch) // receive from channel ch
+        //<-ch
+    }
+    elapsed := time.Since(benchmark_start)
+    fmt.Printf("%.2fs elapsed\n", elapsed.Seconds())
+    fmt.Printf("Throughput: %f ops/sec\n", float64(*request_number) / elapsed.Seconds())
+
+	// Calc output.
+	results := config.Calc()
+
+	// Print JSON format to console.
+	fmt.Printf("%s\n\n", results.JSON())
+
+	// Print pre-formatted console output.
+	fmt.Printf("%s\n\n", results)
+
+	// Print text histogram.
+	fmt.Println(results.Histogram.String(15))
+}