From 7fbc8775086274c27aee3292274db330139ab68a Mon Sep 17 00:00:00 2001
From: yanlifeng <1362520198@qq.com>
Date: Fri, 16 Dec 2022 09:32:02 +0800
Subject: [PATCH] tgs sample slave, somtimes RE ?

---
 Makefile              |  38 ++++--
 run.sh                |   1 +
 slave/Reference.h     |  39 ++++++
 slave/cmdinfo.h       | 112 +++++++++++++++++
 slave/dma_funcs.hpp   |  49 ++++++++
 slave/slave.cpp       | 112 +++++++++++++++++
 slave/state.h         | 161 +++++++++++++++++++++++++
 slave/tgsstate.h      |  97 +++++++++++++++
 slave/threadinfo.h    |  27 +++++
 slave/tuple_spawn.hpp |  71 +++++++++++
 src/Globals.h         |   1 +
 src/adapter.cpp       |  11 +-
 src/main.cpp          | 165 ++++++++++++++-----------
 src/seqc.cpp          | 256 +++++++++++++++++++++------------------
 src/seqc.h            |   2 +-
 src/state.cpp         |   4 +-
 src/tgsstate.cpp      |  99 ++++-----------
 src/tgsstate.h        |  15 ++-
 src/tuple_spawn.hpp   |  71 +++++++++++
 xlink.py              | 272 ++++++++++++++++++++++++++++++++++++++++++
 20 files changed, 1308 insertions(+), 295 deletions(-)
 create mode 100644 run.sh
 create mode 100644 slave/Reference.h
 create mode 100644 slave/cmdinfo.h
 create mode 100644 slave/dma_funcs.hpp
 create mode 100644 slave/slave.cpp
 create mode 100644 slave/state.h
 create mode 100644 slave/tgsstate.h
 create mode 100644 slave/threadinfo.h
 create mode 100644 slave/tuple_spawn.hpp
 create mode 100644 src/tuple_spawn.hpp
 create mode 100644 xlink.py

diff --git a/Makefile b/Makefile
index 1c5a657..04c7fa4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,14 @@
 PRINTDEBUG := 0
+InstructSet :=
 
 $(info            )
 
 DIR_INC := ./inc
 DIR_SRC := ./src
+SLAVE_DIR_SRC := ./slave
 DIR_OBJ := ./obj
 
+
 PREFIX ?= /usr/local
 BINDIR ?= $(PREFIX)/bin
 INCLUDE_DIRS ?= 
@@ -19,6 +22,10 @@ SRC2 := $(wildcard ${DIR_SRC}/*.c)
 OBJ += $(patsubst %.c,${DIR_OBJ}/%.o,$(notdir ${SRC2}))
 
 
+SRC3 := $(wildcard ${SLAVE_DIR_SRC}/*.cpp)
+OBJ += $(patsubst %.cpp,${DIR_OBJ}/%.o,$(notdir ${SRC3}))
+
+
 TARGET := RabbitQCPlus
 
 BIN_TARGET := ${TARGET}
@@ -28,29 +35,42 @@ CXX = sw5g++
 
 
 CXXFLAGS := $(InstructSet)
-CXXFLAGS += -DNDUBUG -std=c++11 -I./ -I./common -g -O3  -w -fopenmp
+CXXFLAGS += -DVerbose -std=c++11 -I./ -I./common -g -O3 -w -fopenmp
 
 
 CXX2 = sw5gcc
 CXXFLAGS2 := -g -O3 -w -Wextra -Wno-unknown-pragmas -Wcast-qual
 
-LIBS := -static -lz -Wl,-z,muldefs -lpthread  -fopenmp  -lrt 
+LIBS := -static -lz -Wl,-z,muldefs -lpthread -fopenmp -lrt
 
 
 LD_FLAGS := $(foreach librarydir,$(LIBRARY_DIRS),-L$(librarydir)) $(LIBS)
 
-
-
-
 ${BIN_TARGET}:${OBJ}
-	$(CXX) $(OBJ) -o $@ $(LD_FLAGS)
+		python3 xlink.py $(CXX) -mhybrid $^ -o $@ $(LD_FLAGS)
 
 ${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp
-	$(CXX) -c $< -o $@ $(CXXFLAGS)
+		$(CXX) -mhost -msimd -c $< -o $@ $(CXXFLAGS)
+${DIR_OBJ}/%.o:${SLAVE_DIR_SRC}/%.cpp
+		$(CXX) -mslave -msimd -c $< -o $@ $(CXXFLAGS)
+${DIR_OBJ}/%.o:${DIR_SRC}/%.c
+		$(CXX2) -mhost -msimd -c $< -o $@ $(CXXFLAGS2) 
 
 
-${DIR_OBJ}/%.o:${DIR_SRC}/%.c
-	$(CXX2) $(CXXFLAGS2) -c $< -o $@
+
+#${BIN_TARGET}:${OBJ}
+#		python3 xlink.py $(CXX) -mhybrid $^ -o $@ $(LD_FLAGS)
+#
+#${DIR_OBJ}/%.o:${DIR_SRC}/%.cpp
+#		$(CXX) -mhost -msimd -c $< -o $@ $(CXXFLAGS)
+#
+#${DIR_OBJ}/%.o:${SLAVE_DIR_SRC}/%.cpp
+#		$(CXX) -mslave -msimd -c $< -o $@ $(CXXFLAGS)
+#
+#${DIR_OBJ}/%.o:${DIR_SRC}/%.c
+#		$(CXX2) -mhost -msimd -c $< -o $@ $(CXXFLAGS2)
+#
+
 
 .PHONY:clean
 clean:
diff --git a/run.sh b/run.sh
new file mode 100644
index 0000000..9bafcf4
--- /dev/null
+++ b/run.sh
@@ -0,0 +1 @@
+bsub -b -I -q q_sw_expr -n 1  -cgsp 64 -sw3run swrun-5a -share_size 4096 ./RabbitQCPlus --help
diff --git a/slave/Reference.h b/slave/Reference.h
new file mode 100644
index 0000000..4bc7b9d
--- /dev/null
+++ b/slave/Reference.h
@@ -0,0 +1,39 @@
+#ifndef __REFERENCE_H_
+#define __REFERENCE_H_
+
+/**
+ * @brief Reference struct that store the FASTA and FASTQ infomation
+ */
+struct Reference {
+    std::string name;
+    std::string comment;
+    std::string seq;
+    std::string quality;
+    std::string strand;
+    uint64_t length;
+    uint64_t gid;
+};
+
+/**
+ * @brief Reference struct that store the FASTA and FASTQ infomation, different from Reference
+ *        neoReference only record the start position and length of name, sequence, strand and quality
+ *        base on certain chunk data `base`
+ */
+struct neoReference {
+    uint64_t pname;  /// name offset form base
+    uint64_t pcom;   /// comment offset form base
+    uint64_t pseq;   /// sequence offset form base
+    uint64_t pqual;  /// quality offset form base
+    uint64_t pstrand;///strand offset form base
+
+    uint64_t lname;    /// length of name
+    uint64_t lcom;     /// length of comment
+    uint64_t lseq;     /// length of sequence
+    uint64_t lqual;    /// length of quality
+    uint64_t lstrand;  /// length of strand
+    uint64_t gid;      /// global id
+    unsigned char *base;/// base data pointer
+};
+
+
+#endif
diff --git a/slave/cmdinfo.h b/slave/cmdinfo.h
new file mode 100644
index 0000000..eed2542
--- /dev/null
+++ b/slave/cmdinfo.h
@@ -0,0 +1,112 @@
+//
+// Created by ylf9811 on 2021/7/6.
+//
+
+#ifndef RERABBITQC_CMDINFO_H
+#define RERABBITQC_CMDINFO_H
+
+class CmdInfo {
+public:
+    CmdInfo();
+
+public:
+    std::string in_file_name1_;
+    std::string in_file_name2_;
+    std::string out_file_name1_;
+    std::string out_file_name2_;
+    int thread_number_;
+    std::string command_;
+    bool isPhred64_;
+    bool isStdin_;
+    bool isStdout_;
+
+    bool overWrite_;
+    bool write_data_;
+    int64_t out_block_size_;
+    int64_t in_file_size1_;
+    int64_t in_file_size2_;
+    int seq_len_;
+    int qul_range_;
+
+    //param for filter
+    int length_required_;
+    int length_limit_;
+    int n_number_limit_;
+    int low_qual_perc_limit_;
+    bool trim_5end_;
+    bool trim_3end_;
+    int cut_window_size_;
+    int cut_mean_quality_;
+    int trim_front1_;
+    int trim_tail1_;
+    int trim_front2_;
+    int trim_tail2_;
+
+
+    //param for adapter
+    bool trim_adapter_;
+    bool no_trim_adapter_;
+    bool se_auto_detect_adapter_;
+    bool pe_auto_detect_adapter_;
+    bool detect_adapter1_;
+    bool detect_adapter2_;
+    std::string adapter_seq1_;
+    std::string adapter_seq2_;
+    int overlap_diff_limit_;
+    int overlap_require_;
+    bool correct_data_;
+    bool analyze_overlap_;
+    int adapter_len_lim_;
+    bool print_what_trimmed_;
+    std::vector<std::string> adapter_from_fasta_;
+    std::string adapter_fasta_file_;
+
+    //duplicate
+    bool state_duplicate_;
+
+    //polyx
+    bool trim_polyx_;
+    bool trim_polyg_;
+    int trim_poly_len_;
+
+    //umi
+    bool add_umi_;
+    int umi_loc_;
+    int umi_len_;
+    std::string umi_prefix_;
+    int umi_skip_;
+
+    //TGS
+    bool is_TGS_;
+    int TGS_min_len_;
+
+    //overrepresentation_analysis
+    bool do_overrepresentation_;
+    int overrepresentation_sampling_;
+    std::vector<std::string> hot_seqs_;
+    std::vector<std::string> hot_seqs2_;
+    int eva_len_;
+    int eva_len2_;
+    bool print_ORP_seqs_;
+
+    //insert size
+    bool no_insert_size_;
+    int max_insert_size_;
+
+    //gz
+    int compression_level_;
+
+    //interleaved
+    bool interleaved_in_;
+    bool interleaved_out_;
+
+
+    //parallel gz
+    bool use_pugz_;
+    bool use_pigz_;
+    int pugz_threads_;
+    int pigz_threads_;
+};
+
+
+#endif//RERABBITQC_CMDINFO_H
diff --git a/slave/dma_funcs.hpp b/slave/dma_funcs.hpp
new file mode 100644
index 0000000..4d993bf
--- /dev/null
+++ b/slave/dma_funcs.hpp
@@ -0,0 +1,49 @@
+#pragma once
+#include <sys/cdefs.h>
+#define __sw_256bit_simd__
+#ifdef __sw_512bit_simd__
+typedef int desc_t __attribute__ ((__mode__(__V1XI__)));
+#elif defined(__sw_256bit_simd__)
+typedef int desc_t __attribute__ ((__mode__(__V1OI__)));
+#endif
+template <typename TM, typename TL>
+__always_inline void dma_getn(TM *mem, TL *ldm, int count) {
+  //long dma_desc[4];
+  desc_t dma_desc;
+  asm volatile(
+      "beq %[SIZE], 2f\n\t"
+      "memb\n\t"
+      "stl $31, 0(%[RPL])\n\t"
+      "vinsw %[RPL], $31, 2, %[DESC]\n\t"
+      "ldi %[DESC], 1($31)\n\t"
+      "sll %[DESC], 56, %[DESC]\n\t"
+      "vinsw %[SIZE], %[DESC], 0, %[DESC]\n\t"
+      "dma %[DESC], %[MEM], %[LDM]\n\t"
+      "1:\n\t"
+      "ldw %[DESC], 0(%[RPL])\n\t"
+      "beq %[DESC], 1b\n\t"
+      "2:\n\t"
+      "memb\n\t"
+      : [ DESC ] "=&r"(dma_desc)
+      : [ MEM ] "r"(mem), [ LDM ] "r"(ldm), [ RPL ] "r"(((long*)&dma_desc) + 2), [ SIZE ] "r"(count * sizeof(TL))
+      : "memory");
+}
+template <typename TM, typename TL>
+__always_inline __attribute__((__artificial__)) void dma_putn(TM *mem, TL *ldm, int count) {
+  desc_t dma_desc;
+  asm volatile(
+      "beq %[SIZE], 2f\n\t"
+      "memb\n\t"
+      "stl $31, 0(%[RPL])\n\t"
+      "vinsw %[RPL], $31, 2, %[DESC]\n\t"
+      "vinsw %[SIZE], %[DESC], 0, %[DESC]\n\t"
+      "dma %[DESC], %[MEM], %[LDM]\n\t"
+      "1:\n\t"
+      "ldw %[DESC], 0(%[RPL])\n\t"
+      "beq %[DESC], 1b\n\t"
+      "2:\n\t"
+      "memb\n\t"
+      : [ DESC ] "=&r"(dma_desc)
+      : [ MEM ] "r"(mem), [ LDM ] "r"(ldm), [ RPL ] "r"(((long*)&dma_desc) + 2), [ SIZE ] "r"(count * sizeof(TL))
+      : "memory");
+}
diff --git a/slave/slave.cpp b/slave/slave.cpp
new file mode 100644
index 0000000..3149212
--- /dev/null
+++ b/slave/slave.cpp
@@ -0,0 +1,112 @@
+
+#include <cstring>
+#include <vector>
+#include <unordered_map>
+#include <unistd.h>
+
+#include "tuple_spawn.hpp"
+#include "slave.h"
+#include "Reference.h"
+#include "cmdinfo.h"
+#include "state.h"
+#include "tgsstate.h"
+#include "threadinfo.h"
+
+
+struct qc_data {
+    ThreadInfo **thread_info_;
+    CmdInfo *cmd_info_;
+    std::vector <neoReference> *data_;
+    std::vector <neoReference> *pass_data_;
+    int *cnt;
+};
+
+void updateTop5(int rlen, double avgQual, TGSStats *tgs_state) {
+    int pos = 5;
+    for (int i = 0; i < 5; i++) {
+        if (tgs_state->mTop5QualReads[i].first < avgQual) {
+            for (int j = 4; j > i; j--) {
+                tgs_state->mTop5QualReads[j] = tgs_state->mTop5QualReads[j - 1];
+            }
+            tgs_state->mTop5QualReads[i] = {avgQual, rlen};
+            break;
+        }
+    }
+    for (int i = 0; i < 5; i++) {
+        if (tgs_state->mTop5LengReads[i].first < rlen) {
+            for (int j = 4; j > i; j--) {
+                tgs_state->mTop5LengReads[j] = tgs_state->mTop5LengReads[j - 1];
+            }
+            tgs_state->mTop5LengReads[i] = {rlen, avgQual};
+            break;
+        }
+    }
+}
+
+void tgsfunc(qc_data *para) {
+    //if(_PEN != 0) return;
+    std::vector <neoReference> *data = para->data_;
+    ThreadInfo *thread_info = para->thread_info_[_PEN];
+    CmdInfo *cmd_info = para->cmd_info_;
+    int data_num = data->size();
+    int base_num = 0;
+    for (int id = _PEN; id < data_num; id += 64) {
+    //for (int id = 0; id < data_num; id++) {
+        para->cnt[_PEN]++;
+        auto ref = (*data)[id];
+        //base_num += ref.lqual;
+        const int rlen = ref.lseq;
+        const char *seq = reinterpret_cast<const char *>(ref.base + ref.pseq);
+        const char *quality = reinterpret_cast<const char *>(ref.base + ref.pqual);
+        int size_range = thread_info->TGS_state_->mMinlen >> 1;
+        if(rlen < size_range) size_range = rlen;
+        if(thread_info->TGS_state_->countLenNum < (1 << 20))
+            thread_info->TGS_state_->mTotalReadsLen[thread_info->TGS_state_->countLenNum++] = rlen;
+        thread_info->TGS_state_->mReadsNum++;
+        thread_info->TGS_state_->mBasesNum += rlen;
+        char c1, c2;
+        int phredSub = 33;
+        if (cmd_info->isPhred64_) phredSub = 64;
+        int sumQual = 0;
+        for (int i = 0; i < rlen; i++) {
+            int qual = (quality[i] - phredSub);
+            sumQual += qual;
+            if (qual >= 5) thread_info->TGS_state_->mBases51015Num[0]++;
+            if (qual >= 10) thread_info->TGS_state_->mBases51015Num[1]++;
+            if (qual >= 15) thread_info->TGS_state_->mBases51015Num[2]++;
+        }
+        updateTop5(rlen, 1.0 * sumQual / rlen, thread_info->TGS_state_);
+        for (int i = 0; i < size_range; ++i) {
+            c1 = seq[i];
+            thread_info->TGS_state_->head_qual_sum[i] += (quality[i] - phredSub);
+            if (c1 == 'A') {
+                thread_info->TGS_state_->head_seq_pos_count[0][i]++;
+            } else if (c1 == 'T') {
+                thread_info->TGS_state_->head_seq_pos_count[1][i]++;
+            } else if (c1 == 'C') {
+                thread_info->TGS_state_->head_seq_pos_count[2][i]++;
+            } else if (c1 == 'G') {
+                thread_info->TGS_state_->head_seq_pos_count[3][i]++;
+            }
+        }
+        for (int i = rlen - 1; i >= rlen - size_range; --i) {
+            c2 = seq[i];
+            thread_info->TGS_state_->tail_qual_sum[i - (rlen - size_range)] += (quality[i] - phredSub);
+
+            if (c2 == 'A') {
+                thread_info->TGS_state_->tail_seq_pos_count[0][i - (rlen - size_range)]++;
+            } else if (c2 == 'T') {
+                thread_info->TGS_state_->tail_seq_pos_count[1][i - (rlen - size_range)]++;
+            } else if (c2 == 'C') {
+                thread_info->TGS_state_->tail_seq_pos_count[2][i - (rlen - size_range)]++;
+            } else if (c2 == 'G') {
+                thread_info->TGS_state_->tail_seq_pos_count[3][i - (rlen - size_range)]++;
+            }
+        }
+
+    }
+    //printf("=== %d base number %d\n", _PEN, base_num);
+}
+
+// template void tmpl_entrance(decltype(test__)*);
+ATHREAD_VISIBLE(tgsfunc);
diff --git a/slave/state.h b/slave/state.h
new file mode 100644
index 0000000..5f34e1a
--- /dev/null
+++ b/slave/state.h
@@ -0,0 +1,161 @@
+//
+// Created by ylf9811 on 2021/7/8.
+//
+
+#ifndef RERABBITQC_STATE_H
+#define RERABBITQC_STATE_H
+
+
+struct node {
+    int pre, cnt;
+    uint64_t v;
+    int slen;
+    std::string seq;
+    int64_t *dist;
+};
+
+class State {
+public:
+    State(CmdInfo *cmd_info, int seq_len, int qul_range, bool is_reed2);
+
+    ~State();
+
+    void StateInfo(neoReference &ref);
+
+    void StateORP(neoReference &ref);
+
+    void ExtendBuffer(int old_len, int new_len);
+
+    void Summarize();
+
+    static State *MergeStates(const std::vector<State *> &states);
+
+    static void PrintStates(const State *state);
+
+    static void PrintFilterResults(const State *state);
+
+    static void PrintAdapterToFile(const State *state);
+
+    static std::string list2string(int64_t *list, int size);
+
+    static std::string list2string(double *list, int size);
+
+    void HashInsert(const char *seq, int len, int eva_len);
+
+    void HashState();
+
+    //void HashQueryAndAdd(uint64_t now, int offset, int len, int eva_len);
+
+    bool HashQueryAndAdd(uint64_t now, int offset, int len, int eva_len);
+
+    int *GetHeadHashGraph() const;
+
+    node *GetHashGraph() const;
+
+    int GetHashNum() const;
+
+    int64_t GetQ20Bases() const;
+
+    int64_t GetQ30Bases() const;
+
+    int64_t GetLines() const;
+
+    int GetMallocSeqLen() const;
+
+    int GetQulRange() const;
+
+    int GetRealSeqLen() const;
+
+    int GetKmerBufLen() const;
+
+    int64_t *GetPosQul() const;
+
+    int64_t *GetPosCnt() const;
+
+    int64_t *GetLenCnt() const;
+
+    int64_t *GetGcCnt() const;
+
+    int64_t *GetQulCnt() const;
+
+    int64_t *GetKmer() const;
+
+    int64_t GetKmerMin() const;
+
+    int64_t GetKmerMax() const;
+
+    bool IsHasSummarize() const;
+
+    int64_t GetTotBases() const;
+
+    int64_t GetGcBases() const;
+
+    double GetOrpCost() const;
+
+    double GetAvgLen() const;
+
+    void AddPassReads();
+
+    void AddFailShort();
+
+    void AddFailLong();
+
+    void AddFailN();
+
+    void AddFailLowq();
+
+    void AddTrimAdapter();
+
+    void AddTrimAdapterBase(int cnt);
+
+
+    std::unordered_map<std::string, int> GetAdapterMap();
+
+    CmdInfo *GetCmdInfo() const;
+
+private:
+    CmdInfo *cmd_info_;
+
+    int64_t q20bases_;
+    int64_t q30bases_;
+    int64_t lines_;
+    int malloc_seq_len_;
+    int qul_range_;
+    int real_seq_len_;
+    int kmer_buf_len_;
+    int64_t *pos_qul_;
+    int64_t *pos_cnt_;
+    int64_t *len_cnt_;
+    int64_t *gc_cnt_;
+    int64_t *qul_cnt_;
+    int64_t *kmer_;
+    int64_t kmer_min_;
+    int64_t kmer_max_;
+    int64_t tot_bases_;
+    int64_t gc_bases_;
+    double avg_len;
+    bool has_summarize_;
+    int *head_hash_graph_;
+    node *hash_graph_;
+    int hash_num_;
+    bool is_read2_;
+    bool do_over_represent_analyze_;
+    int over_representation_sampling_;
+    int64_t over_representation_qcnt_;
+    int64_t over_representation_pcnt_;
+    int64_t *bf_zone_;
+    double orpCost;
+
+public:
+    int64_t pass_reads_;
+    int64_t fail_short_;
+    int64_t fail_long_;
+    int64_t fail_N_;
+    int64_t fail_lowq_;
+    int64_t trim_adapter_;
+    int64_t trim_adapter_bases_;
+    std::unordered_map<std::string, int> adapter_map_;
+};
+
+
+#endif//RERABBITQC_STATE_H
diff --git a/slave/tgsstate.h b/slave/tgsstate.h
new file mode 100644
index 0000000..d94b5b5
--- /dev/null
+++ b/slave/tgsstate.h
@@ -0,0 +1,97 @@
+//
+// Created by ylf9811 on 2021/8/11.
+//
+
+#ifndef RERABBITQC_TGSSTATE_H
+#define RERABBITQC_TGSSTATE_H
+
+
+class TGSStats {
+public:
+    TGSStats(int minLen, bool isMerge = false);
+
+    ~TGSStats();
+
+    static TGSStats *merge(std::vector<TGSStats *> &list);
+
+    void print();
+
+    void CalReadsLens();
+
+    bool isLongRead();
+
+    static std::string list2string(double *list, int size);
+
+    static std::string list2string(double *list, int size, int64_t *coords);
+
+    static std::string list2string(int64_t *list, int size);
+
+    static std::string list2stringReversedOrder(int64_t *list, int size);
+
+    int base2num(std::string base);
+
+//private:
+public:
+    int mMinlen;
+
+    int mHalfMinlen;
+
+    //std::vector<int> mTotalReadsLen;
+    int *mTotalReadsLen;
+
+    int countLenNum; 
+    
+    int *readsLens;
+
+    int64_t *head_seq_pos_count[4];
+
+    int64_t *tail_seq_pos_count[4];
+
+    int64_t *head_qual_sum;
+
+    int64_t *tail_qual_sum;
+
+    int mMaxReadsLen;
+
+    double mAvgReadsLen;
+
+    int64_t mReadsNum;
+
+    int64_t mBasesNum;
+
+    int64_t *mBases51015Num;
+
+    std::pair<double, int> *mTop5QualReads;
+
+    std::pair<int, double> *mTop5LengReads;
+
+    void updateTop5(int rlen, double avgQual);
+
+public:
+    int64_t *const *GetHeadSeqPosCount() const;
+
+    int64_t *const *GetTailSeqPosCount() const;
+
+    const int64_t *GetHeadQualSum() const;
+
+    const int64_t *GetTailQualSum() const;
+
+    const int GetMaxReadsLen() const;
+
+    const double GetAvgReadsLen() const;
+
+    const int64_t GetReadsNum() const;
+
+    const int64_t GetBasesNum() const;
+
+    const int64_t *GetBases51015Num() const;
+
+    const std::pair<double, int> *GetTop5QualReads() const;
+
+    const std::pair<int, double> *GetTop5LengReads() const;
+
+    int *GetReadsLens() const;
+};
+
+
+#endif//RERABBITQC_TGSSTATE_H
diff --git a/slave/threadinfo.h b/slave/threadinfo.h
new file mode 100644
index 0000000..245226a
--- /dev/null
+++ b/slave/threadinfo.h
@@ -0,0 +1,27 @@
+//
+// Created by ylf9811 on 2021/7/7.
+//
+
+#ifndef RERABBITQC_THREADINFO_H
+#define RERABBITQC_THREADINFO_H
+
+
+class ThreadInfo {
+public:
+    ThreadInfo(CmdInfo *cmd_info, bool is_pre);
+
+    ~ThreadInfo();
+
+public:
+    bool is_pe_;
+    CmdInfo *cmd_info_;
+    TGSStats *TGS_state_;
+    State *pre_state1_;
+    State *pre_state2_;
+    State *aft_state1_;
+    State *aft_state2_;
+    int64_t *insert_size_dist_;
+};
+
+
+#endif//RERABBITQC_THREADINFO_H
diff --git a/slave/tuple_spawn.hpp b/slave/tuple_spawn.hpp
new file mode 100644
index 0000000..e348ac9
--- /dev/null
+++ b/slave/tuple_spawn.hpp
@@ -0,0 +1,71 @@
+#pragma once
+#include <tuple>
+#ifdef __sw_slave__
+#include "dma_funcs.hpp"
+namespace cxx11_apply
+{
+    template <int... Is>
+    struct integer_sequence
+    {
+    };
+    template <int N, int... Is>
+    struct GenSeq : GenSeq<N - 1, N - 1, Is...>
+    {
+    };
+    template <int... Is>
+    struct GenSeq<0, Is...>
+    {
+        typedef integer_sequence<Is...> type;
+    };
+    template <typename Tt, typename Tf, int... Is>
+    void __apply(Tt t, Tf f, integer_sequence<Is...> seq)
+    {
+        f(std::get<Is>(t)...);
+    }
+    template <typename... Ts>
+    void apply(void (*F)(Ts...), std::tuple<Ts...> tuple)
+    {
+        typename GenSeq<sizeof...(Ts)>::type a;
+        __apply(tuple, F, a);
+    }
+}
+
+template <typename ...Ts>
+__always_inline void call_tuple(void (*f)(Ts...), Ts ...arg){
+  f(arg...);
+}
+template <typename F, typename ...Ts>
+__always_inline void __tuple_spawn_proxy(std::tuple<F, Ts...> *arg) {
+  cxx11_apply::apply(call_tuple<Ts...>, *arg);
+}
+template <typename F, typename ...Ts>
+__attribute__((noinline)) void tuple_spawn_proxy(std::tuple<F, Ts...> *arg) {
+  std::tuple<F, Ts...> ldm_arg;
+  dma_getn(arg, &ldm_arg, 1);
+  cxx11_apply::apply(call_tuple<Ts...>, ldm_arg);
+}
+template <typename ...Ts>
+void tmpl_entrance(void (*f)(Ts...)){
+  __asm__ __volatile__ (""
+    :
+    :"r"(tuple_spawn_proxy<void (*)(Ts...), Ts...>)
+    : "memory");
+}
+#define ATHREAD_VISIBLE(x) template void tmpl_entrance(decltype(x)*);
+
+#endif
+#ifdef __sw_host__
+extern "C"{
+#include <athread.h>
+}
+#include <tuple>
+
+template <typename F, typename ...Ts>
+extern void slave_tuple_spawn_proxy(std::tuple<F, Ts...>*);
+template <typename ...Ts>
+void athread_spawn_tupled(void (*f)(Ts...), Ts ...args) {
+  auto arg = std::tuple<decltype(f), Ts...>(f, args...);
+  __real_athread_spawn((void*)slave_tuple_spawn_proxy<decltype(f), Ts...>, &arg);
+}
+
+#endif
\ No newline at end of file
diff --git a/src/Globals.h b/src/Globals.h
index fd3efe8..dc6116c 100644
--- a/src/Globals.h
+++ b/src/Globals.h
@@ -51,6 +51,7 @@ Version: 2.00
 #include <string>
 #include <sys/time.h>
 
+#define slave_num 64
 
 inline double GetTime() {
     struct timeval tv;
diff --git a/src/adapter.cpp b/src/adapter.cpp
index 15b4696..adbcd8f 100644
--- a/src/adapter.cpp
+++ b/src/adapter.cpp
@@ -128,7 +128,7 @@ void Adapter::PreOverAnalyze(string file_name, vector<string> &hot_seqs, int &ev
 #ifdef Verbose
     double t0 = GetTime();
 #endif
-    auto *fastq_data_pool = new rabbit::fq::FastqDataPool(128, 1 << 22);
+    auto *fastq_data_pool = new rabbit::fq::FastqDataPool(4, 1 << 22);
     auto fqFileReader = new rabbit::fq::FastqFileReader(file_name, *fastq_data_pool, "",
             file_name.find(".gz") != string::npos);
     int64_t n_chunks = 0;
@@ -306,12 +306,12 @@ vector<string> Adapter::LoadAdaptersFromFasta(string file_name) {
 }
 
 int Adapter::EvalMaxLen(string file_name) {
-    auto *fastq_data_pool = new rabbit::fq::FastqDataPool(128, 1 << 22);
+    auto *fastq_data_pool = new rabbit::fq::FastqDataPool(4, 1 << 22);
     auto fqFileReader = new rabbit::fq::FastqFileReader(file_name, *fastq_data_pool, "",
             file_name.find(".gz") != string::npos);
     int64_t n_chunks = 0;
     // stat up to 256K reads
-    const long READ_LIMIT = 256 * 1024;
+    const long READ_LIMIT = 4 * 1024;
     const long BASE_LIMIT = 151 * READ_LIMIT;
     long records = 0;
     long bases = 0;
@@ -342,12 +342,11 @@ int Adapter::EvalMaxLen(string file_name) {
 }
 
 string Adapter::AutoDetect(string file_name, int trim_tail) {
-    auto *fastq_data_pool = new rabbit::fq::FastqDataPool(128, 1 << 22);
+    auto *fastq_data_pool = new rabbit::fq::FastqDataPool(4, 1 << 22);
     auto fqFileReader = new rabbit::fq::FastqFileReader(file_name, *fastq_data_pool, "",
             file_name.find(".gz") != string::npos);
     int64_t n_chunks = 0;
-    // stat up to 256K reads
-    const long READ_LIMIT = 256 * 1024;
+    const long READ_LIMIT = 4 * 1024;
     const long BASE_LIMIT = 151 * READ_LIMIT;
     long records = 0;
     long bases = 0;
diff --git a/src/main.cpp b/src/main.cpp
index 588558f..0ec962c 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,9 +6,10 @@
 #include "th_ass.h"
 #include <iostream>
 #include <fstream>
+
 using namespace std;
 
-inline bool exists_file (const std::string& name) {
+inline bool exists_file(const std::string &name) {
     ifstream f(name.c_str());
     return f.good();
 }
@@ -23,34 +24,41 @@ int main(int argc, char **argv) {
     app.add_option("-O,--outFile2", cmd_info.out_file_name2_, "output fastq name 2");
 
 
-    app.add_option("--compressLevel", cmd_info.compression_level_, "output file compression level (1 - 9), default is 4");
+    app.add_option("--compressLevel", cmd_info.compression_level_,
+                   "output file compression level (1 - 9), default is 4");
 
     app.add_flag("--overWrite", cmd_info.overWrite_, "overwrite out file if already exists.");
     app.add_flag("--phred64", cmd_info.isPhred64_, "input is using phred64 scoring, default is phred33");
     app.add_flag("--stdin", cmd_info.isStdin_,
-            "input from stdin, or -i /dev/stdin, only for se data or interleaved pe data(which means use --interleavedIn)");
+                 "input from stdin, or -i /dev/stdin, only for se data or interleaved pe data(which means use --interleavedIn)");
     app.add_flag("--stdout", cmd_info.isStdout_,
-            "output to stdout, or -o /dev/stdout, only for se data or interleaved pe data(which means use --interleavedOut)");
+                 "output to stdout, or -o /dev/stdout, only for se data or interleaved pe data(which means use --interleavedOut)");
 
 
+    int tmp_no_dup_ = 0;
+    app.add_flag("--noDuplicate", tmp_no_dup_, "don't do duplicate state, default is off");
     app.add_flag("-a,--noTrimAdapter", cmd_info.no_trim_adapter_, "don't trim adapter, default is off");
     app.add_flag("--decAdaForSe", cmd_info.se_auto_detect_adapter_, "detect adapter for se data, default is on");
     app.add_flag("--decAdaForPe", cmd_info.pe_auto_detect_adapter_,
-            "detect adapter for pe data, default is off, tool prefers to use overlap to find adapter");
+                 "detect adapter for pe data, default is off, tool prefers to use overlap to find adapter");
     app.add_flag("--printWhatTrimmed", cmd_info.print_what_trimmed_,
-            "if print what trimmed to *_trimmed_adapters.txt or not, default is not");
+                 "if print what trimmed to *_trimmed_adapters.txt or not, default is not");
     app.add_option("--adapterSeq1", cmd_info.adapter_seq1_, "specify adapter sequence for read1");
     app.add_option("--adapterSeq2", cmd_info.adapter_seq2_, "specify adapter sequence for read2");
     app.add_option("--adapterFastaFile", cmd_info.adapter_fasta_file_, "specify adapter sequences use fasta file");
     //app.add_option("--adapterLengthLimit", cmd_info.adapter_len_lim_, "minimum adapter length when trimming, default is 0");
 
-    app.add_flag("-c,--correctData", cmd_info.correct_data_, "correcting low quality bases using information from overlap, default is off");
+    app.add_flag("-c,--correctData", cmd_info.correct_data_,
+                 "correcting low quality bases using information from overlap, default is off");
 
-    app.add_option("-w,--threadNum", cmd_info.thread_number_, "number of thread used to do QC, including (de)compression for compressed data, default is 8");
+    app.add_option("-w,--threadNum", cmd_info.thread_number_,
+                   "number of thread used to do QC, including (de)compression for compressed data, default is 8");
 
     //filter
-    app.add_flag("-5,--trim5End", cmd_info.trim_5end_, "do sliding window from 5end to 3end to trim low quality bases, default is off");
-    app.add_flag("-3,--trim3End", cmd_info.trim_3end_, "do sliding window from 5end to 3end to trim low quality bases, default is off");
+    app.add_flag("-5,--trim5End", cmd_info.trim_5end_,
+                 "do sliding window from 5end to 3end to trim low quality bases, default is off");
+    app.add_flag("-3,--trim3End", cmd_info.trim_3end_,
+                 "do sliding window from 5end to 3end to trim low quality bases, default is off");
     app.add_option("--trimFront1", cmd_info.trim_front1_, "number of bases to trim from front in read1, default is 0");
     app.add_option("--trimFront2", cmd_info.trim_front2_, "number of bases to trim from front in read2, default is 0");
     app.add_option("--trimTail1", cmd_info.trim_tail1_, "number of bases to trim from tail in read1, default is 0");
@@ -64,34 +72,38 @@ int main(int argc, char **argv) {
     app.add_flag("-u,--addUmi", cmd_info.add_umi_, "do unique molecular identifier (umi) processing, default is off");
     app.add_option("--umiLen", cmd_info.umi_len_, "umi length if it is in read1/read2, default is 0");
     string umiLoc = "";
-    app.add_option("--umiLoc", umiLoc, "specify the location of umi, can be (index1/index2/read1/read2/per_index/per_read), default is 0");
-    app.add_option("--umiPrefix", cmd_info.umi_prefix_, "identification to be added in front of umi, default is no prefix");
+    app.add_option("--umiLoc", umiLoc,
+                   "specify the location of umi, can be (index1/index2/read1/read2/per_index/per_read), default is 0");
+    app.add_option("--umiPrefix", cmd_info.umi_prefix_,
+                   "identification to be added in front of umi, default is no prefix");
     app.add_option("--umiSkip", cmd_info.umi_skip_, "the number bases to skip if umi exists, default is 0");
 
     //app.add_option("--seqLen", cmd_info.seq_len_, "max sequence length, default is 200");
 
 
     //TGS
-    app.add_flag("--TGS", cmd_info.is_TGS_, "process third generation sequencing (TGS) data (only for se data, does not support trimming and will not produce output files), default is off");
+    app.add_flag("--TGS", cmd_info.is_TGS_,
+                 "process third generation sequencing (TGS) data (only for se data, does not support trimming and will not produce output files), default is off");
 
 
     //overrepresentation
     app.add_flag("-p,--doOverrepresentation", cmd_info.do_overrepresentation_,
-            "do over-representation sequence analysis, default is off");
+                 "do over-representation sequence analysis, default is off");
     app.add_option("-P,--overrepresentationSampling", cmd_info.overrepresentation_sampling_,
-            "do overrepresentation every [] reads, default is 20");
+                   "do overrepresentation every [] reads, default is 20");
 
     app.add_flag("--printORPSeqs", cmd_info.print_ORP_seqs_,
-            "if print overrepresentation sequences to *ORP_sequences.txt or not, default is not");
+                 "if print overrepresentation sequences to *ORP_sequences.txt or not, default is not");
     //insert size analyze
-    app.add_flag("--noInsertSize", cmd_info.no_insert_size_, "no insert size analysis (only for pe data), default is to do insert size analysis");
+    app.add_flag("--noInsertSize", cmd_info.no_insert_size_,
+                 "no insert size analysis (only for pe data), default is to do insert size analysis");
 
 
     //interleaved
     app.add_flag("--interleavedIn", cmd_info.interleaved_in_,
-            "use interleaved input (only for pe data), default is off");
+                 "use interleaved input (only for pe data), default is off");
     app.add_flag("--interleavedOut", cmd_info.interleaved_out_,
-            "use interleaved output (only for pe data), default is off");
+                 "use interleaved output (only for pe data), default is off");
 
 
     //parallel gz
@@ -114,7 +126,7 @@ int main(int argc, char **argv) {
     CLI11_PARSE(app, argc, argv);
 
 
-    if(cmd_info.compression_level_ < 1 || cmd_info.compression_level_ > 9){
+    if (cmd_info.compression_level_ < 1 || cmd_info.compression_level_ > 9) {
         printf("error : compression level should in [1, 9]!\n");
         return 0;
     }
@@ -124,24 +136,28 @@ int main(int argc, char **argv) {
         return 0;
     }
 
-    if(cmd_info.is_TGS_){
-        if(cmd_info.in_file_name2_.length() > 0){
+    if (tmp_no_dup_)cmd_info.state_duplicate_ = false;
+
+    if (cmd_info.is_TGS_) {
+        if (cmd_info.in_file_name2_.length() > 0) {
             printf("WARNING : the TGS module does not support pe inputs, so ignore the -I parameter.\n");
             cmd_info.in_file_name2_ = "";
         }
-        if(cmd_info.out_file_name1_.length() != 0 || cmd_info.out_file_name2_.length() != 0){
+        if (cmd_info.out_file_name1_.length() != 0 || cmd_info.out_file_name2_.length() != 0) {
             printf("WARNING : the TGS module does not support trimming and will not produce output files, so ignore the -o and -O parameter.\n");
             cmd_info.out_file_name1_ = "";
             cmd_info.out_file_name2_ = "";
         }
     }
 
-    if(cmd_info.in_file_name2_.length() > 0 && cmd_info.out_file_name2_.length() == 0 && cmd_info.interleaved_out_ == 0){
+    if (cmd_info.in_file_name2_.length() > 0 && cmd_info.out_file_name2_.length() == 0 &&
+        cmd_info.interleaved_out_ == 0) {
         error_exit("when interleaved_out module is off, two input files can not correspond to one output.\n");
     }
 
 
-    if(cmd_info.out_file_name2_.length() > 0 && cmd_info.in_file_name2_.length() == 0 && cmd_info.interleaved_in_ == 0){
+    if (cmd_info.out_file_name2_.length() > 0 && cmd_info.in_file_name2_.length() == 0 &&
+        cmd_info.interleaved_in_ == 0) {
         error_exit("when interleaved_in module is off, one input file can not correspond to two outputs.\n");
     }
 
@@ -151,16 +167,16 @@ int main(int argc, char **argv) {
     //check if files is ok
     int files_ok = 1;
     if (is_pe) {
-        if (!cmd_info.interleaved_in_){
-            if(ends_with(cmd_info.in_file_name1_, ".gz") != ends_with(cmd_info.in_file_name2_, ".gz"))
+        if (!cmd_info.interleaved_in_) {
+            if (ends_with(cmd_info.in_file_name1_, ".gz") != ends_with(cmd_info.in_file_name2_, ".gz"))
                 files_ok = 0;
         }
-        if (!cmd_info.interleaved_out_){
-            if(ends_with(cmd_info.out_file_name1_, ".gz") != ends_with(cmd_info.out_file_name2_, ".gz"))
+        if (!cmd_info.interleaved_out_) {
+            if (ends_with(cmd_info.out_file_name1_, ".gz") != ends_with(cmd_info.out_file_name2_, ".gz"))
                 files_ok = 0;
         }
     }
-    if(files_ok == 0){
+    if (files_ok == 0) {
         printf("error : for PE data, both files must be of the same type, i.e. cannot be one compressed and one uncompressed!");
         return 0;
     }
@@ -173,22 +189,22 @@ int main(int argc, char **argv) {
     if (in_gz) in_module |= 1;
     if (out_gz) in_module |= 2;
 
-    if(cmd_info.in_file_name2_.length() == 0)cmd_info.thread_number_ = min(32, cmd_info.thread_number_);
+    if (cmd_info.in_file_name2_.length() == 0)cmd_info.thread_number_ = min(32, cmd_info.thread_number_);
     else cmd_info.thread_number_ = min(cmd_info.thread_number_, 64);
 
     int t1, t2, t3;
     if (is_pe) {
         //PE
         int tot_threads = cmd_info.thread_number_;
-        if (in_module == 3){
+        if (in_module == 3) {
             t1 = thread_assignment_pe3[tot_threads].pugz_t;
             t2 = thread_assignment_pe3[tot_threads].w_t;
             t3 = thread_assignment_pe3[tot_threads].pigz_t;
-        } else if (in_module == 2){
+        } else if (in_module == 2) {
             t1 = thread_assignment_pe2[tot_threads].pugz_t;
             t2 = thread_assignment_pe2[tot_threads].w_t;
             t3 = thread_assignment_pe2[tot_threads].pigz_t;
-        } else if (in_module == 1){
+        } else if (in_module == 1) {
             t1 = thread_assignment_pe1[tot_threads].pugz_t;
             t2 = thread_assignment_pe1[tot_threads].w_t;
             t3 = thread_assignment_pe1[tot_threads].pigz_t;
@@ -197,18 +213,18 @@ int main(int argc, char **argv) {
             t2 = thread_assignment_pe0[tot_threads].w_t;
             t3 = thread_assignment_pe0[tot_threads].pigz_t;
         }
-    }else {
+    } else {
         //SE
         int tot_threads = cmd_info.thread_number_;
-        if (in_module == 3){
+        if (in_module == 3) {
             t1 = thread_assignment_se3[tot_threads].pugz_t;
             t2 = thread_assignment_se3[tot_threads].w_t;
             t3 = thread_assignment_se3[tot_threads].pigz_t;
-        } else if (in_module == 2){
+        } else if (in_module == 2) {
             t1 = thread_assignment_se2[tot_threads].pugz_t;
             t2 = thread_assignment_se2[tot_threads].w_t;
             t3 = thread_assignment_se2[tot_threads].pigz_t;
-        } else if (in_module == 1){
+        } else if (in_module == 1) {
             t1 = thread_assignment_se1[tot_threads].pugz_t;
             t2 = thread_assignment_se1[tot_threads].w_t;
             t3 = thread_assignment_se1[tot_threads].pigz_t;
@@ -220,20 +236,20 @@ int main(int argc, char **argv) {
     }
 
     cmd_info.thread_number_ = t2;
-    if (t1 == 0){
+    if (t1 == 0) {
         cmd_info.use_pugz_ = 0;
     } else {
         cmd_info.use_pugz_ = 1;
         cmd_info.pugz_threads_ = t1;
     }
-    if (t3 <= 1){
+    if (t3 <= 1) {
         cmd_info.use_pigz_ = 0;
     } else {
         cmd_info.use_pigz_ = 1;
         cmd_info.pigz_threads_ = t3;
     }
 
-    if(cmd_info.interleaved_in_ || cmd_info.interleaved_out_){
+    if (cmd_info.interleaved_in_ || cmd_info.interleaved_out_) {
         cmd_info.use_pugz_ = 0;
         cmd_info.use_pigz_ = 0;
     }
@@ -262,27 +278,27 @@ int main(int argc, char **argv) {
     if (cmd_info.in_file_name2_.length()) printf("inFile2 is %s\n", cmd_info.in_file_name2_.c_str());
     if (cmd_info.out_file_name1_.length()) {
         bool res = exists_file(cmd_info.out_file_name1_);
-        if(res){
+        if (res) {
             string tmps;
-            if(cmd_info.overWrite_){
+            if (cmd_info.overWrite_) {
                 tmps = "y";
-            }else{
+            } else {
                 char tmp[100];
                 printf("\n");
                 printf("%s already exists, overwrite it or not ? (y/n)\n", cmd_info.out_file_name1_.c_str());
                 scanf("%s", tmp);
                 tmps = string(tmp);
-                while(tmps != "y" && tmps != "n"){
+                while (tmps != "y" && tmps != "n") {
                     printf("please input y or n\n");
                     scanf("%s", tmp);
                     tmps = string(tmp);
                 }
             }
-            if(tmps == "y"){
+            if (tmps == "y") {
                 remove(cmd_info.out_file_name1_.c_str());
-            } else if(tmps == "n"){
+            } else if (tmps == "n") {
                 return 0;
-            } else{
+            } else {
                 assert(0);
             }
         }
@@ -290,27 +306,27 @@ int main(int argc, char **argv) {
     }
     if (cmd_info.out_file_name2_.length()) {
         bool res = exists_file(cmd_info.out_file_name2_);
-        if(res){
+        if (res) {
             string tmps;
-            if(cmd_info.overWrite_){
+            if (cmd_info.overWrite_) {
                 tmps = "y";
-            }else{
+            } else {
                 char tmp[100];
                 printf("\n");
                 printf("%s already exists, overwrite it or not ? (y/n)\n", cmd_info.out_file_name2_.c_str());
                 scanf("%s", tmp);
                 tmps = string(tmp);
-                while(tmps != "y" && tmps != "n"){
+                while (tmps != "y" && tmps != "n") {
                     printf("please input y or n\n");
                     scanf("%s", tmp);
                     tmps = string(tmp);
                 }
             }
-            if(tmps == "y"){
+            if (tmps == "y") {
                 remove(cmd_info.out_file_name2_.c_str());
-            } else if(tmps == "n"){
+            } else if (tmps == "n") {
                 return 0;
-            } else{
+            } else {
                 assert(0);
             }
         }
@@ -322,15 +338,15 @@ int main(int argc, char **argv) {
     string suffix_name1;
     string suffix_name2;
 
-    if(out_gz){
+    if (out_gz) {
         //printf("===out1 is %s\n", cmd_info.out_file_name1_.c_str());
-        if(cmd_info.out_file_name2_.length() > 0){
+        if (cmd_info.out_file_name2_.length() > 0) {
             //printf("===out2 is %s\n", cmd_info.out_file_name2_.c_str());
         }
         //printf("now change out file name...\n");
         int real_begin1 = 0;
-        for(int i = cmd_info.out_file_name1_.length() - 1; i >= 0; i--){
-            if(cmd_info.out_file_name1_[i] == '/'){
+        for (int i = cmd_info.out_file_name1_.length() - 1; i >= 0; i--) {
+            if (cmd_info.out_file_name1_[i] == '/') {
                 real_begin1 = i + 1;
                 break;
             }
@@ -340,22 +356,23 @@ int main(int argc, char **argv) {
         //printf("prefix_name1 %s\n", prefix_name1.c_str());
         //printf("suffix_name1 %s\n", suffix_name1.c_str());
         cmd_info.out_file_name1_ = prefix_name1 + "tmp_" + to_string(rand()) + suffix_name1;
-        if(cmd_info.out_file_name2_.length() > 0){
+        if (cmd_info.out_file_name2_.length() > 0) {
             int real_begin2 = 0;
-            for(int i = cmd_info.out_file_name2_.length() - 1; i >= 0; i--){
-                if(cmd_info.out_file_name2_[i] == '/'){
+            for (int i = cmd_info.out_file_name2_.length() - 1; i >= 0; i--) {
+                if (cmd_info.out_file_name2_[i] == '/') {
                     real_begin2 = i + 1;
                     break;
                 }
             }
             prefix_name2 = cmd_info.out_file_name2_.substr(0, real_begin2);
-            suffix_name2 = cmd_info.out_file_name2_.substr(real_begin2, cmd_info.out_file_name2_.length() - real_begin2);
+            suffix_name2 = cmd_info.out_file_name2_.substr(real_begin2,
+                                                           cmd_info.out_file_name2_.length() - real_begin2);
             //printf("prefix_name2 %s\n", prefix_name2.c_str());
             //printf("suffix_name2 %s\n", suffix_name2.c_str());
             cmd_info.out_file_name2_ = prefix_name2 + "tmp_" + to_string(rand()) + suffix_name2;
         }
         //printf("===out1 is %s\n", cmd_info.out_file_name1_.c_str());
-        if(cmd_info.out_file_name2_.length() > 0){
+        if (cmd_info.out_file_name2_.length() > 0) {
             //printf("===out2 is %s\n", cmd_info.out_file_name2_.c_str());
         }
     }
@@ -395,7 +412,7 @@ int main(int argc, char **argv) {
         if (umiLoc.empty())
             error_exit("You've enabled UMI by (--addUmi), you should specify the UMI location by (--umiLoc)");
         if (umiLoc != "index1" && umiLoc != "index2" && umiLoc != "read1" && umiLoc != "read2" &&
-                umiLoc != "per_index" && umiLoc != "per_read") {
+            umiLoc != "per_index" && umiLoc != "per_read") {
             error_exit("UMI location can only be index1/index2/read1/read2/per_index/per_read");
         }
         if (cmd_info.in_file_name2_.length() == 0 && (umiLoc == "index2" || umiLoc == "read2"))
@@ -429,7 +446,7 @@ int main(int argc, char **argv) {
 
 
     if (cmd_info.use_pugz_) {
-        if(cmd_info.pugz_threads_ > 8){
+        if (cmd_info.pugz_threads_ > 8) {
             //printf("pugz thread number must <= 8, now set pugz thread number == 8.\n");
             cmd_info.pugz_threads_ = 8;
         }
@@ -442,21 +459,23 @@ int main(int argc, char **argv) {
     //    printf("now use %d thread to do QC operations\n", cmd_info.thread_number_);
     //else
     //    printf("now use %d threads to do QC operations\n", cmd_info.thread_number_);
+    printf("eval len\n");
     int mx_len = Adapter::EvalMaxLen(cmd_info.in_file_name1_);
+    printf("eval done\n");
     //printf("auto detect max seqs len is %d\n", mx_len);
     cmd_info.seq_len_ = mx_len;
-    if(cmd_info.adapter_fasta_file_.length() > 0){
+    if (cmd_info.adapter_fasta_file_.length() > 0) {
         printf("loading adatper from %s\n", cmd_info.adapter_fasta_file_.c_str());
         cmd_info.adapter_from_fasta_ = Adapter::LoadAdaptersFromFasta(cmd_info.adapter_fasta_file_);
         sort(cmd_info.adapter_from_fasta_.begin(), cmd_info.adapter_from_fasta_.end());
-        for(auto item : cmd_info.adapter_from_fasta_)printf(" --- %s ---\n", item.c_str());
+        for (auto item: cmd_info.adapter_from_fasta_)printf(" --- %s ---\n", item.c_str());
     }
     double t_start = GetTime();
 
     if (cmd_info.in_file_name2_.length() || cmd_info.interleaved_in_) {
         //PE
         if ((cmd_info.out_file_name1_.length() > 0 && cmd_info.out_file_name2_.length() > 0) ||
-                cmd_info.interleaved_out_) {
+            cmd_info.interleaved_out_) {
             cmd_info.write_data_ = true;
         }
 
@@ -586,7 +605,9 @@ int main(int argc, char **argv) {
         }
         if (cmd_info.adapter_seq1_.length() > 0)
             cmd_info.adapter_len_lim_ = min(cmd_info.adapter_len_lim_, int(cmd_info.adapter_seq1_.length()));
+        printf("000\n");
         SeQc *se_qc = new SeQc(&cmd_info);
+        printf("111\n");
         if (cmd_info.is_TGS_) {
             se_qc->ProcessSeTGS();
         } else {
@@ -594,18 +615,18 @@ int main(int argc, char **argv) {
         }
         delete se_qc;
     }
-    if(out_gz){
+    if (out_gz) {
         string out_name1 = cmd_info.out_file_name1_;
         out_name1 = out_name1.substr(0, out_name1.find(".gz"));
         remove(out_name1.c_str());
-        if(cmd_info.out_file_name2_.length() > 0){
+        if (cmd_info.out_file_name2_.length() > 0) {
             string out_name2 = cmd_info.out_file_name2_;
             out_name2 = out_name2.substr(0, out_name2.find(".gz"));
             remove(out_name2.c_str());
         }
         string init_name1 = prefix_name1 + suffix_name1;
         rename(cmd_info.out_file_name1_.c_str(), init_name1.c_str());
-        if(cmd_info.out_file_name2_.length() > 0){
+        if (cmd_info.out_file_name2_.length() > 0) {
             string init_name2 = prefix_name2 + suffix_name2;
             rename(cmd_info.out_file_name2_.c_str(), init_name2.c_str());
         }
diff --git a/src/seqc.cpp b/src/seqc.cpp
index 4c4642f..695b4d6 100644
--- a/src/seqc.cpp
+++ b/src/seqc.cpp
@@ -2,8 +2,20 @@
 // Created by ylf9811 on 2021/7/6.
 //
 #include "seqc.h"
+#include "tuple_spawn.hpp"
 using namespace std;
 
+struct qc_data {
+    ThreadInfo **thread_info_;
+    CmdInfo *cmd_info_;
+    vector <neoReference> *data_;
+    vector <neoReference> *pass_data_;
+    int *cnt;
+};
+
+
+extern void slave_tgsfunc(qc_data *para);
+
 /**
  * @brief Construct function
  * @param cmd_info1 : cmd information
@@ -18,7 +30,7 @@ SeQc::SeQc(CmdInfo *cmd_info1) {
     in_is_zip_ = cmd_info1->in_file_name1_.find(".gz") != string::npos;
     out_is_zip_ = cmd_info1->out_file_name1_.find(".gz") != string::npos;
     if (cmd_info1->write_data_) {
-        out_queue_ = new moodycamel::ConcurrentQueue<pair<char *, int>>;
+        out_queue_ = new moodycamel::ConcurrentQueue<pair < char * , int>>;
         queueNumNow = 0;
         queueSizeLim = 1 << 5;
         if (out_is_zip_) {
@@ -56,10 +68,11 @@ SeQc::SeQc(CmdInfo *cmd_info1) {
         umier_ = new Umier(cmd_info1);
     }
     if (cmd_info1->use_pugz_) {
-        pugzQueue = new moodycamel::ReaderWriterQueue<pair<char *, int>>(1 << 10);
+        pugzQueue = new moodycamel::ReaderWriterQueue<pair < char * , int>>
+        (1 << 10);
     }
     if (cmd_info1->use_pigz_) {
-        pigzQueue = new moodycamel::ReaderWriterQueue<pair<char *, int>>;
+        pigzQueue = new moodycamel::ReaderWriterQueue<pair < char * , int>>;
         pigzLast.first = new char[1 << 24];
         pigzLast.second = 0;
     }
@@ -149,99 +162,117 @@ void SeQc::Read2Chars(neoReference &ref, char *out_data, int &pos) {
     out_data[pos++] = '\n';
 }
 
-/**
- * @brief get fastq data chunks from the data queue and do QC for them
- * @param thread_info : thread information
- * @param fastq_data_pool :a fastq data pool, it will be used to release data chunk
- * @param dq : data queue
- */
-void SeQc::ConsumerSeFastqTask(ThreadInfo *thread_info, rabbit::fq::FastqDataPool *fastq_data_pool,
+void SeQc::ConsumerSeFastqTask(ThreadInfo **thread_infos, rabbit::fq::FastqDataPool *fastq_data_pool,
                                rabbit::core::TDataQueue<rabbit::fq::FastqDataChunk> &dq) {
     rabbit::int64 id = 0;
     rabbit::fq::FastqDataChunk *fqdatachunk;
-
+    qc_data para;
+    para.cnt = new int[slave_num];
+    for(int i = 0; i < slave_num; i++) para.cnt[i] = 0;
+    para.cmd_info_ = cmd_info_;
+    para.thread_info_ = thread_infos;
     if (cmd_info_->is_TGS_) {
+        double t0 = GetTime();
+        double tsum = 0;
+        athread_init();
         while (dq.Pop(id, fqdatachunk)) {
-            vector<neoReference> data;
+            double tt0 = GetTime();
+            vector <neoReference> data;
             rabbit::fq::chunkFormat(fqdatachunk, data, true);
-            for (auto item: data) {
-                thread_info->TGS_state_->tgsStatRead(item, cmd_info_->isPhred64_);
-            }
+            //printf("format cost %lf\n", GetTime() - tt0);
+            tt0 = GetTime();
+            para.data_ = &data;
+            athread_spawn_tupled(slave_tgsfunc, &para);
+            athread_join();
+            //printf("slave cost %lf\n", GetTime() - tt0);
+            tsum += GetTime() - tt0;
             fastq_data_pool->Release(fqdatachunk);
         }
+        athread_halt();
+        //
+        int ttsum = 0;
+        for(int i = 0; i < slave_num; i++) ttsum += para.cnt[i];
+        printf("ttsum %d\n", ttsum);
+        printf("TGS tot cost %lf\n", GetTime() - t0);
+        printf("TGS slave cost %lf\n", tsum);
     } else {
+        int sum_ = 0;
         while (dq.Pop(id, fqdatachunk)) {
-            vector<neoReference> data;
-            vector<neoReference> pass_data;
+            vector <neoReference> data;
+            vector <neoReference> pass_data;
             rabbit::fq::chunkFormat(fqdatachunk, data, true);
             int out_len = 0;
+            sum_ += data.size();
+            printf("=== %d\n", sum_);
             for (auto item: data) {
-                thread_info->pre_state1_->StateInfo(item);
-                if (cmd_info_->state_duplicate_) {
-                    duplicate_->statRead(item);
-                }
-
-                if (cmd_info_->add_umi_) {
-                    umier_->ProcessSe(item);
-                }
-                bool trim_res = filter_->TrimSeq(item, cmd_info_->trim_front1_, cmd_info_->trim_tail1_);
-
-                if (trim_res && cmd_info_->trim_polyg_) {
-                    PolyX::trimPolyG(item, cmd_info_->trim_poly_len_);
-                }
-
-                if (trim_res && cmd_info_->trim_polyx_) {
-                    PolyX::trimPolyX(item, cmd_info_->trim_poly_len_);
-                }
-
-                if (trim_res && cmd_info_->trim_adapter_) {
-                    int res = 0;
-                    bool is_trimmed = false;
-                    if(cmd_info_->detect_adapter1_) {
-                        if (cmd_info_->print_what_trimmed_) {
-                            res = Adapter::TrimAdapter(item, cmd_info_->adapter_seq1_,
-                                    thread_info->aft_state1_->adapter_map_, cmd_info_->adapter_len_lim_, false);
-                        } else {
-                            res = Adapter::TrimAdapter(item, cmd_info_->adapter_seq1_, false);
-                        }
-                        if (res) {
-                            is_trimmed = true;
-                            thread_info->aft_state1_->AddTrimAdapter();
-                            thread_info->aft_state1_->AddTrimAdapterBase(res);
-                        }
-                    }
-                    if(cmd_info_->adapter_from_fasta_.size() > 0) {
-                        if (cmd_info_->print_what_trimmed_) {
-                            res = Adapter::TrimAdapters(item, cmd_info_->adapter_from_fasta_,
-                                    thread_info->aft_state1_->adapter_map_, cmd_info_->adapter_len_lim_, false);
-                        } else {
-                            res = Adapter::TrimAdapters(item, cmd_info_->adapter_from_fasta_, false);
-                        }
-                        if (res) {
-                            if(!is_trimmed) thread_info->aft_state1_->AddTrimAdapter();
-                            thread_info->aft_state1_->AddTrimAdapterBase(res);
-                        }
-
-                    }
-                }
-                
-                int filter_res = filter_->ReadFiltering(item, trim_res, cmd_info_->isPhred64_);
-                if (filter_res == 0) {
-                    thread_info->aft_state1_->StateInfo(item);
-                    thread_info->aft_state1_->AddPassReads();
-                    if (cmd_info_->write_data_) {
-                        pass_data.push_back(item);
-                        out_len += item.lname + item.lseq + item.lstrand + item.lqual + 4;
-                    }
-                } else if (filter_res == 1) {
-                    thread_info->aft_state1_->AddFailN();
-                } else if (filter_res == 2) {
-                    thread_info->aft_state1_->AddFailShort();
-                } else if (filter_res == 3) {
-                    thread_info->aft_state1_->AddFailLong();
-                } else if (filter_res == 4) {
-                    thread_info->aft_state1_->AddFailLowq();
-                }
+                //thread_info->pre_state1_->StateInfo(item);
+                //if (cmd_info_->state_duplicate_) {
+                //    duplicate_->statRead(item);
+                //}
+
+                //if (cmd_info_->add_umi_) {
+                //    umier_->ProcessSe(item);
+                //}
+                //bool trim_res = filter_->TrimSeq(item, cmd_info_->trim_front1_, cmd_info_->trim_tail1_);
+
+                //if (trim_res && cmd_info_->trim_polyg_) {
+                //    PolyX::trimPolyG(item, cmd_info_->trim_poly_len_);
+                //}
+
+                //if (trim_res && cmd_info_->trim_polyx_) {
+                //    PolyX::trimPolyX(item, cmd_info_->trim_poly_len_);
+                //}
+
+                //if (trim_res && cmd_info_->trim_adapter_) {
+                //    int res = 0;
+                //    bool is_trimmed = false;
+                //    if (cmd_info_->detect_adapter1_) {
+                //        if (cmd_info_->print_what_trimmed_) {
+                //            res = Adapter::TrimAdapter(item, cmd_info_->adapter_seq1_,
+                //                                       thread_info->aft_state1_->adapter_map_,
+                //                                       cmd_info_->adapter_len_lim_, false);
+                //        } else {
+                //            res = Adapter::TrimAdapter(item, cmd_info_->adapter_seq1_, false);
+                //        }
+                //        if (res) {
+                //            is_trimmed = true;
+                //            thread_info->aft_state1_->AddTrimAdapter();
+                //            thread_info->aft_state1_->AddTrimAdapterBase(res);
+                //        }
+                //    }
+                //    if (cmd_info_->adapter_from_fasta_.size() > 0) {
+                //        if (cmd_info_->print_what_trimmed_) {
+                //            res = Adapter::TrimAdapters(item, cmd_info_->adapter_from_fasta_,
+                //                                        thread_info->aft_state1_->adapter_map_,
+                //                                        cmd_info_->adapter_len_lim_, false);
+                //        } else {
+                //            res = Adapter::TrimAdapters(item, cmd_info_->adapter_from_fasta_, false);
+                //        }
+                //        if (res) {
+                //            if (!is_trimmed) thread_info->aft_state1_->AddTrimAdapter();
+                //            thread_info->aft_state1_->AddTrimAdapterBase(res);
+                //        }
+
+                //    }
+                //}
+
+                //int filter_res = filter_->ReadFiltering(item, trim_res, cmd_info_->isPhred64_);
+                //if (filter_res == 0) {
+                //    thread_info->aft_state1_->StateInfo(item);
+                //    thread_info->aft_state1_->AddPassReads();
+                //    if (cmd_info_->write_data_) {
+                //        pass_data.push_back(item);
+                //        out_len += item.lname + item.lseq + item.lstrand + item.lqual + 4;
+                //    }
+                //} else if (filter_res == 1) {
+                //    thread_info->aft_state1_->AddFailN();
+                //} else if (filter_res == 2) {
+                //    thread_info->aft_state1_->AddFailShort();
+                //} else if (filter_res == 3) {
+                //    thread_info->aft_state1_->AddFailLong();
+                //} else if (filter_res == 4) {
+                //    thread_info->aft_state1_->AddFailLowq();
+                //}
             }
 
             if (cmd_info_->write_data_) {
@@ -259,7 +290,7 @@ void SeQc::ConsumerSeFastqTask(ThreadInfo *thread_info, rabbit::fq::FastqDataPoo
 #endif
                         usleep(100);
                     }
-                    
+
                     out_queue_->enqueue(make_pair(out_data, out_len));
                     queueNumNow++;
                     //mylock.unlock();
@@ -392,7 +423,6 @@ void SeQc::PigzTask() {
     infos[4][tmp_level.length()] = '\0';
 
 
-
     infos[5] = "-f";
     infos[6] = "-b";
     infos[7] = "4096";
@@ -428,8 +458,8 @@ void SeQc::ProcessSeFastq() {
     auto *fastqPool = new rabbit::fq::FastqDataPool(32, 1 << 22);
     rabbit::core::TDataQueue<rabbit::fq::FastqDataChunk> queue1(32, 1);
 
-    auto **p_thread_info = new ThreadInfo *[cmd_info_->thread_number_];
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
+    auto **p_thread_info = new ThreadInfo *[slave_num];
+    for (int t = 0; t < slave_num; t++) {
         p_thread_info[t] = new ThreadInfo(cmd_info_, false);
     }
     thread *write_thread;
@@ -442,12 +472,9 @@ void SeQc::ProcessSeFastq() {
     }
 
     thread producer(
-            bind(&SeQc::ProducerSeFastqTask, this, cmd_info_->in_file_name1_, fastqPool, ref(queue1)));
-    auto **threads = new thread *[cmd_info_->thread_number_];
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
-        threads[t] = new thread(
-                bind(&SeQc::ConsumerSeFastqTask, this, p_thread_info[t], fastqPool, ref(queue1)));
-    }
+    bind(&SeQc::ProducerSeFastqTask, this, cmd_info_->in_file_name1_, fastqPool, ref(queue1)));
+    thread consumer(
+    bind(&SeQc::ConsumerSeFastqTask, this, p_thread_info, fastqPool, ref(queue1)));
     if (cmd_info_->use_pugz_) {
         pugzer->join();
     }
@@ -456,9 +483,7 @@ void SeQc::ProcessSeFastq() {
 #ifdef Verbose
     printf("producer cost %.4f\n", GetTime() - t0);
 #endif
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
-        threads[t]->join();
-    }
+    consumer.join();
 #ifdef Verbose
     printf("consumer cost %.4f\n", GetTime() - t0);
 #endif
@@ -473,10 +498,10 @@ void SeQc::ProcessSeFastq() {
     printf("all thrad done\n");
     printf("now merge thread info\n");
 #endif
-    vector<State *> pre_vec_state;
-    vector<State *> aft_vec_state;
+    vector < State * > pre_vec_state;
+    vector < State * > aft_vec_state;
 
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
+    for (int t = 0; t < slave_num; t++) {
         pre_vec_state.push_back(p_thread_info[t]->pre_state1_);
         aft_vec_state.push_back(p_thread_info[t]->aft_state1_);
     }
@@ -569,12 +594,10 @@ void SeQc::ProcessSeFastq() {
     delete fastqPool;
 
 
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
-        delete threads[t];
+    for (int t = 0; t < slave_num; t++) {
         delete p_thread_info[t];
     }
 
-    delete[] threads;
     delete[] p_thread_info;
     if (cmd_info_->write_data_) {
         delete write_thread;
@@ -592,33 +615,28 @@ void SeQc::ProcessSeTGS() {
     auto *fastqPool = new rabbit::fq::FastqDataPool(32, 1 << 22);
     rabbit::core::TDataQueue<rabbit::fq::FastqDataChunk> queue1(32, 1);
 
-    auto **p_thread_info = new ThreadInfo *[cmd_info_->thread_number_];
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
+    auto **p_thread_info = new ThreadInfo *[slave_num];
+    for (int t = 0; t < slave_num; t++) {
         p_thread_info[t] = new ThreadInfo(cmd_info_, false);
     }
     thread producer(
-            bind(&SeQc::ProducerSeFastqTask, this, cmd_info_->in_file_name1_, fastqPool, ref(queue1)));
-    auto **threads = new thread *[cmd_info_->thread_number_];
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
-        threads[t] = new thread(
-                bind(&SeQc::ConsumerSeFastqTask, this, p_thread_info[t], fastqPool, ref(queue1)));
-    }
+    bind(&SeQc::ProducerSeFastqTask, this, cmd_info_->in_file_name1_, fastqPool, ref(queue1)));
+    thread consumer(
+    bind(&SeQc::ConsumerSeFastqTask, this, p_thread_info, fastqPool, ref(queue1)));
 
     if (cmd_info_->use_pugz_) {
         pugzer->join();
     }
 
     producer.join();
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
-        threads[t]->join();
-    }
+    consumer.join();
 #ifdef Verbose
     printf("all thrad done\n");
     printf("now merge thread info\n");
 #endif
-    vector<TGSStats *> vec_state;
+    vector <TGSStats*> vec_state;
 
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
+    for (int t = 0; t < slave_num; t++) {
         vec_state.push_back(p_thread_info[t]->TGS_state_);
     }
     auto mer_state = TGSStats::merge(vec_state);
@@ -637,11 +655,11 @@ void SeQc::ProcessSeTGS() {
     Repoter::ReportHtmlTGS(srr_name + "_RabbitQCPlus.html", command, mer_state, cmd_info_->in_file_name1_);
 
     delete fastqPool;
-    for (int t = 0; t < cmd_info_->thread_number_; t++) {
-        delete threads[t];
+    //printf("111\n");
+    for (int t = 0; t < slave_num; t++) {
+        //printf("11 %d\n", t);
         delete p_thread_info[t];
     }
-
-    delete[] threads;
+    //printf("111\n");
     delete[] p_thread_info;
 }
diff --git a/src/seqc.h b/src/seqc.h
index c7c464f..82cb268 100644
--- a/src/seqc.h
+++ b/src/seqc.h
@@ -48,7 +48,7 @@ class SeQc {
     void ProducerSeFastqTask(std::string file, rabbit::fq::FastqDataPool *fastq_data_pool,
                              rabbit::core::TDataQueue<rabbit::fq::FastqDataChunk> &dq);
 
-    void ConsumerSeFastqTask(ThreadInfo *thread_info, rabbit::fq::FastqDataPool *fastq_data_pool,
+    void ConsumerSeFastqTask(ThreadInfo **thread_infos, rabbit::fq::FastqDataPool *fastq_data_pool,
                              rabbit::core::TDataQueue<rabbit::fq::FastqDataChunk> &dq);
 
     void WriteSeFastqTask();
diff --git a/src/state.cpp b/src/state.cpp
index 9f79fcc..00864ca 100644
--- a/src/state.cpp
+++ b/src/state.cpp
@@ -16,7 +16,7 @@
 #endif
 
 #define MODB 20
-#define gcMax 100000
+#define gcMax 100
 using namespace std;
 
 
@@ -105,8 +105,6 @@ State::~State() {
     }
 }
 
-#define BB 233ll
-#define MM 1000000000000000003ll
 
 void State::HashInsert(const char *seq, int len, int eva_len) {
     uint64_t now = NT64(seq, len);
diff --git a/src/tgsstate.cpp b/src/tgsstate.cpp
index b57d668..8bfc8aa 100644
--- a/src/tgsstate.cpp
+++ b/src/tgsstate.cpp
@@ -5,10 +5,12 @@
 #include "tgsstate.h"
 using namespace std;
 
-TGSStats::TGSStats(int minLen) {
+TGSStats::TGSStats(int minLen, bool isMerge) {
+    if(isMerge) mTotalReadsLen = new int[64 << 20];
+    else mTotalReadsLen = new int[1 << 20];
+    countLenNum = 0;
     mMinlen = minLen;
     mHalfMinlen = mMinlen >> 1;
-    ///mLengths;
     int size_range = mMinlen >> 1;
     head_qual_sum = new int64_t[size_range];
     tail_qual_sum = new int64_t[size_range];
@@ -71,65 +73,6 @@ void TGSStats::updateTop5(int rlen, double avgQual) {
     }
 }
 
-void TGSStats::tgsStatRead(neoReference &ref, bool isPhred64) {
-    const int rlen = ref.lseq;
-    const char *seq = reinterpret_cast<const char *>(ref.base + ref.pseq);
-    const char *quality = reinterpret_cast<const char *>(ref.base + ref.pqual);
-    int size_range = mMinlen >> 1;
-    size_range = min(size_range, rlen);
-    int i;
-    mReadsNum++;
-    mBasesNum += rlen;
-    char c1, c2;
-    mTotalReadsLen.push_back(rlen);
-    int phredSub = 33;
-    if (isPhred64) phredSub = 64;
-    int sumQual = 0;
-    for (int i = 0; i < rlen; i++) {
-        int qual = (quality[i] - phredSub);
-        sumQual += qual;
-        if (qual >= 5) mBases51015Num[0]++;
-        if (qual >= 10) mBases51015Num[1]++;
-        if (qual >= 15) mBases51015Num[2]++;
-    }
-    updateTop5(rlen, 1.0 * sumQual / rlen);
-    //if (rlen > mMinlen) {
-    //[1] stats lengths
-    mLengths.push_back(rlen);
-    //--head
-    for (i = 0; i < size_range; ++i) {
-        c1 = seq[i];
-        //[2] quality sum
-        head_qual_sum[i] += (quality[i] - phredSub);
-        //[3] sequence count
-        if (c1 == 'A') {
-            head_seq_pos_count[0][i]++;
-        } else if (c1 == 'T') {
-            head_seq_pos_count[1][i]++;
-        } else if (c1 == 'C') {
-            head_seq_pos_count[2][i]++;
-        } else if (c1 == 'G') {
-            head_seq_pos_count[3][i]++;
-        }
-    }
-    //--tail
-    for (i = rlen - 1; i >= rlen - size_range; --i) {
-        c2 = seq[i];
-        tail_qual_sum[i - (rlen - size_range)] += (quality[i] - phredSub);
-
-        if (c2 == 'A') {
-            tail_seq_pos_count[0][i - (rlen - size_range)]++;
-        } else if (c2 == 'T') {
-            tail_seq_pos_count[1][i - (rlen - size_range)]++;
-        } else if (c2 == 'C') {
-            tail_seq_pos_count[2][i - (rlen - size_range)]++;
-        } else if (c2 == 'G') {
-            tail_seq_pos_count[3][i - (rlen - size_range)]++;
-        }
-    }
-    //}
-}
-
 
 void TGSStats::print() {
 
@@ -171,16 +114,15 @@ void TGSStats::print() {
 }
 
 TGSStats *TGSStats::merge(vector<TGSStats *> &list) {
-    int i;
     const int minLen = list[0]->mMinlen;
-    TGSStats *s = new TGSStats(minLen);
+    TGSStats *s = new TGSStats(minLen, true);
     for (TGSStats *ds: list) {
-        //mLengths
-        //s->mLengths.push_back(ds->mLengths);
-        //ds->print();
-        s->mLengths.insert(s->mLengths.end(), ds->mLengths.begin(), ds->mLengths.end());
-        s->mTotalReadsLen.insert(s->mTotalReadsLen.end(), ds->mTotalReadsLen.begin(), ds->mTotalReadsLen.end());
-        for (i = 0; i < (minLen >> 1); ++i) {
+        //printf("ooo1\n");
+        for(int i = 0; i < ds->countLenNum; i++){
+          s->mTotalReadsLen[s->countLenNum++] = ds->mTotalReadsLen[i];
+        }
+        //printf("ooo2\n");
+        for (int i = 0; i < (minLen >> 1); ++i) {
             //head_seq
             s->head_seq_pos_count[0][i] += ds->head_seq_pos_count[0][i];
             s->head_seq_pos_count[1][i] += ds->head_seq_pos_count[1][i];
@@ -192,25 +134,28 @@ TGSStats *TGSStats::merge(vector<TGSStats *> &list) {
             s->tail_seq_pos_count[2][i] += ds->tail_seq_pos_count[2][i];
             s->tail_seq_pos_count[3][i] += ds->tail_seq_pos_count[3][i];
         }
-        for (i = 0; i < (minLen >> 1); ++i) {
+        //printf("ooo3\n");
+        for (int i = 0; i < (minLen >> 1); ++i) {
             //head_qual
             s->head_qual_sum[i] += ds->head_qual_sum[i];
             //tail_qual
             s->tail_qual_sum[i] += ds->tail_qual_sum[i];
         }
+        //printf("ooo4\n");
         s->mReadsNum += ds->mReadsNum;
         s->mBasesNum += ds->mBasesNum;
         s->mBases51015Num[0] += ds->mBases51015Num[0];
         s->mBases51015Num[1] += ds->mBases51015Num[1];
         s->mBases51015Num[2] += ds->mBases51015Num[2];
+        //printf("ooo5\n");
         for (int i = 0; i < 5; i++) {
             s->updateTop5(ds->mTop5QualReads[i].second, ds->mTop5QualReads[i].first);
             s->updateTop5(ds->mTop5LengReads[i].first, ds->mTop5LengReads[i].second);
         }
+        //printf("ooo6\n");
     }
 
 
-    //s->head_qual_mean = s->head_qual_sum/s->mLengths.size()
     return s;
 }
 
@@ -218,17 +163,17 @@ void TGSStats::CalReadsLens() {
     int Ppre = 10;
     mMaxReadsLen = 0;
     int64_t readLenSum = 0;
-    for (auto item: mTotalReadsLen) {
-        mMaxReadsLen = max(mMaxReadsLen, item);
-        readLenSum += item;
+    for (int i = 0; i < countLenNum; i++) {
+        mMaxReadsLen = max(mMaxReadsLen, mTotalReadsLen[i]);
+        readLenSum += mTotalReadsLen[i];
     }
-    mAvgReadsLen = 1.0 * readLenSum / mTotalReadsLen.size();
+    mAvgReadsLen = 1.0 * readLenSum / countLenNum;
     mMaxReadsLen /= Ppre;
     mMaxReadsLen += Ppre;
     readsLens = new int[mMaxReadsLen];
     for (int i = 0; i < mMaxReadsLen; i++) readsLens[i] = 0;
-    for (auto item: mTotalReadsLen) {
-        readsLens[(item + Ppre - 1) / Ppre]++;
+    for (int i = 0; i < countLenNum; i++) {
+        readsLens[(mTotalReadsLen[i] + Ppre - 1) / Ppre]++;
     }
 }
 
diff --git a/src/tgsstate.h b/src/tgsstate.h
index d926e3a..3564cea 100644
--- a/src/tgsstate.h
+++ b/src/tgsstate.h
@@ -16,7 +16,7 @@
 
 class TGSStats {
 public:
-    TGSStats(int minLen);
+    TGSStats(int minLen, bool isMerge = false);
 
     ~TGSStats();
 
@@ -24,8 +24,6 @@ class TGSStats {
 
     void print();
 
-    void tgsStatRead(neoReference &ref, bool isPhred64);
-
     void CalReadsLens();
 
     bool isLongRead();
@@ -40,14 +38,17 @@ class TGSStats {
 
     int base2num(std::string base);
 
-private:
+public:
     int mMinlen;
 
     int mHalfMinlen;
 
-    std::vector<int> mLengths;
+    //std::vector<int> mTotalReadsLen;
+    int *mTotalReadsLen;
 
-    std::vector<int> mTotalReadsLen;
+    int countLenNum; 
+    
+    int *readsLens;
 
     int64_t *head_seq_pos_count[4];
 
@@ -57,8 +58,6 @@ class TGSStats {
 
     int64_t *tail_qual_sum;
 
-    int *readsLens;
-
     int mMaxReadsLen;
 
     double mAvgReadsLen;
diff --git a/src/tuple_spawn.hpp b/src/tuple_spawn.hpp
new file mode 100644
index 0000000..e348ac9
--- /dev/null
+++ b/src/tuple_spawn.hpp
@@ -0,0 +1,71 @@
+#pragma once
+#include <tuple>
+#ifdef __sw_slave__
+#include "dma_funcs.hpp"
+namespace cxx11_apply
+{
+    template <int... Is>
+    struct integer_sequence
+    {
+    };
+    template <int N, int... Is>
+    struct GenSeq : GenSeq<N - 1, N - 1, Is...>
+    {
+    };
+    template <int... Is>
+    struct GenSeq<0, Is...>
+    {
+        typedef integer_sequence<Is...> type;
+    };
+    template <typename Tt, typename Tf, int... Is>
+    void __apply(Tt t, Tf f, integer_sequence<Is...> seq)
+    {
+        f(std::get<Is>(t)...);
+    }
+    template <typename... Ts>
+    void apply(void (*F)(Ts...), std::tuple<Ts...> tuple)
+    {
+        typename GenSeq<sizeof...(Ts)>::type a;
+        __apply(tuple, F, a);
+    }
+}
+
+template <typename ...Ts>
+__always_inline void call_tuple(void (*f)(Ts...), Ts ...arg){
+  f(arg...);
+}
+template <typename F, typename ...Ts>
+__always_inline void __tuple_spawn_proxy(std::tuple<F, Ts...> *arg) {
+  cxx11_apply::apply(call_tuple<Ts...>, *arg);
+}
+template <typename F, typename ...Ts>
+__attribute__((noinline)) void tuple_spawn_proxy(std::tuple<F, Ts...> *arg) {
+  std::tuple<F, Ts...> ldm_arg;
+  dma_getn(arg, &ldm_arg, 1);
+  cxx11_apply::apply(call_tuple<Ts...>, ldm_arg);
+}
+template <typename ...Ts>
+void tmpl_entrance(void (*f)(Ts...)){
+  __asm__ __volatile__ (""
+    :
+    :"r"(tuple_spawn_proxy<void (*)(Ts...), Ts...>)
+    : "memory");
+}
+#define ATHREAD_VISIBLE(x) template void tmpl_entrance(decltype(x)*);
+
+#endif
+#ifdef __sw_host__
+extern "C"{
+#include <athread.h>
+}
+#include <tuple>
+
+template <typename F, typename ...Ts>
+extern void slave_tuple_spawn_proxy(std::tuple<F, Ts...>*);
+template <typename ...Ts>
+void athread_spawn_tupled(void (*f)(Ts...), Ts ...args) {
+  auto arg = std::tuple<decltype(f), Ts...>(f, args...);
+  __real_athread_spawn((void*)slave_tuple_spawn_proxy<decltype(f), Ts...>, &arg);
+}
+
+#endif
\ No newline at end of file
diff --git a/xlink.py b/xlink.py
new file mode 100644
index 0000000..bb3b862
--- /dev/null
+++ b/xlink.py
@@ -0,0 +1,272 @@
+import traceback
+import struct
+import sys
+import re
+import collections
+
+ELF64_HDR_FMT = "10sHHIQQQIHHHHHH"
+ELF64_Hdr = collections.namedtuple('ELF64_Hdr', ['e_ident', 'e_type', 'e_machine', 'e_version',
+                                                 'e_entry', 'e_phoff', 'e_shoff', 'e_flags',
+                                                 'e_ehsize', 'e_phentsize', 'e_phnum', 'e_shentsize',
+                                                  'e_shnum', 'e_shstrndx'])
+ELF64_SHDR_FMT = "IIQQQQIIQQ"
+ELF64_Shdr = collections.namedtuple('ELF64_Shdr', ['sh_name', 'sh_type', 'sh_flags', 'sh_addr',
+                                                   'sh_offset', 'sh_size', 'sh_link', 'sh_info',
+                                                   'sh_addralign', 'sh_entsize'])
+ELF64_ST_BIND = lambda x: (x & 0xff) >> 4
+ELF64_ST_TYPE = lambda x: x & 0xf
+ELF64_ST_INFO = lambda bind, type: bind << 4 | type & 0xf
+
+STB_LOCAL      = 0               # Local symbol */
+STB_GLOBAL     = 1               # Global symbol */
+STB_WEAK       = 2               # Weak symbol */
+STB_NUM        = 3               # Number of defined types.  */
+STB_LOOS       = 10              # Start of OS-specific */
+STB_GNU_UNIQUE = 10              # Unique symbol.  */
+STB_HIOS       = 12              # End of OS-specific */
+STB_LOPROC     = 13              # Start of processor-specific */
+STB_HIPROC     = 15              # End of processor-specific */
+
+STT_NOTYPE     = 0               # Symbol type is unspecified */
+STT_OBJECT     = 1               # Symbol is a data object */
+STT_FUNC       = 2               # Symbol is a code object */
+STT_SECTION    = 3               # Symbol associated with a section */
+STT_FILE       = 4               # Symbol's name is file name */
+STT_COMMON     = 5               # Symbol is a common data object */
+STT_TLS        = 6               # Symbol is thread-local data object*/
+STT_NUM        = 7               # Number of defined types.  */
+STT_LOOS       = 10              # Start of OS-specific */
+STT_GNU_IFUNC  = 10              # Symbol is indirect code object */
+STT_HIOS       = 12              # End of OS-specific */
+STT_LOPROC     = 13              # Start of processor-specific */
+STT_HIPROC     = 15		# End of processor-specific */
+
+
+#define STN_UNDEF	0		/* End of a chain.  */
+
+ELF64_SYM_FMT = "IBBHQQ"
+ELF64_Sym = collections.namedtuple('ELF64_Sym', ['st_name', 'st_info', 'st_other', 'st_shndx', 'st_value', 'st_size'])
+
+class ELF64:
+    def __init__(self, bin):
+        bin = bytearray(bin)
+        ehdr = ELF64_Hdr(*struct.unpack(ELF64_HDR_FMT, bin[0:64]))
+        self.bin = bin
+        self.ehsize    = ehdr.e_ehsize
+        self.phoff     = ehdr.e_phoff
+        self.phnum     = ehdr.e_phnum
+        self.phentsize = ehdr.e_phentsize
+        self.shoff     = ehdr.e_shoff
+        self.shnum     = ehdr.e_shnum
+        self.shentsize = ehdr.e_shentsize
+
+        shdr_end = self.shoff + self.shnum * self.shentsize
+        shdr = list(map(lambda x: ELF64_Shdr(*x), struct.iter_unpack(ELF64_SHDR_FMT, bin[self.shoff : shdr_end])))
+        shstrtabhdr = shdr[ehdr.e_shstrndx]
+        #shstrdict = ELF64.split_strtab(
+        shstrtab = ELF64.get_section_bin(bin, shstrtabhdr)
+        self.shstrtab = shstrtab
+        #print(shstrtab[341:351], len(shstrtab))
+        #print("TTT:", shstrtab[341:20])
+        self.shdrs = {}
+        self.ishdrs = []
+        for hdr in shdr:
+            # print(hdr, hdr.sh_name, ELF64.get_str_from_tab(shstrtab, hdr.sh_name))
+            self.shdrs[ELF64.get_str_from_tab(shstrtab, hdr.sh_name)] = hdr
+            self.ishdrs.append(hdr)
+        symtabhdr = self.shdrs['.symtab']
+        symtabbin = ELF64.get_section_bin(bin, symtabhdr)
+        syms = list(map(lambda x: ELF64_Sym(*x), struct.iter_unpack(ELF64_SYM_FMT, symtabbin)))
+        strtabhdr = self.shdrs['.strtab']
+        #strdict = ELF64.split_strtab(
+        
+        strtab = ELF64.get_section_bin(bin, strtabhdr)
+        self.strtab = strtab
+        #self.sym_name = {}
+        self.syms = list(map(lambda sym: sym._replace(st_name = ELF64.get_str_from_tab(strtab, sym.st_name)), syms))
+        self.global_syms = {}
+        for sym in self.syms:
+            if ELF64_ST_BIND(sym.st_info) in [STB_GLOBAL, STB_WEAK]:
+                self.global_syms[sym.st_name] = sym
+    @staticmethod
+    def get_str_from_tab(tab, index):
+        return tab[index:].split(b"\0", 1)[0].decode()
+
+    @staticmethod
+    def get_section_bin(elf_bin, sec_hdr):
+        return elf_bin[sec_hdr.sh_offset : sec_hdr.sh_offset + sec_hdr.sh_size]
+import enum
+class SW5Rel(enum.Enum):
+    NONE       = 0  # /* No reloc */
+    REFLONG    = 1  # /* Direct 32 bit */
+    REFQUAD    = 2  # /* Direct 64 bit */
+    GPREL32    = 3  # /* GP relative 32 bit */
+    LITERAL    = 4  # /* GP relative 16 bit w/optimization */
+    LITUSE     = 5  # /* Optimization hint for LITERAL */
+    GPDISP     = 6  # /* Add displacement to GP */
+    BRADDR     = 7  # /* PC+4 relative 23 bit shifted */
+    HINT       = 8  # /* PC+4 relative 16 bit shifted */
+    SREL16     = 9  # /* PC relative 16 bit */
+    SREL32     = 10 # /* PC relative 32 bit */
+    SREL64     = 11 # /* PC relative 64 bit */
+    GPRELHIGH  = 17 # /* GP relative 32 bit, high 16 bits */
+    GPRELLOW   = 18 # /* GP relative 32 bit, low 16 bits */
+    GPREL16    = 19 # /* GP relative 16 bit */
+    COPY       = 24 # /* Copy symbol at runtime */
+    GLOB_DAT   = 25 # /* Create GOT entry */
+    JMP_SLOT   = 26 # /* Create PLT entry */
+    RELATIVE   = 27 # /* Adjust by program base */
+    TLS_GD_HI  = 28
+    TLSGD      = 29
+    TLS_LDM    = 30
+    DTPMOD64   = 31
+    GOTDTPREL  = 32
+    DTPREL64   = 33
+    DTPRELHI   = 34
+    DTPRELLO   = 35
+    DTPREL16   = 36
+    GOTTPREL   = 37
+    TPREL64    = 38
+    TPRELHI    = 39
+    TPRELLO    = 40
+    TPREL16    = 41
+    LDMLO      = 42
+    LDMHI      = 43
+RA = 26
+T12 = 27
+GP = 29
+LDI = 0x3e
+LDIH = 0x3f
+LDL = 0x23
+MANGLE_RE = re.compile(r"_Z(?P<NS>N?)(?P<len>\d+)slave_(?P<rest>.*)")
+def remangle_slave(mangled_name):
+    m = MANGLE_RE.match(mangled_name)
+    if m:
+        remangle = "slave__Z%s%d%s" % (m["NS"], int(m["len"]) - 6, m["rest"])
+        return remangle
+    else:
+        return None
+import subprocess
+import os
+verbose = False
+def process_text1_reloc(elf, ipass):
+  text1_offset = elf.shdrs['.text1'].sh_offset
+  text1_addr = elf.shdrs['.text1'].sh_addr
+  text1_size = elf.shdrs['.text1'].sh_size
+  got_offset = elf.shdrs['.got'].sh_offset
+  got_addr = elf.shdrs['.got'].sh_addr
+  nonliterals = set([])
+  if '.rela.text1' in elf.shdrs:
+      relatext1_offset = elf.shdrs[".rela.text1"].sh_offset
+      relatext1_size = elf.shdrs[".rela.text1"].sh_size
+      for i in range(relatext1_offset, relatext1_offset + relatext1_size, 24):
+          addr, info, off = struct.unpack("<QQQ", elf.bin[i:i+24])
+          target = info >> 32
+          reltype = SW5Rel(info & 63)
+          if reltype != SW5Rel.LITERAL:
+              nonliterals.add(addr)
+  if ipass == 2:
+      got1_offset = elf.global_syms['__got1'].st_value - elf.shdrs['.ldm'].sh_addr + elf.shdrs['.ldm'].sh_offset
+      got1_addr = elf.global_syms['__got1'].st_value - 0x5000004000
+      got1_size = elf.global_syms['__got1'].st_size
+  def get_inst(i):
+      return struct.unpack('I', elf.bin[i:i+4])[0]
+  got1_map = {}
+  for i in range(0, text1_size, 4):
+      # inst = sw5insts.parse_inst(get_inst(i))
+      inst = get_inst(text1_offset + i)
+      op = inst >> 26
+      ra = inst >> 21 & 31
+      rb = inst >> 16 & 31
+      disp = inst & 65535
+      if disp > 32767:
+          disp -= 65536
+      if op == LDIH and ra == GP and rb in (T12, RA, GP):
+          gpval = i + text1_addr + disp * 65536
+      if op == LDI and ra == GP and rb == GP:
+          gpval = gpval + disp
+      if op == LDL and rb == GP and i + text1_addr not in nonliterals:
+          target_addr = gpval + disp
+          target_offset = target_addr - got_addr + got_offset
+          val = struct.unpack('Q', elf.bin[target_offset:target_offset + 8])[0]
+          if val not in got1_map:
+              got1_map[val] = len(got1_map)
+              idx = got1_map[val]
+              if ipass == 2:
+                  elf.bin[got1_offset + idx * 8: got1_offset + idx * 8 + 8] = struct.pack('Q', val)
+          if ipass == 2:
+              ldm_disp = got1_addr + got1_map[val] * 8
+              inst_new = LDL << 26 | ra << 21 | 31 << 16 | ldm_disp & 65535
+              elf.bin[text1_offset + i:text1_offset+i+4] = struct.pack('I', inst_new)
+  return len(got1_map)
+def remangle_defs(elf):
+  ret = []
+  if '.rela.text' in elf.shdrs:
+    relatext1_offset = elf.shdrs[".rela.text"].sh_offset
+    relatext1_size = elf.shdrs[".rela.text"].sh_size
+    for i in range(relatext1_offset, relatext1_offset + relatext1_size, 24):
+        addr, info, off = struct.unpack("<QQQ", elf.bin[i:i+24])
+        target = info >> 32
+        reltype = SW5Rel(info & 63)
+        target_sym = elf.syms[target]
+        if target_sym.st_shndx < len(elf.ishdrs) and elf.ishdrs[target_sym.st_shndx].sh_type == 0:
+            if remangle_slave(elf.syms[target].st_name):
+                ret.append('--defsym=%s=%s' % (elf.syms[target].st_name, remangle_slave(elf.syms[target].st_name)))
+  return ret
+def run(argv_link, add_args=[], **kwargs):
+  if os.path.basename(argv_link[0]) != 'sw5ld':
+    add_args = list(map(lambda x: "-Wl,"+x, add_args))
+  if (verbose):
+    print(argv_link + add_args)
+  retcode = subprocess.run(argv_link + add_args, **kwargs).returncode
+  if retcode != 0:
+    print('\033[91mexecution of %s failed\033[0m' % str(argv_link + add_args))
+    sys.exit(retcode)
+def get_output(argv):
+  for i, arg in enumerate(argv):
+    if arg == '-o':
+      return argv[i+1]
+  return "./a.out"
+def write_elf(path, binary):
+  open(path, "wb").write(binary)
+  import os
+  import stat
+  st = os.stat(path)
+  os.chmod(path, st.st_mode | stat.S_IEXEC)
+if __name__ == '__main__':
+  import sys
+  import tempfile
+  if len(sys.argv) == 1 or sys.argv[1] == '-h':
+    print("xlink.py [-v] <original link command>", file=sys.stderr)
+    sys.exit(1)
+  verbose = False
+  arg_offset = 1
+  if sys.argv[1] == '-v':
+    verbose = True
+    arg_offset = 2
+  tempdir = tempfile.mkdtemp(prefix='xlink-')
+  try:
+    linker = sys.argv[arg_offset]
+    argv_link = sys.argv[arg_offset:].copy()
+    dest = get_output(argv_link)
+    if os.path.exists(dest):
+      os.remove(dest)
+    pass1_elf = os.path.join(tempdir, 'pass1_elf')
+    add_args_pass0 = ['-q', '--unresolved-symbols=ignore-all', '-o', pass1_elf]
+    run(argv_link, add_args_pass0)
+    elf1 = ELF64(open(pass1_elf, "rb").read())
+    nliterals = process_text1_reloc(elf1, ipass=1)
+
+    got1_c = os.path.join(tempdir, 'got1.c')
+    open(got1_c, "w").write('__attribute__((section(".ldm"))) long __got1[%d];\n' % nliterals)
+    run(['sw5gcc', '-mslave', '-c', 'got1.c'], cwd=tempdir)
+    print("\033[93mGenerating a subset of global offset table in LDM of %dB\033[0m" % (nliterals * 8))
+    pass2_elf = os.path.join(tempdir, 'pass2_elf')
+    add_args_pass2 = [os.path.join(tempdir, 'got1.o'), '-o', pass2_elf, '-q'] + remangle_defs(elf1)
+    run(argv_link, add_args_pass2)
+    elf2 = ELF64(open(pass2_elf, "rb").read())
+    process_text1_reloc(elf2, ipass=2)
+    write_elf(dest, elf2.bin)  # print(nliterals)
+  finally:
+    import shutil
+    shutil.rmtree(tempdir)