-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
yanlifeng
committed
Dec 16, 2022
1 parent
81f84d4
commit 7fbc877
Showing
20 changed files
with
1,308 additions
and
295 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
bsub -b -I -q q_sw_expr -n 1 -cgsp 64 -sw3run swrun-5a -share_size 4096 ./RabbitQCPlus --help |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#ifndef __REFERENCE_H_ | ||
#define __REFERENCE_H_ | ||
|
||
/** | ||
* @brief Reference struct that store the FASTA and FASTQ infomation | ||
*/ | ||
struct Reference { | ||
std::string name; | ||
std::string comment; | ||
std::string seq; | ||
std::string quality; | ||
std::string strand; | ||
uint64_t length; | ||
uint64_t gid; | ||
}; | ||
|
||
/** | ||
* @brief Reference struct that store the FASTA and FASTQ infomation, different from Reference | ||
* neoReference only record the start position and length of name, sequence, strand and quality | ||
* base on certain chunk data `base` | ||
*/ | ||
struct neoReference { | ||
uint64_t pname; /// name offset form base | ||
uint64_t pcom; /// comment offset form base | ||
uint64_t pseq; /// sequence offset form base | ||
uint64_t pqual; /// quality offset form base | ||
uint64_t pstrand;///strand offset form base | ||
|
||
uint64_t lname; /// length of name | ||
uint64_t lcom; /// length of comment | ||
uint64_t lseq; /// length of sequence | ||
uint64_t lqual; /// length of quality | ||
uint64_t lstrand; /// length of strand | ||
uint64_t gid; /// global id | ||
unsigned char *base;/// base data pointer | ||
}; | ||
|
||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
// | ||
// Created by ylf9811 on 2021/7/6. | ||
// | ||
|
||
#ifndef RERABBITQC_CMDINFO_H | ||
#define RERABBITQC_CMDINFO_H | ||
|
||
class CmdInfo { | ||
public: | ||
CmdInfo(); | ||
|
||
public: | ||
std::string in_file_name1_; | ||
std::string in_file_name2_; | ||
std::string out_file_name1_; | ||
std::string out_file_name2_; | ||
int thread_number_; | ||
std::string command_; | ||
bool isPhred64_; | ||
bool isStdin_; | ||
bool isStdout_; | ||
|
||
bool overWrite_; | ||
bool write_data_; | ||
int64_t out_block_size_; | ||
int64_t in_file_size1_; | ||
int64_t in_file_size2_; | ||
int seq_len_; | ||
int qul_range_; | ||
|
||
//param for filter | ||
int length_required_; | ||
int length_limit_; | ||
int n_number_limit_; | ||
int low_qual_perc_limit_; | ||
bool trim_5end_; | ||
bool trim_3end_; | ||
int cut_window_size_; | ||
int cut_mean_quality_; | ||
int trim_front1_; | ||
int trim_tail1_; | ||
int trim_front2_; | ||
int trim_tail2_; | ||
|
||
|
||
//param for adapter | ||
bool trim_adapter_; | ||
bool no_trim_adapter_; | ||
bool se_auto_detect_adapter_; | ||
bool pe_auto_detect_adapter_; | ||
bool detect_adapter1_; | ||
bool detect_adapter2_; | ||
std::string adapter_seq1_; | ||
std::string adapter_seq2_; | ||
int overlap_diff_limit_; | ||
int overlap_require_; | ||
bool correct_data_; | ||
bool analyze_overlap_; | ||
int adapter_len_lim_; | ||
bool print_what_trimmed_; | ||
std::vector<std::string> adapter_from_fasta_; | ||
std::string adapter_fasta_file_; | ||
|
||
//duplicate | ||
bool state_duplicate_; | ||
|
||
//polyx | ||
bool trim_polyx_; | ||
bool trim_polyg_; | ||
int trim_poly_len_; | ||
|
||
//umi | ||
bool add_umi_; | ||
int umi_loc_; | ||
int umi_len_; | ||
std::string umi_prefix_; | ||
int umi_skip_; | ||
|
||
//TGS | ||
bool is_TGS_; | ||
int TGS_min_len_; | ||
|
||
//overrepresentation_analysis | ||
bool do_overrepresentation_; | ||
int overrepresentation_sampling_; | ||
std::vector<std::string> hot_seqs_; | ||
std::vector<std::string> hot_seqs2_; | ||
int eva_len_; | ||
int eva_len2_; | ||
bool print_ORP_seqs_; | ||
|
||
//insert size | ||
bool no_insert_size_; | ||
int max_insert_size_; | ||
|
||
//gz | ||
int compression_level_; | ||
|
||
//interleaved | ||
bool interleaved_in_; | ||
bool interleaved_out_; | ||
|
||
|
||
//parallel gz | ||
bool use_pugz_; | ||
bool use_pigz_; | ||
int pugz_threads_; | ||
int pigz_threads_; | ||
}; | ||
|
||
|
||
#endif//RERABBITQC_CMDINFO_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
#pragma once | ||
#include <sys/cdefs.h> | ||
#define __sw_256bit_simd__ | ||
#ifdef __sw_512bit_simd__ | ||
typedef int desc_t __attribute__ ((__mode__(__V1XI__))); | ||
#elif defined(__sw_256bit_simd__) | ||
typedef int desc_t __attribute__ ((__mode__(__V1OI__))); | ||
#endif | ||
template <typename TM, typename TL> | ||
__always_inline void dma_getn(TM *mem, TL *ldm, int count) { | ||
//long dma_desc[4]; | ||
desc_t dma_desc; | ||
asm volatile( | ||
"beq %[SIZE], 2f\n\t" | ||
"memb\n\t" | ||
"stl $31, 0(%[RPL])\n\t" | ||
"vinsw %[RPL], $31, 2, %[DESC]\n\t" | ||
"ldi %[DESC], 1($31)\n\t" | ||
"sll %[DESC], 56, %[DESC]\n\t" | ||
"vinsw %[SIZE], %[DESC], 0, %[DESC]\n\t" | ||
"dma %[DESC], %[MEM], %[LDM]\n\t" | ||
"1:\n\t" | ||
"ldw %[DESC], 0(%[RPL])\n\t" | ||
"beq %[DESC], 1b\n\t" | ||
"2:\n\t" | ||
"memb\n\t" | ||
: [ DESC ] "=&r"(dma_desc) | ||
: [ MEM ] "r"(mem), [ LDM ] "r"(ldm), [ RPL ] "r"(((long*)&dma_desc) + 2), [ SIZE ] "r"(count * sizeof(TL)) | ||
: "memory"); | ||
} | ||
template <typename TM, typename TL> | ||
__always_inline __attribute__((__artificial__)) void dma_putn(TM *mem, TL *ldm, int count) { | ||
desc_t dma_desc; | ||
asm volatile( | ||
"beq %[SIZE], 2f\n\t" | ||
"memb\n\t" | ||
"stl $31, 0(%[RPL])\n\t" | ||
"vinsw %[RPL], $31, 2, %[DESC]\n\t" | ||
"vinsw %[SIZE], %[DESC], 0, %[DESC]\n\t" | ||
"dma %[DESC], %[MEM], %[LDM]\n\t" | ||
"1:\n\t" | ||
"ldw %[DESC], 0(%[RPL])\n\t" | ||
"beq %[DESC], 1b\n\t" | ||
"2:\n\t" | ||
"memb\n\t" | ||
: [ DESC ] "=&r"(dma_desc) | ||
: [ MEM ] "r"(mem), [ LDM ] "r"(ldm), [ RPL ] "r"(((long*)&dma_desc) + 2), [ SIZE ] "r"(count * sizeof(TL)) | ||
: "memory"); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
|
||
#include <cstring> | ||
#include <vector> | ||
#include <unordered_map> | ||
#include <unistd.h> | ||
|
||
#include "tuple_spawn.hpp" | ||
#include "slave.h" | ||
#include "Reference.h" | ||
#include "cmdinfo.h" | ||
#include "state.h" | ||
#include "tgsstate.h" | ||
#include "threadinfo.h" | ||
|
||
|
||
struct qc_data { | ||
ThreadInfo **thread_info_; | ||
CmdInfo *cmd_info_; | ||
std::vector <neoReference> *data_; | ||
std::vector <neoReference> *pass_data_; | ||
int *cnt; | ||
}; | ||
|
||
void updateTop5(int rlen, double avgQual, TGSStats *tgs_state) { | ||
int pos = 5; | ||
for (int i = 0; i < 5; i++) { | ||
if (tgs_state->mTop5QualReads[i].first < avgQual) { | ||
for (int j = 4; j > i; j--) { | ||
tgs_state->mTop5QualReads[j] = tgs_state->mTop5QualReads[j - 1]; | ||
} | ||
tgs_state->mTop5QualReads[i] = {avgQual, rlen}; | ||
break; | ||
} | ||
} | ||
for (int i = 0; i < 5; i++) { | ||
if (tgs_state->mTop5LengReads[i].first < rlen) { | ||
for (int j = 4; j > i; j--) { | ||
tgs_state->mTop5LengReads[j] = tgs_state->mTop5LengReads[j - 1]; | ||
} | ||
tgs_state->mTop5LengReads[i] = {rlen, avgQual}; | ||
break; | ||
} | ||
} | ||
} | ||
|
||
void tgsfunc(qc_data *para) { | ||
//if(_PEN != 0) return; | ||
std::vector <neoReference> *data = para->data_; | ||
ThreadInfo *thread_info = para->thread_info_[_PEN]; | ||
CmdInfo *cmd_info = para->cmd_info_; | ||
int data_num = data->size(); | ||
int base_num = 0; | ||
for (int id = _PEN; id < data_num; id += 64) { | ||
//for (int id = 0; id < data_num; id++) { | ||
para->cnt[_PEN]++; | ||
auto ref = (*data)[id]; | ||
//base_num += ref.lqual; | ||
const int rlen = ref.lseq; | ||
const char *seq = reinterpret_cast<const char *>(ref.base + ref.pseq); | ||
const char *quality = reinterpret_cast<const char *>(ref.base + ref.pqual); | ||
int size_range = thread_info->TGS_state_->mMinlen >> 1; | ||
if(rlen < size_range) size_range = rlen; | ||
if(thread_info->TGS_state_->countLenNum < (1 << 20)) | ||
thread_info->TGS_state_->mTotalReadsLen[thread_info->TGS_state_->countLenNum++] = rlen; | ||
thread_info->TGS_state_->mReadsNum++; | ||
thread_info->TGS_state_->mBasesNum += rlen; | ||
char c1, c2; | ||
int phredSub = 33; | ||
if (cmd_info->isPhred64_) phredSub = 64; | ||
int sumQual = 0; | ||
for (int i = 0; i < rlen; i++) { | ||
int qual = (quality[i] - phredSub); | ||
sumQual += qual; | ||
if (qual >= 5) thread_info->TGS_state_->mBases51015Num[0]++; | ||
if (qual >= 10) thread_info->TGS_state_->mBases51015Num[1]++; | ||
if (qual >= 15) thread_info->TGS_state_->mBases51015Num[2]++; | ||
} | ||
updateTop5(rlen, 1.0 * sumQual / rlen, thread_info->TGS_state_); | ||
for (int i = 0; i < size_range; ++i) { | ||
c1 = seq[i]; | ||
thread_info->TGS_state_->head_qual_sum[i] += (quality[i] - phredSub); | ||
if (c1 == 'A') { | ||
thread_info->TGS_state_->head_seq_pos_count[0][i]++; | ||
} else if (c1 == 'T') { | ||
thread_info->TGS_state_->head_seq_pos_count[1][i]++; | ||
} else if (c1 == 'C') { | ||
thread_info->TGS_state_->head_seq_pos_count[2][i]++; | ||
} else if (c1 == 'G') { | ||
thread_info->TGS_state_->head_seq_pos_count[3][i]++; | ||
} | ||
} | ||
for (int i = rlen - 1; i >= rlen - size_range; --i) { | ||
c2 = seq[i]; | ||
thread_info->TGS_state_->tail_qual_sum[i - (rlen - size_range)] += (quality[i] - phredSub); | ||
|
||
if (c2 == 'A') { | ||
thread_info->TGS_state_->tail_seq_pos_count[0][i - (rlen - size_range)]++; | ||
} else if (c2 == 'T') { | ||
thread_info->TGS_state_->tail_seq_pos_count[1][i - (rlen - size_range)]++; | ||
} else if (c2 == 'C') { | ||
thread_info->TGS_state_->tail_seq_pos_count[2][i - (rlen - size_range)]++; | ||
} else if (c2 == 'G') { | ||
thread_info->TGS_state_->tail_seq_pos_count[3][i - (rlen - size_range)]++; | ||
} | ||
} | ||
|
||
} | ||
//printf("=== %d base number %d\n", _PEN, base_num); | ||
} | ||
|
||
// template void tmpl_entrance(decltype(test__)*); | ||
ATHREAD_VISIBLE(tgsfunc); |
Oops, something went wrong.