forked from chhylp123/hifiasm
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathProcess_Read.h
228 lines (188 loc) · 6.04 KB
/
Process_Read.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#ifndef __READ__
#define __READ__
#include<stdint.h>
#include <string.h>
#include <stdlib.h>
#include <zlib.h>
#include "Overlaps.h"
#include "CommandLines.h"
///#include "Hash_Table.h"
#define READ_INIT_NUMBER 1000
#define READ_BLOCK_SIZE 64
#define READ_BLOCK_NUM_PRE_THR 100
#define IS_FULL(buffer) ((buffer.num >= buffer.size)?1:0)
#define IS_EMPTY(buffer) ((buffer.num == 0)?1:0)
///#define Get_READ_LENGTH(R_INF, ID) (R_INF.index[ID+1] - R_INF.index[ID])
#define Get_READ_LENGTH(R_INF, ID) (R_INF).read_length[(ID)]
#define Get_NAME_LENGTH(R_INF, ID) ((R_INF).name_index[(ID)+1] - (R_INF).name_index[(ID)])
///#define Get_READ(R_INF, ID) R_INF.read + (R_INF.index[ID]>>2) + ID
#define Get_READ(R_INF, ID) (R_INF).read_sperate[(ID)]
#define Get_NAME(R_INF, ID) ((R_INF).name + (R_INF).name_index[(ID)])
#define CHECK_BY_NAME(R_INF, NAME, ID) (Get_NAME_LENGTH((R_INF),(ID))==strlen((NAME)) && \
memcmp((NAME), Get_NAME((R_INF), (ID)), Get_NAME_LENGTH((R_INF),(ID))) == 0)
extern uint8_t seq_nt6_table[256];
extern char bit_t_seq_table[256][4];
extern char bit_t_seq_table_rc[256][4];
extern char s_H[5];
extern char rc_Table[5];
#define RC_CHAR(x) rc_Table[seq_nt6_table[(uint8_t)x]]
void init_aux_table();
typedef struct
{
uint64_t x_id;
uint64_t x_pos_s;
uint64_t x_pos_e;
uint8_t x_pos_strand;
uint64_t y_id;
uint64_t y_pos_s;
uint64_t y_pos_e;
uint8_t y_pos_strand;
uint64_t matchLen;
uint64_t totalLen;
} PAF;
typedef struct
{
PAF* list;
uint64_t size;
uint64_t length;
} PAF_alloc;
inline void init_PAF_alloc(PAF_alloc* list)
{
list->size = 15;
list->length = 0;
list->list = (PAF*)malloc(sizeof(PAF)*list->size);
}
inline void append_PAF_alloc(PAF_alloc* list, PAF* e)
{
if(list->length+1 > list->size)
{
list->size = list->size * 2;
list->list = (PAF*)realloc(list->list, sizeof(PAF)*list->size);
}
list->list[list->length] = (*e);
list->length++;
}
typedef struct
{
/**[0-1] bits are type:**/
/**[2-31] bits are length**/
uint32_t* record;
uint32_t length;
uint32_t size;
char* lost_base;
uint32_t lost_base_length;
uint32_t lost_base_size;
uint32_t new_length;
} Compressed_Cigar_record;
#define AMBIGU 0
#define FATHER 1
#define MOTHER 2
#define MIX_TRIO 3
#define NON_TRIO 4
#define DROP 5
typedef struct{ // hamt
// uint32_t qn;
uint32_t *tn;
uint8_t *is_match;
int n, m; // for both tn and is_match
} ovecinfo_t;
typedef struct {
uint64_t n, m;
ovecinfo_t *a;
} ovecinfo_v;
// #define hamt_ov_eq(a, b) ((a) == (b))
// #define hamt_ov_hash(a) ((a))
// KHASHL_SET_INIT(static klib_unused, hamt_ov_t, hamt_ov, uint64_t, hamt_ov_hash, hamt_ov_eq)
// a trail of subgraph IDs for unitigs
typedef struct {
int n, m;
// uint32_t readID1; // without the direction bit
// uint32_t readID2;
int *a;
}ma_utg_subglabels_t;
typedef struct {
int n, m;
ma_utg_subglabels_t *a;
} ma_utg_subg_labels_v;
typedef struct
{
uint64_t** N_site;
///uint8_t* read;
char* name;
uint8_t** read_sperate;
uint64_t* read_length;
uint64_t* read_size;
uint8_t* trio_flag;
///seq start pos in uint8_t* read
///do not need it
///uint64_t* index;
uint64_t index_size;
///name start pos in char* name
uint64_t* name_index;
uint64_t name_index_size;
uint64_t total_reads;
uint64_t total_reads_bases;
uint64_t total_name_length;
Compressed_Cigar_record* cigars;
Compressed_Cigar_record* second_round_cigar;
ma_hit_t_alloc* paf;
ma_hit_t_alloc* reverse_paf;
///meta
uint64_t hamt_stat_buf_size;
double* mean;
double* std;
uint16_t* median;
uint16_t* lowq; // lower 10 quantile
uint8_t* mask_readnorm; // bit flag, whether the read is discarded
uint8_t* mask_readtype; // bit flag
uint64_t *statpack; // pack stat with readID for sorting
uint64_t *nb_target_reads; // experimental! guessed number of candiates, for triggering read selection
int is_has_nothing, is_has_lengths, is_all_in_mem;
uint16_t *nb_error_corrected; // collect number of error corrected during ovec
ovecinfo_v OVEC_INF;
int *read2ugID; // an array of length total_reads (not 2*total_reads); -1 means the read is not start/end of a unitig; otherwise the value is unitig's stable ID
ma_utg_subg_labels_v *subg_label_trail; // size of nb_unitig
uint16_t *coasm_sampleID; // an array of length toal_reads
///kvec_t_u64_warp* pb_regions;
} All_reads;
extern All_reads R_INF;
// extern ovecinfo_v OVEC_INF;
typedef struct
{
char* seq;
long long length;
long long size;
long long RID;
} UC_Read;
typedef struct
{
char** read_name;
uint64_t query_num;
kvec_t_u64_warp* candidate_count;
FILE* fp;
pthread_mutex_t OutputMutex;
} Debug_reads;
void init_All_reads(All_reads* r);
void reset_All_reads(All_reads *r); // hamt
void malloc_All_reads(All_reads* r);
void ha_insert_read_len(All_reads *r, int read_len, int name_len);
void ha_compress_base(uint8_t* dest, char* src, uint64_t src_l, uint64_t** N_site_lis, uint64_t N_site_occ);
void init_UC_Read(UC_Read* r);
void recover_UC_Read(UC_Read* r, const All_reads *R_INF, uint64_t ID);
void recover_UC_Read_RC(UC_Read* r, All_reads* R_INF, uint64_t ID);
void recover_UC_Read_sub_region(char* r, long long start_pos, long long length, uint8_t strand, All_reads* R_INF, long long ID);
void destory_UC_Read(UC_Read* r);
void reverse_complement(char* pattern, uint64_t length);
void write_All_reads(All_reads* r, char* read_file_name);
int load_All_reads(All_reads* r, char* read_file_name);
void destory_All_reads(All_reads* r);
int destory_read_bin(All_reads* r);
void hamt_ovecinfo_init();
void hamt_ovecinfo_destroy(ovecinfo_v *v);
void hamt_ovecinfo_debugdump(hifiasm_opt_t *opt);
void hamt_ovecinfo_write_to_disk(hifiasm_opt_t *opt);
void hamt_ovecinfo_load_from_disk(hifiasm_opt_t *opt);
void init_Debug_reads(Debug_reads* x, const char* file);
void destory_Debug_reads(Debug_reads* x);
void recover_UC_sub_Read(UC_Read* i_r, long long start_pos, long long length, uint8_t strand, All_reads* R_INF, long long ID);
#endif