4
4
#include <stdlib.h>
5
5
#include <zlib.h>
6
6
#define SHOW_USAGE () {\
7
- printf( "\nSHort-read Iterative Trimmer (SHI7) v0.92d , by Gabe. Usage:\n");\
7
+ printf( "\nSHort-read Iterative Trimmer (SHI7) v0.92f , by Gabe. Usage:\n");\
8
8
printf( "shi7_trimmer in_seqs.fastq out_prefix MINLEN QUAL <QUAL_RIGHT> ...\n");\
9
9
printf( "Choose one of the following optional trimming methods:\n");\
10
10
printf( " [ROLLING X [PREROLL/POSTROLL]] For rolling average over X bases\n" );\
@@ -105,28 +105,29 @@ int main(int argc, char *argv[]) {
105
105
setvbuf (out2 ,0 ,_IOFBF ,1 <<20 );
106
106
if (!in || !out || (pair && (!in2 || !out2 ))) {
107
107
puts ("Can't open input or output file(s)!" ); exit (1 );}
108
- char * Head = malloc (UINT16_MAX ), * Seq = malloc (UINT16_MAX ),
109
- * Shi7 = malloc (UINT16_MAX ), * Qual = malloc (UINT16_MAX ),
108
+ long maxLen = 16e6 ; // 16MB should be enough for anyone.
109
+ char * Head = malloc (maxLen ), * Seq = malloc (maxLen ),
110
+ * Shi7 = malloc (maxLen ), * Qual = malloc (maxLen ),
110
111
* Head2 = 0 , * Seq2 = 0 , * Shi72 = 0 , * Qual2 = 0 ;
111
- if (pair ) Head2 = malloc (UINT16_MAX ), Seq2 = malloc (UINT16_MAX ),
112
- Shi72 = malloc (UINT16_MAX ), Qual2 = malloc (UINT16_MAX );
112
+ if (pair ) Head2 = malloc (maxLen ), Seq2 = malloc (maxLen ),
113
+ Shi72 = malloc (maxLen ), Qual2 = malloc (maxLen );
113
114
if (palincut && !pair ) puts ("WARNING: Can't do palincut on SE reads!" );
114
115
long crap = 0 , decency = 0 , totI = 0 , totJ = 0 , totLen = 0 , length = 0 , length2 = LONG_MAX ;
115
116
int * Scores = calloc (doRoll ,sizeof (* Scores ));
116
- while (Head = gzgets (in ,Head ,UINT16_MAX )) {
117
- Seq = gzgets (in ,Seq , UINT16_MAX );
118
- Shi7 = gzgets (in ,Shi7 , UINT16_MAX );
119
- Qual = gzgets (in ,Qual , UINT16_MAX );
117
+ while (Head = gzgets (in ,Head ,maxLen )) {
118
+ Seq = gzgets (in ,Seq , maxLen );
119
+ Shi7 = gzgets (in ,Shi7 , maxLen );
120
+ Qual = gzgets (in ,Qual , maxLen );
120
121
length = strlen (Qual );
121
122
//printf("Seq: %s qual: %s len %d\n",Seq,Qual,length);
122
123
if (Qual [length - 1 ]== '\n' ) -- length ;
123
124
if (Qual [length - 1 ]== '\r' ) -- length ;
124
125
//printf("--> Length is now %d\n",length);
125
126
if (pair ) {
126
- Head2 = gzgets (in2 ,Head2 , UINT16_MAX );
127
- Seq2 = gzgets (in2 ,Seq2 , UINT16_MAX );
128
- Shi72 = gzgets (in2 ,Shi72 , UINT16_MAX );
129
- Qual2 = gzgets (in2 ,Qual2 , UINT16_MAX );
127
+ Head2 = gzgets (in2 ,Head2 , maxLen );
128
+ Seq2 = gzgets (in2 ,Seq2 , maxLen );
129
+ Shi72 = gzgets (in2 ,Shi72 , maxLen );
130
+ Qual2 = gzgets (in2 ,Qual2 , maxLen );
130
131
length2 = strlen (Qual2 );
131
132
if (Qual2 [length2 - 1 ]== '\n' ) -- length2 ;
132
133
if (Qual2 [length2 - 1 ]== '\r' ) -- length2 ;
@@ -144,7 +145,7 @@ int main(int argc, char *argv[]) {
144
145
if (pair && doAdap2 ) {
145
146
adMin = adMin < endpt ? adMin : endpt ; // truncate both reads where first adap found
146
147
// Truncate at partial matches up to 2 bases near the end
147
- if (!r && !adap ) // Only run on R1; second read is redundant due to equality
148
+ if (!r && !adap ) // Only run on R1; second read is redundant due to assumed equality of length (!)
148
149
for (int w = * TLen - (AdapLens [z ] < * TLen ? AdapLens [z ] : 0 ) + 1 ; w < * TLen - 1 ; ++ w ) {
149
150
if (!memcmp (Adaps [z ],TSeq + w ,* TLen - w ) && !memcmp (Seq + w ,Seq2 + w ,* TLen - w ))
150
151
{adMin = adMin < w ? adMin : w ; break ;}
@@ -249,22 +250,21 @@ int main(int argc, char *argv[]) {
249
250
// Regurgitate the sequence(s). They're (prolly) OK enough if they made it this far down.
250
251
for (int z = L ; z < R ; z ++ ) Seq [z ] = Qual [z ] - 33 > nify ? Seq [z ] : 'N' ;
251
252
Seq [R ]= 0 ; Qual [R ]= '\n' , Qual [R + 1 ] = 0 ;
252
- //printf(" ------> seq [%d] R1 '%s', L = %d, R = %d, len %d\n",crap+decency,Seq,L,R, R-L);
253
- if (fasta_out ) Head [L ]= '>' , Shi7 [0 ]= 0 , Qual [L ]= 0 ;
253
+ if (fasta_out ) Head [0 ]= '>' , Shi7 [0 ]= 0 , Qual [L ]= 0 ;
254
254
if (strip ) fprintf (out ,"%c%d\n%s\n%s%s" ,fasta_out ? '>' :'@' ,crap + decency ,
255
255
Seq + L ,fasta_out ? "" :"+\n" ,Qual + L );
256
256
else fprintf (out ,"%s%s\n%s%s" ,Head ,Seq + L ,Shi7 ,Qual + L );
257
257
if (pair ) {
258
258
for (int z = L2 ; z < R2 ; z ++ ) Seq2 [z ] = Qual2 [z ] - 33 > nify ? Seq2 [z ] : 'N' ;
259
259
Seq2 [R2 ]= 0 ; Qual2 [R2 ]= '\n' ; Qual2 [R2 + 1 ]= 0 ;
260
- if (fasta_out ) Head2 [L2 ]= '>' , Shi72 [0 ]= 0 , Qual2 [L2 ]= 0 ;
260
+ if (fasta_out ) Head2 [0 ]= '>' , Shi72 [0 ]= 0 , Qual2 [L2 ]= 0 ;
261
261
if (strip ) fprintf (out2 ,"%c%d\n%s\n%s%s" ,fasta_out ? '>' :'@' ,crap + decency ,
262
262
Seq2 + L2 ,fasta_out ? "" :"+\n" ,Qual2 + L2 );
263
263
else fprintf (out2 ,"%s%s\n%s%s" ,Head2 ,Seq2 + L2 ,Shi72 ,Qual2 + L2 );
264
264
}
265
265
++ decency ;
266
266
totLen += (R - L ) + (R2 - L2 );
267
- totI += L + L2 , totJ += (length - R ) + (length2 - R2 );
267
+ totI += L + L2 , totJ += (length - R ) + (pair ? ( length2 - R2 ) : 0 );
268
268
269
269
HELL :NULL ; // HEREIN LIE DEVILS
270
270
if (decency >= head ) break ;
0 commit comments