-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfq-guess-encoding.pl
executable file
·128 lines (112 loc) · 3.1 KB
/
fq-guess-encoding.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env perl
use strict;
use warnings;
use Fatal;
use List::Util qw(sum min max);
use Data::Dumper;
use IO::File;
my(@Options, $verbose, $numreads, $mode);
setOptions();
my $maxL = 0;
my @sum;
my @cnt;
FILE:
for my $file (@ARGV) {
my $nseq=0;
my $fh = open_maybe_compressed($file);
while (my $id = $fh->getline) {
last FILE if ++$nseq > $numreads;
if ($id =~ m/^\@/) {
$fh->getline; # skip dna
$fh->getline; # skip id2
my $qs = $fh->getline;
my @qv = split m//, $qs;
@qv = map { ord } @qv;
$maxL = max($maxL, scalar @qv);
for my $i (0 .. $#qv) {
$sum[$i] += $qv[$i];
$cnt[$i] += 1;
}
}
else {
die "File '$file' does not look like FASTQ";
}
}
}
# convert to average pseudo-Q values (need to subtract offset still)
for my $i (0 .. $maxL-1) {
$sum[$i] ||= 0;
$sum[$i] /= $cnt[$i];
}
# Guess ASCII encoding offset
# Sanger = 33 # Solexa = 59 # Illumina = 64
my %name_of = (33=>'Sanger', 59=>'Solexa', 64=>'Illumina');
my $minq = min(@sum);
my $offset = $minq < 59 ? 33 : ($minq < 65 ? 59 : 64);
print STDERR "Guessed phred offset = $offset ($name_of{$offset})\n" if $verbose;
my $stdout = '';
if ($mode eq 'offset') {
$stdout = $offset;
}
elsif ($mode eq 'bowtie2') {
$stdout = "--phred-$offset";
}
elsif ($mode eq 'nesoni') {
$stdout = "--qv-offset $offset";
}
elsif ($mode eq 'fqatools') {
$stdout = "-o $offset";
}
elsif ($mode eq 'fastx') {
$stdout = "-Q $offset";
}
else {
die "unknown mode '$mode'";
}
print STDOUT "$stdout\n";
#----------------------------------------------------------------------
sub open_maybe_compressed {
my($fname) = @_;
my @try = ("pbzip2 -dc", "pigz -dc", "bzip2 -dc", "gzip -dc", "cat");
my $fh;
for my $exe (@try) {
print STDERR "Trying: $exe $fname\n" if $verbose;
my $io = IO::File->new("$exe \Q$fname\E 2> /dev/null |");
my $c = $io->getc;
if (defined $c) {
$io->ungetc(ord $c); # is ord() needed here?
print STDERR "open_maybe_compressed: using $exe to read $fname\n" if $verbose;
return $io;
}
}
die "could not open $fname";
}
#----------------------------------------------------------------------
# Option setting routines
sub setOptions {
use Getopt::Long;
@Options = (
{OPT=>"help", VAR=>\&usage, DESC=>"This help"},
{OPT=>"verbose!", VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose output"},
{OPT=>"numreads=i", VAR=>\$numreads, DEFAULT=>10_000, DESC=>"Sample this many reads"},
{OPT=>"mode=s", VAR=>\$mode, DEFAULT=>'offset', DESC=>"Mode: offset fastx bowtie2 nesoni fqatools"},
);
(!@ARGV) && (usage());
&GetOptions(map {$_->{OPT}, $_->{VAR}} @Options) || usage();
# Now setup default values.
foreach (@Options) {
if (defined($_->{DEFAULT}) && !defined(${$_->{VAR}})) {
${$_->{VAR}} = $_->{DEFAULT};
}
}
}
sub usage {
my $exe = $0;
$exe =~ s{^.*/}{};
print STDERR "Usage: $exe <readfile[.gz][.bz2] ...>\n";
foreach (@Options) {
printf STDERR " --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
}
exit(1);
}