-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
00729a2
commit ed061be
Showing
2 changed files
with
121 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
#!/usr/bin/env perl | ||
# https://github.com/shenwei356/bio_scripts | ||
use strict; | ||
use Getopt::Long; | ||
use File::Temp qw/ tempfile/; | ||
use BioUtil::Seq; | ||
use BioUtil::Util; | ||
|
||
local $| = 1; | ||
|
||
my @GAPS = ( '-', '.' ); | ||
|
||
my $usage = <<USAGE; | ||
Remove common gaps in aligned fasta sequences. | ||
Reading from STDIN is supported. But in this case, sequences will be saved to | ||
disk temporarily. Because this script reads sequences twice to reduce memery | ||
usage. | ||
Usage: $0 [options] [aligned fasta file...] | ||
Options: | ||
-h, --help Show this help information | ||
-g, --gaps Gap symbols [-.] | ||
-l, --linelength Line length | ||
https://github.com/shenwei356/bio_scripts | ||
USAGE | ||
|
||
my $para = {}; | ||
$$para{linelength} = 70; | ||
|
||
GetOptions( | ||
'help|h' => \$$para{help}, | ||
'gaps|g=s' => \$$para{gaps}, | ||
'linelength|l=i' => \$$para{linelength}, | ||
) or die $usage; | ||
|
||
die $usage if $$para{help}; | ||
|
||
# gap symbols | ||
my %GAPSMAP = (); | ||
if ( $$para{gaps} ) { | ||
@GAPS = split //, $$para{gaps}; | ||
} | ||
$GAPSMAP{$_} = 1 for @GAPS; | ||
|
||
my $use_stdin = 0; | ||
my ( $tmp_file_fh, $tmp_file ) = (undef) x 2; | ||
|
||
my @files = (); | ||
for my $file (@ARGV) { | ||
for my $f ( glob $file ) { | ||
push @files, $f; | ||
} | ||
} | ||
if ( @files == 0 ) { | ||
push @files, 'STDIN'; | ||
( $tmp_file_fh, $tmp_file ) = tempfile(); | ||
$use_stdin = 1; | ||
} | ||
|
||
print STDERR "sequences from STDIN is saved in $tmp_file\n" if $use_stdin; | ||
print STDERR "check...\n"; | ||
|
||
my $gaploc = {}; # store the gap location | ||
my $do_once = 1; | ||
my ( $header, $seq, $len, $i, $base ) = (undef) x 5; | ||
my ($sum, $n) = (0) x 2; | ||
for my $file (@files) { | ||
my $next_seq = FastaReader($file); | ||
while ( my $fa = &$next_seq() ) { | ||
( $header, $seq ) = @$fa; | ||
$sum++; | ||
print STDERR "\r$sum"; | ||
if ($do_once) { | ||
$len = length $seq; | ||
$$gaploc{$_} = 1 for 0 .. ( $len - 1 ); | ||
$do_once = 0; | ||
} | ||
|
||
for $i ( 0 .. ( $len - 1 ) ) { | ||
$base = substr $seq, $i, 1; | ||
if ( $GAPSMAP{$base} != 1 ) { # it's not a gap! | ||
delete $$gaploc{$i}; | ||
} | ||
} | ||
|
||
die "no gap to trim.\n" if scalar keys %$gaploc == 0; | ||
|
||
print $tmp_file_fh ">$header\n$seq\n" if $use_stdin; | ||
} | ||
} | ||
|
||
close $tmp_file_fh if $use_stdin; | ||
|
||
print STDERR "\nextract sequences...\n"; | ||
|
||
@files = ($tmp_file) if $use_stdin; | ||
|
||
my @index = keys %$gaploc; | ||
for my $file (@files) { | ||
my $next_seq = FastaReader($file); | ||
while ( my $fa = &$next_seq() ) { | ||
( $header, $seq ) = @$fa; | ||
$n++; | ||
print STDERR "\r$n / $sum"; | ||
print ">$header\n", | ||
format_seq( delete_string_elements_by_indexes( \$seq, \@index ), | ||
$$para{linelength} ); | ||
} | ||
} | ||
|
||
print STDERR "\n"; | ||
|
||
if ($use_stdin) { | ||
print STDERR "remove $tmp_file\n"; | ||
unlink $tmp_file or die "fail to remove $tmp_file\n"; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters