-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathfasta_locate_motif.pl
executable file
·124 lines (98 loc) · 3.14 KB
/
fasta_locate_motif.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env perl
# https://github.com/shenwei356/bio_scripts
use strict;
use Getopt::Long;
use File::Basename;
use BioUtil::Seq;
$0 = basename($0);
my $usage = <<USAGE;
$0 - locating motif in genomes.
Motifs could be EITHER plain sequence containing "ACTGN" OR regular
expression like "A[TU]G(?:.{3})+?[TU](?:AG|AA|GA)" for ORFs.
Degenerate bases like "RYMM.." are also supported by option -d/
Usage: $0 <motif fasta file> <subject fasta file>
Options:
-d, --degenerate Motif contains egenerate base
-h, --help Show this help information
Attention: In default, motifs are treated as regular expression.
When option -d given, regular expression may be wrong.
For example: "\\w" -> "\\[AT]". In this case you can use "\\.+?"
USAGE
my $args = {};
GetOptions(
'help|h' => \$$args{help},
'degenerate|d' => \$$args{degenerate},
) or die $usage;
die $usage if $$args{help};
die $usage unless @ARGV == 2;
my $queries = read_sequence_from_fasta_file( shift @ARGV );
my $next_seq = FastaReader( shift @ARGV );
print "subject\tquery\tstart\tend\tstrand\tmatched\n";
while ( my $fa = &$next_seq() ) {
my ( $header, $seq ) = @$fa;
for my $qname ( sort keys %$queries ) {
my $qseq = $$queries{$qname};
my $qseq_r = $qseq;
$qseq_r = degenerate_seq_to_regexp($qseq_r) if $$args{degenerate};
my $matches = match_regexp( $qseq_r, $seq );
for my $match (@$matches) {
my ( $start, $end, $matched ) = @$match;
$start += 1;
$end += 1;
print "$header\t$qname\t$start\t$end\t+\t$matched\n";
}
my $qseq_r = revcom($qseq);
$qseq_r = degenerate_seq_to_regexp($qseq_r) if $$args{degenerate};
my $matches = match_regexp( $qseq_r, $seq );
for my $match (@$matches) {
my ( $start, $end, $matched ) = @$match;
$start += 1;
$end += 1;
print "$header\t$qname\t$start\t$end\t-\t"
. revcom($matched) . "\n";
}
}
}
=head2 degenerate_seq_to_regexp
Translate degenerate sequence to regular expression.
=cut
sub degenerate_seq_to_regexp {
my ($seq) = @_;
my %bases = (
'A' => 'A',
'T' => 'T',
'U' => 'U',
'C' => 'C',
'G' => 'G',
'R' => '[AG]',
'Y' => '[CT]',
'M' => '[AC]',
'K' => '[GT]',
'S' => '[CG]',
'W' => '[AT]',
'H' => '[ACT]',
'B' => '[CGT]',
'V' => '[ACG]',
'D' => '[AGT]',
'N' => '[ACGT]',
);
return join '', map { exists $bases{$_} ? $bases{$_} : $_ }
split //, uc $seq;
}
=head2 match_regexp
Find all sites matching the regular expression.
See https://github.com/shenwei356/bio_scripts/blob/master/sequence/fasta_locate_motif.pl
=cut
sub match_regexp {
my ( $r, $s ) = @_;
my @matched = ();
my $pos = -1;
while ( $s =~ /($r)/ig ) {
$pos = pos $s;
# return start, end, matched string
# start and end are 0-based
push @matched, [ $pos - length($1), $pos - 1, $1 ];
pos $s = $pos - length($1) + 1;
}
return \@matched;
}