-
Notifications
You must be signed in to change notification settings - Fork 65
/
Copy pathadd_annotations_to_myva.pl
executable file
·169 lines (148 loc) · 4.74 KB
/
add_annotations_to_myva.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/perl
# Fuction : To add annotations to COG aa file
# Author : Wei Shen
# Email : [email protected]
# Date : 2011-04-08, cost 2 hour.
# Last Update : 2011-04-08
# Annotations are in following files downloaded from
# ftp://ftp.ncbi.nlm.nih.gov/pub/COG/COG
#
# FILE DATA(* means important)
# *fun.txt function_id(one letter) -> function
# *myva protein_id*, aa sequence*
# *myva=gb protein_id -> GI numbers*
# *whog Organism(three letters)*, protein_id,
# function_id, cog_id, protein*
# *org.txt Organism(three letters) -> detail
#
# Output format
# > <protein_id>_<protein>_<function>_<Organism>_<GI numbers>
# aa sequence
#
# Therefore, five files with sign * will be used
#
# Attention:
# 1. NOT all protein_ids from file myva=gb could be found in file whog
# 2. NOT all protein_ids from file myva could be found in file whog
use strict;
# parse file fun
my $fun = &parse_file_fun('fun.txt');
# parse file myva=gb
my $pro_gi = &parse_file_myva_gb('myva=gb');
# parse file org
my $org = &parse_file_org('org.txt');
# parse file whog
my $whog = &parse_file_whog('whog');
# my @keys = keys %$whog;
# print "$_\n" unless $_ ~~ @keys for keys %$pro_gi;
# the result showed that not all protein_ids from file myva=gb
# are in file whog
# parse file myva and add annotation
my $file = 'myva';
my $out_file = "$file.full_annotation.txt";
my ($head, $seq, $pro_id_trim);
open IN, $file or die "File $file failed to open sequence.\n";
$/ = '>';<IN>;
open OUT, ">", $out_file or die "File $out_file failed to open sequence.\n";
while ( <IN> ) {
s/\r?\n>//;
( $head, $seq ) = split /\r?\n/, $_, 2;
## > <protein_id>_<protein>_<function>_<Organism>_<GI numbers>
$pro_id_trim = $head;
$pro_id_trim = $1 if $head =~ /(.+)\_\d+/; # for gi
$head = $head
. " __pro__". $$whog{$head}{protein}
. "__fun_". $$whog{$head}{fun_id}. "_". $$fun{$$whog{$head}{fun_id}}
. "__org__". $$org{$$whog{$head}{org_id}}{organism}
. "__gi__". $$pro_gi{$pro_id_trim};
print OUT ">$head\n$seq\n";
}
close IN;
close OUT;
#====================================================================
# out put data structure:
# $hash_ref = {fun_id => function}
sub parse_file_fun($){
my ($file) = @_;
my $fun = {};
open IN, $file or die "File $file failed to open\n";
while (<IN>) {
next unless /\[(\w)\] (.+) $/;
$$fun{$1} = $2;
}
close IN;
# print scalar keys %$fun;
return $fun;
}
# out put data structure:
# $hash_ref = {protein_id => gi}
sub parse_file_myva_gb($){
my ($file) = @_;
my $pro_gi = {};
open IN, $file or die "File $file failed to open\n";
while (<IN>) {
next unless /^(.+)\s+(.+)$/;
$$pro_gi{$1} = $2;
}
close IN;
# print scalar keys %$pro_gi;
return $pro_gi;
}
# out put data structure:
# $hash_ref = {org_id => {kindom => kindom, organism => organism} }
sub parse_file_org($){
my ($file) = @_;
my $org = {};
open IN, $file or die "File $file failed to open\n";
while (<IN>) {
next unless /^(\w{3})\s+\d+\s+(.+?)\s+(.+)$/;
$$org{$1}{kindom} = $2;
$$org{$1}{organism} = $3;
}
close IN;
# print scalar keys %$org;
return $org;
}
# out put data structure:
# $hash_ref = {protein_id => {org_id => org_id, cog_id => cog_id,
# fun_id => fun_id, protein => protein} }
sub parse_file_whog($){
my ($file) = @_;
my $whog = {};
my ($fun_id, $cog_id, $protein, $org_id, $protein_id, @protein_ids);
open IN, $file or die "File $file failed to open\n";
while (<IN>) {
if (/^\[(\w)\] (\w+) (.+)$/) {
$fun_id = $1;
$cog_id = $2;
$protein = $3;
}
elsif (/^\s+(\w{3})\:\s+(.+)$/) {
$org_id = $1;
@protein_ids = split /\s+/, $2;
for $protein_id (@protein_ids) {
$$whog{$protein_id}{fun_id} = $fun_id;
$$whog{$protein_id}{cog_id} = $cog_id;
$$whog{$protein_id}{protein} = $protein;
$$whog{$protein_id}{org_id} = $org_id;
}
}
elsif (/ (.+)/) {
@protein_ids = split /\s+/, $1;
for $protein_id (@protein_ids) {
$$whog{$protein_id}{fun_id} = $fun_id;
$$whog{$protein_id}{cog_id} = $cog_id;
$$whog{$protein_id}{protein} = $protein;
$$whog{$protein_id}{org_id} = $org_id;
}
}
elsif (/_______/) {
}
else {
}
}
close IN;
# print scalar keys %$whog;
# print "$_\t". ($$whog{'PH0109_1'}{$_}). "\n" for keys %{$$whog{'PH0109_1'}};
return $whog;
}