-
Notifications
You must be signed in to change notification settings - Fork 0
/
tbl-from-gff-complete-genome.sh
70 lines (53 loc) · 1.96 KB
/
tbl-from-gff-complete-genome.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#! /bin/sh
#Francesca Bottacini
#specify input
input="$1"
proteinID="$2"
#Prepare directory
mkdir tmp ;
cp $input* tmp/ ;
cd tmp ;
cp $input.nucleotide $input.fsa ;
#Extract features without sequence from gff file
grep "ID" $input.gff > sequintmp ;
sed -i "/ID/ s/ /:/g" sequintmp ;
#Process forward and reverse strans ORFs
awk '$7 == "+"' sequintmp > sequinfw ;
awk '$7 == "-"' sequintmp > sequinrv ;
awk '{print $4,$5,$3,$9}' sequinfw > sequintblfw ;
awk '{print $5,$4,$3,$9}' sequinrv > sequintblrv ;
cat sequintblfw sequintblrv > sequintbl ;
sed -i "/ / s/ /\t/" sequintbl ;
sed -i "/ / s/ /\t/" sequintbl ;
sed -i "/ / s/ /\t/" sequintbl ;
sed -i "/ / s/ \t/\t/g" sequintbl ;
#Process products and insert proteinID
sed -i "/ID/ s/:/ /g" sequintbl ;
sed -i "/CDS/ s/CDS\tID=/CDS\tprotein_id=gnl\|${proteinID}\|/g" sequintbl ;
sed -i "/;/ s/;/\n\t\t\t\t\t/g" sequintbl ;
sed -i "/ID=/ s/ID\=/\n\t\t\t\t\tID\=/g" sequintbl ;
sed -i "/protein_id\=/ s/protein_id\=/\n\t\t\t\t\tprotein_id\=/g" sequintbl ;
sed -i "/=/ s/=/\t/g" sequintbl ;
#Format tbl file with tabs and returns
mv sequintbl $input.sequin ;
sed ':a;{N;s/\n/;/};ba' $input.sequin > $input.tmptbl ;
sed -e "s/CDS\n\t\t\t\t\tID\t/CDS\t;\t\t\t\t\tprotein_id\tgnl\|${proteinID}\|/g" $input.tmptbl > $input.tbl ;
sed -i "/;/ s/;/\n/g" $input.tbl ;
sed -i "/ID/ s/ID/locus_tag/g" $input.tbl ;
sed -i "/\t/ s/\t\t\t\t\t/\t\t\t/g" $input.tbl ;
sed -i "/>/ s/>.*//g" $input.tbl ;
sed -i "1i >Feature ${input}" $input.tbl ;
#Add hypotheticals to non predicted proteins (altho there should be none)
sed -i "/product/ s/product\t$/product\thypothetical protein/g" $input.tbl ;
sed -i "/CDS/ s/CDS\t$/CDS/g" $input.tbl ;
sed -i "/gene/ s/gene\t$/gene/g" $input.tbl ;
sed -i "/tRNA/ s/tRNA\t$/tRNA/g" $input.tbl ;
sed -i "/rRNA/ s/rRNA\t$/rRNA/g" $input.tbl ;
sed -i "s/ \+/ /g" $input.tbl ;
sed -i "s/ \t/\t/g" $input.tbl ;
#Move files and cleanup
mv $input.tbl ../ ;
mv $input.fsa ../ ;
cd .. ;
rm -fr tmp ;