Skip to content

Commit

Permalink
Create tbl-from-gff-complete-genome.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
frbot authored Jun 23, 2020
1 parent 5a90688 commit d6bc0e7
Showing 1 changed file with 70 additions and 0 deletions.
70 changes: 70 additions & 0 deletions tbl-from-gff-complete-genome.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@

#! /bin/sh
#Francesca Bottacini
#[email protected]

#specify input
input="$1"
proteinID="$2"


#Prepare directory
mkdir tmp ;
cp $input* tmp/ ;
cd tmp ;

cp $input.nucleotide $input.fsa ;

#Extract features without sequence from gff file
grep "ID" $input.gff > sequintmp ;
sed -i "/ID/ s/ /:/g" sequintmp ;

#Process forward and reverse strans ORFs
awk '$7 == "+"' sequintmp > sequinfw ;
awk '$7 == "-"' sequintmp > sequinrv ;

awk '{print $4,$5,$3,$9}' sequinfw > sequintblfw ;
awk '{print $5,$4,$3,$9}' sequinrv > sequintblrv ;
cat sequintblfw sequintblrv > sequintbl ;
sed -i "/ / s/ /\t/" sequintbl ;
sed -i "/ / s/ /\t/" sequintbl ;
sed -i "/ / s/ /\t/" sequintbl ;
sed -i "/ / s/ \t/\t/g" sequintbl ;

#Process products and insert proteinID
sed -i "/ID/ s/:/ /g" sequintbl ;

sed -i "/CDS/ s/CDS\tID=/CDS\tprotein_id=gnl\|${proteinID}\|/g" sequintbl ;
sed -i "/;/ s/;/\n\t\t\t\t\t/g" sequintbl ;

sed -i "/ID=/ s/ID\=/\n\t\t\t\t\tID\=/g" sequintbl ;
sed -i "/protein_id\=/ s/protein_id\=/\n\t\t\t\t\tprotein_id\=/g" sequintbl ;
sed -i "/=/ s/=/\t/g" sequintbl ;

#Format tbl file with tabs and returns
mv sequintbl $input.sequin ;

sed ':a;{N;s/\n/;/};ba' $input.sequin > $input.tmptbl ;
sed -e "s/CDS\n\t\t\t\t\tID\t/CDS\t;\t\t\t\t\tprotein_id\tgnl\|${proteinID}\|/g" $input.tmptbl > $input.tbl ;
sed -i "/;/ s/;/\n/g" $input.tbl ;
sed -i "/ID/ s/ID/locus_tag/g" $input.tbl ;
sed -i "/\t/ s/\t\t\t\t\t/\t\t\t/g" $input.tbl ;
sed -i "/>/ s/>.*//g" $input.tbl ;
sed -i "1i >Feature ${input}" $input.tbl ;

#Add hypotheticals to non predicted proteins (altho there should be none)

sed -i "/product/ s/product\t$/product\thypothetical protein/g" $input.tbl ;
sed -i "/CDS/ s/CDS\t$/CDS/g" $input.tbl ;
sed -i "/gene/ s/gene\t$/gene/g" $input.tbl ;
sed -i "/tRNA/ s/tRNA\t$/tRNA/g" $input.tbl ;
sed -i "/rRNA/ s/rRNA\t$/rRNA/g" $input.tbl ;
sed -i "s/ \+/ /g" $input.tbl ;
sed -i "s/ \t/\t/g" $input.tbl ;

#Move files and cleanup
mv $input.tbl ../ ;
mv $input.fsa ../ ;
cd .. ;

rm -fr tmp ;

0 comments on commit d6bc0e7

Please sign in to comment.