forked from alexfields/ORF-RATER
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_orf_bed.py
executable file
·52 lines (45 loc) · 3.02 KB
/
make_orf_bed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#! /usr/bin/env python
import argparse
import os
import pandas as pd
parser = argparse.ArgumentParser(description='Output a BED file of the final ORF ratings generated by rate_regression_output.py')
parser.add_argument('--inbed', default='transcripts.bed', help='Transcriptome BED-file (Default: transcripts.bed)')
parser.add_argument('--outbed', default='ratedorfs.bed',
help='Output BED-file with translated ORF identifications. IDs (fourth column) are based on the available gene name info, genome '
'coordinate, and protein length; scores (fifth column) are the assigned ORF ratings. (Default: ratedorfs.bed)')
parser.add_argument('--ratingsfile', default='orfratings.h5',
help='Path to pandas HDF store containing ORF ratings; generated by rate_regression_output.py (Default: orfratings.h5)')
parser.add_argument('--minrating', type=float, default=0.8, help='Minimum ORF rating to be included in the BED file (Default: 0.8)')
parser.add_argument('--minlen', type=int, default=0, help='Minimum ORF length (in amino acids) to be included in the BED file (Default: 0)')
parser.add_argument('-c', '--color', type=str, default='', nargs='?', const='Blues',
help='Color ORFs by their rating (requires brewer2mpl). May be followed by the name of the "Sequential" ColorBrewer library to '
'use (e.g. "Greens", "Greys"; default "Blues"). Colors assigned by interpolating nine colors between 0.1 and 1.')
parser.add_argument('-f', '--force', action='store_true', help='Force file overwrite')
opts = parser.parse_args()
if not opts.force and os.path.exists(opts.outbed):
raise IOError('%s exists; use --force to overwrite' % opts.outbed)
if opts.color:
import brewer2mpl
mycolors = brewer2mpl.get_map(opts.color.title(), 'Sequential', 9).colors
# hash transcripts by ID for easy reference later
with open(opts.inbed, 'rU') as inbed:
bedlinedict = {line.split()[3]: line for line in inbed}
ratedorfs = pd.read_hdf(opts.ratingsfile, 'orfratings', mode='r', columns=['orfname', 'tid', 'gcoord', 'gstop', 'strand', 'orfrating'],
where="orfrating >= %f and AAlen >= %d" % (opts.minrating, opts.minlen))
if ratedorfs.empty:
raise IOError('No ORFs of minimum rating %f and length %d identified!' % (opts.minrating, opts.minlen))
with open(opts.outbed, 'w') as outbed:
for (orfname, tid, gcoord, gstop, strand, orfrating) in ratedorfs[['orfname', 'tid', 'gcoord', 'gstop', 'strand', 'orfrating']].itertuples(False):
ls = bedlinedict[tid].split()
ls[3] = orfname
ls[4] = str(int(round(orfrating*1000)))
if strand == '+':
ls[6] = str(gcoord)
ls[7] = str(gstop)
else:
ls[6] = str(gstop+1)
ls[7] = str(gcoord+1)
if opts.color:
ls[8] = ','.join([str(x) for x in mycolors[max(min(int(orfrating*10)-1, 8), 0)]])
# int() acts as floor, so this splits ratings into 0.1-0.1999, 0.2-0.2999, etc
outbed.write('\t'.join(ls)+'\n')