make_orf_bed.py

#! /usr/bin/env python

import argparse
import os
import pandas as pd

parser = argparse.ArgumentParser(description='Output a BED file of the final ORF ratings generated by rate_regression_output.py')
parser.add_argument('--inbed', default='transcripts.bed', help='Transcriptome BED-file (Default: transcripts.bed)')
parser.add_argument('--outbed', default='ratedorfs.bed',
                    help='Output BED-file with translated ORF identifications. IDs (fourth column) are based on the available gene name info, genome '
                         'coordinate, and protein length; scores (fifth column) are the assigned ORF ratings. (Default: ratedorfs.bed)')
parser.add_argument('--ratingsfile', default='orfratings.h5',
                    help='Path to pandas HDF store containing ORF ratings; generated by rate_regression_output.py (Default: orfratings.h5)')
parser.add_argument('--minrating', type=float, default=0.8, help='Minimum ORF rating to be included in the BED file (Default: 0.8)')
parser.add_argument('--minlen', type=int, default=0, help='Minimum ORF length (in amino acids) to be included in the BED file (Default: 0)')
parser.add_argument('-c', '--color', type=str, default='', nargs='?', const='Blues',
                    help='Color ORFs by their rating (requires brewer2mpl). May be followed by the name of the "Sequential" ColorBrewer library to '
                         'use (e.g. "Greens", "Greys"; default "Blues"). Colors assigned by interpolating nine colors between 0.1 and 1.')
parser.add_argument('-f', '--force', action='store_true', help='Force file overwrite')
opts = parser.parse_args()

if not opts.force and os.path.exists(opts.outbed):
    raise IOError('%s exists; use --force to overwrite' % opts.outbed)

if opts.color:
    import brewer2mpl
    mycolors = brewer2mpl.get_map(opts.color.title(), 'Sequential', 9).colors

# hash transcripts by ID for easy reference later
with open(opts.inbed, 'rU') as inbed:
    bedlinedict = {line.split()[3]: line for line in inbed}

ratedorfs = pd.read_hdf(opts.ratingsfile, 'orfratings', mode='r', columns=['orfname', 'tid', 'gcoord', 'gstop', 'strand', 'orfrating'],
                        where="orfrating >= %f and AAlen >= %d" % (opts.minrating, opts.minlen))
if ratedorfs.empty:
    raise IOError('No ORFs of minimum rating %f and length %d identified!' % (opts.minrating, opts.minlen))

with open(opts.outbed, 'w') as outbed:
    for (orfname, tid, gcoord, gstop, strand, orfrating) in ratedorfs[['orfname', 'tid', 'gcoord', 'gstop', 'strand', 'orfrating']].itertuples(False):
        ls = bedlinedict[tid].split()
        ls[3] = orfname
        ls[4] = str(int(round(orfrating*1000)))
        if strand == '+':
            ls[6] = str(gcoord)
            ls[7] = str(gstop)
        else:
            ls[6] = str(gstop+1)
            ls[7] = str(gcoord+1)
        if opts.color:
            ls[8] = ','.join([str(x) for x in mycolors[max(min(int(orfrating*10)-1, 8), 0)]])
            # int() acts as floor, so this splits ratings into 0.1-0.1999, 0.2-0.2999, etc
        outbed.write('\t'.join(ls)+'\n')