Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merged with v0.1.9 and made some changes to work with other aligners #8

Merged
merged 13 commits into from
Feb 23, 2017
31 changes: 14 additions & 17 deletions src/main/python/MLE_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,30 +259,27 @@ def main():
str_print += (isoform_names[i].ljust(len_ljust))
output_file.write(str_print + ('Total').ljust(20) + '\n')

if any(X_opt < 0):
N = sum(N)
L = sum(L)
#[X_opt] = [N / ((L) * 1e-3) / (NN * 1e-6)]
X_opt = N / (L) / NN * 1e9
if (num_isoforms == 1):
X_opt_list = [[X_opt]]
else:
X_opt_list = ['NA'] * num_isoforms
total_exp_level = X_opt
if any(X_opt < 0):
X_opt = 0
total_exp_level_print = 'NA'
X_opt_list_print = ['NA'] * num_isoforms
if (len(L) > 0):
#[X_opt] = [N / ((L) * 1e-3) / (NN * 1e-6)]
X_opt = sum(N) / sum(L) / (NN * 1e-9)
total_exp_level_print = str(round(X_opt, 4))
if (num_isoforms == 1):
X_opt_list_print = [str(round(X_opt, 4))]
#print 'Total Expression: ' + str(X_opt)
else:
#print('MLE computation: num_regions=%d, num_isoforms=%d, num_mapped_reads=%d' % (len(L), num_isoforms, int(sum(N))))
X_opt_list = X_opt.tolist()
total_exp_level = sum(X_opt_list)
X_opt_list_print = [str(round(exp[0], 4)) for exp in X_opt.tolist()]
total_exp_level_print = str(round(sum(X_opt.tolist()), 4))
str_print = ""

for i in range(num_isoforms):
len_ljust = max((len(isoform_names[i])/10 + 1) * 10, 20)
if (X_opt_list[i] == 'NA'):
str_print += (str(X_opt_list[i]).ljust(len_ljust))
else:
str_print += (str(round(X_opt_list[i][0], 4)).ljust(len_ljust))
output_file.write(str_print + str(round(total_exp_level, 4)).ljust(20))
str_print += (X_opt_list_print[i]).ljust(len_ljust)
output_file.write(str_print + total_exp_level_print.ljust(20))
output_file.write('\n')

input_file.close()
Expand Down
4 changes: 4 additions & 0 deletions src/main/python/blat_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,18 @@ def GetPathAndName(pathfilename):
cat_bestpsl_cmd = "cat "
rm_SR_cmd = "rm "
rm_SRpsl_cmd = "rm "
rm_SRbestpsl_cmd = "rm "
for ext in ext_ls:
cat_bestpsl_cmd = cat_bestpsl_cmd + output_path + SR_filename + ext + ".bestpsl "
rm_SR_cmd = rm_SR_cmd + output_path + SR_filename + ext + ' '
rm_SRpsl_cmd = rm_SRpsl_cmd + output_path + SR_filename + ext + '.psl '
rm_SRbestpsl_cmd = rm_SRpsl_cmd + output_path + SR_filename + ext + '.bestpsl '
cat_bestpsl_cmd = cat_bestpsl_cmd + " > " + output_pathfilename
print cat_bestpsl_cmd
print rm_SR_cmd
print rm_SRpsl_cmd
print rm_SRbestpsl_cmd
log_command(cat_bestpsl_cmd)
log_command(rm_SR_cmd)
log_command(rm_SRpsl_cmd)
log_command(rm_SRbestpsl_cmd)
14 changes: 7 additions & 7 deletions src/main/python/exon_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,14 @@ def print_jun(chr_name,jun_start, jun_end):
leftpos=int(ls[1])+int(thickness[0])
rightpos=int(ls[2])-int(thickness[1])

name = ls[3]
num_indep=int(name.split('_')[1].split(']')[0])
num_uniq=int(name.split('](')[1].split('/')[0])
name = ls[3]
if ('[' in name):
num_indep=int(name.split('[')[1].split(']')[0].split('_')[1])
num_uniq=int(name.split('](')[1].split('/')[0])
#print leftpos, rightpos, num_indep, num_uniq

# print leftpos, rightpos, num_indep, num_uniq

if not (num_indep > 1 and num_uniq > 0):
continue
if not (num_indep > 1 and num_uniq > 0):
continue

locus = chr_name + ":" + str(leftpos) + "-" + str(rightpos)
if not ref_jun_dt.has_key(locus):
Expand Down
124 changes: 124 additions & 0 deletions src/main/python/filter_MLE_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/python

import sys, os, argparse

# Description: Filter an input file of expression values based
# on the negative data
# Pre: (Optional)
# --FPR
# an (FPR?) (float)
# Well, Its a float.
#
# We go this far into the sorted list of negative fractions.
# and set that as a threshold.
#
# So if you have a ton of negative examples with 1 expression
# your threshold will simply be 1
#
# --min_isoform_fraction (only one of FPR or min_isoform_fraction can be set)
# This is the minimum fraction of a gene's expression an isoform can be to get reported
# --min_isoform_rpkm
# lowest raw expression of an isoform required to output it.
# 1. a negative filename
# The format looks something like
# chr20:17580-19316.1 <tab> 29.9449 <tab> 32.2812
# I'd guess this a locus name followed by an isoform RPKM then a gene RPKM
#
# It seems values in this
# list include many zero values for the middle column, and sometimes
# higher values for gene locus.
#
# This list is converted into a list of fractions where we know
# what the fraction of the gene locus expression is composed of
# the isoform expression.
#
# 2. an input filename
# Another file with the same format as the first two
# locus <tab> isoform RPKM <tab> gene RPKM
# This file is currently generated by MLEout2tab.py which is downstream
# of MLE_MT.py
#
# 3. an output filename
# If the gene expression is greater than zero and the
# isoform expression/gene expression is higher than the threshold
# write the input file's line to this output file.
# So format is the same on the output as on the input.
#
# Post: Writes the output file as a filterd version of the input file
# It also prints to STDOUT some summary statistics about the threshold
# Modifies:
# Writes to output file, and STDOUT


def main():
parser = argparse.ArgumentParser(description="Determine and FPR and apply filtering thresholds for considered isoforms where relevant.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--FPR',type=float,help="minimum FPR to output results for when specified")
#,help="minimum FPR to output results for.")
group.add_argument('--min_isoform_fraction',type=float,help="minimum isforom fraction of a gene to report.")
#,help="threshold for the minimum isoform fraction")
parser.add_argument('--min_isoform_rpkm',type=float,default=0,help="Threshold for an acceptable RPKM of isoform expression.")
parser.add_argument('neg_filename',help="File containing the isoform and gene rpkms for negative hits in the format of locus <tab> isoform rpkm <tab> gene rpkm")
parser.add_argument('input_filename',help="input in the same format of locus <tab> isoform FKMP <tab> gene RPKM.")
parser.add_argument('output_filename',help="output in the same format of locus <tab> isoform RPKM <tab> gene RPKM.")
args = parser.parse_args()

threshold = False

#Handle the special case of FPR = 1
# output all cases with isoform expression greater than zero and exit
if args.FPR:
if args.FPR >= 1:
# handles pecial case for ignoring FPR
sys.stderr.write("FPR of 1, so consider all isoforms with expression > 0")
of = open(args.output_filename,'w')
with open(args.input_filename) as inf:
for line in inf:
f = line.rstrip().split()
if float(f[1]) >= args.min_isoform_rpkm and float(f[1]) > 0:
of.write(line)
of.close()
return # finished
# Read through the negative file and calculate the FPR threshold
neg_perc_ls = []
neg_file = open(args.neg_filename,'r')
for line in neg_file:
ls = line.strip().split("\t")
ID = ls[0]
iso_exp = float(ls[1])
gene_exp = float(ls[2])
if gene_exp > 0:
neg_perc_ls.append(iso_exp/gene_exp)
else:
neg_perc_ls.append(0)

neg_file.close()

neg_perc_ls.sort()
neg_perc_ls.reverse()
n = int(args.FPR*len(neg_perc_ls))
threshold = neg_perc_ls[n]

if not threshold: threshold = args.min_isoform_fraction
output = open(args.output_filename,'w')
input_file = open(args.input_filename,'r')
i=0
I=0
for line in input_file:
I+=1
ls = line.strip().split("\t")
ID = ls[0]
iso_exp = float(ls[1])
gene_exp = float(ls[2])
if gene_exp > 0 and float(iso_exp/gene_exp) > threshold and iso_exp > args.min_isoform_rpkm:
output.write(line)
i+=1
input_file.close()
if args.FPR:
print "negative:",n,len(neg_perc_ls),float(n)/len(neg_perc_ls)
print "threshold:",threshold
print "input:",i,I,float(i)/I
output.close()

if __name__ == "__main__":
main()
7 changes: 3 additions & 4 deletions src/main/python/gmap_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def GetPathAndName(pathfilename):
bestblat_SR_cmd = python_path + ' ' + bin_path2 + "blat_best.py " + output_path + SR_filename + '.psl 0 > ' + output_pathfilename
log_command(bestblat_SR_cmd)

rm_SRpsl_cmd = "rm " + output_path + SR_filename + '.psl '

print rm_SRpsl_cmd
log_command(rm_SRpsl_cmd)
#rm_SRpsl_cmd = "rm " + output_path + SR_filename + '.psl '
#print rm_SRpsl_cmd
#log_command(rm_SRpsl_cmd)
Binary file added src/main/python/idpcommon.pyc
Binary file not shown.
39 changes: 31 additions & 8 deletions src/main/python/isoform_construction_polyA3end_5cap.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def check_3end(readname):
#gene_name n
#a1-b1 a2-b2 a3-b3
#n1 n2 n3

"""
def parse_5cap_file(CAGE_filename):
if CAGE_filename == "kinfai":
return {}
Expand Down Expand Up @@ -83,6 +83,25 @@ def parse_5cap_file(CAGE_filename):
i +=1
CAGE_file.close()
return result
"""
def parse_5cap_file(CAGE_filename):
if CAGE_filename == "kinfai":
return {}
result = {}
CAGE_file = open(CAGE_filename,'r')
for line in CAGE_file:
ls = line.strip().split('\t')
gene_name = ls[-1]
chr_name = ls[0]
start = int(ls[1])
end = int(ls[2])
if not result.has_key(gene_name):
result[gene_name] = {}
Npeak = 1
result[gene_name][start]=[end, Npeak]

CAGE_file.close()
return result

def define_5end(cap_dt,Nthreshold):
fwd_5end_dt = {}
Expand All @@ -93,6 +112,7 @@ def define_5end(cap_dt,Nthreshold):
for start in cap_dt[gene_name]:
if cap_dt[gene_name][start][1] < Nthreshold:
continue
#print str(gene_name) + "\t" + str(fwd_5end_dt[gene_name])
fwd_5end_dt[gene_name].append(start)
rev_5end_dt[gene_name].append(cap_dt[gene_name][start][0])
return fwd_5end_dt,rev_5end_dt
Expand All @@ -115,16 +135,18 @@ def define_5end(cap_dt,Nthreshold):
leftpos=int(ls[1])+int(thickness[0])
rightpos=int(ls[2])-int(thickness[1])

name = ls[3]
num_indep=int(name.split('_')[1].split(']')[0])
num_uniq=int(name.split('](')[1].split('/')[0])
name = ls[3]
if ('[' in name):
num_indep=int(name.split('_')[-1].split(']')[0])
num_uniq=int(name.split('](')[1].split('/')[0])

if num_indep < 2 or num_uniq == 0:
continue

locus = chr_name + ':' + str(leftpos) + '-' +str(rightpos)
if not jun_strand_dt.has_key(locus):
jun_strand_dt[locus] = strand

if num_indep < 2 or num_uniq == 0:
continue

if not jun_start_dt.has_key(chr_name):
jun_start_dt[chr_name] = {}
Expand Down Expand Up @@ -405,8 +427,9 @@ def nonredundant_right_threeend(exon_boundary_ls,threeend_ls,gene_start,gene_end

cap_dt = parse_5cap_file(CAGE_filename)
fwd_5end_dt,rev_5end_dt = define_5end(cap_dt,1)
print "yue", cap_dt
print "kinfai", fwd_5end_dt,rev_5end_dt
#print "yue", cap_dt
#print "kinfai", fwd_5end_dt,rev_5end_dt

for chr_name in gene_region_dt:
if not ref_fiveend_dt.has_key(chr_name):
continue
Expand Down
57 changes: 57 additions & 0 deletions src/main/python/mapCageToExons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/python
import sys
from operator import itemgetter, attrgetter
import bisect
import re
import os
from binaidp import log_command

CAGE_margin = 100


### Main
########
def main():
# Read input parameters
tss_file_str = sys.argv[1]
exon_file_str = sys.argv[2]
ref_gpd_str = sys.argv[3]
I_ref5end_isoformconstruction = int(sys.argv[4])
output_file_str = sys.argv[5]


exon_file = open(exon_file_str, 'r')
exon_bed_file = open(exon_file_str + ".map2cage.bed", 'w')
for line in exon_file:
fields = line.strip().split("\t")
chrname = fields[0].split(":")[0]
for idx in range(1, len(fields)):
exon_bed_file.write('\t'.join([chrname, str(int(fields[idx]) - CAGE_margin), str(int(fields[idx]) + CAGE_margin), fields[0]]) + '\n')
exon_bed_file.close()
exon_file.close()

bedtools_cmnd = "bedtools intersect -wb -a " + tss_file_str + " -b " + exon_file_str + ".map2cage.bed > " + output_file_str + ".all"
os.system(bedtools_cmnd)

# Remove redundancy
if (I_ref5end_isoformconstruction):
ref_file = open(ref_gpd_str, 'r')
ref_bed_file = open(ref_gpd_str + ".map2cage.bed", 'w')
for line in ref_file:
fields = line.strip().split("\t")
if (fields[3] == "+"):
idx = 4
else:
idx = 5
chrname = fields[2]
ref_bed_file.write('\t'.join([chrname, str(int(fields[idx]) - CAGE_margin), str(int(fields[idx]) + CAGE_margin), fields[0]]) + '\n')
ref_bed_file.close()
ref_file.close()

bedtools_cmnd = "bedtools intersect -v -a " + output_file_str + ".all" + " -b " + ref_gpd_str + ".map2cage.bed > " + output_file_str

os.system(bedtools_cmnd)


if __name__ == '__main__':
main()
3 changes: 1 addition & 2 deletions src/main/python/mapEncodeTSStoRegions.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ def main():
# Read input parameters
tss_file_str = sys.argv[1]
region_file_str = sys.argv[2]

output_file_str = 'encodeTSS_mapped_regions.txt'
output_file_str = sys.argv[3]

tss_file = open(tss_file_str, 'r')
[tss_dict] = parse_tss_file(tss_file)
Expand Down
6 changes: 3 additions & 3 deletions src/main/python/parseSAM.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,10 @@ def parse_read_line(line, READ_LEN):
seg_len = 0
read_len += int(cigar_list[2 * idx])
read_len_list.append(seg_len)
if (abs(read_len - READ_LEN) > read_len_margin):
read_len_list = []
else:
read_len_list = []

if (abs(read_len - READ_LEN) > read_len_margin):
read_len_list = []
return [read_name, read_start_pos, rname, read_len_list]
##########

Expand Down Expand Up @@ -379,6 +378,7 @@ def main():
end_time = datetime.now()
print 'Parsing reads started at ' + str(start_time)
print 'Parsing reads ended at ' + str(end_time)
print 'Number of map_read reads: (%d/%d = %0.2f)'%(num_reads,line_num,100*num_reads/float(line_num+0.00001))

print 'Generating output file.'

Expand Down
Loading