Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merged with v0.1.9 and made some changes to work with other aligners #8

Merged
merged 13 commits into from
Feb 23, 2017
31 changes: 14 additions & 17 deletions src/main/python/MLE_regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,30 +259,27 @@ def main():
str_print += (isoform_names[i].ljust(len_ljust))
output_file.write(str_print + ('Total').ljust(20) + '\n')

if any(X_opt < 0):
N = sum(N)
L = sum(L)
#[X_opt] = [N / ((L) * 1e-3) / (NN * 1e-6)]
X_opt = N / (L) / NN * 1e9
if (num_isoforms == 1):
X_opt_list = [[X_opt]]
else:
X_opt_list = ['NA'] * num_isoforms
total_exp_level = X_opt
if any(X_opt < 0):
X_opt = 0
total_exp_level_print = 'NA'
X_opt_list_print = ['NA'] * num_isoforms
if (len(L) > 0):
#[X_opt] = [N / ((L) * 1e-3) / (NN * 1e-6)]
X_opt = sum(N) / sum(L) / (NN * 1e-9)
total_exp_level_print = str(round(X_opt, 4))
if (num_isoforms == 1):
X_opt_list_print = [str(round(X_opt, 4))]
#print 'Total Expression: ' + str(X_opt)
else:
#print('MLE computation: num_regions=%d, num_isoforms=%d, num_mapped_reads=%d' % (len(L), num_isoforms, int(sum(N))))
X_opt_list = X_opt.tolist()
total_exp_level = sum(X_opt_list)
X_opt_list_print = [str(round(exp[0], 4)) for exp in X_opt.tolist()]
total_exp_level_print = str(round(sum(X_opt.tolist()), 4))
str_print = ""

for i in range(num_isoforms):
len_ljust = max((len(isoform_names[i])/10 + 1) * 10, 20)
if (X_opt_list[i] == 'NA'):
str_print += (str(X_opt_list[i]).ljust(len_ljust))
else:
str_print += (str(round(X_opt_list[i][0], 4)).ljust(len_ljust))
output_file.write(str_print + str(round(total_exp_level, 4)).ljust(20))
str_print += (X_opt_list_print[i]).ljust(len_ljust)
output_file.write(str_print + total_exp_level_print.ljust(20))
output_file.write('\n')

input_file.close()
Expand Down
Binary file removed src/main/python/binaidp.pyc
Binary file not shown.
4 changes: 4 additions & 0 deletions src/main/python/blat_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,18 @@ def GetPathAndName(pathfilename):
cat_bestpsl_cmd = "cat "
rm_SR_cmd = "rm "
rm_SRpsl_cmd = "rm "
rm_SRbestpsl_cmd = "rm "
for ext in ext_ls:
cat_bestpsl_cmd = cat_bestpsl_cmd + output_path + SR_filename + ext + ".bestpsl "
rm_SR_cmd = rm_SR_cmd + output_path + SR_filename + ext + ' '
rm_SRpsl_cmd = rm_SRpsl_cmd + output_path + SR_filename + ext + '.psl '
rm_SRbestpsl_cmd = rm_SRpsl_cmd + output_path + SR_filename + ext + '.bestpsl '
cat_bestpsl_cmd = cat_bestpsl_cmd + " > " + output_pathfilename
print cat_bestpsl_cmd
print rm_SR_cmd
print rm_SRpsl_cmd
print rm_SRbestpsl_cmd
log_command(cat_bestpsl_cmd)
log_command(rm_SR_cmd)
log_command(rm_SRpsl_cmd)
log_command(rm_SRbestpsl_cmd)
14 changes: 7 additions & 7 deletions src/main/python/exon_construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,14 @@ def print_jun(chr_name,jun_start, jun_end):
leftpos=int(ls[1])+int(thickness[0])
rightpos=int(ls[2])-int(thickness[1])

name = ls[3]
num_indep=int(name.split('_')[1].split(']')[0])
num_uniq=int(name.split('](')[1].split('/')[0])
name = ls[3]
if ('[' in name):
num_indep=int(name.split('[')[1].split(']')[0].split('_')[1])
num_uniq=int(name.split('](')[1].split('/')[0])
#print leftpos, rightpos, num_indep, num_uniq

# print leftpos, rightpos, num_indep, num_uniq

if not (num_indep > 1 and num_uniq > 0):
continue
if not (num_indep > 1 and num_uniq > 0):
continue

locus = chr_name + ":" + str(leftpos) + "-" + str(rightpos)
if not ref_jun_dt.has_key(locus):
Expand Down
7 changes: 3 additions & 4 deletions src/main/python/gmap_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def GetPathAndName(pathfilename):
bestblat_SR_cmd = python_path + ' ' + bin_path2 + "blat_best.py " + output_path + SR_filename + '.psl 0 > ' + output_pathfilename
log_command(bestblat_SR_cmd)

rm_SRpsl_cmd = "rm " + output_path + SR_filename + '.psl '

print rm_SRpsl_cmd
log_command(rm_SRpsl_cmd)
#rm_SRpsl_cmd = "rm " + output_path + SR_filename + '.psl '
#print rm_SRpsl_cmd
#log_command(rm_SRpsl_cmd)
Binary file added src/main/python/idpcommon.pyc
Binary file not shown.
39 changes: 31 additions & 8 deletions src/main/python/isoform_construction_polyA3end_5cap.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def check_3end(readname):
#gene_name n
#a1-b1 a2-b2 a3-b3
#n1 n2 n3

"""
def parse_5cap_file(CAGE_filename):
if CAGE_filename == "kinfai":
return {}
Expand Down Expand Up @@ -83,6 +83,25 @@ def parse_5cap_file(CAGE_filename):
i +=1
CAGE_file.close()
return result
"""
def parse_5cap_file(CAGE_filename):
if CAGE_filename == "kinfai":
return {}
result = {}
CAGE_file = open(CAGE_filename,'r')
for line in CAGE_file:
ls = line.strip().split('\t')
gene_name = ls[-1]
chr_name = ls[0]
start = int(ls[1])
end = int(ls[2])
if not result.has_key(gene_name):
result[gene_name] = {}
Npeak = 1
result[gene_name][start]=[end, Npeak]

CAGE_file.close()
return result

def define_5end(cap_dt,Nthreshold):
fwd_5end_dt = {}
Expand All @@ -93,6 +112,7 @@ def define_5end(cap_dt,Nthreshold):
for start in cap_dt[gene_name]:
if cap_dt[gene_name][start][1] < Nthreshold:
continue
#print str(gene_name) + "\t" + str(fwd_5end_dt[gene_name])
fwd_5end_dt[gene_name].append(start)
rev_5end_dt[gene_name].append(cap_dt[gene_name][start][0])
return fwd_5end_dt,rev_5end_dt
Expand All @@ -115,16 +135,18 @@ def define_5end(cap_dt,Nthreshold):
leftpos=int(ls[1])+int(thickness[0])
rightpos=int(ls[2])-int(thickness[1])

name = ls[3]
num_indep=int(name.split('_')[1].split(']')[0])
num_uniq=int(name.split('](')[1].split('/')[0])
name = ls[3]
if ('[' in name):
num_indep=int(name.split('_')[-1].split(']')[0])
num_uniq=int(name.split('](')[1].split('/')[0])

if num_indep < 2 or num_uniq == 0:
continue

locus = chr_name + ':' + str(leftpos) + '-' +str(rightpos)
if not jun_strand_dt.has_key(locus):
jun_strand_dt[locus] = strand

if num_indep < 2 or num_uniq == 0:
continue

if not jun_start_dt.has_key(chr_name):
jun_start_dt[chr_name] = {}
Expand Down Expand Up @@ -405,8 +427,9 @@ def nonredundant_right_threeend(exon_boundary_ls,threeend_ls,gene_start,gene_end

cap_dt = parse_5cap_file(CAGE_filename)
fwd_5end_dt,rev_5end_dt = define_5end(cap_dt,1)
print "yue", cap_dt
print "kinfai", fwd_5end_dt,rev_5end_dt
#print "yue", cap_dt
#print "kinfai", fwd_5end_dt,rev_5end_dt

for chr_name in gene_region_dt:
if not ref_fiveend_dt.has_key(chr_name):
continue
Expand Down
57 changes: 57 additions & 0 deletions src/main/python/mapCageToExons.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/python
import sys
from operator import itemgetter, attrgetter
import bisect
import re
import os
from binaidp import log_command

CAGE_margin = 100


### Main
########
def main():
# Read input parameters
tss_file_str = sys.argv[1]
exon_file_str = sys.argv[2]
ref_gpd_str = sys.argv[3]
I_ref5end_isoformconstruction = int(sys.argv[4])
output_file_str = sys.argv[5]


exon_file = open(exon_file_str, 'r')
exon_bed_file = open(exon_file_str + ".map2cage.bed", 'w')
for line in exon_file:
fields = line.strip().split("\t")
chrname = fields[0].split(":")[0]
for idx in range(1, len(fields)):
exon_bed_file.write('\t'.join([chrname, str(int(fields[idx]) - CAGE_margin), str(int(fields[idx]) + CAGE_margin), fields[0]]) + '\n')
exon_bed_file.close()
exon_file.close()

bedtools_cmnd = "bedtools intersect -wb -a " + tss_file_str + " -b " + exon_file_str + ".map2cage.bed > " + output_file_str + ".all"
os.system(bedtools_cmnd)

# Remove redundancy
if (I_ref5end_isoformconstruction):
ref_file = open(ref_gpd_str, 'r')
ref_bed_file = open(ref_gpd_str + ".map2cage.bed", 'w')
for line in ref_file:
fields = line.strip().split("\t")
if (fields[3] == "+"):
idx = 4
else:
idx = 5
chrname = fields[2]
ref_bed_file.write('\t'.join([chrname, str(int(fields[idx]) - CAGE_margin), str(int(fields[idx]) + CAGE_margin), fields[0]]) + '\n')
ref_bed_file.close()
ref_file.close()

bedtools_cmnd = "bedtools intersect -v -a " + output_file_str + ".all" + " -b " + ref_gpd_str + ".map2cage.bed > " + output_file_str

os.system(bedtools_cmnd)


if __name__ == '__main__':
main()
3 changes: 1 addition & 2 deletions src/main/python/mapEncodeTSStoRegions.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ def main():
# Read input parameters
tss_file_str = sys.argv[1]
region_file_str = sys.argv[2]

output_file_str = 'encodeTSS_mapped_regions.txt'
output_file_str = sys.argv[3]

tss_file = open(tss_file_str, 'r')
[tss_dict] = parse_tss_file(tss_file)
Expand Down
1 change: 0 additions & 1 deletion src/main/python/parseSAM.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def parse_read_line(line, READ_LEN):
read_len_list = []
else:
read_len_list = []

return [read_name, read_start_pos, rname, read_len_list]
##########

Expand Down
Loading