bioinform · marghoob · Feb 23, 2017 · Aug 26, 2014 · Aug 26, 2014 · Feb 3, 2016
diff --git a/src/main/python/MLE_regions.py b/src/main/python/MLE_regions.py
@@ -259,30 +259,27 @@ def main():
             str_print += (isoform_names[i].ljust(len_ljust))
         output_file.write(str_print + ('Total').ljust(20) + '\n')
 
-        if any(X_opt < 0):    
-            N = sum(N)
-            L = sum(L)
-            #[X_opt] = [N / ((L) * 1e-3) / (NN * 1e-6)]
-            X_opt = N / (L) / NN * 1e9
-            if (num_isoforms == 1):
-                X_opt_list = [[X_opt]]
-            else:
-                X_opt_list = ['NA'] * num_isoforms
-            total_exp_level = X_opt
+        if any(X_opt < 0):  
+            X_opt = 0
+            total_exp_level_print = 'NA'
+            X_opt_list_print = ['NA'] * num_isoforms
+            if (len(L) > 0):
+                #[X_opt] = [N / ((L) * 1e-3) / (NN * 1e-6)]
+                X_opt = sum(N) / sum(L) / (NN * 1e-9)
+                total_exp_level_print = str(round(X_opt, 4))
+                if (num_isoforms == 1):
+                    X_opt_list_print = [str(round(X_opt, 4))]
             #print 'Total Expression: ' + str(X_opt)
         else:
             #print('MLE computation: num_regions=%d, num_isoforms=%d, num_mapped_reads=%d' % (len(L), num_isoforms, int(sum(N))))
-            X_opt_list = X_opt.tolist() 
-            total_exp_level = sum(X_opt_list)
+            X_opt_list_print = [str(round(exp[0], 4)) for exp in X_opt.tolist()]
+            total_exp_level_print = str(round(sum(X_opt.tolist()), 4))
         str_print = ""
 
         for i in range(num_isoforms):
             len_ljust = max((len(isoform_names[i])/10 + 1) * 10, 20)
-            if (X_opt_list[i] == 'NA'):
-                str_print += (str(X_opt_list[i]).ljust(len_ljust))
-            else:
-                str_print += (str(round(X_opt_list[i][0], 4)).ljust(len_ljust))
-        output_file.write(str_print + str(round(total_exp_level, 4)).ljust(20))
+            str_print += (X_opt_list_print[i]).ljust(len_ljust)
+        output_file.write(str_print + total_exp_level_print.ljust(20))
         output_file.write('\n')
 
     input_file.close()

diff --git a/src/main/python/blat_threading.py b/src/main/python/blat_threading.py
@@ -91,14 +91,18 @@ def GetPathAndName(pathfilename):
 cat_bestpsl_cmd = "cat "
 rm_SR_cmd = "rm "
 rm_SRpsl_cmd = "rm "
+rm_SRbestpsl_cmd = "rm "
 for ext in ext_ls:
     cat_bestpsl_cmd = cat_bestpsl_cmd + output_path + SR_filename + ext + ".bestpsl "
     rm_SR_cmd = rm_SR_cmd + output_path + SR_filename + ext + ' '
     rm_SRpsl_cmd = rm_SRpsl_cmd + output_path + SR_filename + ext + '.psl '
+    rm_SRbestpsl_cmd = rm_SRpsl_cmd + output_path + SR_filename + ext + '.bestpsl '
 cat_bestpsl_cmd = cat_bestpsl_cmd + " > " + output_pathfilename    
 print cat_bestpsl_cmd
 print rm_SR_cmd
 print rm_SRpsl_cmd
+print rm_SRbestpsl_cmd
 log_command(cat_bestpsl_cmd)
 log_command(rm_SR_cmd)
 log_command(rm_SRpsl_cmd)
+log_command(rm_SRbestpsl_cmd)
diff --git a/src/main/python/exon_construction.py b/src/main/python/exon_construction.py
@@ -125,14 +125,14 @@ def print_jun(chr_name,jun_start, jun_end):
     leftpos=int(ls[1])+int(thickness[0])
     rightpos=int(ls[2])-int(thickness[1])
 
-    name = ls[3]  
-    num_indep=int(name.split('_')[1].split(']')[0])
-    num_uniq=int(name.split('](')[1].split('/')[0])
+    name = ls[3]
+    if ('[' in name):
+        num_indep=int(name.split('[')[1].split(']')[0].split('_')[1])
+        num_uniq=int(name.split('](')[1].split('/')[0])
+        #print leftpos, rightpos, num_indep, num_uniq
 
-#    print leftpos, rightpos, num_indep, num_uniq
-
-    if not (num_indep > 1 and num_uniq > 0):
-        continue
+        if not (num_indep > 1 and num_uniq > 0):
+            continue
 
     locus = chr_name + ":" + str(leftpos) + "-" + str(rightpos)
     if not ref_jun_dt.has_key(locus):

diff --git a/src/main/python/filter_MLE_table.py b/src/main/python/filter_MLE_table.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+
+import sys, os, argparse
+
+# Description: Filter an input file of expression values based 
+#              on the negative data
+# Pre: (Optional)
+#      --FPR
+#         an (FPR?) (float)
+#         Well, Its a float.
+#         
+#         We go this far into the sorted list of negative fractions.
+#         and set that as a threshold.  
+#
+#         So if you have a ton of negative examples with 1 expression
+#         your threshold will simply be 1
+#
+#      --min_isoform_fraction (only one of FPR or min_isoform_fraction can be set)
+#          This is the minimum fraction of a gene's expression an isoform can be to get reported
+#      --min_isoform_rpkm 
+#          lowest raw expression of an isoform required to output it.
+#     1. a negative filename
+#        The format looks something like 
+#        chr20:17580-19316.1 <tab> 29.9449 <tab> 32.2812
+#        I'd guess this a locus name followed by an isoform RPKM then a gene RPKM
+#
+#        It seems values in this
+#        list include many zero values for the middle column, and sometimes
+#        higher values for gene locus. 
+#
+#        This list is converted into a list of fractions where we know 
+#        what the fraction of the gene locus expression is composed of
+#        the isoform expression. 
+#
+#      2. an input filename
+#        Another file with the same format as the first two
+#        locus <tab> isoform RPKM <tab> gene RPKM
+#        This file is currently generated by MLEout2tab.py which is downstream
+#        of MLE_MT.py
+#         
+#      3. an output filename
+#        If the gene expression is greater than zero and the 
+#           isoform expression/gene expression is higher than the threshold
+#           write the input file's line to this output file.
+#        So format is the same on the output as on the input.
+#
+# Post:  Writes the output file as a filterd version of the input file
+#        It also prints to STDOUT some summary statistics about the threshold
+# Modifies:
+#        Writes to output file, and STDOUT
+
+
+def main():
+  parser = argparse.ArgumentParser(description="Determine and FPR and apply filtering thresholds for considered isoforms where relevant.")
+  group = parser.add_mutually_exclusive_group(required=True)
+  group.add_argument('--FPR',type=float,help="minimum FPR to output results for when specified")
+  #,help="minimum FPR to output results for.") 
+  group.add_argument('--min_isoform_fraction',type=float,help="minimum isforom fraction of a gene to report.")
+  #,help="threshold for the minimum isoform fraction")
+  parser.add_argument('--min_isoform_rpkm',type=float,default=0,help="Threshold for an acceptable RPKM of isoform expression.") 
+  parser.add_argument('neg_filename',help="File containing the isoform and gene rpkms for negative hits in the format of locus <tab> isoform rpkm <tab> gene rpkm")
+  parser.add_argument('input_filename',help="input in the same format of locus <tab> isoform FKMP <tab> gene RPKM.")
+  parser.add_argument('output_filename',help="output in the same format of locus <tab> isoform RPKM <tab> gene RPKM.")
+  args = parser.parse_args()
+
+  threshold = False
+
+  #Handle the special case of FPR = 1
+  #  output all cases with isoform expression greater than zero and exit
+  if args.FPR:
+    if args.FPR >= 1:
+      # handles pecial case for ignoring FPR
+      sys.stderr.write("FPR of 1, so consider all isoforms with expression > 0")
+      of = open(args.output_filename,'w')
+      with open(args.input_filename) as inf:
+        for line in inf:
+          f = line.rstrip().split() 
+          if float(f[1]) >= args.min_isoform_rpkm and float(f[1]) > 0: 
+            of.write(line)
+      of.close()
+      return # finished
+    # Read through the negative file and calculate the FPR threshold
+    neg_perc_ls = []
+    neg_file = open(args.neg_filename,'r')
+    for line in neg_file:
+        ls = line.strip().split("\t")
+        ID = ls[0]
+        iso_exp = float(ls[1])
+        gene_exp = float(ls[2])
+        if gene_exp > 0:
+          neg_perc_ls.append(iso_exp/gene_exp)
+        else:
+          neg_perc_ls.append(0)
+
+    neg_file.close()
+
+    neg_perc_ls.sort()
+    neg_perc_ls.reverse()
+    n = int(args.FPR*len(neg_perc_ls))
+    threshold = neg_perc_ls[n]
+
+  if not threshold: threshold = args.min_isoform_fraction
+  output = open(args.output_filename,'w')
+  input_file = open(args.input_filename,'r')
+  i=0
+  I=0
+  for line in input_file:
+      I+=1
+      ls = line.strip().split("\t")
+      ID = ls[0]
+      iso_exp = float(ls[1])
+      gene_exp = float(ls[2])
+      if gene_exp > 0 and float(iso_exp/gene_exp) > threshold and iso_exp > args.min_isoform_rpkm:
+          output.write(line)
+          i+=1
+  input_file.close()
+  if args.FPR:
+    print "negative:",n,len(neg_perc_ls),float(n)/len(neg_perc_ls)
+    print "threshold:",threshold
+    print "input:",i,I,float(i)/I
+  output.close()
+
+if __name__ == "__main__":
+  main()
diff --git a/src/main/python/gmap_threading.py b/src/main/python/gmap_threading.py
@@ -51,7 +51,6 @@ def GetPathAndName(pathfilename):
 bestblat_SR_cmd = python_path + ' ' + bin_path2 + "blat_best.py " + output_path + SR_filename + '.psl 0 > ' + output_pathfilename
 log_command(bestblat_SR_cmd)
 
-rm_SRpsl_cmd = "rm " + output_path + SR_filename + '.psl '
-
-print rm_SRpsl_cmd
-log_command(rm_SRpsl_cmd)
+#rm_SRpsl_cmd = "rm " + output_path + SR_filename + '.psl '
+#print rm_SRpsl_cmd
+#log_command(rm_SRpsl_cmd)
diff --git a/src/main/python/idpcommon.pyc b/src/main/python/idpcommon.pyc
diff --git a/src/main/python/isoform_construction_polyA3end_5cap.py b/src/main/python/isoform_construction_polyA3end_5cap.py
@@ -50,7 +50,7 @@ def check_3end(readname):
 #gene_name	n
 #a1-b1	a2-b2	a3-b3
 #n1	n2	n3
-
+"""
 def parse_5cap_file(CAGE_filename):
     if CAGE_filename == "kinfai":
         return {}
@@ -83,6 +83,25 @@ def parse_5cap_file(CAGE_filename):
             i +=1
     CAGE_file.close()
     return result
+"""
+def parse_5cap_file(CAGE_filename):
+    if CAGE_filename == "kinfai":
+        return {}
+    result = {}
+    CAGE_file = open(CAGE_filename,'r')
+    for line in CAGE_file:
+        ls = line.strip().split('\t')
+        gene_name = ls[-1]
+        chr_name = ls[0]
+        start = int(ls[1])
+        end = int(ls[2])
+        if not result.has_key(gene_name):
+            result[gene_name] = {}
+        Npeak = 1
+        result[gene_name][start]=[end, Npeak]
+
+    CAGE_file.close()
+    return result
 
 def define_5end(cap_dt,Nthreshold):
     fwd_5end_dt = {}
@@ -93,6 +112,7 @@ def define_5end(cap_dt,Nthreshold):
         for start in cap_dt[gene_name]:
             if cap_dt[gene_name][start][1] < Nthreshold:
                 continue
+            #print str(gene_name) + "\t" + str(fwd_5end_dt[gene_name])
             fwd_5end_dt[gene_name].append(start)
             rev_5end_dt[gene_name].append(cap_dt[gene_name][start][0])
     return fwd_5end_dt,rev_5end_dt
@@ -115,16 +135,18 @@ def define_5end(cap_dt,Nthreshold):
     leftpos=int(ls[1])+int(thickness[0])
     rightpos=int(ls[2])-int(thickness[1])
 
-    name = ls[3]  
-    num_indep=int(name.split('_')[1].split(']')[0])
-    num_uniq=int(name.split('](')[1].split('/')[0])
+    name = ls[3]
+    if ('[' in name):
+        num_indep=int(name.split('_')[-1].split(']')[0])
+        num_uniq=int(name.split('](')[1].split('/')[0])
+
+        if num_indep < 2 or num_uniq == 0:
+            continue
 
     locus = chr_name + ':' + str(leftpos) + '-' +str(rightpos)
     if not jun_strand_dt.has_key(locus):
         jun_strand_dt[locus] = strand
 
-    if num_indep < 2 or num_uniq == 0:
-        continue
 
     if not jun_start_dt.has_key(chr_name):
         jun_start_dt[chr_name] = {}
@@ -405,8 +427,9 @@ def nonredundant_right_threeend(exon_boundary_ls,threeend_ls,gene_start,gene_end
 
 cap_dt = parse_5cap_file(CAGE_filename)
 fwd_5end_dt,rev_5end_dt = define_5end(cap_dt,1)
-print "yue", cap_dt
-print "kinfai",  fwd_5end_dt,rev_5end_dt
+#print "yue", cap_dt
+#print "kinfai",  fwd_5end_dt,rev_5end_dt
+
 for chr_name in gene_region_dt:
     if not ref_fiveend_dt.has_key(chr_name):
         continue

diff --git a/src/main/python/mapCageToExons.py b/src/main/python/mapCageToExons.py
@@ -0,0 +1,57 @@
+#!/usr/bin/python
+import sys
+from operator import itemgetter, attrgetter
+import bisect
+import re
+import os
+from binaidp import log_command
+
+CAGE_margin = 100
+
+
+### Main
+########
+def main():
+    # Read input parameters
+    tss_file_str = sys.argv[1]
+    exon_file_str = sys.argv[2]
+    ref_gpd_str = sys.argv[3]
+    I_ref5end_isoformconstruction = int(sys.argv[4])
+    output_file_str = sys.argv[5]
+
+
+    exon_file = open(exon_file_str, 'r')
+    exon_bed_file = open(exon_file_str + ".map2cage.bed", 'w')
+    for line in exon_file:
+        fields = line.strip().split("\t")
+        chrname = fields[0].split(":")[0]
+        for idx in range(1, len(fields)):
+            exon_bed_file.write('\t'.join([chrname, str(int(fields[idx]) - CAGE_margin), str(int(fields[idx]) + CAGE_margin), fields[0]]) + '\n')
+    exon_bed_file.close()
+    exon_file.close()
+
+    bedtools_cmnd = "bedtools intersect -wb -a " + tss_file_str + " -b " + exon_file_str + ".map2cage.bed > " + output_file_str + ".all"
+    os.system(bedtools_cmnd)
+
+    # Remove redundancy
+    if (I_ref5end_isoformconstruction):
+        ref_file = open(ref_gpd_str, 'r')
+        ref_bed_file = open(ref_gpd_str + ".map2cage.bed", 'w')
+        for line in ref_file:
+            fields = line.strip().split("\t")
+            if (fields[3] == "+"):
+                idx = 4
+            else:
+                idx = 5
+            chrname = fields[2]
+            ref_bed_file.write('\t'.join([chrname, str(int(fields[idx]) - CAGE_margin), str(int(fields[idx]) + CAGE_margin), fields[0]]) + '\n')
+        ref_bed_file.close()
+        ref_file.close()
+
+        bedtools_cmnd = "bedtools intersect -v -a " + output_file_str + ".all" + " -b " + ref_gpd_str + ".map2cage.bed > " + output_file_str 
+
+        os.system(bedtools_cmnd)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/main/python/mapEncodeTSStoRegions.py b/src/main/python/mapEncodeTSStoRegions.py
@@ -189,8 +189,7 @@ def main():
     # Read input parameters
     tss_file_str = sys.argv[1]
     region_file_str = sys.argv[2]
-
-    output_file_str = 'encodeTSS_mapped_regions.txt'
+    output_file_str = sys.argv[3]
 
     tss_file = open(tss_file_str, 'r')
     [tss_dict] = parse_tss_file(tss_file)

diff --git a/src/main/python/parseSAM.py b/src/main/python/parseSAM.py
@@ -113,11 +113,10 @@ def parse_read_line(line, READ_LEN):
                     seg_len = 0
                 read_len +=  int(cigar_list[2 * idx])  
         read_len_list.append(seg_len)                
+        if (abs(read_len - READ_LEN) > read_len_margin):
+            read_len_list = []
     else:
         read_len_list = []
-
-    if (abs(read_len - READ_LEN) > read_len_margin):
-        read_len_list = []
     return [read_name, read_start_pos, rname, read_len_list]
 ##########
 
@@ -379,6 +378,7 @@ def main():
     end_time = datetime.now()
     print 'Parsing reads started at ' + str(start_time)
     print 'Parsing reads ended at   ' + str(end_time)
+    print 'Number of map_read reads: (%d/%d = %0.2f)'%(num_reads,line_num,100*num_reads/float(line_num+0.00001))
 
     print 'Generating output file.'