Merge remote-tracking branch 'origin/new_from_ID_0.1.9' into merge_wi…

…th_pagah_latest_changes Conflicts: src/main/python/runIDP.py src/main/python/select_FPR.py
bioinform · Feb 12, 2016 · 55eb135 · 55eb135
2 parents bbd15c8 + 53703e0
commit 55eb135
Show file tree

Hide file tree

Showing 11 changed files with 376 additions and 109 deletions.
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,9 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+
+	<artifactId>idp</artifactId>
+	<groupId>com.github.bioinform.idp</groupId>
+	<name>idp</name>
+	<version>0.1.1-SNAPSHOT</version>
+</project>
diff --git a/src/main/python/MLE_MT.py b/src/main/python/MLE_MT.py
@@ -3,7 +3,7 @@
 import sys
 import os
 import threading
-from binaidp import log_command
+from idpcommon import log_command
 
 def GetPathAndName(pathfilename):
     ls=pathfilename.split('/')
@@ -22,7 +22,7 @@ def main():
     penalty_filename = ''
     if (len(sys.argv) > 5):
         penalty_filename = sys.argv[5]
-    
+
     input_file = open(input_filename, 'r' )
     header = input_file.readline()
     input_files = []
@@ -31,28 +31,27 @@ def main():
         input_files.append(open(input_filename + '.' + str(thread_idx), 'w'))
         input_files[-1].write(header)
 
-
     thread_idx = 0
     while True:
         line = input_file.readline()
         if line == "": 
             break
         num_isoforms = int(line.split()[1])
         input_files[thread_idx].write(line)
-        
+
         for i in range(6):
             input_files[thread_idx].write(input_file.readline())
         for i in range(num_isoforms):
             input_files[thread_idx].write(input_file.readline())
         for i in range(2):
             input_files[thread_idx].write(input_file.readline())
-    
+
         thread_idx = (thread_idx + 1) % num_threads
-        
+
     for thread_idx in range(num_threads):
         input_files[thread_idx].close()
     input_file.close()
-    
+
     ##############################
     threads_list = []
     for thread_idx in range(num_threads):
@@ -65,18 +64,16 @@ def main():
 
     for thread in threads_list:
         thread.join()
-        
+
     cat_cmnd = 'cat '
     rm_cmnd = "rm "
     for thread_idx in range(num_threads):
         cat_cmnd += output_filename + '.' + str(thread_idx) + " "
         rm_cmnd += output_filename + '.' + str(thread_idx)  + " " + input_filename + '.' + str(thread_idx) + " "
-        
+
     cat_cmnd += ' > ' + output_filename
     log_command(cat_cmnd)
     log_command(rm_cmnd)
-
-
 
 if __name__ == '__main__':
     main()
diff --git a/src/main/python/binaidp.pyc b/src/main/python/binaidp.pyc
diff --git a/src/main/python/blat_threading.py b/src/main/python/blat_threading.py
@@ -4,7 +4,7 @@
 import os
 import threading
 import string
-from binaidp import log_command
+from idpcommon import log_command
 
 ################################################################################
 
@@ -16,7 +16,7 @@
     blat_option = ' '.join(sys.argv[4:-2])
     SR_pathfilename = sys.argv[-2]
     output_pathfilename = sys.argv[-1]
-    
+
 else:
     print("usage: python2.6 blat_threading.py p -t=DNA -q=DNA ~/annotations/hg19/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa /usr/bin/python intact_SM.fa intact_SM.fa.psl")
     print("or ./blat_threading.py p -t=DNA -q=DNA ~/annotations/hg19/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa /usr/bin/python intact_SM.fa intact_SM.fa.psl")

diff --git a/src/main/python/filter_MLE_table.py b/src/main/python/filter_MLE_table.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python
+
+import sys, os, argparse
+
+# Description: Filter an input file of expression values based 
+#              on the negative data
+# Pre: (Optional)
+#      --FPR
+#         an (FPR?) (float)
+#         Well, Its a float.
+#         
+#         We go this far into the sorted list of negative fractions.
+#         and set that as a threshold.  
+#
+#         So if you have a ton of negative examples with 1 expression
+#         your threshold will simply be 1
+#
+#      --min_isoform_fraction (only one of FPR or min_isoform_fraction can be set)
+#          This is the minimum fraction of a gene's expression an isoform can be to get reported
+#      --min_isoform_rpkm 
+#          lowest raw expression of an isoform required to output it.
+#     1. a negative filename
+#        The format looks something like 
+#        chr20:17580-19316.1 <tab> 29.9449 <tab> 32.2812
+#        I'd guess this a locus name followed by an isoform RPKM then a gene RPKM
+#
+#        It seems values in this
+#        list include many zero values for the middle column, and sometimes
+#        higher values for gene locus. 
+#
+#        This list is converted into a list of fractions where we know 
+#        what the fraction of the gene locus expression is composed of
+#        the isoform expression. 
+#
+#      2. an input filename
+#        Another file with the same format as the first two
+#        locus <tab> isoform RPKM <tab> gene RPKM
+#        This file is currently generated by MLEout2tab.py which is downstream
+#        of MLE_MT.py
+#         
+#      3. an output filename
+#        If the gene expression is greater than zero and the 
+#           isoform expression/gene expression is higher than the threshold
+#           write the input file's line to this output file.
+#        So format is the same on the output as on the input.
+#
+# Post:  Writes the output file as a filterd version of the input file
+#        It also prints to STDOUT some summary statistics about the threshold
+# Modifies:
+#        Writes to output file, and STDOUT
+
+
+def main():
+  parser = argparse.ArgumentParser(description="Determine and FPR and apply filtering thresholds for considered isoforms where relevant.")
+  group = parser.add_mutually_exclusive_group(required=True)
+  group.add_argument('--FPR',type=float,help="minimum FPR to output results for when specified")
+  #,help="minimum FPR to output results for.") 
+  group.add_argument('--min_isoform_fraction',type=float,help="minimum isforom fraction of a gene to report.")
+  #,help="threshold for the minimum isoform fraction")
+  parser.add_argument('--min_isoform_rpkm',type=float,default=0,help="Threshold for an acceptable RPKM of isoform expression.") 
+  parser.add_argument('neg_filename',help="File containing the isoform and gene rpkms for negative hits in the format of locus <tab> isoform rpkm <tab> gene rpkm")
+  parser.add_argument('input_filename',help="input in the same format of locus <tab> isoform FKMP <tab> gene RPKM.")
+  parser.add_argument('output_filename',help="output in the same format of locus <tab> isoform RPKM <tab> gene RPKM.")
+  args = parser.parse_args()
+
+  threshold = False
+
+  #Handle the special case of FPR = 1
+  #  output all cases with isoform expression greater than zero and exit
+  if args.FPR:
+    if args.FPR >= 1:
+      # handles pecial case for ignoring FPR
+      sys.stderr.write("FPR of 1, so consider all isoforms with expression > 0")
+      of = open(args.output_filename,'w')
+      with open(args.input_filename) as inf:
+        for line in inf:
+          f = line.rstrip().split() 
+          if float(f[1]) >= args.min_isoform_rpkm and float(f[1]) > 0: 
+            of.write(line)
+      of.close()
+      return # finished
+    # Read through the negative file and calculate the FPR threshold
+    neg_perc_ls = []
+    neg_file = open(args.neg_filename,'r')
+    for line in neg_file:
+        ls = line.strip().split("\t")
+        ID = ls[0]
+        iso_exp = float(ls[1])
+        gene_exp = float(ls[2])
+        if gene_exp > 0:
+          neg_perc_ls.append(iso_exp/gene_exp)
+        else:
+          neg_perc_ls.append(0)
+
+    neg_file.close()
+
+    neg_perc_ls.sort()
+    neg_perc_ls.reverse()
+    n = int(args.FPR*len(neg_perc_ls))
+    threshold = neg_perc_ls[n]
+
+  if not threshold: threshold = args.min_isoform_fraction
+  output = open(args.output_filename,'w')
+  input_file = open(args.input_filename,'r')
+  i=0
+  I=0
+  for line in input_file:
+      I+=1
+      ls = line.strip().split("\t")
+      ID = ls[0]
+      iso_exp = float(ls[1])
+      gene_exp = float(ls[2])
+      if gene_exp > 0 and float(iso_exp/gene_exp) > threshold and iso_exp > args.min_isoform_rpkm:
+          output.write(line)
+          i+=1
+  input_file.close()
+  if args.FPR:
+    print "negative:",n,len(neg_perc_ls),float(n)/len(neg_perc_ls)
+    print "threshold:",threshold
+    print "input:",i,I,float(i)/I
+  output.close()
+
+if __name__ == "__main__":
+  main()
diff --git a/src/main/python/gmap_threading.py b/src/main/python/gmap_threading.py
@@ -3,7 +3,7 @@
 import struct
 import os
 import string
-from binaidp import log_command
+from idpcommon import log_command
 
 ################################################################################
 

diff --git a/src/main/python/binaidp.py → src/main/python/idpcommon.py b/src/main/python/binaidp.py → src/main/python/idpcommon.py
@@ -1,5 +1,4 @@
 #!/usr/bin/python
-# Filename: binaidp.py
 
 import os;
 
@@ -14,5 +13,3 @@ def log_command(command, ignorefail=False):
 
 def log_print(print_str):
     os.system("echo " + str(print_str))
-
-# End of binaidp.py
diff --git a/src/main/python/parseSAM_MT.py b/src/main/python/parseSAM_MT.py
@@ -3,7 +3,7 @@
 import sys
 import os
 import threading
-from binaidp import log_command
+from idpcommon import log_command
 
 def GetPathAndName(pathfilename):
     ls=pathfilename.split('/')