Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/new_from_ID_0.1.9' into merge_wi…
Browse files Browse the repository at this point in the history
…th_pagah_latest_changes

Conflicts:
	src/main/python/runIDP.py
	src/main/python/select_FPR.py
  • Loading branch information
Mohammad Sahraeian committed Feb 12, 2016
2 parents bbd15c8 + 53703e0 commit 55eb135
Show file tree
Hide file tree
Showing 11 changed files with 376 additions and 109 deletions.
9 changes: 9 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<artifactId>idp</artifactId>
<groupId>com.github.bioinform.idp</groupId>
<name>idp</name>
<version>0.1.1-SNAPSHOT</version>
</project>
19 changes: 8 additions & 11 deletions src/main/python/MLE_MT.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import os
import threading
from binaidp import log_command
from idpcommon import log_command

def GetPathAndName(pathfilename):
ls=pathfilename.split('/')
Expand All @@ -22,7 +22,7 @@ def main():
penalty_filename = ''
if (len(sys.argv) > 5):
penalty_filename = sys.argv[5]

input_file = open(input_filename, 'r' )
header = input_file.readline()
input_files = []
Expand All @@ -31,28 +31,27 @@ def main():
input_files.append(open(input_filename + '.' + str(thread_idx), 'w'))
input_files[-1].write(header)


thread_idx = 0
while True:
line = input_file.readline()
if line == "":
break
num_isoforms = int(line.split()[1])
input_files[thread_idx].write(line)

for i in range(6):
input_files[thread_idx].write(input_file.readline())
for i in range(num_isoforms):
input_files[thread_idx].write(input_file.readline())
for i in range(2):
input_files[thread_idx].write(input_file.readline())

thread_idx = (thread_idx + 1) % num_threads

for thread_idx in range(num_threads):
input_files[thread_idx].close()
input_file.close()

##############################
threads_list = []
for thread_idx in range(num_threads):
Expand All @@ -65,18 +64,16 @@ def main():

for thread in threads_list:
thread.join()

cat_cmnd = 'cat '
rm_cmnd = "rm "
for thread_idx in range(num_threads):
cat_cmnd += output_filename + '.' + str(thread_idx) + " "
rm_cmnd += output_filename + '.' + str(thread_idx) + " " + input_filename + '.' + str(thread_idx) + " "

cat_cmnd += ' > ' + output_filename
log_command(cat_cmnd)
log_command(rm_cmnd)



if __name__ == '__main__':
main()
Binary file added src/main/python/binaidp.pyc
Binary file not shown.
4 changes: 2 additions & 2 deletions src/main/python/blat_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import threading
import string
from binaidp import log_command
from idpcommon import log_command

################################################################################

Expand All @@ -16,7 +16,7 @@
blat_option = ' '.join(sys.argv[4:-2])
SR_pathfilename = sys.argv[-2]
output_pathfilename = sys.argv[-1]

else:
print("usage: python2.6 blat_threading.py p -t=DNA -q=DNA ~/annotations/hg19/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa /usr/bin/python intact_SM.fa intact_SM.fa.psl")
print("or ./blat_threading.py p -t=DNA -q=DNA ~/annotations/hg19/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa /usr/bin/python intact_SM.fa intact_SM.fa.psl")
Expand Down
124 changes: 124 additions & 0 deletions src/main/python/filter_MLE_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/python

import sys, os, argparse

# Description: Filter an input file of expression values based
# on the negative data
# Pre: (Optional)
# --FPR
# an (FPR?) (float)
# Well, Its a float.
#
# We go this far into the sorted list of negative fractions.
# and set that as a threshold.
#
# So if you have a ton of negative examples with 1 expression
# your threshold will simply be 1
#
# --min_isoform_fraction (only one of FPR or min_isoform_fraction can be set)
# This is the minimum fraction of a gene's expression an isoform can be to get reported
# --min_isoform_rpkm
# lowest raw expression of an isoform required to output it.
# 1. a negative filename
# The format looks something like
# chr20:17580-19316.1 <tab> 29.9449 <tab> 32.2812
# I'd guess this a locus name followed by an isoform RPKM then a gene RPKM
#
# It seems values in this
# list include many zero values for the middle column, and sometimes
# higher values for gene locus.
#
# This list is converted into a list of fractions where we know
# what the fraction of the gene locus expression is composed of
# the isoform expression.
#
# 2. an input filename
# Another file with the same format as the first two
# locus <tab> isoform RPKM <tab> gene RPKM
# This file is currently generated by MLEout2tab.py which is downstream
# of MLE_MT.py
#
# 3. an output filename
# If the gene expression is greater than zero and the
# isoform expression/gene expression is higher than the threshold
# write the input file's line to this output file.
# So format is the same on the output as on the input.
#
# Post: Writes the output file as a filterd version of the input file
# It also prints to STDOUT some summary statistics about the threshold
# Modifies:
# Writes to output file, and STDOUT


def main():
parser = argparse.ArgumentParser(description="Determine and FPR and apply filtering thresholds for considered isoforms where relevant.")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--FPR',type=float,help="minimum FPR to output results for when specified")
#,help="minimum FPR to output results for.")
group.add_argument('--min_isoform_fraction',type=float,help="minimum isforom fraction of a gene to report.")
#,help="threshold for the minimum isoform fraction")
parser.add_argument('--min_isoform_rpkm',type=float,default=0,help="Threshold for an acceptable RPKM of isoform expression.")
parser.add_argument('neg_filename',help="File containing the isoform and gene rpkms for negative hits in the format of locus <tab> isoform rpkm <tab> gene rpkm")
parser.add_argument('input_filename',help="input in the same format of locus <tab> isoform FKMP <tab> gene RPKM.")
parser.add_argument('output_filename',help="output in the same format of locus <tab> isoform RPKM <tab> gene RPKM.")
args = parser.parse_args()

threshold = False

#Handle the special case of FPR = 1
# output all cases with isoform expression greater than zero and exit
if args.FPR:
if args.FPR >= 1:
# handles pecial case for ignoring FPR
sys.stderr.write("FPR of 1, so consider all isoforms with expression > 0")
of = open(args.output_filename,'w')
with open(args.input_filename) as inf:
for line in inf:
f = line.rstrip().split()
if float(f[1]) >= args.min_isoform_rpkm and float(f[1]) > 0:
of.write(line)
of.close()
return # finished
# Read through the negative file and calculate the FPR threshold
neg_perc_ls = []
neg_file = open(args.neg_filename,'r')
for line in neg_file:
ls = line.strip().split("\t")
ID = ls[0]
iso_exp = float(ls[1])
gene_exp = float(ls[2])
if gene_exp > 0:
neg_perc_ls.append(iso_exp/gene_exp)
else:
neg_perc_ls.append(0)

neg_file.close()

neg_perc_ls.sort()
neg_perc_ls.reverse()
n = int(args.FPR*len(neg_perc_ls))
threshold = neg_perc_ls[n]

if not threshold: threshold = args.min_isoform_fraction
output = open(args.output_filename,'w')
input_file = open(args.input_filename,'r')
i=0
I=0
for line in input_file:
I+=1
ls = line.strip().split("\t")
ID = ls[0]
iso_exp = float(ls[1])
gene_exp = float(ls[2])
if gene_exp > 0 and float(iso_exp/gene_exp) > threshold and iso_exp > args.min_isoform_rpkm:
output.write(line)
i+=1
input_file.close()
if args.FPR:
print "negative:",n,len(neg_perc_ls),float(n)/len(neg_perc_ls)
print "threshold:",threshold
print "input:",i,I,float(i)/I
output.close()

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion src/main/python/gmap_threading.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import struct
import os
import string
from binaidp import log_command
from idpcommon import log_command

################################################################################

Expand Down
3 changes: 0 additions & 3 deletions src/main/python/binaidp.py → src/main/python/idpcommon.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/usr/bin/python
# Filename: binaidp.py

import os;

Expand All @@ -14,5 +13,3 @@ def log_command(command, ignorefail=False):

def log_print(print_str):
os.system("echo " + str(print_str))

# End of binaidp.py
2 changes: 1 addition & 1 deletion src/main/python/parseSAM_MT.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import os
import threading
from binaidp import log_command
from idpcommon import log_command

def GetPathAndName(pathfilename):
ls=pathfilename.split('/')
Expand Down
Loading

0 comments on commit 55eb135

Please sign in to comment.