-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge remote-tracking branch 'origin/new_from_ID_0.1.9' into merge_wi…
…th_pagah_latest_changes Conflicts: src/main/python/runIDP.py src/main/python/select_FPR.py
- Loading branch information
Showing
11 changed files
with
376 additions
and
109 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<artifactId>idp</artifactId> | ||
<groupId>com.github.bioinform.idp</groupId> | ||
<name>idp</name> | ||
<version>0.1.1-SNAPSHOT</version> | ||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/python | ||
|
||
import sys, os, argparse | ||
|
||
# Description: Filter an input file of expression values based | ||
# on the negative data | ||
# Pre: (Optional) | ||
# --FPR | ||
# an (FPR?) (float) | ||
# Well, Its a float. | ||
# | ||
# We go this far into the sorted list of negative fractions. | ||
# and set that as a threshold. | ||
# | ||
# So if you have a ton of negative examples with 1 expression | ||
# your threshold will simply be 1 | ||
# | ||
# --min_isoform_fraction (only one of FPR or min_isoform_fraction can be set) | ||
# This is the minimum fraction of a gene's expression an isoform can be to get reported | ||
# --min_isoform_rpkm | ||
# lowest raw expression of an isoform required to output it. | ||
# 1. a negative filename | ||
# The format looks something like | ||
# chr20:17580-19316.1 <tab> 29.9449 <tab> 32.2812 | ||
# I'd guess this a locus name followed by an isoform RPKM then a gene RPKM | ||
# | ||
# It seems values in this | ||
# list include many zero values for the middle column, and sometimes | ||
# higher values for gene locus. | ||
# | ||
# This list is converted into a list of fractions where we know | ||
# what the fraction of the gene locus expression is composed of | ||
# the isoform expression. | ||
# | ||
# 2. an input filename | ||
# Another file with the same format as the first two | ||
# locus <tab> isoform RPKM <tab> gene RPKM | ||
# This file is currently generated by MLEout2tab.py which is downstream | ||
# of MLE_MT.py | ||
# | ||
# 3. an output filename | ||
# If the gene expression is greater than zero and the | ||
# isoform expression/gene expression is higher than the threshold | ||
# write the input file's line to this output file. | ||
# So format is the same on the output as on the input. | ||
# | ||
# Post: Writes the output file as a filterd version of the input file | ||
# It also prints to STDOUT some summary statistics about the threshold | ||
# Modifies: | ||
# Writes to output file, and STDOUT | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser(description="Determine and FPR and apply filtering thresholds for considered isoforms where relevant.") | ||
group = parser.add_mutually_exclusive_group(required=True) | ||
group.add_argument('--FPR',type=float,help="minimum FPR to output results for when specified") | ||
#,help="minimum FPR to output results for.") | ||
group.add_argument('--min_isoform_fraction',type=float,help="minimum isforom fraction of a gene to report.") | ||
#,help="threshold for the minimum isoform fraction") | ||
parser.add_argument('--min_isoform_rpkm',type=float,default=0,help="Threshold for an acceptable RPKM of isoform expression.") | ||
parser.add_argument('neg_filename',help="File containing the isoform and gene rpkms for negative hits in the format of locus <tab> isoform rpkm <tab> gene rpkm") | ||
parser.add_argument('input_filename',help="input in the same format of locus <tab> isoform FKMP <tab> gene RPKM.") | ||
parser.add_argument('output_filename',help="output in the same format of locus <tab> isoform RPKM <tab> gene RPKM.") | ||
args = parser.parse_args() | ||
|
||
threshold = False | ||
|
||
#Handle the special case of FPR = 1 | ||
# output all cases with isoform expression greater than zero and exit | ||
if args.FPR: | ||
if args.FPR >= 1: | ||
# handles pecial case for ignoring FPR | ||
sys.stderr.write("FPR of 1, so consider all isoforms with expression > 0") | ||
of = open(args.output_filename,'w') | ||
with open(args.input_filename) as inf: | ||
for line in inf: | ||
f = line.rstrip().split() | ||
if float(f[1]) >= args.min_isoform_rpkm and float(f[1]) > 0: | ||
of.write(line) | ||
of.close() | ||
return # finished | ||
# Read through the negative file and calculate the FPR threshold | ||
neg_perc_ls = [] | ||
neg_file = open(args.neg_filename,'r') | ||
for line in neg_file: | ||
ls = line.strip().split("\t") | ||
ID = ls[0] | ||
iso_exp = float(ls[1]) | ||
gene_exp = float(ls[2]) | ||
if gene_exp > 0: | ||
neg_perc_ls.append(iso_exp/gene_exp) | ||
else: | ||
neg_perc_ls.append(0) | ||
|
||
neg_file.close() | ||
|
||
neg_perc_ls.sort() | ||
neg_perc_ls.reverse() | ||
n = int(args.FPR*len(neg_perc_ls)) | ||
threshold = neg_perc_ls[n] | ||
|
||
if not threshold: threshold = args.min_isoform_fraction | ||
output = open(args.output_filename,'w') | ||
input_file = open(args.input_filename,'r') | ||
i=0 | ||
I=0 | ||
for line in input_file: | ||
I+=1 | ||
ls = line.strip().split("\t") | ||
ID = ls[0] | ||
iso_exp = float(ls[1]) | ||
gene_exp = float(ls[2]) | ||
if gene_exp > 0 and float(iso_exp/gene_exp) > threshold and iso_exp > args.min_isoform_rpkm: | ||
output.write(line) | ||
i+=1 | ||
input_file.close() | ||
if args.FPR: | ||
print "negative:",n,len(neg_perc_ls),float(n)/len(neg_perc_ls) | ||
print "threshold:",threshold | ||
print "input:",i,I,float(i)/I | ||
output.close() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.