src/main/python/run.cfg

###################################################
#
# This cofiguration file contains all settings for a run
# of IDP (Isoform Detection and Prediction).
#
# lines begining with '#' are comments
# lists begin with '> tag' and end with '<' on separate lines
#
###################################################

##
# Long reads files
# You can input one of three types of long-read data: 
	# 1. Long-read alignments on reference genome in GPD format. 
	# If you has run IDP on the same data, you can use LR.gpd in the previous temp folder.
	# For more info of GPD format, please check http://www.stanford.edu/~kinfai/IDP/hESC_gpd_format.html

LR_gpd_pathfilename =  data/LR.gpd

    # or
	# 2. Long-read alignments on reference genome in PSL format (BLAT output format). 
	# If the PSL file only contains unique alignment for each long reads, then set psl_type = 1. Otherwise, set psl_type = 0

LR_psl_pathfilename = 
psl_type = 1
     
	# or
	# 3. Long reads in FASTA format (LSC corrected data is preferred) and reference genome in FASTA format. 
	# IDP will run BLAT to align long reads to reference genome.
	# (Optional) If primer sequence at 5' end (five_primer) or 3' end (three_primer) is input, IDP can trim the primer sequences off.
	# Primer sequence must be homopolyer compressed. E.g. the original sequence is AACCCTTGGGG, then you should input ACTG 
	
LR_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/longreads.fa
genome_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/genome.fa

five_primer = AGTACTCTG
three_primer = CGCAGAGTAC

##
# Short reads files
# You can input two short-read data: 

SR_jun_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/junction_color.bed 
SR_sam_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/good_hits.sam

# aligner can be gmap or blat 
aligner_choice = gmap
gmap_index_pathfoldername = ./data/gmap_hg19/

read_length = 50
min_junction_overlap_len = 10

##
# Python executable
# (single value)

python_path = /usr/bin/python

blat_executable_pathfilename = blat
gmap_executable_pathfilename = gmap
seqmap_executable_pathfilename = seqmap

##
# Temp folder
# (single value)

temp_foldername = temp

##
# Output folder
# (single value)

output_foldername = output

##
# Number of threading 
# (single value)

Nthread = 10

## 
# IDP outputs isoform identifications under this False Positive Rate threshold

FPR = 0.05

##
# The combination of reference annotations and EST library to generate Two-Consective-Junction-Linkage (TCJL) library
# TCJL is used to defined false isoforms and compute FPR.
# You can download this library from IDP homepage: http://www.stanford.edu/~kinfai/IDP/IDP.html

allref_annotation_pathfilename = /home/kinfai/3seq/IDP_0.1/test_data/hg19.all.gene_est.refFlat.txt

##
# Setting of isoform candidate construction
# 

	# Reference annotation library for gene locus and isoform candidate construction.
	
ref_annotation_pathfilename = /home/kinfai/3seq/IDP_0.1/test_data/normal_ref.gpd

	# Use reference annotated junctions or not. 1: yes; 0: no

I_refjun_isoformconstruction =  1

	# Use reference 5'end or not. 1: yes; 0: no

I_ref5end_isoformconstruction = 1

	# Use reference 3'end or not. 1: yes; 0: no
	
I_ref3end_isoformconstruction = 1

	# (Optional) CAGE data to help IDP define 5'end.
	# The format is like: "chr12   49525109        49525244        chr12:49525109:49525244:-:0.14:1.000000 0       -       14566.7126      1.78e-10    0"
	# (a) Chromosome (b) Start (c) End (d) (coordinates):(paraclu cluster strength):(TSS prediction strength) (e) empty (f) Strand (g) level - expression level in tpm (h) signif - currently empty - will be IDR (i) score2 - raw number of reads
	# You can find the format info in http://genome.crg.es/encode_RNA_dashboard/READMEs/README_TssHmm.pdf
	
CAGE_data_filename = /home/kinfai/3seq/H1_CAGE/encodeTssHmm.bedRnaElements

	# The max number of reference genes overlapped with single junctions.
	# These junctions are used for gene locus and exon boundary definition. 
	# For human, it is recommended to set 1. For genomes that have dense gene distribution, this value can be set higher.

exon_construction_junction_span = 1

	# The max number of junctions on 3' or 5'end to use for construction.
	# The higher this number is, the longer IDP would run.
	
Njun_limit = 10

	# The max number of isoform candidates allowed in each gene locus. 
	# If there are more isoform candidates than this limit, IDP will ignore this gene locus.
	# The higher this number is, the longer IDP would run.
	
Niso_limit = 50

	# The longest exon allowed.
	# The higher this number is, the longer IDP would run.
	
L_exon_limit = 1700

	# The shortest intron (junction) allowed.
	# The higher this number is, the longer IDP would run.
	# For human, it is recommended to set 68.
	
L_min_intron = 68

##
# Empirical priori for Max A Posteriori 

	# (Optional) Abundance and length of direct-detection of annotated gene isoforms
	# Each line represents an isoform: gene	isoform	abundance	length
	# Example: UBL5    NM_024292       519     23.1125 
	# If you leave it empty, IDP can calculate based on your long-read data
	
detected_exp_len = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/multiexon_refFlat.txt_positive_exp_len

	# Number of data points for priori estimation
	
Bfile_Npt = 500

	# Number of data bins for priori estimation
	
Bfile_Nbin = 5

# FPR and min_isoform_fraction are mutually exclusive, please only set one.
# If both are set, only FPR will be used to filter outputs.
FPR = 0.05
min_isoform_fraction =

min_isoform_rpkm = 

#########################