-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrun.cfg
174 lines (118 loc) · 5.51 KB
/
run.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
###################################################
#
# This cofiguration file contains all settings for a run
# of IDP (Isoform Detection and Prediction).
#
# lines begining with '#' are comments
# lists begin with '> tag' and end with '<' on separate lines
#
###################################################
##
# Long reads files
# You can input one of three types of long-read data:
# 1. Long-read alignments on reference genome in GPD format.
# If you has run IDP on the same data, you can use LR.gpd in the previous temp folder.
# For more info of GPD format, please check http://www.stanford.edu/~kinfai/IDP/hESC_gpd_format.html
LR_gpd_pathfilename = data/LR.gpd
# or
# 2. Long-read alignments on reference genome in PSL format (BLAT output format).
# If the PSL file only contains unique alignment for each long reads, then set psl_type = 1. Otherwise, set psl_type = 0
LR_psl_pathfilename =
psl_type = 1
# or
# 3. Long reads in FASTA format (LSC corrected data is preferred) and reference genome in FASTA format.
# IDP will run BLAT to align long reads to reference genome.
# (Optional) If primer sequence at 5' end (five_primer) or 3' end (three_primer) is input, IDP can trim the primer sequences off.
# Primer sequence must be homopolyer compressed. E.g. the original sequence is AACCCTTGGGG, then you should input ACTG
LR_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/longreads.fa
genome_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/genome.fa
five_primer = AGTACTCTG
three_primer = CGCAGAGTAC
##
# Short reads files
# You can input two short-read data:
SR_jun_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/junction_color.bed
SR_sam_pathfilename = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/good_hits.sam
# aligner can be gmap or blat
aligner_choice = gmap
gmap_index_pathfoldername = ./data/gmap_hg19/
read_length = 50
min_junction_overlap_len = 10
##
# Python executable
# (single value)
python_path = /usr/bin/python
blat_executable_pathfilename = blat
gmap_executable_pathfilename = gmap
seqmap_executable_pathfilename = seqmap
##
# Temp folder
# (single value)
temp_foldername = temp
##
# Output folder
# (single value)
output_foldername = output
##
# Number of threading
# (single value)
Nthread = 10
##
# IDP outputs isoform identifications under this False Positive Rate threshold
FPR = 0.05
##
# The combination of reference annotations and EST library to generate Two-Consective-Junction-Linkage (TCJL) library
# TCJL is used to defined false isoforms and compute FPR.
# You can download this library from IDP homepage: http://www.stanford.edu/~kinfai/IDP/IDP.html
allref_annotation_pathfilename = /home/kinfai/3seq/IDP_0.1/test_data/hg19.all.gene_est.refFlat.txt
##
# Setting of isoform candidate construction
#
# Reference annotation library for gene locus and isoform candidate construction.
ref_annotation_pathfilename = /home/kinfai/3seq/IDP_0.1/test_data/normal_ref.gpd
# Use reference annotated junctions or not. 1: yes; 0: no
I_refjun_isoformconstruction = 1
# Use reference 5'end or not. 1: yes; 0: no
I_ref5end_isoformconstruction = 1
# Use reference 3'end or not. 1: yes; 0: no
I_ref3end_isoformconstruction = 1
# (Optional) CAGE data to help IDP define 5'end.
# The format is like: "chr12 49525109 49525244 chr12:49525109:49525244:-:0.14:1.000000 0 - 14566.7126 1.78e-10 0"
# (a) Chromosome (b) Start (c) End (d) (coordinates):(paraclu cluster strength):(TSS prediction strength) (e) empty (f) Strand (g) level - expression level in tpm (h) signif - currently empty - will be IDR (i) score2 - raw number of reads
# You can find the format info in http://genome.crg.es/encode_RNA_dashboard/READMEs/README_TssHmm.pdf
CAGE_data_filename = /home/kinfai/3seq/H1_CAGE/encodeTssHmm.bedRnaElements
# The max number of reference genes overlapped with single junctions.
# These junctions are used for gene locus and exon boundary definition.
# For human, it is recommended to set 1. For genomes that have dense gene distribution, this value can be set higher.
exon_construction_junction_span = 1
# The max number of junctions on 3' or 5'end to use for construction.
# The higher this number is, the longer IDP would run.
Njun_limit = 10
# The max number of isoform candidates allowed in each gene locus.
# If there are more isoform candidates than this limit, IDP will ignore this gene locus.
# The higher this number is, the longer IDP would run.
Niso_limit = 50
# The longest exon allowed.
# The higher this number is, the longer IDP would run.
L_exon_limit = 1700
# The shortest intron (junction) allowed.
# The higher this number is, the longer IDP would run.
# For human, it is recommended to set 68.
L_min_intron = 68
##
# Empirical priori for Max A Posteriori
# (Optional) Abundance and length of direct-detection of annotated gene isoforms
# Each line represents an isoform: gene isoform abundance length
# Example: UBL5 NM_024292 519 23.1125
# If you leave it empty, IDP can calculate based on your long-read data
detected_exp_len = /hsgs/projects/whwong/pegahta/IDP_0.1/example/data/multiexon_refFlat.txt_positive_exp_len
# Number of data points for priori estimation
Bfile_Npt = 500
# Number of data bins for priori estimation
Bfile_Nbin = 5
# FPR and min_isoform_fraction are mutually exclusive, please only set one.
# If both are set, only FPR will be used to filter outputs.
FPR = 0.05
min_isoform_fraction =
min_isoform_rpkm =
#########################