go with source codes

xiaochuanle · Jan 20, 2020 · d2e268b · d2e268b
1 parent 25fdcea
commit d2e268b
Show file tree

Hide file tree

Showing 371 changed files with 62,846 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -1,16 +1,41 @@
 # Introduction
 
-NECAT is an error correction and assembly tool for Nanopore long noisy reads.
+NECAT is an error correction and *de-novo* assembly tool for Nanopore long noisy reads.
 
 # Installation
 
-We currently only provide executable binaries, which can be downloaded and installed in the following way.
+We have sucessfully tested `NECAT` on
+
+* Ubuntu 16.04 (GCC 5.4.0, Perl v5.22.1)
+* CentOS 7.3.1611 (GCC 4.8.5, Perl v5.26.2)
+
+If you meet problems in running `NECAT` like
+```shell
+Syntax error at NECAT/Linuax-amd64/bin/Plgd/Project.pm line 46, near "${cfg{"
+```
+Please update your `perl` to a newer version (such as v5.26).
+
+There are two ways to install `NECAT`.
+
+### Install from executable binaries
+
 ```shell
 $ wget https://github.com/xiaochuanle/NECAT/releases/download/v0.01/necat_20190307_linux_amd64.tar.gz
 $ tar xzvf necat_20190307_linux_amd64.tar.gz
 $ export PATH=$PATH:$(pwd)/NECAT/Linux-amd64/bin
 ```
-After decompression, all the executable files can be found in `NECAT/Linux-amd64/bin`.  The third line above is used for adding `NECAT/Linux-amd64/bin` to the system `PATH`.
+
+
+### Build from source codes
+
+```shell
+$ wget https://github.com/xiaochuanle/NECAT/releases/download/v0.01/necat_20190307_linux_amd64.tar.gz
+$ tar xzvf necat_20190307_linux_amd64.tar.gz
+$ export PATH=$PATH:$(pwd)/NECAT/Linux-amd64/bin
+```
+
+After installation, all the executable files can be found in `NECAT/Linux-amd64/bin`.  The third line above is used for adding `NECAT/Linux-amd64/bin` to the system `PATH`.
+
 
 # Quick Start
 
@@ -30,20 +55,24 @@ ONT_READ_LIST=
 GENOME_SIZE=
 THREADS=4
 MIN_READ_LENGTH=3000
-OVLP_FAST_OPTIONS="-n 500 -z 20 -b 2000 -e 0.5 -j 0 -u 1 -a 1000"
-OVLP_SENSITIVE_OPTIONS="-n 500 -z 10 -e 0.5 -j 0 -u 1 -a 1000"
-CNS_FAST_OPTIONS="-a 2000 -x 4 -y 12 -l 1000 -e 0.5 -p 0.8 -u 0"
-CNS_SENSITIVE_OPTIONS="-a 2000 -x 4 -y 12 -l 1000 -e 0.5 -p 0.8 -u 0"
-TRIM_OVLP_OPTIONS="-n 100 -z 10 -b 2000 -e 0.5 -j 1 -u 1 -a 400"
-ASM_OVLP_OPTIONS="-n 100 -z 10 -b 2000 -e 0.5 -j 1 -u 0 -a 400"
+PREP_OUTPUT_COVERAGE=40
+OVLP_FAST_OPTIONS=-n 500 -z 20 -b 2000 -e 0.5 -j 0 -u 1 -a 1000
+OVLP_SENSITIVE_OPTIONS=-n 500 -z 10 -e 0.5 -j 0 -u 1 -a 1000
+CNS_FAST_OPTIONS=-a 2000 -x 4 -y 12 -l 1000 -e 0.5 -p 0.8 -u 0
+CNS_SENSITIVE_OPTIONS=-a 2000 -x 4 -y 12 -l 1000 -e 0.5 -p 0.8 -u 0
+TRIM_OVLP_OPTIONS=-n 100 -z 10 -b 2000 -e 0.5 -j 1 -u 1 -a 400
+ASM_OVLP_OPTIONS=-n 100 -z 10 -b 2000 -e 0.5 -j 1 -u 0 -a 400
 NUM_ITER=2
-CNS_OUTPUT_COVERAGE=45
-CLEANUP=0
+CNS_OUTPUT_COVERAGE=30
+CLEANUP=1
 USE_GRID=false
 GRID_NODE=0
-FSA_OL_FILTER_OPTIONS="--max_overhang=-1 --min_identity=-1 --coverage=40"
-FSA_ASSEMBLE_OPTIONS=""
-FSA_CTG_BRIDGE_OPTIONS="--dump --read2ctg_min_identity=80 --read2ctg_min_coverage=4 --read2ctg_max_overhang=500 --read_min_length=5000 --ctg_min_length=1000 --read2ctg_min_aligned_length=5000 --select_branch=best"
+GRID_OPTIONS=
+SMALL_MEMORY=0
+FSA_OL_FILTER_OPTIONS=
+FSA_ASSEMBLE_OPTIONS=
+FSA_CTG_BRIDGE_OPTIONS=
+POLISH_CONTIGS=true
 ```
 Filling and modifying the relative information, we have
 
@@ -56,7 +85,7 @@ MIN_READ_LENGTH=3000
   ......
 ```
 
-`read_list.txt` in the second line above contains the full paths of all read files. It looks like
+`read_list.txt` in the second line above contains the ***full paths*** of all read files. It looks like
 
 ``` shell
 $ cat read_list.txt
@@ -67,13 +96,15 @@ $ cat read_list.txt
 /share/home/chuanlex/xiaochuanle/data/testdata/tomato/20161108_Spenn_004_005_all.fastq
 ```
 
+Please note that files in `read_list.txt` need not be the same format. Each file can independently be either `FASTA` or `FASTQ`, and can further be compressed in [GNU Zip (gzip) format](https://www.gnu.org/software/gzip/manual/gzip.html).
+
 ### Step 2: Correct raw reads
 Correct the raw noisy reads using the following command:
 ``` Shell
 $ necat.pl correct ecoli_config.txt
 ```
-The corrected reads is `./ecoli/1-consensus/cns_iter${NUM_ITER}/cns.fasta`.   
-The extracted longest 45x corrected reads is `./ecoli/1-consensus/cns_final.fasta`
+The pipeline only corrects longest 40X (`PREP_OUTPUT_COVERAGE`) raw reads. The corrected reads are in the files `./ecoli/1-consensus/cns_iter${NUM_ITER}/cns.fasta`.   
+The longest 30X (`CNS_OUTPUT_COVERAGE`) corrected reads are extracted for assembly, which are in the file `./ecoli/1-consensus/cns_final.fasta`
 
 ### Step 3: Assemble contigs
 
@@ -82,7 +113,7 @@ After correcting the raw reads, we assemble the contigs using the following comm
 ```Shell
 $ necat.pl assemble ecoli_config.txt
 ```
-The assembled contigs is `./ecoli/4-fsa/contigs.fasta`.
+The assembled contigs are in the file `./ecoli/4-fsa/contigs.fasta`.
 
 ### Step 4: Bridge contigs
 
@@ -91,4 +122,19 @@ After assembling the contigs, we run the bridging-step using the following comma
 ```Shell
 $ necat.pl bridge ecoli_config.txt
 ```
-The bridged contigs is `./ecoli/6-bridge_contigs/bridged_contigs.fasta`.
+The bridged contigs are in the file  `./ecoli/6-bridge_contigs/bridged_contigs.fasta`.
+
+If `POLISH_CONTIGS` is set, the pipeline uses the corrected reads to polish the bridged contigs. The polished contigs are in the file `./ecoli/6-bridge_contigs/polished_contigs.fasta`
+
+# Running with multiple computation nodes
+
+On PBS and SGE systems, users may plan to run `NECAT` with multiple computation nodes. This is done by setting the config file (Step 1 of Quick Start) like
+```shell
+USE_GRID=true
+GRID_NODE=4
+```
+In the above example, `4` computation nodes will be used and each computation node will run with `THREADS` CPU threads.
+
+# Contact
+
+* Chuan-Le Xiao, [email protected]
diff --git a/scripts/oc2asmpm.sh b/scripts/oc2asmpm.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+fPrintUsage()
+{
+  echo "USAGE:"
+  echo "$0 wrk_dir read_list output [map options]"
+}
+
+
+if [ $# -lt 3 ]; then
+  fPrintUsage;
+  exit 1;
+fi
+
+MKDB_WORKDER=oc2mkdb
+PM_WORKER=oc2asmpm
+WRK_DIR=$1
+READ_LIST=$2
+OUTPUT=$3
+MAP_OPTIONS=""
+ASM_OVLP_OPTIONS=${OSA_ASM_OVLP_OPTIONS}
+
+ALL_FINISHED="${WRK_DIR}/all.finished"
+if [ -f ${ALL_FINISHED} ]; then
+    echo Job ${ALL_FINISHED} Has Been Finished. Exit Normally.
+    exit 0;
+fi
+
+PAC_FINISHED="${WRK_DIR}/pac.finished"
+
+i=1;
+for opt in $*
+do
+  if [ $i -gt 3 ]; then
+    MAP_OPTIONS="${MAP_OPTIONS} $opt"
+  fi
+  let i=i+1
+done
+MAP_OPTIONS="${MAP_OPTIONS} ${ASM_OVLP_OPTIONS}"
+
+echo ${WRK_DIR}
+echo ${READ_LIST}
+echo ${OUTPUT}
+echo ${MAP_OPTIONS}
+
+if [ ! -f ${PAC_FINISHED} ]; then
+	MKDB_CMD="${MKDB_WORKDER} ${WRK_DIR} ${READ_LIST}"
+	oc2cmd.sh ${MKDB_CMD}
+	if [ $? -ne 0 ]; then
+  		exit 1;
+	fi
+	touch ${PAC_FINISHED}
+fi
+
+RINFO="${WRK_DIR}/reads_info.txt"
+if [ ! -f ${RINFO} ]; then
+  echo File ${RINFO} does not exist!
+  exit 1;
+fi
+
+NVOLS=`cat ${RINFO} | awk '{print $1}' `
+echo Number of volumes: $NVOLS
+
+for((i=0;i<${NVOLS};i=i+1))
+do
+  RESULT="${WRK_DIR}/pm_result_$i"
+  PM_FINISHED="${RESULT}.finished"
+  if [ ! -f ${PM_FINISHED} ]; then
+  	PM_CMD="${PM_WORKER} ${MAP_OPTIONS} ${WRK_DIR} $i ${RESULT}"
+  	oc2cmd.sh ${PM_CMD}
+  	if [ $? -ne 0 ]; then
+    		exit 1;
+  	fi
+  	touch ${PM_FINISHED}
+  fi
+done
+
+echo "mapping finish"
+
+if [ -f ${OUTPUT} ]; then
+	rm -f ${OUTPUT}
+fi
+
+for((i=0;i<${NVOLS};i=i+1))
+do
+  RESULT="${WRK_DIR}/pm_result_$i"
+  cat ${RESULT} >> ${OUTPUT}
+done
+
+touch ${ALL_FINISHED}
diff --git a/scripts/oc2cmd.sh b/scripts/oc2cmd.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+i=0;
+for Q in $*;
+do
+    if [ $i -eq 0 ]; then
+        CMD=$Q
+    else
+        CMD="${CMD} $Q"
+    fi
+    let i=i+1
+done
+
+if [ ! -n "${CMD}" ]; then
+    echo Empty command!
+    exit 1;
+fi
+
+echo "[$(date)] (${CMD}) STARTS"
+${CMD}
+if [ $? -ne 0 ]; then
+    echo "Failed at Running (${CMD})"
+    exit 1;
+fi
+echo "[$(date)] (${CMD}) FINISHES"
diff --git a/scripts/oc2cns.sh b/scripts/oc2cns.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+fPrintUsage()
+{
+	echo "USAGE:"
+	echo "$0 cns_dir config_file"
+}
+
+if [ $# -ne 2 ]; then
+	fPrintUsage;
+	exit 1;
+fi
+
+CNS_DIR=$1
+CONFIG_FILE=$2
+
+### parse arguments
+while read LINE ; do
+    eval "${LINE}"
+done < ${CONFIG_FILE}
+
+echo ${THREADS}
+echo ${ONT_READ_LIST}
+echo ${MIN_READ_LENGTH}
+echo ${OVLP_FAST_OPTIONS}
+echo ${OVLP_SENSITIVE_OPTIONS}
+echo ${CNS_FAST_OPTIONS}
+echo ${CNS_SENSITIVE_OPTIONS}
+echo ${NUM_ITER}
+echo ${CLEANUP}
+
+READ_LIST="-n ${ONT_READ_LIST}"
+
+if [ ! -d ${CNS_DIR} ]; then
+    mkdir -p ${CNS_DIR}
+fi
+
+PPRR_DIR="${CNS_DIR}/raw_reads"
+if [ ! -d ${PPRR_DIR} ]; then
+	mkdir -p ${PPRR_DIR}
+fi
+TMP_READ_LIST="${PPRR_DIR}/raw_read_list.txt"
+PPRR_FINISHED="${PPRR_DIR}/pprr.finished"
+if [ -f ${PPRR_FINISHED} ]; then
+	echo "Job PPRR has been finished, skip it."
+else
+	PPRR_CMD="oc2pprr ${ONT_READ_LIST} ${PPRR_DIR} ${TMP_READ_LIST} ${MIN_READ_LENGTH}"
+	oc2cmd.sh ${PPRR_CMD}
+	if [ $? -ne 0 ]; then
+    		exit 1;
+	fi
+	touch ${PPRR_FINISHED}
+fi
+READ_LIST="-n ${TMP_READ_LIST}"
+
+
+for ((i=1;i<=${NUM_ITER};i++))
+do
+    DIR_NAME="cns_iter${i}"
+    WRK_DIR="${CNS_DIR}/${DIR_NAME}"
+    CNS_READS="${WRK_DIR}/cns.fasta"
+    UNCNS_READS="${WRK_DIR}/raw.fasta"
+
+    if [ ! -d ${WRK_DIR} ]; then
+        mkdir -p ${WRK_DIR}
+    fi
+
+    if [ $i -eq 1 ]; then
+        ITER_CAN_OPTIONS="-t ${THREADS} ${OVLP_SENSITIVE_OPTIONS}"
+        ITER_CNS_OPTIONS="-t ${THREADS} ${CNS_SENSITIVE_OPTIONS} -r 0"
+    else
+        ITER_CAN_OPTIONS="-t ${THREADS} ${OVLP_FAST_OPTIONS}"
+        ITER_CNS_OPTIONS="-t ${THREADS} ${CNS_FAST_OPTIONS} -r 1"
+    fi
+
+    if [ $i -eq ${NUM_ITER} ]; then
+        ITER_CNS_OPTIONS="${ITER_CNS_OPTIONS} -f 0"
+    else
+        ITER_CNS_OPTIONS="${ITER_CNS_OPTIONS} -f 1"
+    fi
+    export ONTCNS_CAN_OPTIONS=${ITER_CAN_OPTIONS}
+    export ONTCNS_CNS_OPTIONS=${ITER_CNS_OPTIONS}
+
+    CMD="oc2cns_iter.sh -t ${THREADS} -w ${WRK_DIR} -r ${CNS_READS} -u ${UNCNS_READS} ${READ_LIST} -c ${CLEANUP}"
+    oc2cmd.sh ${CMD}
+    if [ $? -ne 0 ]; then
+        exit 1;
+    fi
+
+    if [ $i -ne ${NUM_ITER} ]; then
+        NEXT_READ_LIST="${WRK_DIR}/ReadList.txt"
+        echo "${CNS_READS}" > "${NEXT_READ_LIST}"
+        echo "${UNCNS_READS}" >> "${NEXT_READ_LIST}"
+        READ_LIST="-n ${NEXT_READ_LIST}"
+    fi
+done