forked from roblanf/euc_qc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenomescope.sh
54 lines (41 loc) · 1.79 KB
/
genomescope.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
inputf="/disks/dacelo/data/raw_data/tree_EM1/Project_SN7001117R_0083_CKulheim_LBronham_Melaleuca/"
outputf="/disks/dacelo/data/QC/Project_SN7001117R_0083_CKulheim_LBronham_Melaleuca/genomescope/"
threads=20 # how many threads do you want to use?
s='100M'
kmer_size=21 # 21 suggested in the docs - longer gives issues with seq. error
read_length=100 # length of reads in your input fastq files
kmer_max=5000 # ignore kmers represented more the kmer_max times (avoids issues with e.g. cpDNA)
mkdir $outputf
cd $outputf
# first we join fwd and reverse reads into single .fastq.gz files
for in1 in $(find $inputf -name "*R1_001.fastq.gz"); do
in2=${in1%%R1_001.fastq.gz}"R2_001.fastq.gz"
id="$(dirname $in1)"
id="$(basename $id)"
readf=$id".fastq.gz"
cat $in1 $in2 > $readf
done
# here we join biological replicates into files, we need this to get sufficient coverage
# to run genomescope
cat Sample_M1* > M1.fastq.gz
cat Sample_M2* > M2.fastq.gz
cat Sample_M3* > M3.fastq.gz
cat Sample_M4* > M4.fastq.gz
cat Sample_M5* > M5.fastq.gz
cat Sample_M6* > M6.fastq.gz
cat Sample_M7* > M7.fastq.gz
cat Sample_M8* > M8.fastq.gz
rm Sample*
# make one fastq with ALL THE READS (since each sample is only ~30x which is potentially a little low)
cat *.fastq.gz > all.fastq.gz
for readf in $(find *.fastq.gz); do
# count kmers in jellyfish, then make a histogram. NB: use zcat because jellyfish needs raw fastq, not .gz
zcat $readf | jellyfish count -C -m $kmer_size -s $s -t $threads -o $outputf$readf'.jf' /dev/fd/0
jellyfish histo -t $threads $outputf$readf'.jf' > $readf'.histo'
# run genomescope and put the output in a new directory
id=`echo "$readf" | cut -d'.' -f1`
genomescope.R $readf'.histo' $kmer_size $read_length $outputf$id $kmer_max verbose
done
# clean up big files
rm *.fastq.gz
rm *.jf