-
Notifications
You must be signed in to change notification settings - Fork 2
/
alignment.py
236 lines (225 loc) · 9.16 KB
/
alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""
Alignment and preprocessing via Drop-Seq Tools 2.4.0 and Bowtie2 version 2.4.1
Requires Samtools and pysam.
Note: `<basename>` is inputted using the `--basename BASENAME` argument.
0. Input Data -> Unmapped BAM, starting from any of these substeps depending on the input:
a. Raw BAM (`<basename>`.bam)
b. Single-end FASTQ (`<basename>`.fastq)
c. Pair-end FASTQs (`<basename>`_R1.fastq,`<basename>`_TCR.fastq)
1. Unmapped BAM (`<basename>`_unmapped.bam) -> aligned and tagged BAM
a. Tag cell barcodes (`<basename>_celltagged.bam`)
b. Tag molecular barcodes (`<basename>_idtagged.bam`)
c. Filter out reads with low quality cell/molecular barcodes (`<basename>`_filtered.bam)
d. Trim 5’ primer sequence (`<basename>`_adaptertrimmed.bam)
e. Trim 3’ polyA sequence (`<basename>`_trimmed.bam)
f. SAM -> Fastq (`<basename>`_trimmed.fastq)
g. Bowtie2 alignment (`<basename>`_aligned.sam)
h. Sort STAR alignment in queryname order (`<basename>`_alignedsorted.bam)
i. Merge STAR alignment tagged SAM to recover cell/molecular barcodes (`<basename>`_merged.bam)
j. Add gene/exon and other annotation tags (`<basename>`_exontagged.bam)
k. Barcode Repair
i. Repair substitution errors (`<basename>_synthrepaired.bam`)
ii. Repair indel errors (`<basename>`_repaired.bam)
iii. Sort by coordinates (`<basename>`_repairedsorted.bam)
"""
import tcrgo.dropseq_tools as ds
import tcrgo.io as io
import argparse
from textwrap import dedent, indent
import os
import pysam
from tcrgo import Log
log = Log(name=__name__)
def main(args):
log.init(args.verbosity)
log.info("Verifying that Drop-Seq Tools, Picard, and Bowtie2 are all callable...")
log.info("Testing file version 2")
ds.test_tools(args.dropseq, args.picard, args.aligner)
if not os.path.exists(args.output_path):
os.makedirs(args.output_path)
if args.basename is None:
args.basename = os.path.basename(args.data[0].split('.')[0])
# Files are produced in this order
basename = os.path.join(args.output_path, args.basename)
fastq_singleend = basename + ".fastq"
fastq_barcode = basename + "_R1_out.fastq"
fastq_biological = basename + "_TCR_out.fastq"
bam_unmapped = basename + "_unmapped.bam"
bam_idtagged = basename + "_idtagged.bam"
bam_filtered = basename + "_filtered.bam"
bam_trimmed = basename + "_trimmed.bam"
fastq_trimmed = basename + "_trimmed.fastq"
sam_aligned = basename + "_aligned.sam"
bam_alignedsorted = basename + "_alignedsorted.bam"
bam_merged = basename +"_merged.bam"
bam_exontagged = basename + "_exontagged.bam"
bam_synthrepaired = basename + "_synthrepaired.bam"
bam_repaired = basename + "_repaired.bam"
bam_repairedsorted = basename + "_repairedsorted.bam"
log.info("Checking sequence data and FASTA arguments.")
if not os.path.isfile(args.fasta):
log.error("Please enter a valid path to your FASTA file!")
is_bam = io.is_bam(args.data)
is_fastq_singleend = len(args.data) < 2
if is_bam:
bam_singleend = args.data[0]
elif is_fastq_singleend:
fastq_singleend = args.data[0]
else:
fastq_barcode_in, fastq_biological_in = args.data
if is_bam and not os.path.isfile(fastq_singleend):
log.info("Converting raw BAM to single-end FASTQ...")
ds.bam2fq(bam_singleend, fastq_singleend)
elif is_fastq_singleend and not os.path.isfile(fastq_barcode) and not os.path.isfile(fastq_biological):
log.info("Transforming Read1-index-1 FASTQ to Read1 and Read2 FASTQs")
ds.transform_read_data(fastq_singleend, args.basename, fastq_barcode, fastq_biological, args.output_path)
elif not is_fastq_singleend and not os.path.isfile(fastq_barcode) and not os.path.isfile(fastq_biological) and not args.not_reverse_complement:
log.info("Transforming TCR FASTQ to its reverse complement")
ds.rev_comp_fastqs(fastq_barcode_in, fastq_biological_in,args.basename, fastq_barcode, fastq_biological, args.output_path)
fastq_barcode = fastq_barcode_in
if args.not_reverse_complement:
fastq_barcode = fastq_barcode_in
fastq_biological=fastq_biological_in
if not os.path.isfile(bam_unmapped):
log.info("Converting R1 and R2 FASTQs to a single-end BAM")
ds.fastq_to_bam(args.picard, fastq_barcode, fastq_biological, bam_unmapped, args.basename)
if not os.path.isfile(bam_idtagged):
log.info("Tagging BAM with cell barcode and UMI sequences...")
ds.tag_identifiers_bam(args.dropseq, bam_unmapped, bam_idtagged)
log.info("Filter barcode option = "+str(args.filter_barcodes))
if args.filter_barcodes:
if not os.path.isfile(bam_filtered):
log.info("Filtering reads with low quality bases in cell barcode and UMI.")
ds.filter_bam(args.dropseq, bam_idtagged, bam_filtered)
if not os.path.isfile(bam_trimmed):
log.info("Trimming adapter and poly A sequences...")
#ds.trim_bam(args.dropseq, bam_idtagged, bam_trimmed)
ds.trim_bam(args.dropseq, bam_filtered, bam_trimmed)
else:
if not os.path.isfile(bam_trimmed):
log.info("Trimming adapter and poly A sequences...")
ds.trim_bam(args.dropseq, bam_idtagged, bam_trimmed)
#ds.trim_bam(args.dropseq, bam_filtered, bam_trimmed)
if not os.path.isfile(sam_aligned):
log.info(f"Aligning BAM using {args.aligner}")
if args.aligner == "bowtie2":
ds.bowtie2(bam_trimmed, args.fasta, sam_aligned)
elif args.aligner == "star":
if not os.path.isfile(fastq_trimmed):
ds.bam2fq(bam_trimmed, fastq_trimmed)
ds.star(fastq_trimmed, args.fasta, basename, sam_aligned)
if not os.path.isfile(bam_alignedsorted):
log.info("Sorting aligned SAM by queryname, outputting as BAM.")
ds.sort_sam(args.picard, sam_aligned, bam_alignedsorted)
if not os.path.isfile(bam_merged):
log.info("Creating reference dictionary from FASTA for BAM merging.")
ds.create_sequence_dictionary(args.picard, args.fasta)
log.info("Merging the aligned BAM with the unmapped BAM to recover info for repair.")
ds.merge_bam(args.picard, args.fasta, bam_trimmed, bam_alignedsorted, bam_merged)
if not os.path.isfile(bam_exontagged):
#log.info("Faux-tagging the merged BAM with dummy exon info.")
log.info("Tagging the merged BAM with TRA/TRB exon info.")
ds.fauxtag_exons(bam_merged, bam_exontagged)
if not os.path.isfile(bam_repaired):
log.info("Repairing insertion-deletion errors in the cell barcode sequences.")
#ds.repair_bam(args.dropseq, bam_exontagged, bam_repaired, min_umis_per_cell=1)
ds.bead_synthesis_errors(args.dropseq, bam_exontagged, bam_synthrepaired, min_umis_per_cell=1)
log.info("Repairing substitution errors in the cell barcode sequences and collapsing barcodes. New filter params")
ds.bead_substitution_errors(args.dropseq, bam_synthrepaired, bam_repaired, min_umis_per_cell=1)
log.info("Sorting bam by coordinate, outputting final BAM.")
pysam.sort("-o", bam_repairedsorted, bam_repaired)
log.success("DONE")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog="python -m alignment", # Would be good to make a command-line alias and set that here
usage="",
formatter_class=argparse.RawDescriptionHelpFormatter,
description=dedent(
f"""
+{'='*78}+
|{'{:^78}'.format('SEQ-WELL TCR RECOVERY - TCRGO?')}|
+{'='*78}+
{indent(__doc__, 3*' ')}
"""
),
epilog=dedent(
"""
Developed by James Gatter & Sarah Nyquist of Shalek Lab (2020).
Original Matlab pipeline written by Andy Tu and Tod Gierhann of Love Lab (2019).
"""
)
)
# REQUIRED ARGUMENTS
parser.add_argument(
"data",
type=str,
nargs='+',
metavar="<BAM|FASTQ or FASTQ_R1 FASTQ_Index-1>",
help="Path to single-end BAM/FASTQ or paths (delim. by comma) Read1-index-1 FASTQs"
)
parser.add_argument(
'-d', "--dropseq",
type=str,
help="Path to the Drop-Seq Tools jar file. Recommended you use a version 2.3.0 or 2.4.0."
)
parser.add_argument(
'-p', "--picard",
type=str,
help=
"Path to the Picard jar file. Recommended you use a version of Picard that is "
"bundled with/compatible with your version of Drop-Seq Tools."
)
parser.add_argument(
'-f', "--fasta",
type=str,
required=True,
help="Path to reference FASTA."
)
# OPTIONAL ARGUMENTS
parser.add_argument(
'-v', "--verbosity",
type=str,
default="INFO",
choices=["INFO", "VERBOSE", "WARNING", "SUCCESS", "ERROR"],
help="The verbosity of the log. (default: %(default)s)."
)
parser.add_argument(
'-b', "--basename",
type=str,
default=None,
help=
"The basename to affix to all output files. If not specified, "
"Python's os module will be used to ascertain."
)
parser.add_argument(
'-o', "--output-path",
type=str,
default="./out/alignment/",
help="The path to which the files from this program will output. (default: %(default)s )."
)
parser.add_argument(
'-a', "--aligner",
type=str,
default="bowtie2",
choices=["bowtie2", "star"],
required=False,
help=
"The aligner the program will choose for aligning the sequence data "
"against the reference FASTA. (default: %(default)s )."
)
parser.add_argument(
'-t', "--filter-barcodes",
action="store_true",
required=False,
help=
"Determines if the filter barcode based on quality step should be run. "
)
parser.add_argument(
'-r', "--not-reverse-complement",
action="store_true",
required=False,
help=
"Determines if the reverse complement of the input fastqs is taken prior to alignment or not. Default is to take reverse complement. "
)
args = parser.parse_args()
main(args)