-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcombine_ends.py
executable file
·143 lines (121 loc) · 4.43 KB
/
combine_ends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
"""
Given an R1 and an R2 bam file that have been filtered to remove the non-5'
ends of chimeric alignments (with remove_chimeras.py), combine them into a
single bam file.
Loosely based on
https://github.com/ArimaGenomics/mapping_pipeline/blob/master/two_read_bam_combiner.pl
"""
import argparse
import sys
import pysam
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
# cannot use type=lambda because template is needed, which comes
# from the infile
parser.add_argument(
"-o", "--outfile", default="-", nargs="?", help="where to put filtered bam"
)
parser.add_argument(
"-q",
"--mapq",
type=int,
default=20,
help="minimum MAPQ for both ends to keep a pair [20]",
)
parser.add_argument(
"r1_bam",
type=lambda f: pysam.AlignmentFile(f, "rb"),
help="bam file containing R1 alignments",
)
parser.add_argument(
"r2_bam",
type=lambda f: pysam.AlignmentFile(f, "rb"),
help="bam file containing R2 alignments",
)
return parser.parse_args()
def pair_reads(r1, r2):
"""
Given bam entries for two ends of the same read (R1/R2) that have
been aligned as if they were single-end reads, set the fields in the
entries so that they are now properly paired.
Args:
r1, r2 (pysam.AlignedSegment): two ends of the same read, both
aligned and having the same read name
Returns:
r1, r2 (pysam.AlignedSegment): the same two reads that were
provided as arguments, but with the FLAG, RNEXT, PNEXT, and
TLEN fields modified to make them a proper pair
"""
# if the two ends map to the same reference, we need to set RNEXT
# to '=' for both reads and also calculate the TLEN
if r1.reference_name == r2.reference_name:
r1.next_reference_name = "="
r2.next_reference_name = "="
if r1.reference_start < r2.reference_start:
tlen = (
r2.reference_start
+ max(r2.get_reference_positions())
- r1.reference_start
)
r1.template_length = tlen
r2.template_length = -1 * tlen
else:
tlen = (
r1.reference_start
+ max(r1.get_reference_positions())
- r2.reference_start
)
r1.template_length = -1 * tlen
r2.template_length = tlen
else: # ends map to different references, so just set RNEXT
r1.next_reference_name = r2.reference_name
r2.next_reference_name = r1.reference_name
# set PNEXT
r1.next_reference_start = r2.reference_start
r2.next_reference_start = r1.reference_start
# set some bits in the FLAG
r1.is_paired, r2.is_paired = True, True
r1.is_proper_pair, r2.is_proper_pair = True, True
r1.mate_is_unmapped, r2.mate_is_unmapped = False, False
r1.mate_is_reverse = r2.is_reverse
r2.mate_is_reverse = r1.is_reverse
r1.is_read1, r2.is_read1 = True, False
r1.is_read2, r2.is_read2 = False, True
r1.is_secondary, r2.is_secondary = False, False
r1.is_supplementary, r2.is_supplementary = False, False
return r1, r2
def next_or_exit(iterator):
"""
Just like next(iterator), except that if there is nothing more in
the iterator, it runs sys.exit() rather than raising a StopIteration
"""
try:
return next(iterator)
except StopIteration:
sys.exit()
def main():
args = parse_args()
outfile = pysam.AlignmentFile(args.outfile, "wb", template=args.r1_bam)
r1, r2 = next_or_exit(args.r1_bam), next_or_exit(args.r2_bam)
while r1 and r2:
# if r1 and r2 iters are not on the same read, advance the
# one that's on the read with the lexographically lesser
# name until they are
while r1.query_name != r2.query_name:
if r1.query_name < r2.query_name:
r1 = next_or_exit(args.r1_bam)
if r1.query_name > r2.query_name:
r2 = next_or_exit(args.r2_bam)
if (
not r1.is_unmapped
and not r2.is_unmapped
and r1.mapping_quality >= args.mapq
and r2.mapping_quality >= args.mapq
):
r1, r2 = pair_reads(r1, r2)
outfile.write(r1)
outfile.write(r2)
r1, r2 = next_or_exit(args.r1_bam), next_or_exit(args.r2_bam)
if __name__ == "__main__":
main()