Skip to content

Commit

Permalink
Minor bugs and cosmetics
Browse files Browse the repository at this point in the history
  • Loading branch information
cstritt committed Sep 13, 2021
1 parent 92b34da commit 2d43ff3
Show file tree
Hide file tree
Showing 4 changed files with 332 additions and 202 deletions.
23 changes: 12 additions & 11 deletions detettore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@
FACIENDA:
- check DP and GT assignement for TAPs
- run TIP splitreads only if reads are not paired-end
- write bamstats to vcf (including median coverage), for later filtering vcfs
"""

Expand Down Expand Up @@ -150,8 +154,9 @@ def main():
(len(DR_anchors), len(splitreads)))

# Discordant read pairs: find anchor clusters with mates mapping to TEs
tips.discordant_read_pairs(parameters, DR_anchors)
print(str(len(tips.DR_clusters)) + ' clusters')
if parameters.paired_end:
tips.discordant_read_pairs(parameters, DR_anchors)
print(str(len(tips.DR_clusters)) + ' clusters')

# Same for splitreads
tips.splitreads(parameters, splitreads, split_positions)
Expand All @@ -167,7 +172,7 @@ def main():


#%% TAPs
if "taps" in parameters.modus:
if "taps" in parameters.modus and parameters.paired_end:

print('\n\nSEARCHING TE ABSENCE POLYMORPHISMS')
print('\nExtracting read pairs around annotated TEs ...')
Expand All @@ -180,17 +185,14 @@ def main():

#%% Write VCF, stats, and log file

if "tips" in parameters.modus and "taps" in parameters.modus:

if "tips" in parameters.modus and "taps" in parameters.modus and parameters.paired_end:
combined_vcf = tips_vcf + taps_vcf
combined_vcf = sorted(combined_vcf, key=lambda x: (x[0], int(x[1])))

elif "tips" in parameters.modus:

combined_vcf = tips_vcf

elif "taps" in parameters.modus:

elif "taps" in parameters.modus and parameters.paired_end:
combined_vcf = taps_vcf


Expand Down Expand Up @@ -247,14 +249,13 @@ def main():
'TIPs summary:\n0/1:%i\n1/1:%i\n' % (tips_stats['0/1'], tips_stats['1/1'])
)

if 'taps' in parameters.modus:
if 'taps' in parameters.modus and parameters.paired_end:
print(
'TAPs summary:\n0/0:%i\n0/1:%i\n1/1:%i\n./.:%i\n' % (taps_stats['0/0'],taps_stats['0/1'], taps_stats['1/1'], taps_stats['./.'])
)

# with open(args.outname + '.stats.tsv', 'w') as f:


# Create log file
logging.basicConfig(
filename = args.outname +'.log',
Expand All @@ -263,7 +264,7 @@ def main():
level = logging.DEBUG)

log = logging.getLogger(args.outname +'.log')

log.info('readlength: %i, isize_mean: %i, isize_stdev: %i' % (
parameters.readlength, parameters.isize_mean, parameters.isize_stdev))

Expand Down
227 changes: 111 additions & 116 deletions scripts/bamstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
- insert size mean
- insert size standard deviation
Copyright (C) 2018 C. Stritt
Copyright (C) 2021 C. Stritt
License: GNU General Public License v3. See LICENSE.txt for details.
"""

Expand All @@ -27,16 +27,17 @@ def get_args():
parser = argparse.ArgumentParser(
description='Get basic coverage and insert size statistics for a bam file.')

parser.add_argument("bamfile",
help='bwa mem alignment to reference genome.')
parser.add_argument('bamfile')

args = parser.parse_args()
return args


def local_cov(region, bamfile):
""" Estimates mean coverage and standard deviation in the
region chromosome:start-end. Attencion: indexing in pybam is 0-based """
"""
Estimates mean coverage and standard deviation in the region
chromosome:start-end. Attencion: indexing in pybam is 0-based
"""

chromosome, start, end = region
pybam = pysam.AlignmentFile(bamfile, "rb")
Expand All @@ -52,10 +53,7 @@ def local_cov(region, bamfile):

coverage = []
for i in range(start, end):
try:
c = depth[i]
except KeyError:
c = 0
c = depth[i] if i in depth else 0
coverage.append(c)

if len(coverage) > 1:
Expand All @@ -71,68 +69,16 @@ def local_cov(region, bamfile):
return mean, stdev


def local_cov_filt(region, bamfile, filters):

""" Estimate mean coverage and standard deviation in the region of
interest. Filters for base and mapping quality are applied
"""
chromosome, start, end = region
baseq, mapq = filters

pybam = pysam.AlignmentFile(bamfile, "rb")
cov_dict = {}

crap_reads = set()

for pileupcolumn in pybam.pileup(chromosome, start, end,
**{"truncate": True}):
pos = pileupcolumn.pos
cov_dict[pos] = 0

for pileupread in pileupcolumn.pileups:

read = pileupread.alignment

read_id = read.query_name
if read_id in crap_reads:
continue

if pileupread.is_del or pileupread.is_refskip or\
not read.is_proper_pair or read.mapping_quality < mapq:

crap_reads.add(read_id)
continue

if read.query_qualities[pileupread.query_position] < baseq:
continue

cov_dict[pos] += 1

coverage = []
for i in range(start, end):
try:
c = cov_dict[i]
except KeyError:
c = 0
coverage.append(c)

if len(coverage) > 1:
mean = int(statistics.mean(coverage))
stdev = int(statistics.stdev(coverage))
else:
mean = int(coverage[0])
stdev = 'NA'

return mean, stdev


def core_dist_stats(dist):
"""
Get the core of a distributions and calculate its moments. The core
rather than the full distribution is used in order to mitigate the
influence if coverage and insert size outliers in the bam file.
"""

# Overall mean (for comparison...)
mean_tutto = int(statistics.mean(dist))

# Median absolute deviation
median = statistics.median(dist)
abs_deviations = [fabs(x - median) for x in dist]
Expand All @@ -146,18 +92,16 @@ def core_dist_stats(dist):

mean = int(statistics.mean(core_dist))
stdev = int(statistics.stdev(core_dist))
return mean, stdev

return mean_tutto, mean, stdev


def coverage_stats(bamfile):
"""
Create list with coverage at every single position. If the list is longer than
1e6, use random sample to estimate coverage statistics.
To do: only use first chromosome, traverse bam only once for coverage and isize stats
count_coverage function.
Better: only use 1e6 bp
Only traverses the first chromosome if there are multiple.
"""

Expand All @@ -168,17 +112,21 @@ def coverage_stats(bamfile):
pybam.close()

if len(cov) < 1e6:
stats = core_dist_stats(cov)
mean, mean_cd, stdev_cd = core_dist_stats(cov)
else:
stats = core_dist_stats(random.sample(cov, int(1e6)))
return stats
mean, mean_cd, stdev_cd = core_dist_stats(random.sample(cov, int(1e6)))

return {
'cov_mean' : mean,
'cov_mean_cd' : mean_cd,
'cov_stdev_cd' : stdev_cd
}


def isize_stats(bamfile):
def length_stats(bamfile):
"""
Get insert sizes of properly paired reads, calculate mean and standard
deviation of the core distribution. Using the core distribution instead of
all data mitigates the influence of outliers (source: Piccard toolbox).
deviation of the core distribution.
A second traversal of the bam file is required because here reads are traversed rather
than positions, as in the coverage_stats function.
Expand All @@ -199,57 +147,104 @@ def isize_stats(bamfile):
isizes.append(abs(read.isize))

pybam.close()


# Readlengths
readlengths = [x for x in readlengths if x]

read_d = {
'readlength_max' : max(readlengths),
'readlength_mean' : int(statistics.mean(readlengths)),
'readlength_stdev' : int(statistics.stdev(readlengths))
}

if not isizes:
sys.exit("\nERROR: No properly paired reads, check your bam file!")
return read_d

# Insert sizes
if len(isizes) < 1e6:
stats = core_dist_stats(isizes)
mean, mean_cd, stdev_cd = core_dist_stats(isizes)
else:
stats = core_dist_stats(random.sample(isizes, int(1e6)))

return stats, max([x for x in readlengths if x])


def write_output(bamfile, outfmt):

# cov_mean, cov_stdev = coverage_stats(bamfile)
isize, readlength = isize_stats(bamfile)

outlist = [
'readlength\t' + str(readlength),
#'coverage_mean\t' + str(cov_mean),
#'coverage_stdev\t' + str(cov_stdev),
'isize_mean\t' + str(isize[0]),
'isize_stdev\t' + str(isize[1])
]

if outfmt == 'file':
basename = bamfile.split('/')[-1].split('.')[0]
outfile = basename + '_bamstats.txt'
with open(outfile, 'w') as f:
mean, mean_cd, stdev_cd = core_dist_stats(random.sample(isizes, int(1e6)))

isize_d = {
'isize_mean' : mean,
'isize_mean_cd' : mean_cd,
'isize_stdev_cd' : stdev_cd
}

return read_d, isize_d

for line in outlist:
print(line)
f.write(line + '\n')
return outfile

elif outfmt == 'dict':
def return_dict(bamfile):
"""
Returns dictionary used by detettore. Coverage stats disabled, as
they require second traversal of bam file.
"""

length_statistiks = length_stats(bamfile)

# No paired end reads
if isinstance(length_statistiks, dict):

return {
'readlength' : length_statistiks['readlength_max']
}

else:
read_d, isize_d = length_statistiks

return {
'readlength' : readlength,
#'coverage_mean' : cov_mean,
#'coverage_stdev' : cov_stdev,
'isize_mean' : isize[0],
'isize_stdev' : isize[1]
'readlength' : read_d['readlength_mean'],
'isize_mean' : isize_d['isize_mean_cd'],
'isize_stdev' : isize_d['isize_stdev_cd']
}


def main():

"""
Write to stdout:
cov_mean, cov_mean_cd, cov_stdev_cd, readlength, PE/SE, isize_mean, isize_stdev
"""

args = get_args()
bamfile = args.bamfile
write_output(bamfile)


cov_d = coverage_stats(args.bamfile)
length_statistiks = length_stats(args.bamfile)

if isinstance(length_statistiks, dict):
rtype = 'SE'
read_d = length_statistiks
isize_d = {
'isize_mean_cd' : 'NA',
'isize_stdev_cd' : 'NA'
}

else:
rtype = 'PE'
read_d, isize_d = length_statistiks

outline = map( str, [
cov_d['cov_mean'],
cov_d['cov_mean_cd'],
cov_d['cov_stdev_cd'],
read_d['readlength_max'],
read_d['readlength_mean'],
read_d['readlength_stdev'],
rtype,
isize_d['isize_mean_cd'],
isize_d['isize_stdev_cd']
] )


sys.stdout.write('\t'.join([
'cov_mean', 'cov_mean_cd', 'cov_stdev_cd', 'readlength_max',
'readlength_mean', 'readlength_stdev', 'read_type', 'isize_mean_cd', 'isize_stdev_cd']) +'\n')
sys.stdout.write('\t'.join(outline)+'\n')


if __name__ == '__main__':
main()

Loading

0 comments on commit 2d43ff3

Please sign in to comment.