forked from ethanagb/pblibsim
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cleanfasta.py
189 lines (179 loc) · 6.39 KB
/
cleanfasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
from __future__ import division
import numpy as np
#Pull in list of base file names. Could implement as a flag at some point.
with open('namelist','r') as infile:
lines = infile.readlines()
names =[str(e.strip()) for e in lines]
infile.close()
with open('chrdist.td','w+') as outfile2:
for chrom in names:
#Strip header lines from fasta for processing
with open(str(chrom) + '.fa','r') as infile, open(str(chrom) + '.noheader.fa','w+') as outfile:
for i, line in enumerate(infile):
if i >= 0:
if not line.startswith('>'):
outfile.write(line)
infile.close()
outfile.close()
#Remove any newline chars, remove undefined nts, make all nts uppercase for processing.
with open(str(chrom) + '.noheader.fa','r') as infile, \
open(str(chrom) + '.clean.fa','w+') as outfile:
lines = infile.readlines()
x = map(str.strip,lines)
seq = ''
for line in x:
y = str(line)
z = y.upper()
w = z.replace('N','')
seq += w
outfile.write(seq)
outfile2.write(str(chrom) + '\t' + str(len(seq)) + '\n')
infile.close()
outfile.close()
outfile2.close()
#Calculate probability distribution of chromosomes.
with open('chrdist.td','r') as infile:
lengths = []
names = []
for x in infile:
length = x.split('\t')[1]
name = x.split('\t')[0]
lengths.append(int(length))
names.append(str(name))
infile.close()
d = dict(zip(names,lengths))
total=0
for n in lengths:
total += n
keys = []
probs = []
with open('chrdists.frac.td','w+') as outfile:
for key in d:
keys.append(key)
frac = float(d[key])/total
probs.append(frac)
outfile.write(str(key) + '\t' + str(frac) + '\n')
"""from numpy import random
#e = dict(zip(keys,probs))
counter = 0
while counter < 1000:
selected_chrom = random.choice(keys,p=probs)
print selected_chrom"""
# calculate mean reads required to achieve coverage (will eventually be specified parameter). Read lengths will eventually need to be modified to fit distribution.
required_reads = []
keys = []
desired_cov = 8
mean = 4076
std = 3076
sigma = (np.log(1+(float(mean)/(float(std))**2)))**0.5
mu = np.log(mean)-0.5*sigma**2
read_length = np.random.lognormal(mu,sigma)
for key in d:
req_reads = (desired_cov*d[key])/mean
required_reads.append(req_reads)
keys.append(key)
#print read_length, req_reads
req_read_dict = dict(zip(keys,required_reads))
#build list of read lengths
read_counts = {}
for key in req_read_dict:
n=[]
counter = 0
while counter < req_read_dict[key]:
n.append(int(round(np.random.lognormal(mu,sigma))))
counter += 1
read_counts[key] = n
#Build dictionary with simulated reads by chr, creates bed file for validation of coverage
sim_reads_dict = {}
for key in read_counts:
seq = ""
simreads = []
with open(str(key) + '.clean.fa', 'r') as infile:
for x in infile:
seq += str(x)
infile.close()
with open(str(key) + '_simreads.bed','w+') as bedout:
for length in read_counts[key]:
buf = length/2
x = np.random.randint(buf,d[key]-buf)
low = int(x-buf)
hi = int(x+buf)
bedout.write(str(low) + '\t' + str(hi) + '\n')
simread = seq[low:hi+1]
simreads.append(simread)
bedout.close()
seq = None
sim_reads_dict[key] = simreads
read_counts = None
#write outfiles of simulatied reads.
for key in sim_reads_dict:
with open(str(key)+'_simreads.fa','w+') as fastaout:
for i in xrange(0,len(sim_reads_dict[key])):
fastaout.write('>' + str(key)+'_'+str(i) + '\n' + str(sim_reads_dict[key][i]) + '\n')
fastaout.close()
sim_reads_dict = None
with open('terminal_dimer_dists.td','w+') as finalout:
finalout.write('Chr_Name' + '\t' + 'AA' + '\t'+ 'AT'+ '\t'+ 'AC'+ '\t'+ 'AG' + '\t'+ 'TA'+ '\t'+ 'TT' + '\t'+ 'TC' + '\t' + 'TG' + '\t'+'GA' + '\t'+ 'GT' + '\t'+ 'GG' + '\t'+ 'GC'+ '\t'+ 'CA' + '\t'+ 'CT' + '\t'+ 'CG' + '\t'+ 'CC' + '\t'+ 'Total_dimers' + '\n')
for key in d:
n=[]
AA = 0
AT = 0
AC = 0
AG = 0
TA = 0
TT = 0
TC = 0
TG = 0
CA = 0
CT = 0
CG = 0
CC = 0
GA = 0
GT = 0
GG = 0
GC = 0
with open(str(key) + '_simreads.fa', 'r') as fastain:
lines = fastain.readlines()
for l in lines:
if not l.startswith('>'):
n.append(l)
for i in xrange(0,len(n)-1):
x = str(n[i])
y=x.strip()
termdimer = y[-2:]
if termdimer == 'AA' :
AA += 1
elif termdimer == 'AT':
AG += 1
elif termdimer == 'AG':
AT += 1
elif termdimer == 'AC':
AC += 1
elif termdimer == 'TA':
TA += 1
elif termdimer == 'TT':
TT += 1
elif termdimer == 'TC':
TC += 1
elif termdimer == 'TG':
TG += 1
elif termdimer == 'CA':
CA += 1
elif termdimer == 'CT':
CT += 1
elif termdimer == 'CG':
CG += 1
elif termdimer == 'CC':
CC += 1
elif termdimer == 'GA':
GA += 1
elif termdimer == 'GT' :
GT += 1
elif termdimer == 'GC':
GC += 1
elif termdimer == 'GG':
GG += 1
fastain.close()
total = AA + AT + AC + AG + TA +TT +TC+TG+GA+GT+GG+GC+CA+CT+CG+CC
finalout.write(str(key) + '\t' + str(AA/total) + '\t'+ str(AT/total)+ '\t'+ str(AC/total)+ '\t'+ str(AG/total) + '\t'+ str(TA/total) + '\t'+ str(TT/total) + '\t'+ str(TC/total) + '\t' + str(TG/total) + '\t'+str(GA/total) + '\t'+ str(GT/total) + '\t'+ str(GG/total) + '\t'+ str(GC/total)+ '\t'+ str(CA/total) + '\t'+ str(CT/total) + '\t'+ str(CG/total) + '\t'+ str(CC/total) + '\t'+ str(total) + '\n')
finalout.close()