forked from ythuang0522/homopolish
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign2df.py
executable file
·130 lines (114 loc) · 3.65 KB
/
align2df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
import pandas as pd
from Bio import SeqIO
def todf(draft, db_np, path, truth_np=None):
nuc_dict = {0:'A',1:'T',2:'C',3:'G'}
record = SeqIO.read(draft, "fasta")
genome_size = len(record)
db_arr = np.load(db_np)
arr, coverage, ins = db_arr['arr'], db_arr['coverage'], db_arr['ins']
if truth_np: # include label for output
truth_arr = np.load(truth_np)
truth_arr, truth_coverage, truth_ins = truth_arr['arr'], truth_arr['coverage'], truth_arr['ins']
label = []
position = []
draft = []
A, T, C , G = [], [], [], []
gap = []
ins_A, ins_T, ins_C, ins_G = [], [], [], []
homo = []
cov = []
insertion = np.where(ins != 0)[0]
for i in np.unique(insertion):
i = int(i)
for j in range(4):
count = 0
if ins[i][j].any() != 0:
position.append(i)
draft.append('-')
A.append(0)
T.append(0)
C.append(0)
G.append(0)
gap.append(0)
ins_A.append(ins[i][j][0])
ins_T.append(ins[i][j][1])
ins_C.append(ins[i][j][2])
ins_G.append(ins[i][j][3])
if coverage[i] != 0:
cov.append(coverage[i])
else:
cov.append(coverage[i+1])
index = np.argmax(ins[i][j])
temp = i
while i + 1 < genome_size and record[i+1] == nuc_dict.get(index):
count += 1
i += 1
i = temp
while record[i] and record[i] == nuc_dict.get(index):
count += 1
i -= 1
i = temp
homo.append(count)
if truth_np:
if not truth_ins[i][j].any(): #truth: no insertion
label.append(6)
else:
for k in range(4):
if truth_ins[i][j][k] != 0: # truth : insertion A/T/C/G
label.append(k)
deletion = np.where(arr[:,4]!=0)[0]
for i in deletion:
i = int(i)
count = 0
position.append(i)
draft.append(record[i])
A.append(arr[i][0])
T.append(arr[i][1])
C.append(arr[i][2])
G.append(arr[i][3])
gap.append(arr[i][4])
ins_A.append(0)
ins_T.append(0)
ins_C.append(0)
ins_G.append(0)
cov.append(coverage[i])
index = i
nuc = record[i]
while i + 1 < genome_size and record[i+1] == nuc:
count += 1
i += 1
i = index
while record[i] and record[i] == nuc:
count += 1
i -= 1
i = index
homo.append(count)
if truth_np:
if truth_arr[i][4] > 0:
label.append(4)
else:
label.append(5)
df_dict = {"position": position,
"draft": draft,
"A": A,
"T": T,
"C": C,
"G": G,
"gap": gap,
"Ins_A": ins_A,
"Ins_T": ins_T,
"Ins_C": ins_C,
"Ins_G": ins_G,
"coverage": cov,
"homopolymer": homo
}
if truth_np:
df_dict.update({'label': label})
df_path = path + '/{}.feather'.format(record.id)
df = pd.DataFrame(df_dict)
if df.empty:
print('DataFrame is empty!')
return False
df.to_feather(df_path)
return df_path