forked from ythuang0522/homopolish
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
59 lines (51 loc) · 2.24 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def preprocessing(df):
new_draft = []
df['A'] = df['A'].astype(int) / df['coverage'].astype(int)
df['T'] = df['T'].astype(int) / df['coverage'].astype(int)
df['C'] = df['C'].astype(int) / df['coverage'].astype(int)
df['G'] = df['G'].astype(int) / df['coverage'].astype(int)
df['gap'] = df['gap'].astype(int) / df['coverage'].astype(int)
df['Ins_A'] = df['Ins_A'].astype(int) / df['coverage'].astype(int)
df['Ins_T'] = df['Ins_T'].astype(int) / df['coverage'].astype(int)
df['Ins_C'] = df['Ins_C'].astype(int) / df['coverage'].astype(int)
df['Ins_G'] = df['Ins_G'].astype(int) / df['coverage'].astype(int)
df.loc[df['homopolymer'] > 10, 'homopolymer'] = 10
for i in range(0,11):
if i not in df.homopolymer.value_counts():
name = 'homopolymer_' + str(i)
df[name] = 0
df = pd.DataFrame(df.drop(['draft'], axis=1))
df = pd.get_dummies(df, columns=['homopolymer'])
df = pd.DataFrame(df.drop(['position'], axis=1))
scaler = MinMaxScaler()
df['coverage'] = scaler.fit_transform(df[['coverage']])
return df
def haplotype(df):
haplotype = np.zeros((df.shape[0],), dtype=int)
ins_pos = df[df.draft == "-"].position.values
side = 10
dis = np.diff(ins_pos)
idxs = np.where((dis < 10) & (dis > 1))[0]
for i in idxs:
head = df[df['position'] == ins_pos[i]]
head_total = head[['Ins_A','Ins_T','Ins_C','Ins_G']].sum(axis=1).values[0]
tail = df[df['position'] == ins_pos[i+1]]
tail_total = tail[['Ins_A','Ins_T','Ins_C','Ins_G']].sum(axis=1).values[0]
head_idx=head.index.values[0]
tail_idx=tail.index.values[0]
coverage = head.coverage.values[0]
if (head_total + tail_total) == coverage:
if head_total > tail_total:
haplotype[head_idx] = 1
haplotype[tail_idx] = 2
elif head_total < tail_total:
haplotype[head_idx] = 2
haplotype[tail_idx] = 1
else:
haplotype[head_idx] = 1
haplotype[tail_idx] = 1
df['haplotype'] = haplotype
return df