forked from helengracehuang/MsCaviar
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathld_prune.py
152 lines (134 loc) · 6.11 KB
/
ld_prune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Given a list of MsCAVIAR files for a locus, LD prunes SNPs above a threshold
import argparse, sys
def ld_prune_parseargs(): # handle user arguments
parser = argparse.ArgumentParser(description="Given a list of MsCAVIAR " +
" LD files for a locus, LD prunes SNPs above a threshold.")
parser.add_argument('--infiles', required=True, nargs='+',
help = 'Space-separated list of MsCAVIAR LD files to LD prune.')
parser.add_argument('--output_prefix', required=True,
help = 'Prefix for pruned output files.')
parser.add_argument('--threshold', default=1.0, type=float,
help = 'LD prune SNPs above this threshold of LD with each other.')
#parser.add_argument('--any_study', action='store_true',
# help = 'LD prune SNPs above --threshold in ANY study (default is ALL).')
#parser.add_argument('--any_pick_criteria', default='max_overall',
# choices = ['max_overall', 'max_average', 'max_difference'],
# help = 'For any_study, pick SNP with "max" zscore according to this critera.')
parser.add_argument('--zscore_file_ext', default = '.zscores',
help = 'Filename extension for zscores files.')
parser.add_argument('--ld_file_ext', default = '.ld',
help = 'Filename extension for LD files.')
parser.add_argument('--processed_file_ext', default = 'NONE',
help = 'Use this if you want to include PAINTOR .processed files')
args = parser.parse_args()
return args
# Find all SNP pairs that are above threshold ("match") in any study
def find_snp_match_by_study(all_infiles, threshold):
snp_match_by_study = {} # SNP --> matching SNPs --> studies they match in
for ld_fname in all_infiles:
with(open(ld_fname, 'r')) as infile:
row_num = 0
for line in infile:
# symmetric matrix -- only need to check upper triangle
# store that any SNP pair above threshold matches for this file
all_corr_for_snp = [float(i) for i in line.strip().split()]
for col_num in range(row_num + 1, len(all_corr_for_snp)):
if all_corr_for_snp[col_num] >= threshold:
if row_num not in snp_match_by_study:
snp_match_by_study[row_num] = {}
if col_num not in snp_match_by_study[row_num]:
snp_match_by_study[row_num][col_num] = []
snp_match_by_study[row_num][col_num].append(ld_fname)
row_num += 1
return snp_match_by_study
# Find "clusters" of SNPs above LD threshold with each other in all studies
def make_clusters_all_study(snp_match_by_study, num_studies):
clusters = []
sorted_snp_list = sorted([i for i in snp_match_by_study])
for snp in sorted_snp_list:
this_snp_cluster = [snp]
# find snps that "match" (above LD threshold) in all studies
for match_snp in snp_match_by_study[snp]:
if len(snp_match_by_study[snp][match_snp]) == num_studies:
this_snp_cluster.append(match_snp)
if len(this_snp_cluster) > 1: # other SNPs match this one in all studies
clusters.append(sorted(this_snp_cluster))
return clusters
'''
# Find "clusters" of SNPs above LD threshold with each other in any study
def make_clusters_any_study(snp_match_by_study):
clusters = []
sorted_snp_list = sorted([i for i in snp_match_by_study])
# store all matching SNP pairs as clusters
for snp in sorted_snp_list:
this_snp_cluster = [snp]
for match_snp in snp_match_by_study[snp]:
this_snp_cluster.append(match_snp)
clusters.append(sorted(this_snp_cluster))
# now merge clusters that overlap, since SNPs in LD with each other
return clusters
'''
# Pick one SNP to keep from each cluster; mark the rest to be pruned out.
# In this case, we keep the first SNP per cluster, since the choice is arbitrary
def create_prune_list_all_study(clusters):
prune_list = {}
for i in range(len(clusters)):
for j in range(1, len(clusters[i])):
prune_list[clusters[i][j]] = True
return prune_list
# Rewrite MsCAVIAR locus file without the pruned SNPs
def rewrite_locus_file(output_prefix, zscore_fname, prune_list, header=False):
outname = output_prefix + zscore_fname.split('/')[-1]
with(open(zscore_fname, 'r')) as infile:
with(open(outname, 'w')) as outfile:
if header:
outfile.write(infile.readline())
# track which SNP we're on, write if not in list of pruned SNPs
snp_num = 0
for line in infile:
if snp_num not in prune_list:
outfile.write(line)
snp_num += 1
# Rewrite MsCAVIAR LD file without the pruned SNPs
def rewrite_ld_file(output_prefix, ld_fname, prune_list):
outname = output_prefix + ld_fname.split('/')[-1]
with(open(ld_fname, 'r')) as infile:
with(open(outname, 'w')) as outfile:
# track which SNP we're on, write if not in list of pruned SNPs
snp_num = 0
for line in infile:
if snp_num not in prune_list:
splits = line.strip().split(' ')
# exclude SNP columns in prune list as well
out_snps = [splits[i] for i in range(len(splits)) if i not in prune_list]
outfile.write(' '.join(out_snps) + '\n')
snp_num += 1
def ld_prune_main(args = {}):
if dir(args) == {}:
args = ld_prune_parseargs()
if not args.zscore_file_ext.startswith('.'):
args.zscore_file_ext = '.' + args.zscore_file_ext
if not args.ld_file_ext.startswith('.'):
args.ld_file_ext = '.' + args.ld_file_ext
if args.processed_file_ext != 'NONE' and not args.processed_file_ext.startswith('.'):
args.processed_file_ext = '.' + args.processed_file_ext
snp_match_by_study = find_snp_match_by_study(args.infiles, args.threshold)
#if args.any_study:
# clusters = make_clusters_any_study(snp_match_by_study)
# prune_list = create_prune_list_any_study(clusters, args.any_pick_criteria)
#else:
num_studies = len(args.infiles)
clusters = make_clusters_all_study(snp_match_by_study, num_studies)
prune_list = create_prune_list_all_study(clusters)
for ld_fname in args.infiles:
# infer zscore filename from LD filename
zscore_fname = ld_fname.split(args.ld_file_ext)[0] + args.zscore_file_ext
rewrite_locus_file(args.output_prefix, zscore_fname, prune_list)
rewrite_ld_file(args.output_prefix, ld_fname, prune_list)
if not args.processed_file_ext == 'NONE':
proc_fname = ld_fname.split(args.ld_file_ext)[0] + args.processed_file_ext
rewrite_locus_file(args.output_prefix, proc_fname, prune_list, header=True)
if __name__ == '__main__':
args = ld_prune_parseargs()
ld_prune_main(args)
#