forked from phbradley/tcr-dist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfile_converter.py
389 lines (331 loc) · 14.1 KB
/
file_converter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
##
##
##
from basic import *
import util
import copy
import sys
from amino_acids import amino_acids
#import matplotlib
#if make_png: matplotlib.use('Agg')
#import matplotlib.pyplot as plt
#import numpy as np
with Parser(locals()) as p:
#p.str('args').unspecified_default().multiple().required()
p.str('organism').required()
p.str('input_file').required()
p.str('output_file').required()
p.str('input_format').required().described_as('Should be one of: parsed_seqs -OR- clones -OR cdrblast')
p.str('output_format').required().described_as('Should be one of: parsed_seqs -OR- clones')
p.str('single_chain').described_as( 'Only generate info for a single chain; should be one of: alpha -OR- beta')
p.str('epitope').described_as( 'Use this value for the epitope field')
p.str('subject').described_as( 'Use this value for the subject field')
p.flag('auto_ids').described_as('Auto-generate numbered TCR ids')
p.str('id_base').described_as('If using --auto_ids, you can specify a base name for the IDs.')
p.flag('clobber').shorthand('c')
p.flag('dont_check_genes')
p.flag('check_genes')
p.multiword('extra_fields').cast(lambda x:x.split())
p.set_help_prefix("""
This script is for converting old TCR-dist formats to the new format and for converted MIGEC output to an input format suitable for TCR-dist.
Usage examples:
python ../file_converter.py --input_format cdrblast --output_format clones --input_file MIGEC_d2_SH2_3.filtered.cdrblast.txt --output_file tmp.d2.tsv --organism human -c --auto_ids --epitope ring1 --subject frodo --single_chain beta
python ../file_converter.py --input_format parsed_seqs --output_format parsed_seqs --input_file JCC42_EpiMouse_AsEpi.tsv --output_file tmp.tsv --organism mouse -c
""")
check_genes = ( check_genes or not dont_check_genes ) # now make check_genes the default
warnings = set()
prob_warning = '[WARNING] Setting a value of 1.0 for missing TCR probabilities'
clone_size_warning = '[WARNING] Setting a default clone_size of 1 for lines without clone_size info'
cdrblast_clone_size_warning = '[WARNING] Using the cdrblast "Count" field as the TCR clone_size value'
rep_warning = '[WARNING] Blast results do not seem to be specific to allele'
if exists( output_file ) and not clobber:
print output_file,'already exists, use --clobber or remove it'
exit(1)
if check_genes:
from all_genes import all_genes
## these are the fields we are going to include in the output file
##
required_fields_parsed_seqs_file = """
id epitope subject
va_gene va_genes va_rep va_reps va_countreps
ja_gene ja_genes ja_rep ja_reps ja_countreps
vb_gene vb_genes vb_rep vb_reps vb_countreps
jb_gene jb_genes jb_rep jb_reps jb_countreps
cdr3a cdr3a_nucseq cdr3a_quals
cdr3b cdr3b_nucseq cdr3b_quals
""".split()
required_fields_clones_file = """
clone_id epitope subject
clone_size
va_gene va_genes va_rep va_reps va_countreps
ja_gene ja_genes ja_rep ja_reps ja_countreps
vb_gene vb_genes vb_rep vb_reps vb_countreps
jb_gene jb_genes jb_rep jb_reps jb_countreps
cdr3a cdr3a_nucseq
cdr3b cdr3b_nucseq
a_protseq_prob
cdr3a_protseq_prob
va_rep_prob
ja_rep_prob
a_nucseq_prob
b_protseq_prob
cdr3b_protseq_prob
vb_rep_prob
jb_rep_prob
b_nucseq_prob
""".split()
## not doing 'members' field
## these are the fields we are going to include in the output file
## now stored in a dictionary by file-type
all_required_fields = {
'parsed_seqs': required_fields_parsed_seqs_file,
'clones': required_fields_clones_file,
}
assert output_format in all_required_fields
outfields = all_required_fields[ output_format ]
if extra_fields:
outfields += extra_fields
if single_chain:
assert single_chain in ['alpha','beta']
# for pos in reversed( range( len(outfields ) ) ):
# field = outfields[pos]
# if single_chain=='alpha' and ( field[:3] in ['vb_','jb_'] or field.startswith('cdr3b') ):
# del outfields[pos]
# elif single_chain== 'beta' and ( field[:3] in ['va_','ja_'] or field.startswith('cdr3a') ):
# del outfields[pos]
listsep = ';' # for lists within fields
def remove_wonky_characters_from_the_end_of_line( inline ):
outline = inline[:]
while not ( outline[-1] == '\t' or outline[-1].split() ):
outline = outline[:-1]
return outline
# def exit_from_reconstruct_field_from_data( field, l ):
# exit(1)
def reconstruct_field_from_data( field, l, organism ):
try:
if field in l:
return l[ field ]
if field == 'subject':
if 'mouse' in l:
return l['mouse']
elif field.endswith('_prob'):
if prob_warning not in warnings:
print prob_warning
warnings.add( prob_warning )
return 1.0
elif field == 'clone_size':
if clone_size_warning not in warnings:
print clone_size_warning
warnings.add( clone_size_warning )
return 1
elif field[:3] in ['va_','ja_','vb_','jb_']: ## these are all pretty similar
prefix = field[:3]
tag = field[3:]
#print 'prefix:',prefix,tag,l.keys()
if tag == 'gene':
# only one place to get this
genes_field = prefix+'genes'
if genes_field not in l:
return None
genes = l[ genes_field ].split( listsep )
#print 'genes:',genes
return sorted( genes )[0]
elif tag == 'genes':
if prefix+'blast_hits' in l:
hits = l[ prefix+'blast_hits' ]
if hits == '-': #failed to find any hits
return None
else:
return listsep.join( sorted( util.get_top_genes( hits ) ) )
elif prefix+'gene' in l: ## just take the one gene we know
return l[ prefix+'gene' ]
elif tag == 'rep':
if prefix+'gene' in l:
if "*" in l[prefix + 'gene']:
return util.get_rep( l[ prefix+'gene' ], organism )
else:
if rep_warning not in warnings:
print rep_warning
warnings.add( rep_warning )
return l[prefix + 'gene']
elif tag == 'reps':
## we should already have hit 'genes' in the list of fields we are trying to fill !!!
if "*" in l[prefix + 'gene']:
return listsep.join( sorted( ( util.get_rep(x,organism) for x in l[prefix+'genes'].split(listsep) ) ) )
else:
if rep_warning not in warnings:
print rep_warning
warnings.add( rep_warning )
return l[prefix + 'gene']
elif tag == 'countreps':
if "*" in l[prefix + 'gene']:
return listsep.join( sorted( util.countreps_from_genes( l[prefix+'genes'].split(listsep), organism ) ) )
else:
if rep_warning not in warnings:
print rep_warning
warnings.add( rep_warning )
return l[prefix + 'gene']
elif field.endswith('_quals'):
seqfield = field[:5]+'_nucseq'
if seqfield not in l:
return None
return '.'.join( ['60']*len(l[seqfield] ) )
except Exception as inst: ## this is not the best way to handle it...
print 'Hit an exception trying to get field {} from line'.format(field),inst
print 'Failed to reconstruct {} from the input fields: {}'\
.format( field, ' '.join( sorted( l.keys() ) ) )
return None
def map_field_names( l, input_format, output_format ):
outl = copy.deepcopy( l )
if input_format == 'cdrblast':
## the problem is that this line could be either alpha or beta; they can occur in the same file
if 'V segments' in l.keys():
v_genestring = l['V segments']
j_genestring = l['J segments']
cdr3aa = l['CDR3 amino acid sequence']
cdr3nt = l['CDR3 nucleotide sequence']
clonecount = l['Count']
elif "cdr3nt" in l.keys():
v_genestring = l['v']
j_genestring = l['j']
cdr3aa = l['cdr3aa']
cdr3nt = l['cdr3nt']
clonecount = l['count']
else:
print "Bad format detected."
sys.exit(1)
outl = {} # ditch the old info
if 'TRA' in v_genestring or 'TRA' in j_genestring:
outl['va_genes'] = listsep.join( v_genestring.split(',') )
outl['ja_genes'] = listsep.join( j_genestring.split(',') )
outl['cdr3a'] = cdr3aa
outl['cdr3a_nucseq'] = cdr3nt
if single_chain=='alpha':
if organism == "mouse":
outl['vb_genes'] = "TRBV19*01"
outl['jb_genes'] = "TRBJ1-4*02"
outl['cdr3b'] = "CASSMGANERLFF"
outl['cdr3b_nucseq'] = "tgtgccagcagtatgggggccaacgaaagattatttttc"
else:
print "error: need to add organism info in script"
sys.exit()
# outl['cdr3a'] = l['CDR3 amino acid sequence']
# outl['cdr3a_nucseq'] = l['CDR3 nucleotide sequence']
elif 'TRB' in v_genestring or 'TRB' in j_genestring:
outl['vb_genes'] = listsep.join( v_genestring.split(',') )
outl['jb_genes'] = listsep.join( j_genestring.split(',') )
outl['cdr3b'] = cdr3aa
outl['cdr3b_nucseq'] = cdr3nt
if single_chain == 'beta':
if organism == "mouse":
outl['va_genes'] = "TRAV8D-1*02"
outl['ja_genes'] = "TRAJ33*01"
outl['cdr3a'] = "CATDMDSNYQLIW"
outl['cdr3a_nucseq'] = "tgtgctactgacatggatagcaactatcagttgatctgg"
#outl['cdr3b'] = l['CDR3 amino acid sequence']
#outl['cdr3b_nucseq'] = l['CDR3 nucleotide sequence']
else:
print 'cant determine whether this cdrblast line is alpha or beta:',l
return None ## NOTE EARLY RETURN
outl['clone_size'] = clonecount
if cdrblast_clone_size_warning not in warnings:
print cdrblast_clone_size_warning
warnings.add( cdrblast_clone_size_warning )
return outl
infields = []
print 'making:',output_file
out = open( output_file,'w')
out.write( '\t'.join( outfields ) + '\n' )
idcounter=0
for inline in open( input_file,'rU'):
line = remove_wonky_characters_from_the_end_of_line( inline )
if not infields:
if line[0] == '#':
line = line[1:]
infields = line.split('\t')
assert infields
if extra_fields:
for f in extra_fields:
assert f in infields
## we may want to add some fields to outfields
else:
l = parse_tsv_line( line, infields )
l = map_field_names( l, input_format, output_format )
if l is None: continue
if auto_ids:
if not id_base:
id_base = 'id'
idcounter += 1
if output_format == "clones":
l['id'] = '{}{}{}'.format(id_base,idcounter,".clone")
else:
l['id'] = '{}{}'.format(id_base,idcounter)
if epitope: ## danger if we are using this cmdline option as a variable...
l['epitope'] = epitope
if subject: ## somewhere else in the script...
l['subject'] = subject
badline = False
for field in outfields:
if field == "clone_id":
if 'clone_id' not in l:
l['clone_id'] = l['id']
val = reconstruct_field_from_data( field, l, organism )
if val is None:
if single_chain:
if single_chain=='alpha' and ( field[:3] in ['vb_','jb_'] or field.startswith('cdr3b') ):
val = "-"
l[ field ] = val
break
elif single_chain== 'beta' and ( field[:3] in ['va_','ja_'] or field.startswith('cdr3a') ):
val = "-"
l[ field ] = val
break
print 'ERROR bad line: field= {} failed, line= {}'.format( field, line[:-1] )
badline = True
break
if field in ["cdr3a", "cdr3b"]:
if len(l[field]) <= 5:
print 'bad line: length of ' + field + " is only " + str(len(l[field]))
badline = True
break
for aa in l[field]:
if aa not in amino_acids:
print 'bad cdr3 amino acid:',aa,l[field]
badline = True
break
if badline:
break
l[ field ] = val
if badline:
print 'badline:',inline[:-1]
continue
outl = {}
for field in outfields:
if not field in l.keys():
if "prob" in field:
outl[field] = 1.0
if prob_warning not in warnings:
print prob_warning
warnings.add( prob_warning )
else:
outl[field] = "-"
continue
outl[ field ] = l[ field ]
if field == 'cdr3a_nucseq' or field == 'cdr3b_nucseq':
outl[ field ] = outl[ field ].lower()
# check the genes
if check_genes:
bad_genes = False
for ab in 'ab':
for vj in 'vj':
tag = '{}{}_gene'.format(vj,ab)
if tag in outl:
gene = outl[tag]
if gene not in all_genes[organism]:
print 'bad gene:',tag,gene
bad_genes = True
if bad_genes:
print 'SKIPPING bad genes',outl
continue
out.write( make_tsv_line( outl, outfields ) + '\n' )
out.close()