forked from phbradley/tcr-dist
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_tsv.py
96 lines (80 loc) · 2.44 KB
/
parse_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def parse_tsv_line(line,infields,sep='\t'):
if line[-1]=='\n': line = line[:-1] #doh
l = line.split(sep)
assert len(l) == len(infields)
vals = {}
for tag,val in zip(infields,l):
vals[tag] = val
return vals
def make_tsv_line(vals,outfields,empty_string_replacement='',sep='\t'):
"""Does not have the \n at the end"""
l = []
for tag in outfields:
val = vals[tag]
if type(val) is str:
if empty_string_replacement and not val:
l.append( empty_string_replacement )
else:
l.append(val)
else:
l.append(str(val))
return sep.join( l )
def parse_tsv_file( filename, key_fields=[], store_fields=[], save_l=False, sep='\t' ):
if not key_fields and not store_fields:
save_l = True
D = {}
L = []
infields = []
for line in open( filename,'rU'):
if not infields:
if line[0] == '#':
infields = line[1:-1].split(sep)
else:
infields = line[:-1].split(sep)
continue
assert infields
l = parse_tsv_line( line[:-1], infields )
if store_fields:
dats = [ l[x] for x in store_fields ]
if save_l:
dats.append( l )
else:
assert save_l
dats = l
if key_fields:
subd = D
for k in key_fields[:-1]:
tag = l[k]
if tag not in subd: subd[tag] = {}
subd = subd[tag]
final_tag = l[ key_fields[-1] ]
if final_tag not in subd: subd[final_tag] = []
subd[final_tag].append( dats )
else:
L.append( dats )
if key_fields:
return D
else:
return L
# silly
def safely_split_csv_line( line, text_encapsulator='"' ):
newcomma = "COMMA!!DUDE!!" # any string that's not present in line will work
assert newcomma not in line
l = list(line)
assert len(l) == len(line)
in_quote = False
newl = l[:]
for i,a in enumerate(l):
if a == text_encapsulator:
in_quote = not in_quote
else:
if in_quote and a == ',':
newl[i] = newcomma
assert not in_quote
#print ''.join(newl)
l = ( ''.join( newl ) ).split(',')
newl = l[:]
for i,a in enumerate(l):
if newcomma in a:
newl[i] = l[i].replace(newcomma,',')
return newl