forked from emreg00/toolbox
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_orphanet.py
136 lines (125 loc) · 5.94 KB
/
parse_orphanet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import urllib2, os, cPickle
from bs4 import BeautifulSoup
import time
from xml.etree.ElementTree import iterparse, tostring
import parse_uniprot
NS = ""
def main():
base_dir = "/home/emre/data/disease/orphanet"
file_name = base_dir + "/gene_associations.xml"
out_file = base_dir + "/disease_to_genes.tsv"
disease_to_genes, gene_to_uniprot = get_disease_to_genes(file_name)
uniprot_file = base_dir + "/../../proteome/uniprot/idmapping.tab"
uniprot_ids = gene_to_uniprot.values() #reduce(lambda x,y: x+y, gene_to_uniprot.values())
uniprot_to_geneid = parse_uniprot.get_uniprot_to_geneid_from_idmapping_file(uniprot_file, uniprot_ids)
output_disease_genes(file_name, out_file, uniprot_to_geneid)
return
def output_disease_genes(file_name, out_file, uniprot_to_geneid=None):
#geneid_to_names, name_to_geneid = diseasome.get_geneid_symbol_mapping()
disease_to_genes, gene_to_uniprot = get_disease_to_genes(file_name)
f = open(out_file, 'w')
count = 0
all_geneids = set()
for disease, genes in disease_to_genes.iteritems():
#f.write("\t%s\t%s\n" % (disease, "\t".join(genes)))
if uniprot_to_geneid is None:
geneids = genes #set([ name_to_geneid[gene] for gene in genes if gene in name_to_geneid ])
else:
geneids = set()
for gene in genes:
if gene in gene_to_uniprot:
uniprot = gene_to_uniprot[gene]
if uniprot in uniprot_to_geneid:
geneid = uniprot_to_geneid[uniprot]
try:
geneid = int(geneid)
except:
print "Problem with:", geneid
geneids.add(geneid)
if len(geneids) > 5:
count += 1
all_geneids |= geneids
#print disease, geneids
f.write("\t%s\t%s\n" % (disease, "\t".join(map(str, geneids))))
f.close()
print "N disease > 5", count, "N all geneids", len(all_geneids)
return
def get_disease_to_genes(file_name):
disease_to_genes = {}
gene_to_synonyms = {}
gene_to_uniprot = {}
context = iterparse(file_name, ["start", "end"])
context = iter(context)
event, root = context.next()
state_stack = [ root.tag ]
phenotype, gene, source = None, None, None
for (event, elem) in context:
if event == "start":
state_stack.append(elem.tag)
if elem.tag == NS+"Name":
gene = None
if elem.tag == NS+"Source":
source = None
elif event == "end":
if elem.tag == NS+"Name":
if state_stack[-2] == NS+"Disorder":
phenotype = elem.text.encode("ascii", "replace")
elif elem.tag == NS+"Symbol":
if state_stack[-4] == NS+"DisorderGeneAssociationList" and state_stack[-2] == "Gene": # This is always the case
gene = elem.text
disease_to_genes.setdefault(phenotype, set()).add(gene)
elif elem.tag == NS+"Synonym":
if state_stack[-5] == NS+"DisorderGeneAssociationList" and state_stack[-3] == "Gene":
synonym = elem.text
gene_to_synonyms.setdefault(gene, set()).add(synonym)
elif elem.tag == NS+"Source":
if state_stack[-6] == NS+"DisorderGeneAssociationList" and state_stack[-4] == "Gene":
source = elem.text
elif elem.tag == NS+"Reference":
if state_stack[-6] == NS+"DisorderGeneAssociationList" and state_stack[-4] == "Gene":
if source == "SwissProt":
if gene in gene_to_uniprot:
if gene_to_uniprot[gene] != elem.text:
# Only GNAS has inconsistency but the geneids are the same
print "Uniprot inconsistency:", gene, gene_to_uniprot[gene], elem.text
gene_to_uniprot[gene] = elem.text
# Does not work
elif False and elem.tag == NS+"ExternalReference": #NS+"Source":
if state_stack[-5] == NS+"DisorderGeneAssociationList" and state_stack[-3] == "Gene":
elem_inner = elem.find(NS+"Source")
print tostring(elem_inner)
if elem_inner.text == "SwissProt":
elem_inner = elem.find(NS+"Reference")
gene_to_uniprot[gene] = elem.text
elem.clear()
state_stack.pop()
root.clear()
print len(disease_to_genes), len(gene_to_synonyms), len(gene_to_uniprot)
print disease_to_genes["Lymphangioleiomyomatosis"], gene_to_synonyms["TSC1"], gene_to_uniprot["TSC1"]
return disease_to_genes, gene_to_uniprot
# Does not work due to the unknown tags in xml file
def get_disease_to_genes_bs4(file_name):
disease_to_genes = {}
xml_doc = open(file_name)
soup = BeautifulSoup(xml_doc, "xml")
for tag in soup.find_all('Disorder'): #, recursive=False):
phenotype = None
for tag_inner in tag.children:
print tag_inner.name
if tag_inner.name == "DisorderGeneAssociationList":
for child in tag_inner.DisorderGeneAssociation.Gene.children:
#print child.name
if child.name == "Symbol":
gene = child.string.encode("ascii", "ignore")
if child.name == "ExternalReferenceList":
for child_inner in child.children:
if child_inner.Source.string == "SwissProt":
uniprot = child_inner.Reference.string
elif tag_inner.name == "Name":
phenotype = tag_inner.string.encode("ascii", "ignore")
if phenotype is not None:
disease_to_genes.setdefault(phenotype, set()).add(gene)
print len(disease_to_genes)
return disease_to_genes
if __name__ == "__main__":
main()