-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser_asbase.py
101 lines (81 loc) · 2.98 KB
/
parser_asbase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import h5py
string = h5py.string_dtype(encoding='utf-8')
def punctuation_pic_to_text(elem):
pic_src = elem.img['src']
if "None" in pic_src:
return "Unknown"
elif "True" in pic_src:
return "Yes"
elif "False" in pic_src:
return "No"
def get_photostable(elem):
return punctuation_pic_to_text(elem)
def get_thermostable(elem):
return punctuation_pic_to_text(elem)
def get_solubility(elem):
divs = elem.find_all("div", class_="col".split())
res = []
for div in divs:
solution = div.get_text().strip()
solubility = punctuation_pic_to_text(div)
res.append(':'.join([solution, solubility]))
return ",".join(res)
def get_pdb_structure(elem):
pdb_block = elem.get_text().strip()
return pdb_block
def parse_html(text):
# pasrse html 2 dict
data = {}
soup = BeautifulSoup(text, features="lxml")
card_groups = soup.find_all("div", {"class": "card-group"})
skipped_headers = ["Chemical Structure"]
for card in card_groups:
elements = card.find_all("div", class_="card-header card-body".split())
for ele in elements:
class_name = ele['class'][0]
if "header" in class_name: # header
header = ele.get_text(strip=True)
if "Φ" in header:
header = header.replace("Φ", "phi ")
if "(" in header:
header = header.replace("(", "")
header = header.replace(")", "")
if "λ" in header:
header = header.replace("λ", "lambda ")
if "τ" in header:
header = header.replace("τ", "tau ")
if "ε" in header:
header = header.replace("ε", "epsilon ")
elif "body" in class_name: # body
if "Photostable" in header:
bodystr = get_photostable(ele)
elif "Thermostable" in header:
bodystr = get_thermostable(ele)
elif "Solubility" in header:
bodystr = get_solubility(ele)
elif "optimized structure" in header:
bodystr = get_pdb_structure(ele)
else:
bodystr = ele.get_text().strip().replace("\n", " ")
if header not in skipped_headers:
data[header] = bodystr
return data
def store_dat2h5(data, handler):
id = data["ID"]
group = handler.create_group(id)
for key in data.keys():
# dims = len(data[key])
group.create_dataset(key, dtype=string, shape=(1))[...] = data[key]
handler = h5py.File("testdb.hdf5", "w")
files = os.listdir("./pages")
for text_file in files:
text_file = "./pages/" + text_file
text = open(text_file).read()
data = parse_html(text)
store_dat2h5(data, handler)
print("Done on " + text_file)
handler.close()