-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhtml_tables_to_dictionary.py
127 lines (103 loc) · 4.43 KB
/
html_tables_to_dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Extract HTML tables from a URL, file, or string and return a dictionary:
{
'table1': {
'heads': [ 'col_hdg1', 'col_hdg2', ... 'col_hdgN' ],
'rows': [
[ 'col_val1', 'col_val2', ... 'col_valN' ],
[ 'col_val1', 'col_val2', ... 'col_valN' ],
[ 'col_val1', 'col_val2', ... 'col_valN' ],
.
.
.
]
},
'table2': {
'heads': [ 'col_hdg1', 'col_hdg2', ... 'col_hdgN' ],
'rows': [
[ 'col_val1', 'col_val2', ... 'col_valN' ],
[ 'col_val1', 'col_val2', ... 'col_valN' ],
[ 'col_val1', 'col_val2', ... 'col_valN' ],
.
.
.
]
},
.
.
.
}
Note: The table names are taken from the heading immediately preceeding the table
definition or, if no heading precedes the table definition, is generated as
'Unlabelled Table NN', where 'NN' is the table's sequence number.
"""
from urllib.request import *
from html.parser import HTMLParser
import sys
def get_html_tables(html_file):
class MyHTMLParser(HTMLParser):
state = {'counter': 0, 'heading': False, 'last_heading': None, 'last_tag': None, 'tables': {}}
def handle_starttag(self, tag, attrs):
if tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
self.state['last_tag'] = 'h'
elif tag == 'table':
self.state['counter'] += 1
self.state['last_tag'] = tag
if self.state['last_heading']:
if self.state['counter'] < 2:
self.state['table_name'] = self.state['last_heading']
else:
self.state['table_name'] = '%s + %s' % (self.state['last_heading'], self.state['counter'])
else:
self.state['table_name'] = 'Unlabelled Table %s' % self.state['counter']
table_name = self.state['table_name']
self.state['tables'][table_name] = {'rows': []}
elif tag == 'tr':
self.state['last_tag'] = tag
self.state['row'] = []
elif tag == 'th':
self.state['last_tag'] = tag
self.state['heading'] = True
elif tag == 'td':
self.state['last_tag'] = tag
self.state['heading'] = False
else:
self.state['last_tag'] = None
def handle_endtag(self, tag):
if tag == 'table':
table_name = self.state['table_name']
if 'heads' not in self.state['tables'][table_name]:
self.state['tables'][table_name]['heads'] = []
if len(self.state['tables'][table_name]['rows']) > 0:
for ix in range(1, len(self.state['tables'][table_name]['rows'][0])+1):
self.state['tables'][table_name]['heads'].append('Unlabelled Column %s' % ix)
if tag == 'tr':
table_name = self.state['table_name']
if self.state['heading']:
self.state['tables'][table_name]['heads'] = self.state['row']
else:
self.state['tables'][table_name]['rows'].append(self.state['row'])
self.state['last_tag'] = None
def handle_data(self, data):
if self.state['last_tag'] == 'h':
self.state['counter'] = 0
self.state['last_heading'] = data
elif self.state['last_tag'] in ['td', 'th']:
self.state['row'].append(data)
def decode(obj):
if isinstance(obj, str):
return obj
else:
return obj.decode('utf-8')
parser = MyHTMLParser()
if len(html_file) > 7 and (html_file[:7] == 'http://' or html_file[:8] == 'https://'):
with urlopen(html_file) as response:
parser.feed(' '.join(decode(response.read()).split()))
elif len(html_file) > 7 and html_file[:7] == 'file://':
with open(html_file[7:]) as response:
parser.feed(' '.join(decode(response.read()).split()))
else:
parser.feed(' '.join(decode(html_file).split()))
return parser.state['tables']
if __name__ == "__main__":
get_html_tables(sys.argv[1])