-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMyHTMLParser.py
126 lines (106 loc) · 3.74 KB
/
MyHTMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# !python3
# coding=UTF-8
from html.parser import HTMLParser
# 把HTMLParser向上封装一层,详见https://www.cnblogs.com/zhanghaohong/p/4562127.html
class Parser_loginurl(HTMLParser):
'''分离出统一身份认证地址'''
def __init__(self):
HTMLParser.__init__(self)
self.data = [] # 定义data数组用来存储html中的数据
self.links = []
def handle_starttag(self, tag, attrs):
# print('<%s>' % tag)
if tag == 'a':
if len(attrs) == 0:
pass
else:
tag = False
for (Name, Value) in attrs:
if Name == 'id':
# 获取统一身份认证接口
if Value == 'login-unified-wrapper':
tag = True
if Name == 'href':
if tag == True:
self.links.append(Value)
"""
def handle_endtag(self, tag):
print('<%s>' % tag)
def handle_startendtag(self, tag, attrs):
print('<%s>' % tag)
def handle_data(self, data):
print('data===>', data)
def handle_comment(self, data):
print('<!--', data, '-->')
def handle_entityref(self, name):
print("&%s;" % name)
def handle_charref(self, name):
print('&#%s;' % name)
if __name__ == "__main__":
html_code = '''<html>
<head>这是头标签</head>
<body>
<!--test html parser-->
<p>Some <a type="button" id="login-unified-wrapper" href="/ucas-sso/login">html</a> HTML Ӓ ...<br>END</p>
</body ></html>'''
parser = MyHTMLParser()
parser.feed(html_code)
parser.close()
print(parser.data)
print(parser.links)
"""
class Parser_lastdata(HTMLParser):
'''分离出本次健康打卡所需的数据: _token, '''
def __init__(self):
HTMLParser.__init__(self)
self.data = {} # 定义data数组用来存储html中的数据
def handle_starttag(self, tag, attrs):
if tag == 'input':
'''print('<%s>' % tag)
print(attrs)'''
if len(attrs) == 0:
pass
elif len(attrs) == 3:
tag = False
for (Name, Value) in attrs:
if Name == 'type' and Value == 'hidden':
tag = True
if tag == True:
name = attrs[1][1]
value = attrs[2][1]
self.data[name] = value
class Parser_validateimg(HTMLParser):
'''分离出统一身份认证验证码的地址'''
def __init__(self):
HTMLParser.__init__(self)
self.data = [] # 定义data数组用来存储html中的数据
self.links = [] # 验证码获取链接
def handle_starttag(self, tag, attrs):
# print('<%s>' % tag)
if tag == 'a':
if len(attrs) == 0:
pass
else:
tag = False
for (Name, Value) in attrs:
if Name == 'class':
# 获取统一身份认证接口
if Value == 'validate-img':
tag = True
if Name == 'src':
if tag == True:
self.links.append(Value)
"""
def handle_endtag(self, tag):
print('<%s>' % tag)
def handle_startendtag(self, tag, attrs):
print('<%s>' % tag)
def handle_data(self, data):
print('data===>', data)
def handle_comment(self, data):
print('<!--', data, '-->')
def handle_entityref(self, name):
print("&%s;" % name)
def handle_charref(self, name):
print('&#%s;' % name)
"""