-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDealHtml.py
71 lines (52 loc) · 1.67 KB
/
DealHtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import re
# 清洗HTML标签文本
# @param htmlstr HTML字符串.
def filter_tags(htmlstr):
# 过滤DOCTYPE
htmlstr = ' '.join(htmlstr.split()) # 去掉多余的空格
re_doctype = re.compile(r'<!DOCTYPE .*?> ', re.S)
s = re_doctype.sub('',htmlstr)
# 过滤CDATA
re_cdata = re.compile('//<!CDATA\[[ >]∗ //\] > ', re.I)
s = re_cdata.sub('', s)
# Script
re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>', re.I)
s = re_script.sub('', s) # 去掉SCRIPT
# style
re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)
s = re_style.sub('', s) # 去掉style
# 处理换行
re_br = re.compile('<br\s*?/?>')
s = re_br.sub('', s) # 将br转换为换行
# HTML标签
re_h = re.compile('</?\w+[^>]*>')
s = re_h.sub('', s) # 去掉HTML 标签
# HTML注释
re_comment = re.compile('<!--[^>]*-->')
s = re_comment.sub('', s)
# 多余的空行
blank_line = re.compile('\n+')
s = blank_line.sub('', s)
blank_line_l = re.compile('\n')
s = blank_line_l.sub('', s)
blank_kon = re.compile('\t')
s = blank_kon.sub('', s)
blank_one = re.compile('\r\n')
s = blank_one.sub('', s)
blank_two = re.compile('\r')
s = blank_two.sub('', s)
blank_three = re.compile(' ')
s = blank_three.sub('', s)
# 剔除超链接
http_link = re.compile(r'(http://.+.html)')
s = http_link.sub('', s)
return s
def readTxt(path):
res = ''
with open(path,'r',encoding='utf-8') as f:
res = f.read()
return res
if __name__=='__main__':
str_doc = readTxt(r'./clearText/htmldome.html')
s=filter_tags(str_doc)
print(s)