forked from downdawn/dzdp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlist_woff_encryption.py
144 lines (121 loc) · 5.21 KB
/
list_woff_encryption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# -*- coding: utf-8 -*-
import re
from lxml import etree
import requests
from fontTools.ttLib import TTFont
from settings import list_headers, file_path
class ListWoffEncryption(object):
"""
大众点评,列表页信息爬虫
woff字体加密
"""
def __init__(self, food_url):
self.food_url = food_url
self.list_html = ""
self.xml_path = file_path + '/font.xml'
self.headers = list_headers
def get_list_html(self):
"""初步获取列表页面的HTML"""
response = requests.get(url=self.food_url, headers=self.headers)
self.list_html = response.text
# print(self.list_html)
def get_woff_url(self):
"""获取woff字体的url"""
re_info = re.search(r'href="//s3plus(.*?)"', self.list_html)
if re_info:
woff_url = "http://s3plus" + re_info.group(1)
response = requests.get(url=woff_url)
# print(response.text)
# 获取商家点评woff的url
re_info1 = re.search(r'font-family.*?shopNum.*?format.*?url\("(.*?)"\)', response.text)
shop_woff_url = "http:" + re_info1.group(1)
print(shop_woff_url)
filename = shop_woff_url.split('/')[-1]
filepath = file_path + filename
return shop_woff_url, filepath
@staticmethod
def save_woff_xml(xml_path, shop_woff_url, filepath):
"""保存woff到本地,转化成xml文件"""
response = requests.get(url=shop_woff_url)
if response.status_code == 200:
with open(filepath, 'wb') as f:
f.write(response.content)
# 转化成xml文件
font = TTFont(filepath) # 打开文件
font.saveXML(xml_path) # 转换成 xml 文件并保存
def get_shop_num(self, name_id_list):
"""获取被加密的字体"""
re_info = re.findall(r'<svgmtsi class="shopNum">&#(.*?);<', self.list_html, re.S)
shop_list = list() # 页面被加密的字符串列表
for info in re_info:
for name in name_id_list:
for k, v in name.items():
if k == info:
shop_list.append(v)
# 替换列表页面的字体反爬内容
sub_old = '<svgmtsi class="shopNum">&#{};<'.format(info)
sub_new = '<svgmtsi class="shopNum">{}<'.format(v)
self.list_html = re.sub(sub_old, sub_new, self.list_html)
print(shop_list)
def class_name_id(self):
"""class和name,name和id的字典列表关系"""
# self.xml_path = './woff_file/font.xml'
with open(self.xml_path, 'r') as f:
data = f.read()
# class和name
re_info = re.search(r'<cmap_format_4 platformID="0".*?>(.*?)</cmap_format_4>', data, re.S)
re_info1 = re.findall(r'<map code="0(.*?)" name="(.*?)"/>', re_info.group(1))
name_list = list()
for info in re_info1:
name_dict = dict()
name_dict[info[0]] = info[1]
name_list.append(name_dict)
print('name_list', name_list)
# name和id
re_info2 = re.findall(r'<GlyphID id="(.*?)" name="(.*?)"/>', data, re.S)
id_list = list()
for info in re_info2:
id_dict = dict()
id_dict[info[1]] = str(int(info[0]) - 1)[-1]
id_list.append(id_dict)
print('id_list', id_list)
name_id_list = list()
for name in name_list:
name_id_dict = dict()
for k, v in name.items():
for _id in id_list:
for k2, v2 in _id.items():
if v == k2:
name_id_dict[k] = v2
name_id_list.append(name_id_dict)
print(name_id_list)
return name_id_list
def parse_data(self):
"""解析替换后的html,获取店铺名称, 点评数, 平均价格,评价得分列表"""
html = etree.HTML(self.list_html)
txt_info = html.xpath("//div[@class='txt']")
result = list()
for txt in txt_info:
item = dict()
item["title"] = txt.xpath(".//div[@class='tit']/a/@title")
item["review_num"] = ''.join(txt.xpath(".//a[@class='review-num']//b//text()"))
item["mean_price"] = ''.join(txt.xpath(".//a[@class='mean-price']//b//text()")).replace('¥', '')
_comment = txt.xpath(".//span[@class='comment-list']/span")
item["comment_list"] = list()
for comment in _comment:
item["comment_list"].append(''.join(comment.xpath("./b//text()")))
result.append(item)
# print(result)
return result
def run(self):
self.get_list_html()
shop_woff_url, filepath = self.get_woff_url()
self.save_woff_xml(self.xml_path, shop_woff_url, filepath)
name_id_list = self.class_name_id()
self.get_shop_num(name_id_list)
result = self.parse_data()
return result
if __name__ == '__main__':
_food_url = "http://www.dianping.com/xiamen/ch10/g112"
test = ListWoffEncryption(food_url=_food_url)
print(test.run())