Skip to content

Commit 601fd91

Browse files
committed
add one_update.py
1 parent dd2c6c7 commit 601fd91

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

spiderFile/one_update.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import re
2+
import requests as rq
3+
4+
ROOT_URL = "http://wufazhuce.com/one/"
5+
URL_NUM = 14
6+
7+
def yield_url(ROOT_URL, URL_NUM):
8+
return ROOT_URL + str(URL_NUM)
9+
10+
def get_html(url):
11+
return rq.get(url).content.decode("utf-8")
12+
13+
def get_data(html):
14+
img_url_regex = re.compile('<img src="(.*?)" alt="" />')
15+
cite_regex = re.compile('<div class="one-cita">(.*?)</div>', re.S)
16+
img_url = re.findall(img_url_regex, html)[0]
17+
cite = re.findall(cite_regex, html)[0].strip()
18+
return img_url, cite
19+
20+
def save_data(img_url, cite, URL_NUM):
21+
with open("./{}.jpg".format(URL_NUM), "wb") as fp:
22+
fp.write(rq.get(img_url).content)
23+
with open("./cite{}.txt".format(URL_NUM), "w") as fp:
24+
fp.write(cite)
25+
return URL_NUM + 1
26+
27+
def main(ROOT_URL, URL_NUM, number):
28+
for _ in range(number):
29+
url = yield_url(ROOT_URL, URL_NUM)
30+
html = get_html(url)
31+
img_url, cite = get_data(html)
32+
URL_NUM = save_data(img_url, cite, URL_NUM)
33+
34+
if __name__ == "__main__":
35+
try:
36+
main(ROOT_URL, URL_NUM, 20)
37+
except:
38+
pass

0 commit comments

Comments
 (0)