-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4chan.py
executable file
·86 lines (74 loc) · 2.05 KB
/
4chan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/python
import requests
import os
import time
import urllib
from lxml import etree
from requests.packages import urllib3
""" 批量下载4chan页面中的图片
"""
def timeft(t, fs="%Y-%m-%d %H:%M:%S"):
return time.strftime(fs, t)
def geturls():
""" 获取配置文件内的url列表
配置文件与脚本同名,无后缀
"""
a = []
s = os.path.basename(__file__).split('.')[0]
if not os.path.exists(s):
return a
with open(s) as f:
l = f.readline()
while l:
a.append(l.strip())
l = f.readline()
return a
def writeimg(img, path):
""" 保存图片
"""
conn = urllib.request.urlopen("http:" + img, timeout=30)
f = open(path, 'wb')
f.write(conn.read())
f.close()
def getimg(link, path):
""" 获取图片url
"""
img = link.attrib.get("href")
filename = img.split('/')[-1]
filepath = "%s/%s" % (path, filename)
if os.path.isfile(filepath):
return
print(filename)
n = 0
while n < 3:
try:
writeimg(img, filepath)
time.sleep(3)
except:
n += 1
print("retry %d" % n)
else:
break
def main():
# 保存路径
_PATH = "/home/public/Pictures/4chan"
urls = geturls()
for url in urls:
print(url)
ua = url.split('/')
path = "%s/%s" % (_PATH, ua[-3])
if not os.path.exists(path):
os.makedirs(path)
path = "%s/%s" % (path, ua[-1])
if not os.path.exists(path):
os.makedirs(path)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
r = requests.get(
url=url, allow_redirects=False, verify=False, timeout=30)
content = r.content.decode('ISO-8859-1')
html = etree.HTML(content)
links = html.xpath("//a[@class='fileThumb']")
for link in links:
getimg(link, path)
if __name__ == '__main__':
main()