Skip to content

Commit a7c36ff

Browse files
committed
first commit
1 parent 8419329 commit a7c36ff

File tree

14 files changed

+622
-0
lines changed

14 files changed

+622
-0
lines changed

Crawler_Exercise/GetPhotos2.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
url = 'http://tieba.baidu.com/p/4178314700'
5+
6+
def GetHtml(url):
7+
html = requests.get(url).text
8+
return html
9+
10+
def GetImg(html):
11+
soup = BeautifulSoup(html, 'html.parser')
12+
imglist = []
13+
for photourl in soup.find_all('img'):
14+
imglist.append(photourl.get('src'))
15+
x = 0
16+
for imgurl in imglist:
17+
with open('E:/Pic/%s.jpg' % x, 'wb') as file:
18+
file.write(requests.get(imgurl).content)
19+
x += 1
20+
21+
if __name__ == '__main__':
22+
html = GetHtml(url)
23+
GetImg(html)
24+
File renamed without changes.
1.66 KB
Binary file not shown.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# -*- coding:utf-8 -*-
2+
import sys,os,pdb,re,time,random
3+
from spider import SpiderHTML
4+
from bs4 import BeautifulSoup
5+
6+
class QiubaiSpider(SpiderHTML):
7+
def __init__(self,contentType,pageStart=1, pageEnd=1):
8+
#super.__init__(self)
9+
self._contentType = contentType
10+
self._pageStart = int(pageStart)
11+
self._pageEnd = int(pageEnd)+1
12+
self.__url = {'new':'http://www.qiushibaike.com/textnew/page/','hot':'http://www.qiushibaike.com/text/page/'}
13+
14+
def getJokes(self):
15+
reqUrl = ''
16+
17+
if contentType in self.__url:
18+
reqUrl = self.__url[self._contentType]
19+
else:
20+
reqUrl = self.__url['new']
21+
for i in range(self._pageStart,self._pageEnd):
22+
pageUrl = reqUrl+str(i)+'/'
23+
jokes = self.getUrl(pageUrl)
24+
jokes = jokes.find_all('div',id=re.compile('qiushi_tag_\d+'))
25+
filepath = os.path.join('E:\\','qiubai','page_'+self._contentType+str(i))
26+
info = '正在保存第{page}页的糗事到文件 {file}.txt'
27+
print(info.format(page=i,file=filepath))
28+
for joke in jokes:
29+
jokeContent = str(joke.find('div',attrs={'class':'content'}))
30+
jokeContent = re.sub('<div class="content">','',jokeContent)
31+
jokeContent = re.sub('</div>','',jokeContent)
32+
jokeContent = re.sub('<!--\d+-->','',jokeContent)
33+
jokeContent = re.sub('<br>','\n',jokeContent)
34+
jokeContent = re.sub('<br/>','\n',jokeContent)
35+
try:
36+
author = joke.find(attrs={'class':'author clearfix'}).find('h2').string
37+
upvote = joke.find(attrs={'class':'stats'}).span.i.string
38+
except AttributeError:
39+
pass
40+
41+
joke = '-----------------------------\r\n作者:{author}\r\n{joke}\r\n\r\n{upvote}人觉得很赞\r\n'.format(joke=jokeContent.strip(),author=author,upvote=upvote)
42+
43+
self.saveText(filepath+'.txt',joke,'a')
44+
if i%2 == 0: #防止被封,间隔时间长一点
45+
time.sleep(random.random()*3)
46+
47+
if __name__ == '__main__':
48+
contentType = 'new'
49+
page = 5
50+
paramsNum = len(sys.argv)
51+
52+
#输入想获取最新的糗百还是最热的糗百
53+
#参数2,3为想要获取的页数
54+
if paramsNum>=4:
55+
contentType = sys.argv[1]
56+
page = sys.argv[2]
57+
pageEnd = sys.argv[3]
58+
elif paramsNum>=3:
59+
contentType = sys.argv[1]
60+
page = sys.argv[2]
61+
pageEnd = page
62+
elif paramsNum == 2:
63+
contentType = sys.argv[1]
64+
page,pageEnd = 1,1
65+
else:
66+
contentType = 'new'
67+
page,pageEnd = 1,1
68+
69+
qiubai = QiubaiSpider(contentType,page,pageEnd)
70+
qiubai.getJokes()
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# -*- coding: utf-8 -*-
2+
__author__ = 'yhf'
3+
import os,re,codecs,urllib
4+
from urllib import request
5+
from bs4 import BeautifulSoup
6+
7+
class SpiderHTML(object):
8+
#打开页面
9+
def getUrl(self, url, coding='utf-8'):
10+
req = request.Request(url)
11+
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 UBrowser/5.5.9703.2 Safari/537.36')
12+
with request.urlopen(req) as response:
13+
return BeautifulSoup(response.read().decode(coding),'html.parser')
14+
15+
#保存文本内容到本地
16+
def saveText(self,filename,content,mode='w'):
17+
self._checkPath(filename)
18+
with codecs.open(filename, encoding='utf-8', mode=mode) as f:
19+
f.write(content)
20+
21+
22+
#保存图片
23+
def saveImg(self, imgUrl, imgName):
24+
data=urllib.request.urlopen(imgUrl).read()
25+
self._checkPath(imgName)
26+
with open(imgName,'wb') as f:
27+
f.write(data)
28+
29+
#创建目录
30+
def _checkPath(self, path):
31+
dirname = os.path.dirname(path.strip())
32+
if not os.path.exists(dirname):
33+
os.makedirs(dirname)
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from spider import SpiderHTML
2+
import re,os,sys,time,urllib,random,http
3+
'''
4+
抓取淘宝模特的靓图
5+
'''
6+
7+
class TaobaommSpider(SpiderHTML):
8+
#抓取起始页,结束页,每个妹子抓取的图片数量
9+
def __init__(self,pageStart, pageEnd,limit_img):
10+
self._pageStart = int(pageStart)
11+
self._pageEnd = int(pageEnd)+1
12+
self._limit = limit_img
13+
self.__url = 'https://mm.taobao.com/json/request_top_list.htm?page='
14+
self.__dir = r'E:\taobaomm'
15+
16+
def start(self):
17+
for page in range(self._pageStart,self._pageEnd):
18+
url = self.__url + str(page)
19+
contents = self.getUrl(url,'gbk')
20+
lists = contents.find_all('div',class_='personal-info')
21+
for girl in lists:
22+
info = girl.find('a',attrs={'class':'lady-name'})
23+
avatar = girl.find('a',class_='lady-avatar')
24+
25+
girlinfo = {}
26+
girlinfo['name'] = info.string
27+
girlinfo['age'] = info.find_next_sibling('em').strong.string
28+
girlinfo['city'] = info.find_next('span').string
29+
girlinfo['url'] = 'https:'+avatar['href']
30+
#去除掉缩小的图片
31+
girlinfo['avatar'] = 'https:'+re.sub('_\d+x\d+\.\w+$','',avatar.img['src'])
32+
imgType = os.path.splitext(girlinfo['avatar'])[1]
33+
logInfo = '找到一位MM:{name},{age}岁,她在{city}'.format(**girlinfo)
34+
print(logInfo)
35+
tmpDir = os.path.join(self.__dir,girlinfo['name']+'-'+girlinfo['age']+'-'+girlinfo['city'])
36+
if(os.path.exists(tmpDir)):
37+
print('已经获得过信息,去找下一位')
38+
continue
39+
#以名字命名,保存图片和基本信息
40+
self.saveImg(girlinfo['avatar'],os.path.join(tmpDir,'avatar'+imgType))
41+
print('正在进入她的个人中心获取私图')
42+
43+
gilrsCenter = self.getUrl(girlinfo['url'],'gbk')
44+
imgs = gilrsCenter.find('div',class_='mm-aixiu-content').find_all('img')
45+
i = 0
46+
for img in imgs:
47+
i = i + 1
48+
if i % 5 == 0:
49+
print('正在获取第{i}张图'.format(i=i))
50+
try:
51+
imgurl = 'https:'+img['src']
52+
extend_name = os.path.splitext(img['src'])[1]
53+
if extend_name == '.gif':
54+
continue #一般都是表情图,略过
55+
self.saveImg(imgurl,os.path.join(tmpDir,str(i)+extend_name))
56+
except urllib.error.HTTPError as e:
57+
pass
58+
except KeyError as e:
59+
pass
60+
except http.client.IncompleteRead:
61+
pass
62+
63+
if i >= self._limit:
64+
pass #若要限制每个模特抓图的张数,此处改为break
65+
time.sleep(random.random()*2)
66+
67+
68+
if __name__ == '__main__':
69+
page, limit, paramsNum= 1, 0, len(sys.argv)
70+
if paramsNum>=4:
71+
page, pageEnd, limit = sys.argv[1], sys.argv[2], int(sys.argv[3])
72+
elif paramsNum == 2:
73+
page = sys.argv[1]
74+
pageEnd = page
75+
else:
76+
page,pageEnd = 1,1
77+
78+
if limit <5:
79+
limit = 20
80+
spider = TaobaommSpider(page,pageEnd,limit)
81+
spider.start()
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# -*- coding:utf-8 -*-
2+
3+
from spider import SpiderHTML
4+
import sys,urllib,http,os,random,re,time
5+
__author__ = 'waiting'
6+
'''
7+
使用了第三方的类库 BeautifulSoup4,请自行安装
8+
需要目录下的spider.py文件
9+
运行环境:python3.4,windows7
10+
'''
11+
12+
#收藏夹的地址
13+
url = 'https://www.zhihu.com/collection/69135664' #page参数改为代码添加
14+
15+
#本地存放的路径,不存在会自动创建
16+
store_path = 'E:\\zhihu\收藏夹\\攻不可破的大美妞阵线联盟'
17+
18+
class zhihuCollectionSpider(SpiderHTML):
19+
def __init__(self,pageStart, pageEnd, url):
20+
self._url = url
21+
self._pageStart = int(pageStart)
22+
self._pageEnd = int(pageEnd)+1
23+
self.downLimit = 0 #低于此赞同的答案不收录
24+
25+
def start(self):
26+
for page in range(self._pageStart,self._pageEnd): #收藏夹的页数
27+
url = self._url + '?page='+str(page)
28+
content = self.getUrl(url)
29+
questionList = content.find_all('div',class_='zm-item')
30+
for question in questionList: #收藏夹的每个问题
31+
Qtitle = question.find('h2',class_='zm-item-title')
32+
if Qtitle is None: #被和谐了
33+
continue
34+
35+
questionStr = Qtitle.a.string
36+
Qurl = 'https://www.zhihu.com'+Qtitle.a['href'] #问题题目
37+
Qtitle = re.sub(r'[\\/:*?"<>]','#',Qtitle.a.string) #windows文件/目录名不支持的特殊符号
38+
print('-----正在获取问题:'+Qtitle+'-----') #获取到问题的链接和标题,进入抓取
39+
Qcontent = self.getUrl(Qurl)
40+
answerList = Qcontent.find_all('div',class_='zm-item-answer zm-item-expanded')
41+
self._processAnswer(answerList,Qtitle) #处理问题的答案
42+
time.sleep(5)
43+
44+
45+
def _processAnswer(self,answerList,Qtitle):
46+
j = 0
47+
for answer in answerList:
48+
j = j + 1
49+
50+
upvoted = int(answer.find('span',class_='count').string.replace('K','000')) #获得此答案赞同数
51+
if upvoted < 100:
52+
continue
53+
authorInfo = answer.find('div',class_='zm-item-answer-author-info') #获取作者信息
54+
author = {'introduction':'','link':''}
55+
try:
56+
author['name'] = authorInfo.find('a',class_='author-link').string #获得作者的名字
57+
author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #获得作者的简介
58+
except AttributeError:
59+
author['name'] = '匿名用户'+str(j)
60+
except TypeError: #简介为空的情况
61+
pass
62+
63+
try:
64+
author['link'] = authorInfo.find('a',class_='author-link')['href']
65+
except TypeError: #匿名用户没有链接
66+
pass
67+
68+
file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')
69+
if os.path.exists(file_name): #已经抓取过
70+
continue
71+
72+
self.saveText(file_name,'{introduction}\r\n{link}'.format(**author)) #保存作者的信息
73+
print('正在获取用户`{name}`的答案'.format(**author))
74+
answerContent = answer.find('div',class_='zm-editable-content clearfix')
75+
if answerContent is None: #被举报的用户没有答案内容
76+
continue
77+
78+
imgs = answerContent.find_all('img')
79+
if len(imgs) == 0: #答案没有上图
80+
pass
81+
else:
82+
self._getImgFromAnswer(imgs,Qtitle,**author)
83+
84+
85+
#收录图片
86+
def _getImgFromAnswer(self,imgs,Qtitle,**author):
87+
i = 0
88+
for img in imgs:
89+
if 'inline-image' in img['class']: #不抓取知乎的小图
90+
continue
91+
i = i + 1
92+
imgUrl = img['src']
93+
extension = os.path.splitext(imgUrl)[1]
94+
path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)
95+
try:
96+
self.saveImg(imgUrl,path_name) #捕获各种图片异常,流程不中断
97+
except ValueError:
98+
pass
99+
except urllib.error.HTTPError as e:
100+
pass
101+
except KeyError as e:
102+
pass
103+
except http.client.IncompleteRead:
104+
pass
105+
#收录文字
106+
def _getTextFromAnswer(self):
107+
pass
108+
109+
#例:zhihu.py 1 5 获取1到5页的数据
110+
if __name__ == '__main__':
111+
page, limit, paramsNum= 1, 0, len(sys.argv)
112+
if paramsNum>=3:
113+
page, pageEnd = sys.argv[1], sys.argv[2]
114+
elif paramsNum == 2:
115+
page = sys.argv[1]
116+
pageEnd = page
117+
else:
118+
page,pageEnd = 1,1
119+
120+
spider = zhihuCollectionSpider(page,pageEnd,url)
121+
spider.start()
122+

Crawler_Exercise/baiduImg.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import requests
2+
import re
3+
4+
url = 'http://image.baidu.com/search/index'
5+
headers = {
6+
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0',
7+
'Accept-Language' : 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
8+
'Accept-Encoding' : 'gzip, deflate',
9+
'Referer' : 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&fm=detail&lm=-1&st=-1&sf=2&fmq=&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&oq=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&rsp=-1',
10+
'Cookie' : 'HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaMMzY0i9qXJ9ATcu3rvxFIc-a7KI9byBcYk%7EjBVmPGIbL3LTKKJ2D17mh5VfJ5yjlCncAb2yhPI5sZM51Qo7tpCemygM0VNUzuTBJwYF8OYmi3nsCCzbpo5U9tLSzkZfcQ1rxUcJSzaipThg__; HISTORY=fec845b215cd8e8be424cf320de232722d0050; PTOKEN=ff58b208cc3c16596889e0a20833991d; STOKEN=1b1f4b028b5a4415aa1dd9794ff061d312ad2a822d52418f3f1ffabbc0ac6142; SAVEUSERID=0868a2b4c9d166dc85e605f0dfd153; USERNAMETYPE=3; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_18205_18559_17001_17073_15479_12166_18086_10634; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm',
11+
}
12+
def get_html(url, headers):
13+
data = {
14+
'cl' : '2',
15+
'ct' : '201326592',
16+
'face' : '0',
17+
'fp' : 'result',
18+
'gsm' : '200001e',
19+
'ic' : '0',
20+
'ie' : 'utf-8',
21+
'ipn' : 'rj',
22+
'istype' : '2',
23+
'lm' : '-1',
24+
'nc' : '1',
25+
'oe' : 'utf-8',
26+
'pn' : '30',
27+
'queryword' : '高清摄影',
28+
'rn' : '30',
29+
'st' : '-1',
30+
'tn' : 'resultjson_com',
31+
'word' : '高清摄影'
32+
}
33+
34+
page = requests.get(url, data, headers = headers).text
35+
return page
36+
37+
def get_img(page, headers):
38+
# img_url_list = []
39+
reg = re.compile('http://.*?\.jpg')
40+
imglist1 = re.findall(reg, page)
41+
imglist2 = imglist1[0 : len(imglist1) : 3]
42+
# [img_url_list.append(i) for i in imglist if not i in img_url_list]
43+
x = 0
44+
for imgurl in imglist2:
45+
bin = requests.get(imgurl, headers = headers).content
46+
with open('E:/Pic2/%s.jpg' % x, 'wb') as file:
47+
file.write(bin)
48+
x += 1
49+
50+
if __name__ == '__main__':
51+
page = get_html(url, headers)
52+
get_img(page, headers)
53+

0 commit comments

Comments
 (0)