diff --git "a/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\346\226\227\351\261\274API\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\344\277\235\345\255\230\345\210\260Mongodb.py" "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\346\226\227\351\261\274API\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\344\277\235\345\255\230\345\210\260Mongodb.py"
new file mode 100644
index 0000000..79aae0a
--- /dev/null
+++ "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\346\226\227\351\261\274API\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\344\277\235\345\255\230\345\210\260Mongodb.py"
@@ -0,0 +1,41 @@
+import json
+
+import requests
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+client = MongoClient('localhost')
+db = client["DouyuTV"]
+col = db["Roominfo"]
+host = 'http://api.douyutv.com/api/v1/live/'
+all_game = 'http://open.douyucdn.cn/api/RoomApi/game'
+sort = []
+
+
+def parser(url):
+ html = requests.get(url).text
+ soup = BeautifulSoup(html, 'lxml')
+ jn = json.loads(soup.text)
+ return jn
+
+
+def get_room_sort(url):
+ jn = parser(url)
+ data = jn['data']
+ for item in data:
+ sort.append(host + item['short_name'])
+
+
+def get_room_info():
+ for item in sort:
+ jn = parser(item)
+ data = jn['data']
+ try:
+ col.insert(data)
+ except Exception as e:
+ pass
+
+
+if __name__ == '__main__':
+ get_room_sort(all_game)
+ get_room_info()
diff --git "a/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\347\275\221\345\235\200\346\236\204\351\200\240\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\345\210\260Mongodb.py" "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\347\275\221\345\235\200\346\236\204\351\200\240\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\345\210\260Mongodb.py"
new file mode 100644
index 0000000..360c08a
--- /dev/null
+++ "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\347\275\221\345\235\200\346\236\204\351\200\240\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\345\210\260Mongodb.py"
@@ -0,0 +1,73 @@
+import re
+from datetime import datetime
+import requests
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+HOST = "http://www.douyu.com"
+Directory_url = "http://www.douyu.com/directory?isAjax=1"
+Qurystr = "/?page=1&isAjax=1"
+
+client = MongoClient('localhost')
+db = client["Douyu2"]
+col = db["Roominfo"]
+
+
+def get_roominfo(data):
+ if data:
+ firstpage = BeautifulSoup(data, 'lxml')
+ roomlist = firstpage.select('li')
+ print(len(roomlist))
+ if roomlist:
+ for room in roomlist:
+ try:
+ roomid = room["data-rid"]
+ roomtitle = room.a["title"]
+ roomtitle = roomtitle.encode('utf-8')
+ roomowner = room.select("p > span")
+ roomtag = room.select("div > span")
+ roomimg = room.a
+ roomtag = roomtag[0].string
+ date = datetime.now()
+ if len(roomowner) == 2:
+ zbname = roomowner[0].string
+ audience = roomowner[1].get_text()
+ audience = audience.encode('utf-8').decode('utf-8')
+ image = roomimg.span.img["data-original"]
+ word = u"万"
+ if word in audience:
+ r = re.compile(r'(\d+)(\.?)(\d*)')
+ data = r.match(audience).group(0)
+ audience = int(float(data) * 10000)
+ else:
+ audience = int(audience)
+ roominfo = {
+ "roomid": int(roomid),
+ "roomtitle": roomtitle,
+ "anchor": zbname,
+ "audience": audience,
+ "tag": roomtag,
+ "date": date,
+ "img": image
+ }
+ col.insert_one(roominfo)
+ except Exception as e:
+ print(e)
+
+
+def insert_info():
+ session = requests.session()
+ pagecontent = session.get(Directory_url).text
+ pagesoup = BeautifulSoup(pagecontent, 'lxml')
+ games = pagesoup.select('a')
+ # col.drop()
+ for game in games:
+ links = game["href"]
+ gameurl = HOST + links + Qurystr
+ print(gameurl)
+ gamedata = session.get(gameurl).text
+ get_roominfo(gamedata)
+
+
+if __name__ == '__main__':
+ insert_info()
diff --git "a/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\346\226\227\351\261\274\347\233\264\346\222\255\346\210\277\351\227\264\346\225\260\346\215\256\345\210\206\346\236\220.py" "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\346\226\227\351\261\274\347\233\264\346\222\255\346\210\277\351\227\264\346\225\260\346\215\256\345\210\206\346\236\220.py"
new file mode 100644
index 0000000..e69de29
diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2531.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2531.py"
new file mode 100644
index 0000000..707784f
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2531.py"
@@ -0,0 +1,23 @@
+import urllib.request
+from bs4 import BeautifulSoup
+import os
+
+# 下载网页
+url = 'http://www.yidianzixun.com/home?page=article&id=0G5zThN8&up=0'
+res = urllib.request.urlopen(url)
+html = res.read().decode('utf-8')
+# 解析网页
+soup = BeautifulSoup(html, 'html.parser')
+result = soup.find_all('img', limit=10)
+links = []
+for content in result:
+ links.append(content.get('src'))
+# 下载并存储图片
+if not os.path.exists('photo'):
+ os.makedirs('photo')
+i = 0
+for link in links:
+ i += 1
+ filename = 'photo\\' + 'photo' + str(i) + '.gif'
+ with open(filename, 'w') as file:
+ urllib.request.urlretrieve(link, filename)
diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2532.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2532.py"
new file mode 100644
index 0000000..a196dae
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2532.py"
@@ -0,0 +1,22 @@
+import urllib.request
+from bs4 import BeautifulSoup
+import os
+
+url = 'http://www.8she.com/31988.html'
+res = urllib.request.urlopen(url)
+html = res.read().decode('utf-8')
+soup = BeautifulSoup(html, 'html.parser')
+result = soup.find_all(class_='aligncenter', limit=15)
+# print(result)
+links = []
+for content in result:
+ links.append(content.get('src'))
+# 下载并存储图片
+if not os.path.exists('E:\\rieuse\爬虫图片\photo2'):
+ os.makedirs('E:\\rieuse\爬虫图片\photo2')
+i = 0
+for link in links:
+ i += 1
+ filename = 'E:\\rieuse\爬虫图片\photo2\\' + 'photo' + str(i) + '.jpg'
+ with open(filename, 'w') as file:
+ urllib.request.urlretrieve(link, filename)
diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2533\347\210\254\347\256\200\344\271\2467\346\227\245\347\203\255\351\227\250.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2533\347\210\254\347\256\200\344\271\2467\346\227\245\347\203\255\351\227\250.py"
new file mode 100644
index 0000000..4d0f58e
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2533\347\210\254\347\256\200\344\271\2467\346\227\245\347\203\255\351\227\250.py"
@@ -0,0 +1,37 @@
+# -*-coding:utf-8-*-
+import csv
+import requests
+from bs4 import BeautifulSoup
+
+base_url = 'http://www.jianshu.com/trending/weekly'
+
+articles = []
+data_list = []
+for i in range(1, 7):
+ url = base_url + '?page={}'.format(i)
+ r = requests.get(url)
+ html = r.text
+ soup = BeautifulSoup(html, 'html.parser')
+ for article in soup.find_all(class_='content'):
+ title = article.find(class_='title').get_text()
+ link = 'http://www.jianshu.com' + article.find(class_='title').get('href')
+ author = article.find(class_='blue-link').get_text()
+ time = article.span['data-shared-at']
+ meta = article.find(class_='meta').find_all(['a', 'span'])
+ metas = []
+ for item in meta:
+ metas.append(item.get_text().strip())
+ read = metas[0]
+ comment = metas[1]
+ like = metas[2]
+ try:
+ money = metas[3]
+ except:
+ money = '0'
+ articles.append([title, author, time, read, comment, like, money, link])
+
+with open('jianshu.csv', 'w') as f:
+ writer = csv.writer(f)
+ writer.writerow(['文章标题', '作者', '时间', '阅读量', '评论', '喜欢', '赞赏数', '文章地址'])
+ for row in articles:
+ writer.writerow(row)
diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2534\344\275\277\347\224\250bs\347\232\204select\351\200\211\345\217\226\345\233\276\347\211\207\344\272\214\350\277\233\345\210\266\344\277\235\345\255\230\345\233\276\347\211\207\346\226\207\344\273\266.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2534\344\275\277\347\224\250bs\347\232\204select\351\200\211\345\217\226\345\233\276\347\211\207\344\272\214\350\277\233\345\210\266\344\277\235\345\255\230\345\233\276\347\211\207\346\226\207\344\273\266.py"
new file mode 100644
index 0000000..52a4dae
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2534\344\275\277\347\224\250bs\347\232\204select\351\200\211\345\217\226\345\233\276\347\211\207\344\272\214\350\277\233\345\210\266\344\277\235\345\255\230\345\233\276\347\211\207\346\226\207\344\273\266.py"
@@ -0,0 +1,32 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+
+'''
+下载图片或者文件也可以使用urlretrieve模块
+from urllib import request
+request.urlretrieve('','1.jpg')
+'''
+# proxies = {
+# "http": "http://175.155.240.127:808",
+# "https": "http://114.239.149.110:808",
+# }
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
+ 'Connection': 'keep-alive'}
+url = 'http://www.wmpic.me/86253'
+r = requests.get(url, headers=headers)
+soup = BeautifulSoup(r.text, 'html.parser')
+result = soup.select('#content > div.content-c > center > img')
+links = []
+for content in result:
+ links.append(content.get('src'))
+if not os.path.exists('花瓶'):
+ os.makedirs('花瓶')
+i = 0
+for link in links:
+ i += 1
+ filename = '花瓶\\' + '花瓶' + str(i) + '.jpg'
+ ir = requests.get(link)
+ with open(filename, 'wb') as fo:
+ fo.write(ir.content)
diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2535\345\244\232\347\272\277\347\250\213\347\210\254\345\217\226\347\263\227\344\272\213\347\231\276\347\247\221\350\257\204\350\256\272.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2535\345\244\232\347\272\277\347\250\213\347\210\254\345\217\226\347\263\227\344\272\213\347\231\276\347\247\221\350\257\204\350\256\272.py"
new file mode 100644
index 0000000..d18c0cc
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2535\345\244\232\347\272\277\347\250\213\347\210\254\345\217\226\347\263\227\344\272\213\347\231\276\347\247\221\350\257\204\350\256\272.py"
@@ -0,0 +1,46 @@
+import urllib.request
+import threading
+import re
+import urllib.error
+
+headers = ("User-Agent",
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
+opener = urllib.request.build_opener()
+opener.addheaders = [headers]
+urllib.request.install_opener(opener)
+
+
+class One(threading.Thread):
+ def __init__(self):
+ threading.Thread.__init__(self)
+
+ def run(self):
+ for i in range(1, 36, 2):
+ url = "http://www.qiushibaike.com/8hr/page/" + str(i)
+ pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
+ pat = '
.*?(.*?).*?
'
+ datalist = re.compile(pat, re.S).findall(pagedata)
+ for j in range(0, len(datalist)):
+ print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
+ print(datalist[j])
+
+
+class Two(threading.Thread):
+ def __init__(self):
+ threading.Thread.__init__(self)
+
+ def run(self):
+ for i in range(0, 36, 2):
+ url = "http://www.qiushibaike.com/8hr/page/" + str(i)
+ pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
+ pat = '.*?(.*?).*?
'
+ datalist = re.compile(pat, re.S).findall(pagedata)
+ for j in range(0, len(datalist)):
+ print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
+ print(datalist[j])
+
+
+one = One()
+one.start()
+two = Two()
+two.start()
diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/\347\210\254\345\217\226\350\212\261\347\223\243\345\246\271\345\255\220\347\274\251\347\225\245\345\233\276.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/\347\210\254\345\217\226\350\212\261\347\223\243\345\246\271\345\255\220\347\274\251\347\225\245\345\233\276.py"
new file mode 100644
index 0000000..7b381c7
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/\347\210\254\345\217\226\350\212\261\347\223\243\345\246\271\345\255\220\347\274\251\347\225\245\345\233\276.py"
@@ -0,0 +1,58 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium import webdriver
+import requests
+import lxml.html
+import os
+
+SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
+browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
+# browser = webdriver.Firefox()
+wait = WebDriverWait(browser, 15)
+browser.set_window_size(1400, 900)
+
+
+def get_url():
+ print('打开主页搜寻链接中...')
+ try:
+ browser.get('http://huaban.com/boards/favorite/beauty/')
+ wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall')))
+ html = browser.page_source
+ doc = lxml.html.fromstring(html)
+ name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
+ u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href')
+ for item, fileName in zip(u, name):
+ url = 'http://huaban.com' + item
+ print('主链接已找到:' + url)
+ if '*' in fileName:
+ fileName = fileName.replace('*', '')
+ dowload(url, fileName)
+ except Exception as e:
+ print(e)
+
+
+def dowload(url, fileName):
+ try:
+ browser.get(url)
+ wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall')))
+ html = browser.page_source
+ doc = lxml.html.fromstring(html)
+ if not os.path.exists('image2\\' + fileName):
+ os.makedirs('image2\\' + fileName)
+ link = doc.xpath('//*[@id="waterfall"]/div/a/img/@src')
+ i = 0
+ for item in link:
+ i += 1
+ ur = 'http:' + item
+ print('正在下载第' + str(i) + '张图片,地址:' + ur)
+ r = requests.get(ur)
+ filename = 'image2\\{}\\'.format(fileName) + str(i) + '.jpg'
+ with open(filename, 'wb') as fo:
+ fo.write(r.content)
+ except Exception:
+ print('本次出错了')
+
+
+if __name__ == '__main__':
+ get_url()
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\200\357\274\232\347\210\254\345\217\226\350\261\206\347\223\243\347\224\265\345\275\261\344\270\255\351\200\237\345\272\246\344\270\216\346\277\200\346\203\2058\346\274\224\345\221\230\345\233\276\347\211\207.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\200\357\274\232\347\210\254\345\217\226\350\261\206\347\223\243\347\224\265\345\275\261\344\270\255\351\200\237\345\272\246\344\270\216\346\277\200\346\203\2058\346\274\224\345\221\230\345\233\276\347\211\207.py"
new file mode 100644
index 0000000..12241ea
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\200\357\274\232\347\210\254\345\217\226\350\261\206\347\223\243\347\224\265\345\275\261\344\270\255\351\200\237\345\272\246\344\270\216\346\277\200\346\203\2058\346\274\224\345\221\230\345\233\276\347\211\207.py"
@@ -0,0 +1,26 @@
+import urllib.request
+import os
+import re
+
+
+def douban(url):
+ r = urllib.request.urlopen(url)
+ html = r.read().decode('utf-8')
+ result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
+ result2 = re.findall(r'(?<=title=").\S+', html)
+ result2.pop()
+ result3 = sorted(set(result2), key=result2.index)
+ result3.pop(-3)
+ if not os.path.exists('douban'):
+ os.makedirs('douban')
+ i = 0
+ for link in result:
+ filename = 'douban\\' + str(result3[i]) + '.jpg'
+ i += 1
+ with open(filename, 'w') as file:
+ urllib.request.urlretrieve(link, filename)
+
+
+url = 'https://movie.douban.com/subject/26260853/celebrities'
+if __name__ == '__main__':
+ douban(url)
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\203\357\274\232\346\211\271\351\207\217\346\212\223\345\217\226\350\212\261\347\223\243\347\275\221\351\253\230\346\270\205\347\276\216\345\233\276\345\271\266\344\277\235\345\255\230.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\203\357\274\232\346\211\271\351\207\217\346\212\223\345\217\226\350\212\261\347\223\243\347\275\221\351\253\230\346\270\205\347\276\216\345\233\276\345\271\266\344\277\235\345\255\230.py"
new file mode 100644
index 0000000..d505575
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\203\357\274\232\346\211\271\351\207\217\346\212\223\345\217\226\350\212\261\347\223\243\347\275\221\351\253\230\346\270\205\347\276\216\345\233\276\345\271\266\344\277\235\345\255\230.py"
@@ -0,0 +1,74 @@
+__author__ = '布咯咯_rieuse'
+
+import os
+import lxml.html
+import requests
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+
+SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
+browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
+# browser = webdriver.Firefox()
+wait = WebDriverWait(browser, 5)
+browser.set_window_size(1400, 900)
+
+
+def parser(url, param):
+ # 解析模块
+ browser.get(url)
+ wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param)))
+ html = browser.page_source
+ doc = lxml.html.fromstring(html)
+ return doc
+
+
+def get_main_url():
+ print('打开主页搜寻链接中...')
+ try:
+ doc = parser('http://huaban.com/boards/favorite/beauty/', '#waterfall')
+ name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
+ u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href')
+ for item, fileName in zip(u, name):
+ main_url = 'http://huaban.com' + item
+ print('主链接已找到' + main_url)
+ if '*' in fileName:
+ fileName = fileName.replace('*', '')
+ download(main_url, fileName)
+ except Exception as e:
+ print(e)
+
+
+def download(main_url, fileName):
+ print('-------准备下载中-------')
+ try:
+ doc = parser(main_url, '#waterfall')
+ if not os.path.exists('image\\' + fileName):
+ print('创建文件夹...')
+ os.makedirs('image\\' + fileName)
+ link = doc.xpath('//*[@id="waterfall"]/div/a/@href')
+ # print(link)
+ i = 0
+ for item in link:
+ i += 1
+ minor_url = 'http://huaban.com' + item
+ doc = parser(minor_url, '#pin_view_page')
+ img_url = doc.xpath('//*[@id="baidu_image_holder"]/a/img/@src')
+ img_url2 = doc.xpath('//*[@id="baidu_image_holder"]/img/@src')
+ img_url += img_url2
+ try:
+ url = 'http:' + str(img_url[0])
+ print('正在下载第' + str(i) + '张图片,地址:' + url)
+ r = requests.get(url)
+ filename = 'image\\{}\\'.format(fileName) + str(i) + '.jpg'
+ with open(filename, 'wb') as fo:
+ fo.write(r.content)
+ except Exception:
+ print('出错了!')
+ except Exception:
+ print('出错啦!')
+
+
+if __name__ == '__main__':
+ get_main_url()
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\211\357\274\232\347\210\254\345\217\226v2ex\346\225\260\346\215\256\347\224\250csv\344\277\235\345\255\230.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\211\357\274\232\347\210\254\345\217\226v2ex\346\225\260\346\215\256\347\224\250csv\344\277\235\345\255\230.py"
new file mode 100644
index 0000000..675afcf
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\211\357\274\232\347\210\254\345\217\226v2ex\346\225\260\346\215\256\347\224\250csv\344\277\235\345\255\230.py"
@@ -0,0 +1,20 @@
+import csv, requests, re
+from bs4 import BeautifulSoup
+
+url = 'https://www.v2ex.com/?tab=all'
+html = requests.get(url).text
+soup = BeautifulSoup(html, 'html.parser')
+articles = []
+for article in soup.find_all(class_='cell item'):
+ title = article.find(class_='item_title').get_text()
+ category = article.find(class_='node').get_text()
+ author = re.findall(r'(?<= a')
+ link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0]
+ articles.append([title, category, author, link])
+
+with open(r'document\v2ex.csv', 'w') as f:
+ writer = csv.writer(f)
+ writer.writerow(['文章标题', '分类', '作者', '文章地址'])
+ for row in articles:
+ writer.writerow(row)
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\214\357\274\232\344\275\277\347\224\250lxml\350\247\243\346\236\220HTML\357\274\214\350\276\223\345\207\272\345\257\271\345\272\224\345\200\274.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\214\357\274\232\344\275\277\347\224\250lxml\350\247\243\346\236\220HTML\357\274\214\350\276\223\345\207\272\345\257\271\345\272\224\345\200\274.py"
new file mode 100644
index 0000000..2ed2425
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\214\357\274\232\344\275\277\347\224\250lxml\350\247\243\346\236\220HTML\357\274\214\350\276\223\345\207\272\345\257\271\345\272\224\345\200\274.py"
@@ -0,0 +1,16 @@
+import requests
+import lxml.html
+
+url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml'
+html = requests.get(url).text
+doc = lxml.html.fromstring(html)
+titles = doc.xpath('//div[@class="newsList"]/ul/li/a/text()')
+href = doc.xpath('//div[@class="newsList"]/ul/li/a/@href')
+i = 0
+for content in titles:
+ results = {
+ '标题': titles[i],
+ '链接': href[i]
+ }
+ i += 1
+ print(results)
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\224\357\274\232\344\275\277\347\224\250Selenium\347\210\254\345\217\226\344\270\200\347\202\271\350\265\204\350\256\257\345\212\250\346\200\201\346\225\260\346\215\256.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\224\357\274\232\344\275\277\347\224\250Selenium\347\210\254\345\217\226\344\270\200\347\202\271\350\265\204\350\256\257\345\212\250\346\200\201\346\225\260\346\215\256.py"
new file mode 100644
index 0000000..7c1855a
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\224\357\274\232\344\275\277\347\224\250Selenium\347\210\254\345\217\226\344\270\200\347\202\271\350\265\204\350\256\257\345\212\250\346\200\201\346\225\260\346\215\256.py"
@@ -0,0 +1,26 @@
+from selenium.webdriver.common.keys import Keys
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import csv
+
+driver = webdriver.Firefox()
+driver.implicitly_wait(3)
+first_url = 'http://www.yidianzixun.com/channel/c6'
+driver.get(first_url)
+driver.find_element_by_class_name('icon-refresh').click()
+for i in range(1, 90):
+ driver.find_element_by_class_name('icon-refresh').send_keys(Keys.DOWN)
+soup = BeautifulSoup(driver.page_source, 'lxml')
+articles = []
+for article in soup.find_all(class_='item doc style-small-image style-content-middle'):
+ title = article.find(class_='doc-title').get_text()
+ source = article.find(class_='source').get_text()
+ comment = article.find(class_='comment-count').get_text()
+ link = 'http://www.yidianzixun.com' + article.get('href')
+ articles.append([title, source, comment, link])
+driver.quit()
+with open(r'document\yidian.csv', 'w') as f:
+ writer = csv.writer(f)
+ writer.writerow(['文章标题', '作者', '评论数', '文章地址'])
+ for row in articles:
+ writer.writerow(row)
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\205\255\357\274\232Selenium+xpath+bs4\347\210\254\345\217\226\344\272\232\351\251\254\351\200\212\346\225\260\346\215\256\344\277\235\345\255\230\345\210\260mongodb.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\205\255\357\274\232Selenium+xpath+bs4\347\210\254\345\217\226\344\272\232\351\251\254\351\200\212\346\225\260\346\215\256\344\277\235\345\255\230\345\210\260mongodb.py"
new file mode 100644
index 0000000..c946b6f
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\205\255\357\274\232Selenium+xpath+bs4\347\210\254\345\217\226\344\272\232\351\251\254\351\200\212\346\225\260\346\215\256\344\277\235\345\255\230\345\210\260mongodb.py"
@@ -0,0 +1,101 @@
+from selenium.common.exceptions import TimeoutException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium import webdriver
+from bs4 import BeautifulSoup
+import lxml.html
+import pymongo
+import re
+
+MONGO_URL = 'localhost'
+MONGO_DB = 'amazon'
+MONGO_TABLE = 'amazon-python'
+SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
+KEYWORD = 'python'
+client = pymongo.MongoClient(MONGO_URL)
+db = client[MONGO_DB]
+
+browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
+# browser = webdriver.Firefox()
+wait = WebDriverWait(browser, 10)
+browser.set_window_size(1400, 900)
+
+
+def search():
+ print('正在搜索')
+ try:
+ browser.get('https://www.amazon.cn/')
+ input = wait.until(
+ EC.presence_of_element_located((By.CSS_SELECTOR, '#twotabsearchtextbox'))
+ )
+ submit = wait.until(
+ EC.element_to_be_clickable((By.CSS_SELECTOR, '#nav-search > form > div.nav-right > div > input')))
+ input.send_keys(KEYWORD)
+ submit.click()
+ total = wait.until(
+ EC.presence_of_element_located((By.CSS_SELECTOR, '#pagn > span.pagnDisabled')))
+ get_products()
+ print('一共' + total.text + '页')
+ return total.text
+ except TimeoutException:
+ return search()
+
+
+def next_page(number):
+ print('正在翻页', number)
+ try:
+ wait.until(EC.text_to_be_present_in_element(
+ (By.CSS_SELECTOR, '#pagnNextString'), '下一页'))
+ submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pagnNextString')))
+ submit.click()
+ wait.until(EC.text_to_be_present_in_element(
+ (By.CSS_SELECTOR, '.pagnCur'), str(number)))
+ get_products()
+ except TimeoutException:
+ next_page(number)
+
+
+def get_products():
+ try:
+ wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#s-results-list-atf')))
+ html = browser.page_source
+ soup = BeautifulSoup(html, 'lxml')
+ doc = lxml.html.fromstring(html)
+ date = doc.xpath('//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()')
+ content = soup.find_all(attrs={"id": re.compile(r'result_\d+')})
+ for item, time in zip(content, date):
+ product = {
+ 'title': item.find(class_='s-access-title').get_text(),
+ 'image': item.find(class_='s-access-image cfMarker').get('src'),
+ 'price': item.find(class_='a-size-base a-color-price s-price a-text-bold').get_text(),
+ 'date': time
+ }
+ save_to_mongo(product)
+ print(product)
+ except Exception as e:
+ print(e)
+
+
+def save_to_mongo(result):
+ try:
+ if db[MONGO_TABLE].insert(result):
+ print('存储到mongodb成功', result)
+ except Exception:
+ print('存储到mongodb失败', result)
+
+
+def main():
+ try:
+ total = search()
+ total = int(re.compile('(\d+)').search(total).group(1))
+ for i in range(2, total + 1):
+ next_page(i)
+ except Exception as e:
+ print('出错啦', e)
+ finally:
+ browser.close()
+
+
+if __name__ == '__main__':
+ main()
diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\233\233\357\274\232\350\216\267\345\217\226\351\273\221\345\244\247\351\252\214\350\257\201\347\240\201\345\271\266\347\231\273\345\275\225.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\233\233\357\274\232\350\216\267\345\217\226\351\273\221\345\244\247\351\252\214\350\257\201\347\240\201\345\271\266\347\231\273\345\275\225.py"
new file mode 100644
index 0000000..9ad7735
--- /dev/null
+++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\233\233\357\274\232\350\216\267\345\217\226\351\273\221\345\244\247\351\252\214\350\257\201\347\240\201\345\271\266\347\231\273\345\275\225.py"
@@ -0,0 +1,27 @@
+import requests
+from PIL import Image
+from bs4 import BeautifulSoup
+
+url1 = 'http://my.hlju.edu.cn/captchaGenerate.portal?'
+url2 = 'http://my.hlju.edu.cn/userPasswordValidate.portal'
+url3 = 'http://my.hlju.edu.cn/index.portal'
+headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
+}
+s = requests.session()
+response = s.get(url1, headers=headers)
+html = response.text
+soup = BeautifulSoup(html, 'html.parser')
+with open('img\code.jpg', 'wb') as f:
+ f.write(response.content)
+img = Image.open('img\code.jpg')
+img.show()
+data = {}
+data['Login.Token1'] = '20154433'
+data['Login.Token2'] = '134868'
+data['captcha'] = input('输入验证码:')
+data['goto'] = 'http://my.hlju.edu.cn/loginSuccess.portal'
+data['gotoOnFail'] = 'http://my.hlju.edu.cn/loginFailure.portal'
+response2 = s.post(url=url2, data=data, headers=headers)
+response3 = s.get(url3, headers=headers)
+print(response3.text)