diff --git "a/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\346\226\227\351\261\274API\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\344\277\235\345\255\230\345\210\260Mongodb.py" "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\346\226\227\351\261\274API\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\344\277\235\345\255\230\345\210\260Mongodb.py" new file mode 100644 index 0000000..79aae0a --- /dev/null +++ "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\346\226\227\351\261\274API\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\344\277\235\345\255\230\345\210\260Mongodb.py" @@ -0,0 +1,41 @@ +import json + +import requests +from bs4 import BeautifulSoup +from pymongo import MongoClient + +client = MongoClient('localhost') +db = client["DouyuTV"] +col = db["Roominfo"] +host = 'http://api.douyutv.com/api/v1/live/' +all_game = 'http://open.douyucdn.cn/api/RoomApi/game' +sort = [] + + +def parser(url): + html = requests.get(url).text + soup = BeautifulSoup(html, 'lxml') + jn = json.loads(soup.text) + return jn + + +def get_room_sort(url): + jn = parser(url) + data = jn['data'] + for item in data: + sort.append(host + item['short_name']) + + +def get_room_info(): + for item in sort: + jn = parser(item) + data = jn['data'] + try: + col.insert(data) + except Exception as e: + pass + + +if __name__ == '__main__': + get_room_sort(all_game) + get_room_info() diff --git "a/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\347\275\221\345\235\200\346\236\204\351\200\240\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\345\210\260Mongodb.py" "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\347\275\221\345\235\200\346\236\204\351\200\240\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\345\210\260Mongodb.py" new file mode 100644 index 0000000..360c08a --- /dev/null +++ "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\345\210\251\347\224\250\347\275\221\345\235\200\346\236\204\351\200\240\347\210\254\345\217\226\346\226\227\351\261\274\345\205\250\351\203\250\346\210\277\351\227\264\344\277\241\346\201\257\345\210\260Mongodb.py" @@ -0,0 +1,73 @@ +import re +from datetime import datetime +import requests +from bs4 import BeautifulSoup +from pymongo import MongoClient + +HOST = "http://www.douyu.com" +Directory_url = "http://www.douyu.com/directory?isAjax=1" +Qurystr = "/?page=1&isAjax=1" + +client = MongoClient('localhost') +db = client["Douyu2"] +col = db["Roominfo"] + + +def get_roominfo(data): + if data: + firstpage = BeautifulSoup(data, 'lxml') + roomlist = firstpage.select('li') + print(len(roomlist)) + if roomlist: + for room in roomlist: + try: + roomid = room["data-rid"] + roomtitle = room.a["title"] + roomtitle = roomtitle.encode('utf-8') + roomowner = room.select("p > span") + roomtag = room.select("div > span") + roomimg = room.a + roomtag = roomtag[0].string + date = datetime.now() + if len(roomowner) == 2: + zbname = roomowner[0].string + audience = roomowner[1].get_text() + audience = audience.encode('utf-8').decode('utf-8') + image = roomimg.span.img["data-original"] + word = u"万" + if word in audience: + r = re.compile(r'(\d+)(\.?)(\d*)') + data = r.match(audience).group(0) + audience = int(float(data) * 10000) + else: + audience = int(audience) + roominfo = { + "roomid": int(roomid), + "roomtitle": roomtitle, + "anchor": zbname, + "audience": audience, + "tag": roomtag, + "date": date, + "img": image + } + col.insert_one(roominfo) + except Exception as e: + print(e) + + +def insert_info(): + session = requests.session() + pagecontent = session.get(Directory_url).text + pagesoup = BeautifulSoup(pagecontent, 'lxml') + games = pagesoup.select('a') + # col.drop() + for game in games: + links = game["href"] + gameurl = HOST + links + Qurystr + print(gameurl) + gamedata = session.get(gameurl).text + get_roominfo(gamedata) + + +if __name__ == '__main__': + insert_info() diff --git "a/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\346\226\227\351\261\274\347\233\264\346\222\255\346\210\277\351\227\264\346\225\260\346\215\256\345\210\206\346\236\220.py" "b/Python\347\210\254\345\217\226\346\226\227\351\261\274\346\210\277\351\227\264\344\277\241\346\201\257\345\222\214\346\225\260\346\215\256\345\210\206\346\236\220/\346\226\227\351\261\274\347\233\264\346\222\255\346\210\277\351\227\264\346\225\260\346\215\256\345\210\206\346\236\220.py" new file mode 100644 index 0000000..e69de29 diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2531.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2531.py" new file mode 100644 index 0000000..707784f --- /dev/null +++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2531.py" @@ -0,0 +1,23 @@ +import urllib.request +from bs4 import BeautifulSoup +import os + +# 下载网页 +url = 'http://www.yidianzixun.com/home?page=article&id=0G5zThN8&up=0' +res = urllib.request.urlopen(url) +html = res.read().decode('utf-8') +# 解析网页 +soup = BeautifulSoup(html, 'html.parser') +result = soup.find_all('img', limit=10) +links = [] +for content in result: + links.append(content.get('src')) +# 下载并存储图片 +if not os.path.exists('photo'): + os.makedirs('photo') +i = 0 +for link in links: + i += 1 + filename = 'photo\\' + 'photo' + str(i) + '.gif' + with open(filename, 'w') as file: + urllib.request.urlretrieve(link, filename) diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2532.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2532.py" new file mode 100644 index 0000000..a196dae --- /dev/null +++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2532.py" @@ -0,0 +1,22 @@ +import urllib.request +from bs4 import BeautifulSoup +import os + +url = 'http://www.8she.com/31988.html' +res = urllib.request.urlopen(url) +html = res.read().decode('utf-8') +soup = BeautifulSoup(html, 'html.parser') +result = soup.find_all(class_='aligncenter', limit=15) +# print(result) +links = [] +for content in result: + links.append(content.get('src')) +# 下载并存储图片 +if not os.path.exists('E:\\rieuse\爬虫图片\photo2'): + os.makedirs('E:\\rieuse\爬虫图片\photo2') +i = 0 +for link in links: + i += 1 + filename = 'E:\\rieuse\爬虫图片\photo2\\' + 'photo' + str(i) + '.jpg' + with open(filename, 'w') as file: + urllib.request.urlretrieve(link, filename) diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2533\347\210\254\347\256\200\344\271\2467\346\227\245\347\203\255\351\227\250.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2533\347\210\254\347\256\200\344\271\2467\346\227\245\347\203\255\351\227\250.py" new file mode 100644 index 0000000..4d0f58e --- /dev/null +++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2533\347\210\254\347\256\200\344\271\2467\346\227\245\347\203\255\351\227\250.py" @@ -0,0 +1,37 @@ +# -*-coding:utf-8-*- +import csv +import requests +from bs4 import BeautifulSoup + +base_url = 'http://www.jianshu.com/trending/weekly' + +articles = [] +data_list = [] +for i in range(1, 7): + url = base_url + '?page={}'.format(i) + r = requests.get(url) + html = r.text + soup = BeautifulSoup(html, 'html.parser') + for article in soup.find_all(class_='content'): + title = article.find(class_='title').get_text() + link = 'http://www.jianshu.com' + article.find(class_='title').get('href') + author = article.find(class_='blue-link').get_text() + time = article.span['data-shared-at'] + meta = article.find(class_='meta').find_all(['a', 'span']) + metas = [] + for item in meta: + metas.append(item.get_text().strip()) + read = metas[0] + comment = metas[1] + like = metas[2] + try: + money = metas[3] + except: + money = '0' + articles.append([title, author, time, read, comment, like, money, link]) + +with open('jianshu.csv', 'w') as f: + writer = csv.writer(f) + writer.writerow(['文章标题', '作者', '时间', '阅读量', '评论', '喜欢', '赞赏数', '文章地址']) + for row in articles: + writer.writerow(row) diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2534\344\275\277\347\224\250bs\347\232\204select\351\200\211\345\217\226\345\233\276\347\211\207\344\272\214\350\277\233\345\210\266\344\277\235\345\255\230\345\233\276\347\211\207\346\226\207\344\273\266.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2534\344\275\277\347\224\250bs\347\232\204select\351\200\211\345\217\226\345\233\276\347\211\207\344\272\214\350\277\233\345\210\266\344\277\235\345\255\230\345\233\276\347\211\207\346\226\207\344\273\266.py" new file mode 100644 index 0000000..52a4dae --- /dev/null +++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2534\344\275\277\347\224\250bs\347\232\204select\351\200\211\345\217\226\345\233\276\347\211\207\344\272\214\350\277\233\345\210\266\344\277\235\345\255\230\345\233\276\347\211\207\346\226\207\344\273\266.py" @@ -0,0 +1,32 @@ +import requests +from bs4 import BeautifulSoup +import os + +''' +下载图片或者文件也可以使用urlretrieve模块 +from urllib import request +request.urlretrieve('','1.jpg') +''' +# proxies = { +# "http": "http://175.155.240.127:808", +# "https": "http://114.239.149.110:808", +# } +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', + 'Connection': 'keep-alive'} +url = 'http://www.wmpic.me/86253' +r = requests.get(url, headers=headers) +soup = BeautifulSoup(r.text, 'html.parser') +result = soup.select('#content > div.content-c > center > img') +links = [] +for content in result: + links.append(content.get('src')) +if not os.path.exists('花瓶'): + os.makedirs('花瓶') +i = 0 +for link in links: + i += 1 + filename = '花瓶\\' + '花瓶' + str(i) + '.jpg' + ir = requests.get(link) + with open(filename, 'wb') as fo: + fo.write(ir.content) diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2535\345\244\232\347\272\277\347\250\213\347\210\254\345\217\226\347\263\227\344\272\213\347\231\276\347\247\221\350\257\204\350\256\272.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2535\345\244\232\347\272\277\347\250\213\347\210\254\345\217\226\347\263\227\344\272\213\347\231\276\347\247\221\350\257\204\350\256\272.py" new file mode 100644 index 0000000..d18c0cc --- /dev/null +++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/Python\347\210\254\350\231\2535\345\244\232\347\272\277\347\250\213\347\210\254\345\217\226\347\263\227\344\272\213\347\231\276\347\247\221\350\257\204\350\256\272.py" @@ -0,0 +1,46 @@ +import urllib.request +import threading +import re +import urllib.error + +headers = ("User-Agent", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") +opener = urllib.request.build_opener() +opener.addheaders = [headers] +urllib.request.install_opener(opener) + + +class One(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + + def run(self): + for i in range(1, 36, 2): + url = "http://www.qiushibaike.com/8hr/page/" + str(i) + pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore") + pat = '
.*?(.*?).*?
' + datalist = re.compile(pat, re.S).findall(pagedata) + for j in range(0, len(datalist)): + print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") + print(datalist[j]) + + +class Two(threading.Thread): + def __init__(self): + threading.Thread.__init__(self) + + def run(self): + for i in range(0, 36, 2): + url = "http://www.qiushibaike.com/8hr/page/" + str(i) + pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore") + pat = '
.*?(.*?).*?
' + datalist = re.compile(pat, re.S).findall(pagedata) + for j in range(0, len(datalist)): + print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") + print(datalist[j]) + + +one = One() +one.start() +two = Two() +two.start() diff --git "a/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/\347\210\254\345\217\226\350\212\261\347\223\243\345\246\271\345\255\220\347\274\251\347\225\245\345\233\276.py" "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/\347\210\254\345\217\226\350\212\261\347\223\243\345\246\271\345\255\220\347\274\251\347\225\245\345\233\276.py" new file mode 100644 index 0000000..7b381c7 --- /dev/null +++ "b/Python\347\210\254\350\231\253\345\260\217\347\232\204demo/\347\210\254\345\217\226\350\212\261\347\223\243\345\246\271\345\255\220\347\274\251\347\225\245\345\233\276.py" @@ -0,0 +1,58 @@ +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium import webdriver +import requests +import lxml.html +import os + +SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] +browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) +# browser = webdriver.Firefox() +wait = WebDriverWait(browser, 15) +browser.set_window_size(1400, 900) + + +def get_url(): + print('打开主页搜寻链接中...') + try: + browser.get('http://huaban.com/boards/favorite/beauty/') + wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall'))) + html = browser.page_source + doc = lxml.html.fromstring(html) + name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()') + u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href') + for item, fileName in zip(u, name): + url = 'http://huaban.com' + item + print('主链接已找到:' + url) + if '*' in fileName: + fileName = fileName.replace('*', '') + dowload(url, fileName) + except Exception as e: + print(e) + + +def dowload(url, fileName): + try: + browser.get(url) + wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall'))) + html = browser.page_source + doc = lxml.html.fromstring(html) + if not os.path.exists('image2\\' + fileName): + os.makedirs('image2\\' + fileName) + link = doc.xpath('//*[@id="waterfall"]/div/a/img/@src') + i = 0 + for item in link: + i += 1 + ur = 'http:' + item + print('正在下载第' + str(i) + '张图片,地址:' + ur) + r = requests.get(ur) + filename = 'image2\\{}\\'.format(fileName) + str(i) + '.jpg' + with open(filename, 'wb') as fo: + fo.write(r.content) + except Exception: + print('本次出错了') + + +if __name__ == '__main__': + get_url() diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\200\357\274\232\347\210\254\345\217\226\350\261\206\347\223\243\347\224\265\345\275\261\344\270\255\351\200\237\345\272\246\344\270\216\346\277\200\346\203\2058\346\274\224\345\221\230\345\233\276\347\211\207.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\200\357\274\232\347\210\254\345\217\226\350\261\206\347\223\243\347\224\265\345\275\261\344\270\255\351\200\237\345\272\246\344\270\216\346\277\200\346\203\2058\346\274\224\345\221\230\345\233\276\347\211\207.py" new file mode 100644 index 0000000..12241ea --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\200\357\274\232\347\210\254\345\217\226\350\261\206\347\223\243\347\224\265\345\275\261\344\270\255\351\200\237\345\272\246\344\270\216\346\277\200\346\203\2058\346\274\224\345\221\230\345\233\276\347\211\207.py" @@ -0,0 +1,26 @@ +import urllib.request +import os +import re + + +def douban(url): + r = urllib.request.urlopen(url) + html = r.read().decode('utf-8') + result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html) + result2 = re.findall(r'(?<=title=").\S+', html) + result2.pop() + result3 = sorted(set(result2), key=result2.index) + result3.pop(-3) + if not os.path.exists('douban'): + os.makedirs('douban') + i = 0 + for link in result: + filename = 'douban\\' + str(result3[i]) + '.jpg' + i += 1 + with open(filename, 'w') as file: + urllib.request.urlretrieve(link, filename) + + +url = 'https://movie.douban.com/subject/26260853/celebrities' +if __name__ == '__main__': + douban(url) diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\203\357\274\232\346\211\271\351\207\217\346\212\223\345\217\226\350\212\261\347\223\243\347\275\221\351\253\230\346\270\205\347\276\216\345\233\276\345\271\266\344\277\235\345\255\230.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\203\357\274\232\346\211\271\351\207\217\346\212\223\345\217\226\350\212\261\347\223\243\347\275\221\351\253\230\346\270\205\347\276\216\345\233\276\345\271\266\344\277\235\345\255\230.py" new file mode 100644 index 0000000..d505575 --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\203\357\274\232\346\211\271\351\207\217\346\212\223\345\217\226\350\212\261\347\223\243\347\275\221\351\253\230\346\270\205\347\276\216\345\233\276\345\271\266\344\277\235\345\255\230.py" @@ -0,0 +1,74 @@ +__author__ = '布咯咯_rieuse' + +import os +import lxml.html +import requests +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait + +SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] +browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) +# browser = webdriver.Firefox() +wait = WebDriverWait(browser, 5) +browser.set_window_size(1400, 900) + + +def parser(url, param): + # 解析模块 + browser.get(url) + wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, param))) + html = browser.page_source + doc = lxml.html.fromstring(html) + return doc + + +def get_main_url(): + print('打开主页搜寻链接中...') + try: + doc = parser('http://huaban.com/boards/favorite/beauty/', '#waterfall') + name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()') + u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href') + for item, fileName in zip(u, name): + main_url = 'http://huaban.com' + item + print('主链接已找到' + main_url) + if '*' in fileName: + fileName = fileName.replace('*', '') + download(main_url, fileName) + except Exception as e: + print(e) + + +def download(main_url, fileName): + print('-------准备下载中-------') + try: + doc = parser(main_url, '#waterfall') + if not os.path.exists('image\\' + fileName): + print('创建文件夹...') + os.makedirs('image\\' + fileName) + link = doc.xpath('//*[@id="waterfall"]/div/a/@href') + # print(link) + i = 0 + for item in link: + i += 1 + minor_url = 'http://huaban.com' + item + doc = parser(minor_url, '#pin_view_page') + img_url = doc.xpath('//*[@id="baidu_image_holder"]/a/img/@src') + img_url2 = doc.xpath('//*[@id="baidu_image_holder"]/img/@src') + img_url += img_url2 + try: + url = 'http:' + str(img_url[0]) + print('正在下载第' + str(i) + '张图片,地址:' + url) + r = requests.get(url) + filename = 'image\\{}\\'.format(fileName) + str(i) + '.jpg' + with open(filename, 'wb') as fo: + fo.write(r.content) + except Exception: + print('出错了!') + except Exception: + print('出错啦!') + + +if __name__ == '__main__': + get_main_url() diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\211\357\274\232\347\210\254\345\217\226v2ex\346\225\260\346\215\256\347\224\250csv\344\277\235\345\255\230.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\211\357\274\232\347\210\254\345\217\226v2ex\346\225\260\346\215\256\347\224\250csv\344\277\235\345\255\230.py" new file mode 100644 index 0000000..675afcf --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\270\211\357\274\232\347\210\254\345\217\226v2ex\346\225\260\346\215\256\347\224\250csv\344\277\235\345\255\230.py" @@ -0,0 +1,20 @@ +import csv, requests, re +from bs4 import BeautifulSoup + +url = 'https://www.v2ex.com/?tab=all' +html = requests.get(url).text +soup = BeautifulSoup(html, 'html.parser') +articles = [] +for article in soup.find_all(class_='cell item'): + title = article.find(class_='item_title').get_text() + category = article.find(class_='node').get_text() + author = re.findall(r'(?<= a') + link = 'https://www.v2ex.com' + re.findall(r'(?<=href=").+(?=")', str(u))[0] + articles.append([title, category, author, link]) + +with open(r'document\v2ex.csv', 'w') as f: + writer = csv.writer(f) + writer.writerow(['文章标题', '分类', '作者', '文章地址']) + for row in articles: + writer.writerow(row) diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\214\357\274\232\344\275\277\347\224\250lxml\350\247\243\346\236\220HTML\357\274\214\350\276\223\345\207\272\345\257\271\345\272\224\345\200\274.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\214\357\274\232\344\275\277\347\224\250lxml\350\247\243\346\236\220HTML\357\274\214\350\276\223\345\207\272\345\257\271\345\272\224\345\200\274.py" new file mode 100644 index 0000000..2ed2425 --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\214\357\274\232\344\275\277\347\224\250lxml\350\247\243\346\236\220HTML\357\274\214\350\276\223\345\207\272\345\257\271\345\272\224\345\200\274.py" @@ -0,0 +1,16 @@ +import requests +import lxml.html + +url = 'http://news.ifeng.com/listpage/11502/0/1/rtlist.shtml' +html = requests.get(url).text +doc = lxml.html.fromstring(html) +titles = doc.xpath('//div[@class="newsList"]/ul/li/a/text()') +href = doc.xpath('//div[@class="newsList"]/ul/li/a/@href') +i = 0 +for content in titles: + results = { + '标题': titles[i], + '链接': href[i] + } + i += 1 + print(results) diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\224\357\274\232\344\275\277\347\224\250Selenium\347\210\254\345\217\226\344\270\200\347\202\271\350\265\204\350\256\257\345\212\250\346\200\201\346\225\260\346\215\256.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\224\357\274\232\344\275\277\347\224\250Selenium\347\210\254\345\217\226\344\270\200\347\202\271\350\265\204\350\256\257\345\212\250\346\200\201\346\225\260\346\215\256.py" new file mode 100644 index 0000000..7c1855a --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\344\272\224\357\274\232\344\275\277\347\224\250Selenium\347\210\254\345\217\226\344\270\200\347\202\271\350\265\204\350\256\257\345\212\250\346\200\201\346\225\260\346\215\256.py" @@ -0,0 +1,26 @@ +from selenium.webdriver.common.keys import Keys +from selenium import webdriver +from bs4 import BeautifulSoup +import csv + +driver = webdriver.Firefox() +driver.implicitly_wait(3) +first_url = 'http://www.yidianzixun.com/channel/c6' +driver.get(first_url) +driver.find_element_by_class_name('icon-refresh').click() +for i in range(1, 90): + driver.find_element_by_class_name('icon-refresh').send_keys(Keys.DOWN) +soup = BeautifulSoup(driver.page_source, 'lxml') +articles = [] +for article in soup.find_all(class_='item doc style-small-image style-content-middle'): + title = article.find(class_='doc-title').get_text() + source = article.find(class_='source').get_text() + comment = article.find(class_='comment-count').get_text() + link = 'http://www.yidianzixun.com' + article.get('href') + articles.append([title, source, comment, link]) +driver.quit() +with open(r'document\yidian.csv', 'w') as f: + writer = csv.writer(f) + writer.writerow(['文章标题', '作者', '评论数', '文章地址']) + for row in articles: + writer.writerow(row) diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\205\255\357\274\232Selenium+xpath+bs4\347\210\254\345\217\226\344\272\232\351\251\254\351\200\212\346\225\260\346\215\256\344\277\235\345\255\230\345\210\260mongodb.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\205\255\357\274\232Selenium+xpath+bs4\347\210\254\345\217\226\344\272\232\351\251\254\351\200\212\346\225\260\346\215\256\344\277\235\345\255\230\345\210\260mongodb.py" new file mode 100644 index 0000000..c946b6f --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\205\255\357\274\232Selenium+xpath+bs4\347\210\254\345\217\226\344\272\232\351\251\254\351\200\212\346\225\260\346\215\256\344\277\235\345\255\230\345\210\260mongodb.py" @@ -0,0 +1,101 @@ +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.support.ui import WebDriverWait +from selenium import webdriver +from bs4 import BeautifulSoup +import lxml.html +import pymongo +import re + +MONGO_URL = 'localhost' +MONGO_DB = 'amazon' +MONGO_TABLE = 'amazon-python' +SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] +KEYWORD = 'python' +client = pymongo.MongoClient(MONGO_URL) +db = client[MONGO_DB] + +browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) +# browser = webdriver.Firefox() +wait = WebDriverWait(browser, 10) +browser.set_window_size(1400, 900) + + +def search(): + print('正在搜索') + try: + browser.get('https://www.amazon.cn/') + input = wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, '#twotabsearchtextbox')) + ) + submit = wait.until( + EC.element_to_be_clickable((By.CSS_SELECTOR, '#nav-search > form > div.nav-right > div > input'))) + input.send_keys(KEYWORD) + submit.click() + total = wait.until( + EC.presence_of_element_located((By.CSS_SELECTOR, '#pagn > span.pagnDisabled'))) + get_products() + print('一共' + total.text + '页') + return total.text + except TimeoutException: + return search() + + +def next_page(number): + print('正在翻页', number) + try: + wait.until(EC.text_to_be_present_in_element( + (By.CSS_SELECTOR, '#pagnNextString'), '下一页')) + submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#pagnNextString'))) + submit.click() + wait.until(EC.text_to_be_present_in_element( + (By.CSS_SELECTOR, '.pagnCur'), str(number))) + get_products() + except TimeoutException: + next_page(number) + + +def get_products(): + try: + wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#s-results-list-atf'))) + html = browser.page_source + soup = BeautifulSoup(html, 'lxml') + doc = lxml.html.fromstring(html) + date = doc.xpath('//*[@class="s-result-item celwidget "]/div/div[2]/div[1]/span[2]/text()') + content = soup.find_all(attrs={"id": re.compile(r'result_\d+')}) + for item, time in zip(content, date): + product = { + 'title': item.find(class_='s-access-title').get_text(), + 'image': item.find(class_='s-access-image cfMarker').get('src'), + 'price': item.find(class_='a-size-base a-color-price s-price a-text-bold').get_text(), + 'date': time + } + save_to_mongo(product) + print(product) + except Exception as e: + print(e) + + +def save_to_mongo(result): + try: + if db[MONGO_TABLE].insert(result): + print('存储到mongodb成功', result) + except Exception: + print('存储到mongodb失败', result) + + +def main(): + try: + total = search() + total = int(re.compile('(\d+)').search(total).group(1)) + for i in range(2, total + 1): + next_page(i) + except Exception as e: + print('出错啦', e) + finally: + browser.close() + + +if __name__ == '__main__': + main() diff --git "a/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\233\233\357\274\232\350\216\267\345\217\226\351\273\221\345\244\247\351\252\214\350\257\201\347\240\201\345\271\266\347\231\273\345\275\225.py" "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\233\233\357\274\232\350\216\267\345\217\226\351\273\221\345\244\247\351\252\214\350\257\201\347\240\201\345\271\266\347\231\273\345\275\225.py" new file mode 100644 index 0000000..9ad7735 --- /dev/null +++ "b/Python\347\210\254\350\231\253\346\227\245\350\256\260\347\263\273\345\210\227/Python\347\210\254\350\231\253\346\227\245\350\256\260\345\233\233\357\274\232\350\216\267\345\217\226\351\273\221\345\244\247\351\252\214\350\257\201\347\240\201\345\271\266\347\231\273\345\275\225.py" @@ -0,0 +1,27 @@ +import requests +from PIL import Image +from bs4 import BeautifulSoup + +url1 = 'http://my.hlju.edu.cn/captchaGenerate.portal?' +url2 = 'http://my.hlju.edu.cn/userPasswordValidate.portal' +url3 = 'http://my.hlju.edu.cn/index.portal' +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' +} +s = requests.session() +response = s.get(url1, headers=headers) +html = response.text +soup = BeautifulSoup(html, 'html.parser') +with open('img\code.jpg', 'wb') as f: + f.write(response.content) +img = Image.open('img\code.jpg') +img.show() +data = {} +data['Login.Token1'] = '20154433' +data['Login.Token2'] = '134868' +data['captcha'] = input('输入验证码:') +data['goto'] = 'http://my.hlju.edu.cn/loginSuccess.portal' +data['gotoOnFail'] = 'http://my.hlju.edu.cn/loginFailure.portal' +response2 = s.post(url=url2, data=data, headers=headers) +response3 = s.get(url3, headers=headers) +print(response3.text)