forked from rieuse/learnPython
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
622 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import json | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from pymongo import MongoClient | ||
|
||
client = MongoClient('localhost') | ||
db = client["DouyuTV"] | ||
col = db["Roominfo"] | ||
host = 'http://api.douyutv.com/api/v1/live/' | ||
all_game = 'http://open.douyucdn.cn/api/RoomApi/game' | ||
sort = [] | ||
|
||
|
||
def parser(url): | ||
html = requests.get(url).text | ||
soup = BeautifulSoup(html, 'lxml') | ||
jn = json.loads(soup.text) | ||
return jn | ||
|
||
|
||
def get_room_sort(url): | ||
jn = parser(url) | ||
data = jn['data'] | ||
for item in data: | ||
sort.append(host + item['short_name']) | ||
|
||
|
||
def get_room_info(): | ||
for item in sort: | ||
jn = parser(item) | ||
data = jn['data'] | ||
try: | ||
col.insert(data) | ||
except Exception as e: | ||
pass | ||
|
||
|
||
if __name__ == '__main__': | ||
get_room_sort(all_game) | ||
get_room_info() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import re | ||
from datetime import datetime | ||
import requests | ||
from bs4 import BeautifulSoup | ||
from pymongo import MongoClient | ||
|
||
HOST = "http://www.douyu.com" | ||
Directory_url = "http://www.douyu.com/directory?isAjax=1" | ||
Qurystr = "/?page=1&isAjax=1" | ||
|
||
client = MongoClient('localhost') | ||
db = client["Douyu2"] | ||
col = db["Roominfo"] | ||
|
||
|
||
def get_roominfo(data): | ||
if data: | ||
firstpage = BeautifulSoup(data, 'lxml') | ||
roomlist = firstpage.select('li') | ||
print(len(roomlist)) | ||
if roomlist: | ||
for room in roomlist: | ||
try: | ||
roomid = room["data-rid"] | ||
roomtitle = room.a["title"] | ||
roomtitle = roomtitle.encode('utf-8') | ||
roomowner = room.select("p > span") | ||
roomtag = room.select("div > span") | ||
roomimg = room.a | ||
roomtag = roomtag[0].string | ||
date = datetime.now() | ||
if len(roomowner) == 2: | ||
zbname = roomowner[0].string | ||
audience = roomowner[1].get_text() | ||
audience = audience.encode('utf-8').decode('utf-8') | ||
image = roomimg.span.img["data-original"] | ||
word = u"万" | ||
if word in audience: | ||
r = re.compile(r'(\d+)(\.?)(\d*)') | ||
data = r.match(audience).group(0) | ||
audience = int(float(data) * 10000) | ||
else: | ||
audience = int(audience) | ||
roominfo = { | ||
"roomid": int(roomid), | ||
"roomtitle": roomtitle, | ||
"anchor": zbname, | ||
"audience": audience, | ||
"tag": roomtag, | ||
"date": date, | ||
"img": image | ||
} | ||
col.insert_one(roominfo) | ||
except Exception as e: | ||
print(e) | ||
|
||
|
||
def insert_info(): | ||
session = requests.session() | ||
pagecontent = session.get(Directory_url).text | ||
pagesoup = BeautifulSoup(pagecontent, 'lxml') | ||
games = pagesoup.select('a') | ||
# col.drop() | ||
for game in games: | ||
links = game["href"] | ||
gameurl = HOST + links + Qurystr | ||
print(gameurl) | ||
gamedata = session.get(gameurl).text | ||
get_roominfo(gamedata) | ||
|
||
|
||
if __name__ == '__main__': | ||
insert_info() |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import urllib.request | ||
from bs4 import BeautifulSoup | ||
import os | ||
|
||
# 下载网页 | ||
url = 'http://www.yidianzixun.com/home?page=article&id=0G5zThN8&up=0' | ||
res = urllib.request.urlopen(url) | ||
html = res.read().decode('utf-8') | ||
# 解析网页 | ||
soup = BeautifulSoup(html, 'html.parser') | ||
result = soup.find_all('img', limit=10) | ||
links = [] | ||
for content in result: | ||
links.append(content.get('src')) | ||
# 下载并存储图片 | ||
if not os.path.exists('photo'): | ||
os.makedirs('photo') | ||
i = 0 | ||
for link in links: | ||
i += 1 | ||
filename = 'photo\\' + 'photo' + str(i) + '.gif' | ||
with open(filename, 'w') as file: | ||
urllib.request.urlretrieve(link, filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import urllib.request | ||
from bs4 import BeautifulSoup | ||
import os | ||
|
||
url = 'http://www.8she.com/31988.html' | ||
res = urllib.request.urlopen(url) | ||
html = res.read().decode('utf-8') | ||
soup = BeautifulSoup(html, 'html.parser') | ||
result = soup.find_all(class_='aligncenter', limit=15) | ||
# print(result) | ||
links = [] | ||
for content in result: | ||
links.append(content.get('src')) | ||
# 下载并存储图片 | ||
if not os.path.exists('E:\\rieuse\爬虫图片\photo2'): | ||
os.makedirs('E:\\rieuse\爬虫图片\photo2') | ||
i = 0 | ||
for link in links: | ||
i += 1 | ||
filename = 'E:\\rieuse\爬虫图片\photo2\\' + 'photo' + str(i) + '.jpg' | ||
with open(filename, 'w') as file: | ||
urllib.request.urlretrieve(link, filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
# -*-coding:utf-8-*- | ||
import csv | ||
import requests | ||
from bs4 import BeautifulSoup | ||
|
||
base_url = 'http://www.jianshu.com/trending/weekly' | ||
|
||
articles = [] | ||
data_list = [] | ||
for i in range(1, 7): | ||
url = base_url + '?page={}'.format(i) | ||
r = requests.get(url) | ||
html = r.text | ||
soup = BeautifulSoup(html, 'html.parser') | ||
for article in soup.find_all(class_='content'): | ||
title = article.find(class_='title').get_text() | ||
link = 'http://www.jianshu.com' + article.find(class_='title').get('href') | ||
author = article.find(class_='blue-link').get_text() | ||
time = article.span['data-shared-at'] | ||
meta = article.find(class_='meta').find_all(['a', 'span']) | ||
metas = [] | ||
for item in meta: | ||
metas.append(item.get_text().strip()) | ||
read = metas[0] | ||
comment = metas[1] | ||
like = metas[2] | ||
try: | ||
money = metas[3] | ||
except: | ||
money = '0' | ||
articles.append([title, author, time, read, comment, like, money, link]) | ||
|
||
with open('jianshu.csv', 'w') as f: | ||
writer = csv.writer(f) | ||
writer.writerow(['文章标题', '作者', '时间', '阅读量', '评论', '喜欢', '赞赏数', '文章地址']) | ||
for row in articles: | ||
writer.writerow(row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import requests | ||
from bs4 import BeautifulSoup | ||
import os | ||
|
||
''' | ||
下载图片或者文件也可以使用urlretrieve模块 | ||
from urllib import request | ||
request.urlretrieve('','1.jpg') | ||
''' | ||
# proxies = { | ||
# "http": "http://175.155.240.127:808", | ||
# "https": "http://114.239.149.110:808", | ||
# } | ||
headers = { | ||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', | ||
'Connection': 'keep-alive'} | ||
url = 'http://www.wmpic.me/86253' | ||
r = requests.get(url, headers=headers) | ||
soup = BeautifulSoup(r.text, 'html.parser') | ||
result = soup.select('#content > div.content-c > center > img') | ||
links = [] | ||
for content in result: | ||
links.append(content.get('src')) | ||
if not os.path.exists('花瓶'): | ||
os.makedirs('花瓶') | ||
i = 0 | ||
for link in links: | ||
i += 1 | ||
filename = '花瓶\\' + '花瓶' + str(i) + '.jpg' | ||
ir = requests.get(link) | ||
with open(filename, 'wb') as fo: | ||
fo.write(ir.content) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import urllib.request | ||
import threading | ||
import re | ||
import urllib.error | ||
|
||
headers = ("User-Agent", | ||
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0") | ||
opener = urllib.request.build_opener() | ||
opener.addheaders = [headers] | ||
urllib.request.install_opener(opener) | ||
|
||
|
||
class One(threading.Thread): | ||
def __init__(self): | ||
threading.Thread.__init__(self) | ||
|
||
def run(self): | ||
for i in range(1, 36, 2): | ||
url = "http://www.qiushibaike.com/8hr/page/" + str(i) | ||
pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore") | ||
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>' | ||
datalist = re.compile(pat, re.S).findall(pagedata) | ||
for j in range(0, len(datalist)): | ||
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") | ||
print(datalist[j]) | ||
|
||
|
||
class Two(threading.Thread): | ||
def __init__(self): | ||
threading.Thread.__init__(self) | ||
|
||
def run(self): | ||
for i in range(0, 36, 2): | ||
url = "http://www.qiushibaike.com/8hr/page/" + str(i) | ||
pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore") | ||
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>' | ||
datalist = re.compile(pat, re.S).findall(pagedata) | ||
for j in range(0, len(datalist)): | ||
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:") | ||
print(datalist[j]) | ||
|
||
|
||
one = One() | ||
one.start() | ||
two = Two() | ||
two.start() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
from selenium.webdriver.common.by import By | ||
from selenium.webdriver.support import expected_conditions as EC | ||
from selenium.webdriver.support.ui import WebDriverWait | ||
from selenium import webdriver | ||
import requests | ||
import lxml.html | ||
import os | ||
|
||
SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] | ||
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) | ||
# browser = webdriver.Firefox() | ||
wait = WebDriverWait(browser, 15) | ||
browser.set_window_size(1400, 900) | ||
|
||
|
||
def get_url(): | ||
print('打开主页搜寻链接中...') | ||
try: | ||
browser.get('http://huaban.com/boards/favorite/beauty/') | ||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall'))) | ||
html = browser.page_source | ||
doc = lxml.html.fromstring(html) | ||
name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()') | ||
u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href') | ||
for item, fileName in zip(u, name): | ||
url = 'http://huaban.com' + item | ||
print('主链接已找到:' + url) | ||
if '*' in fileName: | ||
fileName = fileName.replace('*', '') | ||
dowload(url, fileName) | ||
except Exception as e: | ||
print(e) | ||
|
||
|
||
def dowload(url, fileName): | ||
try: | ||
browser.get(url) | ||
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall'))) | ||
html = browser.page_source | ||
doc = lxml.html.fromstring(html) | ||
if not os.path.exists('image2\\' + fileName): | ||
os.makedirs('image2\\' + fileName) | ||
link = doc.xpath('//*[@id="waterfall"]/div/a/img/@src') | ||
i = 0 | ||
for item in link: | ||
i += 1 | ||
ur = 'http:' + item | ||
print('正在下载第' + str(i) + '张图片,地址:' + ur) | ||
r = requests.get(ur) | ||
filename = 'image2\\{}\\'.format(fileName) + str(i) + '.jpg' | ||
with open(filename, 'wb') as fo: | ||
fo.write(r.content) | ||
except Exception: | ||
print('本次出错了') | ||
|
||
|
||
if __name__ == '__main__': | ||
get_url() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
import urllib.request | ||
import os | ||
import re | ||
|
||
|
||
def douban(url): | ||
r = urllib.request.urlopen(url) | ||
html = r.read().decode('utf-8') | ||
result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html) | ||
result2 = re.findall(r'(?<=title=").\S+', html) | ||
result2.pop() | ||
result3 = sorted(set(result2), key=result2.index) | ||
result3.pop(-3) | ||
if not os.path.exists('douban'): | ||
os.makedirs('douban') | ||
i = 0 | ||
for link in result: | ||
filename = 'douban\\' + str(result3[i]) + '.jpg' | ||
i += 1 | ||
with open(filename, 'w') as file: | ||
urllib.request.urlretrieve(link, filename) | ||
|
||
|
||
url = 'https://movie.douban.com/subject/26260853/celebrities' | ||
if __name__ == '__main__': | ||
douban(url) |
Oops, something went wrong.