Skip to content

Commit

Permalink
sort
Browse files Browse the repository at this point in the history
  • Loading branch information
rieuse committed May 29, 2017
1 parent d586847 commit 3e2d8f4
Show file tree
Hide file tree
Showing 16 changed files with 622 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import json

import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

client = MongoClient('localhost')
db = client["DouyuTV"]
col = db["Roominfo"]
host = 'http://api.douyutv.com/api/v1/live/'
all_game = 'http://open.douyucdn.cn/api/RoomApi/game'
sort = []


def parser(url):
html = requests.get(url).text
soup = BeautifulSoup(html, 'lxml')
jn = json.loads(soup.text)
return jn


def get_room_sort(url):
jn = parser(url)
data = jn['data']
for item in data:
sort.append(host + item['short_name'])


def get_room_info():
for item in sort:
jn = parser(item)
data = jn['data']
try:
col.insert(data)
except Exception as e:
pass


if __name__ == '__main__':
get_room_sort(all_game)
get_room_info()
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient

HOST = "http://www.douyu.com"
Directory_url = "http://www.douyu.com/directory?isAjax=1"
Qurystr = "/?page=1&isAjax=1"

client = MongoClient('localhost')
db = client["Douyu2"]
col = db["Roominfo"]


def get_roominfo(data):
if data:
firstpage = BeautifulSoup(data, 'lxml')
roomlist = firstpage.select('li')
print(len(roomlist))
if roomlist:
for room in roomlist:
try:
roomid = room["data-rid"]
roomtitle = room.a["title"]
roomtitle = roomtitle.encode('utf-8')
roomowner = room.select("p > span")
roomtag = room.select("div > span")
roomimg = room.a
roomtag = roomtag[0].string
date = datetime.now()
if len(roomowner) == 2:
zbname = roomowner[0].string
audience = roomowner[1].get_text()
audience = audience.encode('utf-8').decode('utf-8')
image = roomimg.span.img["data-original"]
word = u"万"
if word in audience:
r = re.compile(r'(\d+)(\.?)(\d*)')
data = r.match(audience).group(0)
audience = int(float(data) * 10000)
else:
audience = int(audience)
roominfo = {
"roomid": int(roomid),
"roomtitle": roomtitle,
"anchor": zbname,
"audience": audience,
"tag": roomtag,
"date": date,
"img": image
}
col.insert_one(roominfo)
except Exception as e:
print(e)


def insert_info():
session = requests.session()
pagecontent = session.get(Directory_url).text
pagesoup = BeautifulSoup(pagecontent, 'lxml')
games = pagesoup.select('a')
# col.drop()
for game in games:
links = game["href"]
gameurl = HOST + links + Qurystr
print(gameurl)
gamedata = session.get(gameurl).text
get_roominfo(gamedata)


if __name__ == '__main__':
insert_info()
Empty file.
23 changes: 23 additions & 0 deletions Python爬虫小的demo/Python爬虫1.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import urllib.request
from bs4 import BeautifulSoup
import os

# 下载网页
url = 'http://www.yidianzixun.com/home?page=article&id=0G5zThN8&up=0'
res = urllib.request.urlopen(url)
html = res.read().decode('utf-8')
# 解析网页
soup = BeautifulSoup(html, 'html.parser')
result = soup.find_all('img', limit=10)
links = []
for content in result:
links.append(content.get('src'))
# 下载并存储图片
if not os.path.exists('photo'):
os.makedirs('photo')
i = 0
for link in links:
i += 1
filename = 'photo\\' + 'photo' + str(i) + '.gif'
with open(filename, 'w') as file:
urllib.request.urlretrieve(link, filename)
22 changes: 22 additions & 0 deletions Python爬虫小的demo/Python爬虫2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import urllib.request
from bs4 import BeautifulSoup
import os

url = 'http://www.8she.com/31988.html'
res = urllib.request.urlopen(url)
html = res.read().decode('utf-8')
soup = BeautifulSoup(html, 'html.parser')
result = soup.find_all(class_='aligncenter', limit=15)
# print(result)
links = []
for content in result:
links.append(content.get('src'))
# 下载并存储图片
if not os.path.exists('E:\\rieuse\爬虫图片\photo2'):
os.makedirs('E:\\rieuse\爬虫图片\photo2')
i = 0
for link in links:
i += 1
filename = 'E:\\rieuse\爬虫图片\photo2\\' + 'photo' + str(i) + '.jpg'
with open(filename, 'w') as file:
urllib.request.urlretrieve(link, filename)
37 changes: 37 additions & 0 deletions Python爬虫小的demo/Python爬虫3爬简书7日热门.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*-coding:utf-8-*-
import csv
import requests
from bs4 import BeautifulSoup

base_url = 'http://www.jianshu.com/trending/weekly'

articles = []
data_list = []
for i in range(1, 7):
url = base_url + '?page={}'.format(i)
r = requests.get(url)
html = r.text
soup = BeautifulSoup(html, 'html.parser')
for article in soup.find_all(class_='content'):
title = article.find(class_='title').get_text()
link = 'http://www.jianshu.com' + article.find(class_='title').get('href')
author = article.find(class_='blue-link').get_text()
time = article.span['data-shared-at']
meta = article.find(class_='meta').find_all(['a', 'span'])
metas = []
for item in meta:
metas.append(item.get_text().strip())
read = metas[0]
comment = metas[1]
like = metas[2]
try:
money = metas[3]
except:
money = '0'
articles.append([title, author, time, read, comment, like, money, link])

with open('jianshu.csv', 'w') as f:
writer = csv.writer(f)
writer.writerow(['文章标题', '作者', '时间', '阅读量', '评论', '喜欢', '赞赏数', '文章地址'])
for row in articles:
writer.writerow(row)
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import requests
from bs4 import BeautifulSoup
import os

'''
下载图片或者文件也可以使用urlretrieve模块
from urllib import request
request.urlretrieve('','1.jpg')
'''
# proxies = {
# "http": "http://175.155.240.127:808",
# "https": "http://114.239.149.110:808",
# }
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Connection': 'keep-alive'}
url = 'http://www.wmpic.me/86253'
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')
result = soup.select('#content > div.content-c > center > img')
links = []
for content in result:
links.append(content.get('src'))
if not os.path.exists('花瓶'):
os.makedirs('花瓶')
i = 0
for link in links:
i += 1
filename = '花瓶\\' + '花瓶' + str(i) + '.jpg'
ir = requests.get(link)
with open(filename, 'wb') as fo:
fo.write(ir.content)
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import urllib.request
import threading
import re
import urllib.error

headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)


class One(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)

def run(self):
for i in range(1, 36, 2):
url = "http://www.qiushibaike.com/8hr/page/" + str(i)
pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist = re.compile(pat, re.S).findall(pagedata)
for j in range(0, len(datalist)):
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])


class Two(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)

def run(self):
for i in range(0, 36, 2):
url = "http://www.qiushibaike.com/8hr/page/" + str(i)
pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
datalist = re.compile(pat, re.S).findall(pagedata)
for j in range(0, len(datalist)):
print("第" + str(i) + "页第" + str(j) + "个段子的内容是:")
print(datalist[j])


one = One()
one.start()
two = Two()
two.start()
58 changes: 58 additions & 0 deletions Python爬虫小的demo/爬取花瓣妹子缩略图.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
import requests
import lxml.html
import os

SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
# browser = webdriver.Firefox()
wait = WebDriverWait(browser, 15)
browser.set_window_size(1400, 900)


def get_url():
print('打开主页搜寻链接中...')
try:
browser.get('http://huaban.com/boards/favorite/beauty/')
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall')))
html = browser.page_source
doc = lxml.html.fromstring(html)
name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href')
for item, fileName in zip(u, name):
url = 'http://huaban.com' + item
print('主链接已找到:' + url)
if '*' in fileName:
fileName = fileName.replace('*', '')
dowload(url, fileName)
except Exception as e:
print(e)


def dowload(url, fileName):
try:
browser.get(url)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall')))
html = browser.page_source
doc = lxml.html.fromstring(html)
if not os.path.exists('image2\\' + fileName):
os.makedirs('image2\\' + fileName)
link = doc.xpath('//*[@id="waterfall"]/div/a/img/@src')
i = 0
for item in link:
i += 1
ur = 'http:' + item
print('正在下载第' + str(i) + '张图片,地址:' + ur)
r = requests.get(ur)
filename = 'image2\\{}\\'.format(fileName) + str(i) + '.jpg'
with open(filename, 'wb') as fo:
fo.write(r.content)
except Exception:
print('本次出错了')


if __name__ == '__main__':
get_url()
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import urllib.request
import os
import re


def douban(url):
r = urllib.request.urlopen(url)
html = r.read().decode('utf-8')
result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
result2 = re.findall(r'(?<=title=").\S+', html)
result2.pop()
result3 = sorted(set(result2), key=result2.index)
result3.pop(-3)
if not os.path.exists('douban'):
os.makedirs('douban')
i = 0
for link in result:
filename = 'douban\\' + str(result3[i]) + '.jpg'
i += 1
with open(filename, 'w') as file:
urllib.request.urlretrieve(link, filename)


url = 'https://movie.douban.com/subject/26260853/celebrities'
if __name__ == '__main__':
douban(url)
Loading

0 comments on commit 3e2d8f4

Please sign in to comment.