sort

a1064295263 · May 29, 2017 · 3e2d8f4 · 3e2d8f4
1 parent d586847
commit 3e2d8f4
Show file tree

Hide file tree

Showing 16 changed files with 622 additions and 0 deletions.
diff --git a/Python爬取斗鱼房间信息和数据分析/利用斗鱼API爬取斗鱼全部房间信息保存到Mongodb.py b/Python爬取斗鱼房间信息和数据分析/利用斗鱼API爬取斗鱼全部房间信息保存到Mongodb.py
@@ -0,0 +1,41 @@
+import json
+
+import requests
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+client = MongoClient('localhost')
+db = client["DouyuTV"]
+col = db["Roominfo"]
+host = 'http://api.douyutv.com/api/v1/live/'
+all_game = 'http://open.douyucdn.cn/api/RoomApi/game'
+sort = []
+
+
+def parser(url):
+    html = requests.get(url).text
+    soup = BeautifulSoup(html, 'lxml')
+    jn = json.loads(soup.text)
+    return jn
+
+
+def get_room_sort(url):
+    jn = parser(url)
+    data = jn['data']
+    for item in data:
+        sort.append(host + item['short_name'])
+
+
+def get_room_info():
+    for item in sort:
+        jn = parser(item)
+        data = jn['data']
+        try:
+            col.insert(data)
+        except Exception as e:
+            pass
+
+
+if __name__ == '__main__':
+    get_room_sort(all_game)
+    get_room_info()
diff --git a/Python爬取斗鱼房间信息和数据分析/利用网址构造爬取斗鱼全部房间信息到Mongodb.py b/Python爬取斗鱼房间信息和数据分析/利用网址构造爬取斗鱼全部房间信息到Mongodb.py
@@ -0,0 +1,73 @@
+import re
+from datetime import datetime
+import requests
+from bs4 import BeautifulSoup
+from pymongo import MongoClient
+
+HOST = "http://www.douyu.com"
+Directory_url = "http://www.douyu.com/directory?isAjax=1"
+Qurystr = "/?page=1&isAjax=1"
+
+client = MongoClient('localhost')
+db = client["Douyu2"]
+col = db["Roominfo"]
+
+
+def get_roominfo(data):
+    if data:
+        firstpage = BeautifulSoup(data, 'lxml')
+        roomlist = firstpage.select('li')
+        print(len(roomlist))
+        if roomlist:
+            for room in roomlist:
+                try:
+                    roomid = room["data-rid"]
+                    roomtitle = room.a["title"]
+                    roomtitle = roomtitle.encode('utf-8')
+                    roomowner = room.select("p > span")
+                    roomtag = room.select("div > span")
+                    roomimg = room.a
+                    roomtag = roomtag[0].string
+                    date = datetime.now()
+                    if len(roomowner) == 2:
+                        zbname = roomowner[0].string
+                        audience = roomowner[1].get_text()
+                        audience = audience.encode('utf-8').decode('utf-8')
+                        image = roomimg.span.img["data-original"]
+                        word = u"万"
+                        if word in audience:
+                            r = re.compile(r'(\d+)(\.?)(\d*)')
+                            data = r.match(audience).group(0)
+                            audience = int(float(data) * 10000)
+                        else:
+                            audience = int(audience)
+                        roominfo = {
+                            "roomid": int(roomid),
+                            "roomtitle": roomtitle,
+                            "anchor": zbname,
+                            "audience": audience,
+                            "tag": roomtag,
+                            "date": date,
+                            "img": image
+                        }
+                        col.insert_one(roominfo)
+                except Exception as e:
+                    print(e)
+
+
+def insert_info():
+    session = requests.session()
+    pagecontent = session.get(Directory_url).text
+    pagesoup = BeautifulSoup(pagecontent, 'lxml')
+    games = pagesoup.select('a')
+    # col.drop()
+    for game in games:
+        links = game["href"]
+        gameurl = HOST + links + Qurystr
+        print(gameurl)
+        gamedata = session.get(gameurl).text
+        get_roominfo(gamedata)
+
+
+if __name__ == '__main__':
+    insert_info()
diff --git a/Python爬取斗鱼房间信息和数据分析/斗鱼直播房间数据分析.py b/Python爬取斗鱼房间信息和数据分析/斗鱼直播房间数据分析.py
diff --git a/Python爬虫小的demo/Python爬虫1.py b/Python爬虫小的demo/Python爬虫1.py
@@ -0,0 +1,23 @@
+import urllib.request
+from bs4 import BeautifulSoup
+import os
+
+# 下载网页
+url = 'http://www.yidianzixun.com/home?page=article&id=0G5zThN8&up=0'
+res = urllib.request.urlopen(url)
+html = res.read().decode('utf-8')
+# 解析网页
+soup = BeautifulSoup(html, 'html.parser')
+result = soup.find_all('img', limit=10)
+links = []
+for content in result:
+    links.append(content.get('src'))
+# 下载并存储图片
+if not os.path.exists('photo'):
+    os.makedirs('photo')
+i = 0
+for link in links:
+    i += 1
+    filename = 'photo\\' + 'photo' + str(i) + '.gif'
+    with open(filename, 'w') as file:
+        urllib.request.urlretrieve(link, filename)
diff --git a/Python爬虫小的demo/Python爬虫2.py b/Python爬虫小的demo/Python爬虫2.py
@@ -0,0 +1,22 @@
+import urllib.request
+from bs4 import BeautifulSoup
+import os
+
+url = 'http://www.8she.com/31988.html'
+res = urllib.request.urlopen(url)
+html = res.read().decode('utf-8')
+soup = BeautifulSoup(html, 'html.parser')
+result = soup.find_all(class_='aligncenter', limit=15)
+# print(result)
+links = []
+for content in result:
+    links.append(content.get('src'))
+# 下载并存储图片
+if not os.path.exists('E:\\rieuse\爬虫图片\photo2'):
+    os.makedirs('E:\\rieuse\爬虫图片\photo2')
+i = 0
+for link in links:
+    i += 1
+    filename = 'E:\\rieuse\爬虫图片\photo2\\' + 'photo' + str(i) + '.jpg'
+    with open(filename, 'w') as file:
+        urllib.request.urlretrieve(link, filename)
diff --git a/Python爬虫小的demo/Python爬虫3爬简书7日热门.py b/Python爬虫小的demo/Python爬虫3爬简书7日热门.py
@@ -0,0 +1,37 @@
+# -*-coding:utf-8-*-
+import csv
+import requests
+from bs4 import BeautifulSoup
+
+base_url = 'http://www.jianshu.com/trending/weekly'
+
+articles = []
+data_list = []
+for i in range(1, 7):
+    url = base_url + '?page={}'.format(i)
+    r = requests.get(url)
+    html = r.text
+    soup = BeautifulSoup(html, 'html.parser')
+    for article in soup.find_all(class_='content'):
+        title = article.find(class_='title').get_text()
+        link = 'http://www.jianshu.com' + article.find(class_='title').get('href')
+        author = article.find(class_='blue-link').get_text()
+        time = article.span['data-shared-at']
+        meta = article.find(class_='meta').find_all(['a', 'span'])
+        metas = []
+        for item in meta:
+            metas.append(item.get_text().strip())
+        read = metas[0]
+        comment = metas[1]
+        like = metas[2]
+        try:
+            money = metas[3]
+        except:
+            money = '0'
+        articles.append([title, author, time, read, comment, like, money, link])
+
+with open('jianshu.csv', 'w') as f:
+    writer = csv.writer(f)
+    writer.writerow(['文章标题', '作者', '时间', '阅读量', '评论', '喜欢', '赞赏数', '文章地址'])
+    for row in articles:
+        writer.writerow(row)
diff --git a/Python爬虫小的demo/Python爬虫4使用bs的select选取图片二进制保存图片文件.py b/Python爬虫小的demo/Python爬虫4使用bs的select选取图片二进制保存图片文件.py
@@ -0,0 +1,32 @@
+import requests
+from bs4 import BeautifulSoup
+import os
+
+'''
+下载图片或者文件也可以使用urlretrieve模块
+from urllib import request
+request.urlretrieve('','1.jpg')
+'''
+# proxies = {
+#     "http": "http://175.155.240.127:808",
+#     "https": "http://114.239.149.110:808",
+# }
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
+    'Connection': 'keep-alive'}
+url = 'http://www.wmpic.me/86253'
+r = requests.get(url, headers=headers)
+soup = BeautifulSoup(r.text, 'html.parser')
+result = soup.select('#content > div.content-c > center > img')
+links = []
+for content in result:
+    links.append(content.get('src'))
+if not os.path.exists('花瓶'):
+    os.makedirs('花瓶')
+i = 0
+for link in links:
+    i += 1
+    filename = '花瓶\\' + '花瓶' + str(i) + '.jpg'
+    ir = requests.get(link)
+    with open(filename, 'wb') as fo:
+        fo.write(ir.content)
diff --git a/Python爬虫小的demo/Python爬虫5多线程爬取糗事百科评论.py b/Python爬虫小的demo/Python爬虫5多线程爬取糗事百科评论.py
@@ -0,0 +1,46 @@
+import urllib.request
+import threading
+import re
+import urllib.error
+
+headers = ("User-Agent",
+           "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
+opener = urllib.request.build_opener()
+opener.addheaders = [headers]
+urllib.request.install_opener(opener)
+
+
+class One(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        for i in range(1, 36, 2):
+            url = "http://www.qiushibaike.com/8hr/page/" + str(i)
+            pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
+            pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
+            datalist = re.compile(pat, re.S).findall(pagedata)
+            for j in range(0, len(datalist)):
+                print("第" + str(i) + "页第" + str(j) + "个段子的内容是：")
+                print(datalist[j])
+
+
+class Two(threading.Thread):
+    def __init__(self):
+        threading.Thread.__init__(self)
+
+    def run(self):
+        for i in range(0, 36, 2):
+            url = "http://www.qiushibaike.com/8hr/page/" + str(i)
+            pagedata = urllib.request.urlopen(url).read().decode("utf-8", "ignore")
+            pat = '<div class="content">.*?<span>(.*?)</span>.*?</div>'
+            datalist = re.compile(pat, re.S).findall(pagedata)
+            for j in range(0, len(datalist)):
+                print("第" + str(i) + "页第" + str(j) + "个段子的内容是：")
+                print(datalist[j])
+
+
+one = One()
+one.start()
+two = Two()
+two.start()
diff --git a/Python爬虫小的demo/爬取花瓣妹子缩略图.py b/Python爬虫小的demo/爬取花瓣妹子缩略图.py
@@ -0,0 +1,58 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium import webdriver
+import requests
+import lxml.html
+import os
+
+SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
+browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
+# browser = webdriver.Firefox()
+wait = WebDriverWait(browser, 15)
+browser.set_window_size(1400, 900)
+
+
+def get_url():
+    print('打开主页搜寻链接中...')
+    try:
+        browser.get('http://huaban.com/boards/favorite/beauty/')
+        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall')))
+        html = browser.page_source
+        doc = lxml.html.fromstring(html)
+        name = doc.xpath('//*[@id="waterfall"]/div/a[1]/div[2]/h3/text()')
+        u = doc.xpath('//*[@id="waterfall"]/div/a[1]/@href')
+        for item, fileName in zip(u, name):
+            url = 'http://huaban.com' + item
+            print('主链接已找到：' + url)
+            if '*' in fileName:
+                fileName = fileName.replace('*', '')
+            dowload(url, fileName)
+    except Exception as e:
+        print(e)
+
+
+def dowload(url, fileName):
+    try:
+        browser.get(url)
+        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#waterfall')))
+        html = browser.page_source
+        doc = lxml.html.fromstring(html)
+        if not os.path.exists('image2\\' + fileName):
+            os.makedirs('image2\\' + fileName)
+        link = doc.xpath('//*[@id="waterfall"]/div/a/img/@src')
+        i = 0
+        for item in link:
+            i += 1
+            ur = 'http:' + item
+            print('正在下载第' + str(i) + '张图片，地址：' + ur)
+            r = requests.get(ur)
+            filename = 'image2\\{}\\'.format(fileName) + str(i) + '.jpg'
+            with open(filename, 'wb') as fo:
+                fo.write(r.content)
+    except Exception:
+        print('本次出错了')
+
+
+if __name__ == '__main__':
+    get_url()
diff --git a/Python爬虫日记系列/Python爬虫日记一：爬取豆瓣电影中速度与激情8演员图片.py b/Python爬虫日记系列/Python爬虫日记一：爬取豆瓣电影中速度与激情8演员图片.py
@@ -0,0 +1,26 @@
+import urllib.request
+import os
+import re
+
+
+def douban(url):
+    r = urllib.request.urlopen(url)
+    html = r.read().decode('utf-8')
+    result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
+    result2 = re.findall(r'(?<=title=").\S+', html)
+    result2.pop()
+    result3 = sorted(set(result2), key=result2.index)
+    result3.pop(-3)
+    if not os.path.exists('douban'):
+        os.makedirs('douban')
+    i = 0
+    for link in result:
+        filename = 'douban\\' + str(result3[i]) + '.jpg'
+        i += 1
+        with open(filename, 'w') as file:
+            urllib.request.urlretrieve(link, filename)
+
+
+url = 'https://movie.douban.com/subject/26260853/celebrities'
+if __name__ == '__main__':
+    douban(url)