Skip to content

Commit

Permalink
知乎信息抓取
Browse files Browse the repository at this point in the history
  • Loading branch information
Ehco1996 committed Feb 13, 2018
1 parent 9a96f20 commit da5bebf
Show file tree
Hide file tree
Showing 6 changed files with 105 additions and 0 deletions.
3 changes: 3 additions & 0 deletions zhihu/zhihu_hard/.vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"python.pythonPath": "/Users/ehco/.pyenv/versions/venv-spider/bin/python"
}
Empty file.
12 changes: 12 additions & 0 deletions zhihu/zhihu_hard/src/configs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from lazyspider.lazyheaders import LazyHeaders

# 登录之后的curl字符串
CURL = ''''''
# 轮子哥的主页地址
VZCH = 'https://www.zhihu.com/people/excited-vczh/activities'

# 获取你的cookie和headers
lz = LazyHeaders(CURL)
COOKIES = lz.getCookies()
HEDADERS = lz.getHeaders()
# print(COOKIES, HEDADERS)
19 changes: 19 additions & 0 deletions zhihu/zhihu_hard/src/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from bs4 import BeautifulSoup


def to_soup(page):
return BeautifulSoup(page, 'lxml')


with open('1.html', 'r') as f:
html = f.read()


soup = to_soup(html)

res = soup.find_all('div', class_="List-item")
for item in res:
ele = item.find('h2', class_='ContentItem-title')
title = ele.text
url = 'https://www.zhihu.com' + ele.a['href']
print(title, url)
30 changes: 30 additions & 0 deletions zhihu/zhihu_hard/src/spider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@


from configs import COOKIES, HEDADERS
from tools import get_driver
from parse import to_soup


class UserActivities():
'''
用户的动态信息
'''

def __init__(self, url):
self.driver = get_driver()
self.peopple_url = url
self.url_list = set()

def get_page_source(self):
'''
获取html文本
'''
self.driver.get(self.peopple_url)
return self.driver.page_source

def parse_user_html(self):
'''
解析用户动态
'''
soup = to_soup(self.get_page_source())

41 changes: 41 additions & 0 deletions zhihu/zhihu_hard/src/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import shutil

import requests
from selenium.webdriver import PhantomJS
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from configs import COOKIES, HEDADERS


def my_session():
session = requests.Session()
session.get('https://www.zhihu.com',
cookies=COOKIES, headers=HEDADERS)
return session


def get_image(url, path):
res = requests.get(url, stream=True)
with open(path, 'wb') as f:
shutil.copyfileobj(res.raw, f)


def save_html(text, name):
with open(name, 'w') as f:
f.write(text)


def get_driver():
# 设置请求头
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (HEDADERS.get("User-Agent"))
# 初始化driver
driver = PhantomJS(desired_capabilities=dcap)
# 加入cookies
for c in my_session().cookies:
driver.add_cookie({'name': c.name, 'value': c.value,
'path': c.path, 'expiry': c.expires, 'domain': c.domain})
# 设置窗口大小
driver.set_window_position(0, 0)
driver.set_window_size(1920, 1080)
return driver

0 comments on commit da5bebf

Please sign in to comment.