知乎信息抓取

zhuyoucai168 · Feb 13, 2018 · da5bebf · da5bebf
1 parent 9a96f20
commit da5bebf
Show file tree

Hide file tree

Showing 6 changed files with 105 additions and 0 deletions.
diff --git a/zhihu/zhihu_hard/.vscode/settings.json b/zhihu/zhihu_hard/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+ "python.pythonPath": "/Users/ehco/.pyenv/versions/venv-spider/bin/python"
+}
diff --git a/zhihu/zhihu_hard/src/__init__.py b/zhihu/zhihu_hard/src/__init__.py
diff --git a/zhihu/zhihu_hard/src/configs.py b/zhihu/zhihu_hard/src/configs.py
@@ -0,0 +1,12 @@
+from lazyspider.lazyheaders import LazyHeaders
+
+# 登录之后的curl字符串
+CURL = ''''''
+# 轮子哥的主页地址
+VZCH = 'https://www.zhihu.com/people/excited-vczh/activities'
+
+# 获取你的cookie和headers
+lz = LazyHeaders(CURL)
+COOKIES = lz.getCookies()
+HEDADERS = lz.getHeaders()
+# print(COOKIES, HEDADERS)
diff --git a/zhihu/zhihu_hard/src/parse.py b/zhihu/zhihu_hard/src/parse.py
@@ -0,0 +1,19 @@
+from bs4 import BeautifulSoup
+
+
+def to_soup(page):
+ return BeautifulSoup(page, 'lxml')
+
+
+with open('1.html', 'r') as f:
+ html = f.read()
+
+
+soup = to_soup(html)
+
+res = soup.find_all('div', class_="List-item")
+for item in res:
+ ele = item.find('h2', class_='ContentItem-title')
+ title = ele.text
+ url = 'https://www.zhihu.com' + ele.a['href']
+ print(title, url)
diff --git a/zhihu/zhihu_hard/src/spider.py b/zhihu/zhihu_hard/src/spider.py
@@ -0,0 +1,30 @@
+
+
+from configs import COOKIES, HEDADERS
+from tools import get_driver
+from parse import to_soup
+
+
+class UserActivities():
+ '''
+ 用户的动态信息
+ '''
+
+ def __init__(self, url):
+ self.driver = get_driver()
+ self.peopple_url = url
+ self.url_list = set()
+
+ def get_page_source(self):
+ '''
+ 获取html文本
+ '''
+ self.driver.get(self.peopple_url)
+ return self.driver.page_source
+
+ def parse_user_html(self):
+ '''
+ 解析用户动态
+ '''
+ soup = to_soup(self.get_page_source())
+
diff --git a/zhihu/zhihu_hard/src/tools.py b/zhihu/zhihu_hard/src/tools.py
@@ -0,0 +1,41 @@
+import shutil
+
+import requests
+from selenium.webdriver import PhantomJS
+from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+
+from configs import COOKIES, HEDADERS
+
+
+def my_session():
+ session = requests.Session()
+ session.get('https://www.zhihu.com',
+ cookies=COOKIES, headers=HEDADERS)
+ return session
+
+
+def get_image(url, path):
+ res = requests.get(url, stream=True)
+ with open(path, 'wb') as f:
+ shutil.copyfileobj(res.raw, f)
+
+
+def save_html(text, name):
+ with open(name, 'w') as f:
+ f.write(text)
+
+
+def get_driver():
+ # 设置请求头
+ dcap = dict(DesiredCapabilities.PHANTOMJS)
+ dcap["phantomjs.page.settings.userAgent"] = (HEDADERS.get("User-Agent"))
+ # 初始化driver
+ driver = PhantomJS(desired_capabilities=dcap)
+ # 加入cookies
+ for c in my_session().cookies:
+ driver.add_cookie({'name': c.name, 'value': c.value,
+ 'path': c.path, 'expiry': c.expires, 'domain': c.domain})
+ # 设置窗口大小
+ driver.set_window_position(0, 0)
+ driver.set_window_size(1920, 1080)
+ return driver