first commit

cjhayes16 · Mar 17, 2017 · 5763efd · 5763efd
commit 5763efd
Show file tree

Hide file tree

Showing 24 changed files with 1,757 additions and 0 deletions.
diff --git a/.idea/Zhihu.iml b/.idea/Zhihu.iml
diff --git a/.idea/dictionaries/CQC.xml b/.idea/dictionaries/CQC.xml
diff --git a/.idea/jsLinters/jscs.xml b/.idea/jsLinters/jscs.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/workspace.xml b/.idea/workspace.xml
diff --git a/scrapy.cfg b/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = zhihu.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = zhihu
diff --git a/zhihu/__init__.py b/zhihu/__init__.py
diff --git a/zhihu/__init__.pyc b/zhihu/__init__.pyc
diff --git a/zhihu/__pycache__/__init__.cpython-35.pyc b/zhihu/__pycache__/__init__.cpython-35.pyc
diff --git a/zhihu/__pycache__/items.cpython-35.pyc b/zhihu/__pycache__/items.cpython-35.pyc
diff --git a/zhihu/__pycache__/pipelines.cpython-35.pyc b/zhihu/__pycache__/pipelines.cpython-35.pyc
diff --git a/zhihu/__pycache__/settings.cpython-35.pyc b/zhihu/__pycache__/settings.cpython-35.pyc
diff --git a/zhihu/items.py b/zhihu/items.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+from scrapy import Item, Field
+
+
+class UserItem(Item):
+    # define the fields for your item here like:
+    id = Field()
+    name = Field()
+    avatar_url = Field()
+    headline = Field()
+    description = Field()
+    url = Field()
+    url_token = Field()
+    gender = Field()
+    cover_url = Field()
+    type = Field()
+    badge = Field()
+
+    answer_count = Field()
+    articles_count = Field()
+    commercial_question_count = Field()
+    favorite_count = Field()
+    favorited_count = Field()
+    follower_count = Field()
+    following_columns_count = Field()
+    following_count = Field()
+    pins_count = Field()
+    question_count = Field()
+    thank_from_count = Field()
+    thank_to_count = Field()
+    thanked_count = Field()
+    vote_from_count = Field()
+    vote_to_count = Field()
+    voteup_count = Field()
+    following_favlists_count = Field()
+    following_question_count = Field()
+    following_topic_count = Field()
+    marked_answers_count = Field()
+    mutual_followees_count = Field()
+    hosted_live_count = Field()
+    participated_live_count = Field()
+
+    locations = Field()
+    educations = Field()
+    employments = Field()
+
diff --git a/zhihu/middlewares.py b/zhihu/middlewares.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class ZhihuSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/zhihu/pipelines.py b/zhihu/pipelines.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+import pymongo
+
+
+class ZhihuPipeline(object):
+    def process_item(self, item, spider):
+        return item
+
+
+class MongoPipeline(object):
+    collection_name = 'users'
+
+    def __init__(self, mongo_uri, mongo_db):
+        self.mongo_uri = mongo_uri
+        self.mongo_db = mongo_db
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        return cls(
+            mongo_uri=crawler.settings.get('MONGO_URI'),
+            mongo_db=crawler.settings.get('MONGO_DATABASE')
+        )
+
+    def open_spider(self, spider):
+        self.client = pymongo.MongoClient(self.mongo_uri)
+        self.db = self.client[self.mongo_db]
+
+    def close_spider(self, spider):
+        self.client.close()
+
+    def process_item(self, item, spider):
+        self.db[self.collection_name].update({'url_token': item['url_token']}, dict(item), True)
+        return item