From d3c7084cf748f7d47f841a57c0fd8da26714ded3 Mon Sep 17 00:00:00 2001 From: gz51837844 <2486516421@qq.com> Date: Mon, 18 Jul 2016 17:17:01 +0800 Subject: [PATCH] =?UTF-8?q?=E4=B8=8A=E4=BC=A0douban.py,=20=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E4=B9=8B=E5=89=8D=E7=9A=84=E5=B0=8Fbug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 1 + crawler/README | 1 + crawler/douban.py | 43 +++++++++++++++++++ .../simpleSpider/spiders/simplespider.py | 2 +- crawler/tmSpider/tmSpider/spiders/tmall.py | 4 +- 5 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 crawler/douban.py diff --git a/README.md b/README.md index 12b67b1..07b494f 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ gooseeker - crawler/result2.xml 安居客房产经纪人结果文件2 - crawler/crawl_gooseeker_bbs.py 采集集搜客论坛内容 - crawler/xslt_bbs.xml 集搜客论坛内容提取本地xslt文件 + - crawler/douban.py 采集豆瓣小组讨论话题 - crawler/simpleSpider 一个小爬虫(基于Scrapy开源框架) - crawler/tmSpider 采集天猫商品信息(基于Scrapy开源框架) diff --git a/crawler/README b/crawler/README index df7611a..d130381 100644 --- a/crawler/README +++ b/crawler/README @@ -10,6 +10,7 @@ crawler - result2.xml 安居客房产经纪人结果文件2 - crawl_gooseeker_bbs.py 采集集搜客论坛内容 - xslt_bbs.xml 集搜客论坛内容提取本地xslt文件 + - douban.py 采集豆瓣小组讨论话题 - simpleSpider 一个小爬虫(基于Scrapy开源框架) - tmSpider 采集天猫商品信息(基于Scrapy开源框架) diff --git a/crawler/douban.py b/crawler/douban.py new file mode 100644 index 0000000..2b77978 --- /dev/null +++ b/crawler/douban.py @@ -0,0 +1,43 @@ +# _*_coding:utf8_*_ +# douban.py +# 爬取豆瓣小组讨论话题 + +from urllib import request +from lxml import etree +from gooseeker import GsExtractor +from selenium import webdriver + +class PhantomSpider: + def getContent(self, url): + browser = webdriver.PhantomJS(executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe') + browser.get(url) + time.sleep(3) + html = browser.execute_script("return document.documentElement.outerHTML") + output = etree.HTML(html) + return output + + def saveContent(self, filepath, content): + file_obj = open(filepath, 'w', encoding='UTF-8') + file_obj.write(content) + file_obj.close() + +doubanExtra = GsExtractor() +# 下面这句调用gooseeker的api来设置xslt抓取规则 +# 第一个参数是app key,请到GooSeeker会员中心申请 +# 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的 +doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b" , "豆瓣小组讨论话题") + +url = "https://www.douban.com/group/haixiuzu/discussion?start=" +totalpages = 5 +doubanSpider = PhantomSpider() +print("爬取开始") + +for pagenumber in range(1 , totalpages): + currenturl = url + str((pagenumber-1)*25) + print("正在爬取", currenturl) + content = doubanSpider.getContent(currenturl) + outputxml = doubanExtra.extract(content) + outputfile = "result" + str(pagenumber) +".xml" + doubanSpider.saveContent(outputfile , str(outputxml)) + +print("爬取结束") diff --git a/crawler/simpleSpider/simpleSpider/spiders/simplespider.py b/crawler/simpleSpider/simpleSpider/spiders/simplespider.py index 1306ada..51eda46 100644 --- a/crawler/simpleSpider/simpleSpider/spiders/simplespider.py +++ b/crawler/simpleSpider/simpleSpider/spiders/simplespider.py @@ -31,7 +31,7 @@ def parse(self, response): time.sleep(3) #get xslt extra=GsExtractor() - extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474") + extra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "淘宝天猫_商品详情30474") # get doc html = self.browser.execute_script("return document.documentElement.outerHTML") doc = etree.HTML(html) diff --git a/crawler/tmSpider/tmSpider/spiders/tmall.py b/crawler/tmSpider/tmSpider/spiders/tmall.py index 4318129..70a7ed0 100644 --- a/crawler/tmSpider/tmSpider/spiders/tmall.py +++ b/crawler/tmSpider/tmSpider/spiders/tmall.py @@ -2,7 +2,7 @@ import time import scrapy -import tmSpider.gooseeker.gsextractor as gsextractor +import tmSpider.gooseeker.Gsextractor as gsextractor class TmallSpider(scrapy.Spider): name = "tmall" @@ -22,7 +22,7 @@ def parse(self, response): html = response.body print("----------------------------------------------------------------------------") extra=gsextractor.GsExtractor() - extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474","tmall","list") + extra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "淘宝天猫_商品详情30474","tmall","list") result = extra.extract(html) print(str(result).encode('gbk','ignore').decode('gbk'))