Skip to content

Commit

Permalink
上传douban.py, 修改之前的小bug
Browse files Browse the repository at this point in the history
  • Loading branch information
gz51837844 committed Jul 18, 2016
1 parent e0cde03 commit d3c7084
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 3 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ gooseeker
- crawler/result2.xml 安居客房产经纪人结果文件2
- crawler/crawl_gooseeker_bbs.py 采集集搜客论坛内容
- crawler/xslt_bbs.xml 集搜客论坛内容提取本地xslt文件
- crawler/douban.py 采集豆瓣小组讨论话题

- crawler/simpleSpider 一个小爬虫(基于Scrapy开源框架)
- crawler/tmSpider 采集天猫商品信息(基于Scrapy开源框架)
1 change: 1 addition & 0 deletions crawler/README
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ crawler
- result2.xml 安居客房产经纪人结果文件2
- crawl_gooseeker_bbs.py 采集集搜客论坛内容
- xslt_bbs.xml 集搜客论坛内容提取本地xslt文件
- douban.py 采集豆瓣小组讨论话题

- simpleSpider 一个小爬虫(基于Scrapy开源框架)
- tmSpider 采集天猫商品信息(基于Scrapy开源框架)
Expand Down
43 changes: 43 additions & 0 deletions crawler/douban.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# _*_coding:utf8_*_
# douban.py
# 爬取豆瓣小组讨论话题

from urllib import request
from lxml import etree
from gooseeker import GsExtractor
from selenium import webdriver

class PhantomSpider:
def getContent(self, url):
browser = webdriver.PhantomJS(executable_path='C:\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe')
browser.get(url)
time.sleep(3)
html = browser.execute_script("return document.documentElement.outerHTML")
output = etree.HTML(html)
return output

def saveContent(self, filepath, content):
file_obj = open(filepath, 'w', encoding='UTF-8')
file_obj.write(content)
file_obj.close()

doubanExtra = GsExtractor()
# 下面这句调用gooseeker的api来设置xslt抓取规则
# 第一个参数是app key,请到GooSeeker会员中心申请
# 第二个参数是规则名,是通过GooSeeker的图形化工具: 谋数台MS 来生成的
doubanExtra.setXsltFromAPI("ffd5273e213036d812ea298922e2627b" , "豆瓣小组讨论话题")

url = "https://www.douban.com/group/haixiuzu/discussion?start="
totalpages = 5
doubanSpider = PhantomSpider()
print("爬取开始")

for pagenumber in range(1 , totalpages):
currenturl = url + str((pagenumber-1)*25)
print("正在爬取", currenturl)
content = doubanSpider.getContent(currenturl)
outputxml = doubanExtra.extract(content)
outputfile = "result" + str(pagenumber) +".xml"
doubanSpider.saveContent(outputfile , str(outputxml))

print("爬取结束")
2 changes: 1 addition & 1 deletion crawler/simpleSpider/simpleSpider/spiders/simplespider.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def parse(self, response):
time.sleep(3)
#get xslt
extra=GsExtractor()
extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474")
extra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "淘宝天猫_商品详情30474")
# get doc
html = self.browser.execute_script("return document.documentElement.outerHTML")
doc = etree.HTML(html)
Expand Down
4 changes: 2 additions & 2 deletions crawler/tmSpider/tmSpider/spiders/tmall.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import time
import scrapy

import tmSpider.gooseeker.gsextractor as gsextractor
import tmSpider.gooseeker.Gsextractor as gsextractor

class TmallSpider(scrapy.Spider):
name = "tmall"
Expand All @@ -22,7 +22,7 @@ def parse(self, response):
html = response.body
print("----------------------------------------------------------------------------")
extra=gsextractor.GsExtractor()
extra.setXsltFromAPI("0a3898683f265e7b28991e0615228baa", "淘宝天猫_商品详情30474","tmall","list")
extra.setXsltFromAPI("31d24931e043e2d5364d03b8ff9cc77e", "淘宝天猫_商品详情30474","tmall","list")

result = extra.extract(html)
print(str(result).encode('gbk','ignore').decode('gbk'))
Expand Down

0 comments on commit d3c7084

Please sign in to comment.