Skip to content

Commit

Permalink
Merge pull request nange#62 from ThunderLT/master
Browse files Browse the repository at this point in the history
add a example about stackoverflow
  • Loading branch information
nange authored Jul 21, 2019
2 parents e75388f + 0b4e6ed commit b6bf2a1
Show file tree
Hide file tree
Showing 4 changed files with 165 additions and 11 deletions.
1 change: 1 addition & 0 deletions _example/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
_ "github.com/nange/gospider/_example/rule/baidunews"
_ "github.com/nange/gospider/_example/rule/dianping"
_ "github.com/nange/gospider/_example/rule/mojitianqi"
_ "github.com/nange/gospider/_example/rule/stackoverflow"
log "github.com/sirupsen/logrus"
)

Expand Down
24 changes: 19 additions & 5 deletions _example/rule/dianping/dianping.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,32 @@ import (

"github.com/nange/gospider/spider"
log "github.com/sirupsen/logrus"

)

func init() {
spider.Register(rule)
}

var (
outputFields=[]string{"city","adname","big_category","sub_category","shop_name", "address", "tel", "photos"}
constraints= spider.NewConstraints(outputFields,
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(128) NOT NULL DEFAULT ''",
"VARCHAR(128) NOT NULL DEFAULT ''",
"VARCHAR(128) NOT NULL DEFAULT ''",
"VARCHAR(128) NOT NULL DEFAULT ''",
"VARCHAR(32) NOT NULL DEFAULT ''",
"VARCHAR(256) NOT NULL DEFAULT ''",
)
)
// NOTICE: 目前这个例子仅实现了抓取美食类商家
var rule = &spider.TaskRule{
Name: "大众点评商家数据",
Description: "抓取大众点评上全国各大城市所有类型的商家详情数据",
Namespace: "dianping_shop",
OutputFields: []string{"city", "adname", "big_category", "sub_category", "shop_name", "address", "tel", "photos"},
Name: "大众点评商家数据",
Description: "抓取大众点评上全国各大城市所有类型的商家详情数据",
Namespace: "dianping_shop",
OutputFields: outputFields,
OutputConstraints: constraints,
AllowURLRevisit: true,
Rule: &spider.Rule{
Head: func(ctx *spider.Context) error { // 定义入口
Expand Down
30 changes: 24 additions & 6 deletions _example/rule/mojitianqi/moji_tianqi.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,31 @@ import (
func init() {
spider.Register(rule)
}

var (
outputFields=[]string{"province", "area", "aqi", "quality_grade", "pm10", "pm25", "no2", "so2", "o3", "co", "tip", "publish_time"}
constraints= spider.NewConstraints(outputFields,
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(128) NOT NULL DEFAULT ''",
"VARCHAR(8) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(16) NOT NULL DEFAULT ''",
"VARCHAR(256) NOT NULL DEFAULT ''",
"VARCHAR(256) NOT NULL DEFAULT ''",
"VARCHAR(256) NOT NULL DEFAULT ''",
)
)
var rule = &spider.TaskRule{
Name: "墨迹天气全国空气质量",
Description: "抓取墨迹天气全国各个城市区县空气质量数据",
Namespace: "moji_tianqi",
DisableCookies: true,
OutputFields: []string{"province", "area", "aqi", "quality_grade", "pm10", "pm25", "no2", "so2", "o3", "co", "tip", "publish_time"},
Name: "墨迹天气全国空气质量",
Description: "抓取墨迹天气全国各个城市区县空气质量数据",
Namespace: "moji_tianqi",
DisableCookies: true,
OutputFields: outputFields,
OutputConstraints: constraints,
Rule: &spider.Rule{
Head: func(ctx *spider.Context) error { // 定义入口
return ctx.VisitForNext("https://tianqi.moji.com/aqi/china")
Expand Down
121 changes: 121 additions & 0 deletions _example/rule/stackoverflow/stackoverflow.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
package stackoverflow

import (
"github.com/nange/gospider/spider"
log "github.com/sirupsen/logrus"
)

func init() {
spider.Register(rule)
}

var (
outputFields = []string{"question_title", "question_detail", "question_tags", "answer_list"}
constraints = spider.NewConstraints(outputFields,
"VARCHAR(512) NOT NULL DEFAULT ''",
"TEXT",
"VARCHAR(512) NOT NULL DEFAULT ''",
"TEXT",
)
)
var rule = &spider.TaskRule{
Name: "StackOverFlow",
Description: "StackOverFlow Highly Quality QA",
Namespace: "stackoverflow_en",
OutputFields: outputFields,
OutputConstraints: constraints,
Rule: &spider.Rule{
Head: func(ctx *spider.Context) error {
return ctx.VisitForNext("https://stackoverflow.com/questions?tab=votes&page=1")
},
Nodes: map[int]*spider.Node{
0: step1,
1: step2,
},
},
}
var step1 = &spider.Node{
OnRequest: func(ctx *spider.Context, req *spider.Request) {
log.Infof("Visiting %s", req.URL.String())
},
OnError: func(ctx *spider.Context, res *spider.Response, err error) error {
log.Errorf("Visiting failed! url:%s,err:%s", res.Request.URL.String(), err.Error())
return Retry(ctx, 3)
},
OnHTML: map[string]func(*spider.Context, *spider.HTMLElement) error{
`.summary h3 a`: func(context *spider.Context, element *spider.HTMLElement) error {
link := element.Attr("href")
link = "https://stackoverflow.com" + link
return context.VisitForNext(link)
},
},
}
var step2 = &spider.Node{
OnRequest: func(ctx *spider.Context, req *spider.Request) {
log.Println("Visting", req.URL.String())
},
OnHTML: map[string]func(*spider.Context, *spider.HTMLElement) error{
`.inner-content.clearfix`: func(ctx *spider.Context, element *spider.HTMLElement) error {
question_title := element.ChildText("#question-header h1 a")

question_detail, err := element.DOM.Find("#mainbar .question .post-layout .postcell.post-layout--right .post-text").Html()
if err != nil {
log.Errorf("step2 question_detail canot find the quesion:", question_title, err.Error())
question_detail = "No Descrption"
}

question_taglist := ""
element.ForEach("#mainbar .question .post-layout .postcell.post-layout--right .post-taglist.grid.gs4.gsy.fd-column .grid.ps-relative.d-block a",
func(i int, tagEle *spider.HTMLElement) {
tag := tagEle.Attr("href")
if tag == "" {
log.Errorf("step2 question_tag canot find the question:", question_title)
} else {
question_taglist += (tag + "T^T")
}
})

answer_detail := ""
accept_answer := element.DOM.Find("#mainbar #answers .answer.accepted-answer")
if accept_answer != nil {
accept_answer = accept_answer.Find(".post-layout .answercell.post-layout--right .post-text")
if accept_answer != nil {
answer_detail, err = accept_answer.Html()
if err != nil {
log.Errorf("step2 acceptAnswer.Html() error ,quesionTitle is:", question_title, err.Error())
}
} else {
//todo: error about accept answer find no text
log.Errorf("mainbar #answers .answer.accepted-answer find but .post-layout .answercell.post-layout--right .post-text not find ; question title", question_title)
}
} else {
//no accept answer: find first answer
accept_answer = accept_answer.Find(".answer").First()
if accept_answer != nil {
accept_answer = accept_answer.Find(".post-layout .answercell.post-layout--right .post-text")
if accept_answer != nil {
answer_detail, err = accept_answer.Html()
if err != nil {
log.Errorf("step2 Answer.Html() error ,quesionTitle is:", question_title, err.Error())
}
} else {
log.Errorf("step2 canot find post-layout .answercell.post-layout--right .post-text ;quesionTitle is:", question_title)
}
} else {
log.Errorf("step2 canot find anser first ;quesionTitle is:", question_title)
}
}
return ctx.Output(map[int]interface{}{
0: question_title,
1: question_detail,
2: question_taglist,
3: answer_detail,
})
},
},
}

func Retry(ctx *spider.Context, count int) error {
log.Errorf("need to retry")
return nil
}

0 comments on commit b6bf2a1

Please sign in to comment.