Skip to content

Commit

Permalink
Merge pull request gocolly#537 from ysung6/master
Browse files Browse the repository at this point in the history
update coursera example
  • Loading branch information
asciimoo authored Aug 31, 2020
2 parents b545326 + 6bfbce0 commit 1d5d257
Showing 1 changed file with 28 additions and 20 deletions.
48 changes: 28 additions & 20 deletions _examples/coursera_courses/coursera_courses.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ type Course struct {
URL string
Language string
Commitment string
HowToPass string
Rating string
}

func main() {
fName := "courses.json"
file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s\n", fName, err)
return
}
defer file.Close()

// Instantiate default collector
c := colly.NewCollector(
// Visit only domains: coursera.org, www.coursera.org
Expand All @@ -38,7 +45,7 @@ func main() {

courses := make([]Course, 0, 200)

// On every a element which has href attribute call callback
// On every <a> element which has "href" attribute call callback
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
// If attribute class is this long string return from callback
// As this a is irrelevant
Expand All @@ -59,8 +66,8 @@ func main() {
log.Println("visiting", r.URL.String())
})

// On every a HTML element which has name attribute call callback
c.OnHTML(`a[name]`, func(e *colly.HTMLElement) {
// On every <a> element with collection-product-card class call callback
c.OnHTML(`a.collection-product-card`, func(e *colly.HTMLElement) {
// Activate detailCollector if the link contains "coursera.org/learn"
courseURL := e.Request.AbsoluteURL(e.Attr("href"))
if strings.Index(courseURL, "coursera.org/learn") != -1 {
Expand All @@ -71,30 +78,31 @@ func main() {
// Extract details of the course
detailCollector.OnHTML(`div[id=rendered-content]`, func(e *colly.HTMLElement) {
log.Println("Course found", e.Request.URL)
title := e.ChildText(".course-title")
title := e.ChildText(".banner-title")
if title == "" {
log.Println("No title found", e.Request.URL)
}
course := Course{
Title: title,
URL: e.Request.URL.String(),
Description: e.ChildText("div.content"),
Creator: e.ChildText("div.creator-names > span"),
Creator: e.ChildText("li.banner-instructor-info > a > div > div > span"),
Rating: e.ChildText("span.number-rating"),
}
// Iterate over rows of the table which contains different information
// about the course
e.ForEach("table.basic-info-table tr", func(_ int, el *colly.HTMLElement) {
switch el.ChildText("td:first-child") {
case "Language":
course.Language = el.ChildText("td:nth-child(2)")
// Iterate over div components and add details to course
e.ForEach(".AboutCourse .ProductGlance > div", func(_ int, el *colly.HTMLElement) {
svgTitle := strings.Split(el.ChildText("div:nth-child(1) svg title"), " ")
lastWord := svgTitle[len(svgTitle)-1]
switch lastWord {
// svg Title: Available Langauges
case "languages":
course.Language = el.ChildText("div:nth-child(2) > div:nth-child(1)")
// svg Title: Mixed/Beginner/Intermediate/Advanced Level
case "Level":
course.Level = el.ChildText("td:nth-child(2)")
case "Commitment":
course.Commitment = el.ChildText("td:nth-child(2)")
case "How To Pass":
course.HowToPass = el.ChildText("td:nth-child(2)")
case "User Ratings":
course.Rating = el.ChildText("td:nth-child(2) div:nth-of-type(2)")
course.Level = el.ChildText("div:nth-child(2) > div:nth-child(1)")
// svg Title: Hours to complete
case "complete":
course.Commitment = el.ChildText("div:nth-child(2) > div:nth-child(1)")
}
})
courses = append(courses, course)
Expand All @@ -103,7 +111,7 @@ func main() {
// Start scraping on http://coursera.com/browse
c.Visit("https://coursera.org/browse")

enc := json.NewEncoder(os.Stdout)
enc := json.NewEncoder(file)
enc.SetIndent("", " ")

// Dump json to the standard output
Expand Down

0 comments on commit 1d5d257

Please sign in to comment.