Skip to content

Commit

Permalink
Refactored collector to support secondary crawling
Browse files Browse the repository at this point in the history
  • Loading branch information
MewX committed Dec 25, 2022
1 parent 4ec209a commit d2a898c
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 8 deletions.
32 changes: 24 additions & 8 deletions task/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,21 +68,23 @@ func (task *Collector) Execute() error {
for _, c := range task.categories {
switch c {
case proto.Category_broadcast.String():
task.crawlBroadcasts()
//task.crawlBroadcastLists()
task.crawlBroadcastDetail()
case proto.Category_book.String():
task.crawlBooks()
task.crawlBookLists()
case proto.Category_movie.String():
task.crawlMovies()
task.crawlMovieLists()
case proto.Category_game.String():
task.crawlGames()
task.crawlGameLists()
default:
return errors.New("Category not implemented " + c)
}
}
return nil
}

func (task *Collector) crawlBroadcasts() error {
// crawlBroadcastLists downloads the list of broadcasts.
func (task *Collector) crawlBroadcastLists() error {
page := startingPage
q := util.NewQueue()
c := util.NewColly()
Expand Down Expand Up @@ -118,17 +120,31 @@ func (task *Collector) crawlBroadcasts() error {
return q.Run(c)
}

func (task *Collector) crawlBooks() error {
// crawlBroadcastDetail downloads the detail of each broadcast by reading all downloaded broadcast lists.
func (task *Collector) crawlBroadcastDetail() error {
fileNamePattern := fmt.Sprintf("*_%s_p*.html", proto.Category_broadcast)
files := util.GetFilePathListWithPattern(task.outputDir, fileNamePattern)
for _, fn := range files {
log.Println("Found file:", fn)
// TODO: finish this with goquery.
}

// TODO: handle each type of broadcasts.

return errors.New("update the implementation")
}

func (task *Collector) crawlBookLists() error {
// TODO: update the implementation.
return errors.New("update the implementation")
}

func (task *Collector) crawlMovies() error {
func (task *Collector) crawlMovieLists() error {
// TODO: update the implementation.
return errors.New("update the implementation")
}

func (task *Collector) crawlGames() error {
func (task *Collector) crawlGameLists() error {
// TODO: update the implementation.
return errors.New("update the implementation")
}
Expand Down
26 changes: 26 additions & 0 deletions util/files.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"github.com/its-my-data/doubak/proto"
"github.com/mengzhuo/cookiestxt"
"html"
"io/fs"
"log"
"net/http"
"os"
Expand Down Expand Up @@ -37,6 +38,31 @@ func GetPathWithCreationWithBase(base, subdirs string) (string, error) {
return newPath, os.MkdirAll(newPath, os.ModePerm)
}

// ReadEntireFile reads the entire file to a string.
func ReadEntireFile(fullPath string) string {
b, err := os.ReadFile(fullPath)
if err != nil {
log.Fatal(err)
}
return string(b)
}

// GetFilePathListWithPattern returns the full paths for files matching the pattern in the base path.
func GetFilePathListWithPattern(basePath, fileNamePattern string) []string {
var files []string
filepath.WalkDir(basePath, func(s string, d fs.DirEntry, e error) error {
if e != nil {
return e
}
if matched, _ := filepath.Match(fileNamePattern, d.Name()); matched {
files = append(files, s)
}
return nil
})
log.Println("Found", len(files), "matched files with pattern:", fileNamePattern)
return files
}

// LoadCookiesFile loads the external cookies file.
func LoadCookiesFile(filePath string) ([]*http.Cookie, error) {
f, err := os.Open(filePath)
Expand Down

0 comments on commit d2a898c

Please sign in to comment.