From a1cedc44a1ca91418c86eaa6b5f6a8d1d8a68eeb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A1szl=C3=B3=20Hammerl?= Date: Wed, 1 Aug 2018 15:30:46 +0200 Subject: [PATCH] [fix] instagram example scraper fixed --- _examples/instagram/instagram.go | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/_examples/instagram/instagram.go b/_examples/instagram/instagram.go index 38a83f544..3a1ae14aa 100644 --- a/_examples/instagram/instagram.go +++ b/_examples/instagram/instagram.go @@ -1,27 +1,25 @@ package main import ( - "bytes" "crypto/md5" "encoding/json" "fmt" "log" "net/url" "os" + "regexp" "strings" "github.com/gocolly/colly" ) -// found in https://www.instagram.com/static/bundles/en_US_Commons.js/68e7390c5938.js -// included from profile page -const instagramQueryId = "42323d64886122307be10013ad2dcc45" - // "id": user id, "after": end cursor const nextPageURL string = `https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s` const nextPagePayload string = `{"id":"%s","first":50,"after":"%s"}` var requestID string +var requestIds [][]byte +var queryIdPattern = regexp.MustCompile(`queryId:".{32}"`) type pageInfo struct { EndCursor string `json:"end_cursor"` @@ -108,8 +106,8 @@ func main() { c.OnHTML("html", func(e *colly.HTMLElement) { d := c.Clone() d.OnResponse(func(r *colly.Response) { - idStart := bytes.Index(r.Body, []byte(`:n},queryId:"`)) - requestID = string(r.Body[idStart+13 : idStart+45]) + requestIds = queryIdPattern.FindAll(r.Body, -1) + requestID = string(requestIds[1][9:41]) }) requestIDURL := e.Request.AbsoluteURL(e.ChildAttr(`link[as="script"]`, "href")) d.Visit(requestIDURL)