Skip to content

Commit

Permalink
support crawl twitter
Browse files Browse the repository at this point in the history
  • Loading branch information
wzdnzd committed May 5, 2023
1 parent 1fac16e commit 7dd5785
Show file tree
Hide file tree
Showing 4 changed files with 193 additions and 5 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ venv/
ENV/
env.bak/
venv.bak/
.vscode/

# Spyder project settings
.spyderproject
Expand All @@ -129,4 +130,5 @@ dmypy.json
.pyre/
cache.db
config.yaml
generate.ini
generate.ini
subscribe/config/config.json
15 changes: 15 additions & 0 deletions aggregate/subscribe/config/config.default.json
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,21 @@
"exclude": "",
"spams": []
},
"twitter": {
"enable": true,
"users": {
"username": {
"enable": true,
"num": 30,
"exclude": "",
"config": {
"rename": "",
"xxxxxx": ""
},
"push_to": []
}
}
},
"repositories": [
{
"enable": false,
Expand Down
160 changes: 156 additions & 4 deletions aggregate/subscribe/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ def batch_crawl(conf: dict, thread: int = 50) -> list:
if repositories:
tasks.update(crawl_github_repo(repos=repositories))

twitter_spider = conf.get("twitter", {})
if twitter_spider:
tasks.update(crawl_twitter(tasks=twitter_spider))

pages = conf.get("pages", {})
if pages:
tasks.update(crawl_pages(pages=pages))
Expand Down Expand Up @@ -625,13 +629,17 @@ def crawl_github(


def crawl_single_page(
url: str, push_to: list = [], exclude: str = "", config: dict = {}
url: str,
push_to: list = [],
exclude: str = "",
config: dict = {},
headers: dict = None,
) -> dict:
if not url or not push_to:
logger.error(f"[PageCrawl] cannot crawl from page: {url}")
return {}

content = utils.http_get(url=url)
content = utils.http_get(url=url, headers=headers)
if content == "":
return {}

Expand All @@ -644,7 +652,7 @@ def crawl_single_page(
)


def crawl_pages(pages: dict, silent: bool = False) -> dict:
def crawl_pages(pages: dict, silent: bool = False, headers: dict = None) -> dict:
if not pages:
return {}

Expand All @@ -661,7 +669,7 @@ def crawl_pages(pages: dict, silent: bool = False) -> dict:
exclude = v.get("exclude", "").strip()
config = v.get("config", {})

params.append([k, push_to, exclude, config])
params.append([k, push_to, exclude, config, headers])

subscribes = multi_thread_crawl(func=crawl_single_page, params=params)
endtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
Expand All @@ -672,6 +680,150 @@ def crawl_pages(pages: dict, silent: bool = False) -> dict:
return subscribes


def get_guest_token() -> str:
content = utils.http_get(url="https://twitter.com")
if not content:
return ""

matcher = re.findall("gt=([0-9]{19})", content, flags=re.I)
return matcher[0] if matcher else ""


def username_to_id(username: str, headers: dict) -> str:
if utils.isblank(username):
return ""

if not headers or "X-Guest-Token" not in headers:
guest_token = get_guest_token()
if not guest_token:
return ""

headers = {
"User-Agent": utils.USER_AGENT,
"Authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"X-Guest-Token": guest_token,
"Content-Type": "application/json",
}

variables = {
"screen_name": username.lower().strip(),
"withSafetyModeUserFields": True,
}
features = {
"blue_business_profile_image_shape_enabled": True,
"responsive_web_graphql_exclude_directive_enabled": True,
"verified_phone_label_enabled": False,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
}

payload = urllib.parse.urlencode(
{"variables": json.dumps(variables), "features": json.dumps(features)}
)
url = f"https://twitter.com/i/api/graphql/sLVLhk0bGj3MVFEKTdax1w/UserByScreenName?{payload}"
try:
content = utils.http_get(url=url, headers=headers)
if not content:
return ""

data = json.loads(content).get("data", {}).get("user", {}).get("result", "")
return data.get("rest_id", "")
except:
logger.error(f"[TwitterCrawl] cannot query uid by username=[{username}]")
return ""


def crawl_twitter(tasks: dict) -> dict:
if not tasks:
return {}

starttime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logger.info(f"[TwitterCrawl] start crawl from Twitter, time: {starttime}")

# extract X-Guest-Token
guest_token = get_guest_token()
if not guest_token:
logger.error(f"[TwitterCrawl] cannot extract X-Guest-Token from twitter")
return {}

headers = {
"User-Agent": utils.USER_AGENT,
"Authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"X-Guest-Token": guest_token,
"Content-Type": "application/json",
}

features = {
"blue_business_profile_image_shape_enabled": True,
"responsive_web_graphql_exclude_directive_enabled": True,
"verified_phone_label_enabled": False,
"responsive_web_graphql_timeline_navigation_enabled": True,
"responsive_web_graphql_skip_user_profile_image_extensions_enabled": False,
"tweetypie_unmention_optimization_enabled": True,
"vibe_api_enabled": True,
"responsive_web_edit_tweet_api_enabled": True,
"graphql_is_translatable_rweb_tweet_is_translatable_enabled": True,
"view_counts_everywhere_api_enabled": True,
"longform_notetweets_consumption_enabled": True,
"tweet_awards_web_tipping_enabled": False,
"freedom_of_speech_not_reach_fetch_enabled": True,
"standardized_nudges_misinfo": True,
"tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": False,
"interactive_text_enabled": True,
"responsive_web_text_conversations_enabled": False,
"longform_notetweets_rich_text_read_enabled": True,
"responsive_web_enhance_cards_enabled": False,
}

candidates, pages = {}, {}
for k, v in tasks.items():
if utils.isblank(k) or not v or type(v) != dict:
continue
candidates[k] = v

if not candidates:
return {}

# username to uid
params = [[k, headers] for k in candidates.keys()]
cpu_count = multiprocessing.cpu_count()
count = len(params) if len(params) <= cpu_count else cpu_count
pool = multiprocessing.Pool(count)
uids = pool.starmap(username_to_id, params)
pool.close()

for i in range(len(uids)):
uid = uids[i]
if not uid:
continue

config = candidates.get(params[i][0])
count = config.pop("num", 10)
variables = {
"userId": uid,
"count": min(max(count, 1), 100),
"includePromotedContent": False,
"withClientEventToken": False,
"withBirdwatchNotes": False,
"withVoice": True,
"withV2Timeline": True,
}

payload = urllib.parse.urlencode(
{"variables": json.dumps(variables), "features": json.dumps(features)}
)
url = f"https://twitter.com/i/api/graphql/P7qs2Sf7vu1LDKbzDW9FSA/UserMedia?{payload}"
pages[url] = config

subscribes = crawl_pages(pages=pages, silent=True, headers=headers)
endtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logger.info(
f"[TwitterCrawl] finished crawl from Twitter, found {len(subscribes)} subscriptions, time: {endtime}"
)

return subscribes


def extract_subscribes(
content: str,
push_to: list = [],
Expand Down
19 changes: 19 additions & 0 deletions aggregate/subscribe/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,25 @@ def parse_config(config: dict) -> None:
github_conf["spams"] = spams
params["github"] = github_conf

# spider's config for twitter
twitter_conf = spiders.get("twitter", {})
users = twitter_conf.pop("users", {})
if twitter_conf.pop("enable", True) and users:
enabled_users = {}
for k, v in users.items():
if (
utils.isblank(k)
or not v
or type(v) != dict
or not v.pop("enable", True)
):
continue

v["push_to"] = list(set(v.get("push_to", [])))
enabled_users[k] = v

params["twitter"] = enabled_users

# spider's config for github's repositories
repo_conf, repositories = spiders.get("repositories", []), {}
for repo in repo_conf:
Expand Down

0 comments on commit 7dd5785

Please sign in to comment.