Skip to content

Commit

Permalink
crawl from specified pages
Browse files Browse the repository at this point in the history
  • Loading branch information
wzdnzd committed Aug 17, 2022
1 parent 2657264 commit 140674b
Show file tree
Hide file tree
Showing 2 changed files with 135 additions and 61 deletions.
166 changes: 106 additions & 60 deletions aggregate/subscribe/crawl.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ def batch_crawl(conf: dict, thread: int = 50) -> list:
if repositories:
tasks.update(crawl_github_repo(repos=repositories))

pages = conf.get("pages", {})
if pages:
tasks.update(crawl_pages(pages=pages))

if not tasks:
print("cannot found any subscribe url from crawler")
return []
Expand Down Expand Up @@ -120,13 +124,6 @@ def crawl_single_telegram(
if not userid:
return {}

pattern = None
try:
if exclude and exclude.strip() != "":
pattern = re.compile(exclude.strip())
except:
print(f"compile regex error, exclude: {exclude}")

now = time.time() - 3600 * 8
crawl_time = datetime.fromtimestamp(now).strftime("%Y-%m-%dT%H:%M:%SZ")
url = f"https://telemetr.io/post-list-ajax/{userid}/with-date-panel?period={period}&date={crawl_time}"
Expand All @@ -138,7 +135,7 @@ def crawl_single_telegram(
include=include,
limits=limits,
source=Origin.TELEGRAM.name,
exclude=pattern,
exclude=exclude,
)


Expand All @@ -151,7 +148,7 @@ def crawl_telegram(users: dict, period: int, limits: int = 20) -> dict:
params = []
for k, v in users.items():
include = v.get("include", "")
exclude = v.get("exclude", "")
exclude = v.get("exclude", "").strip()
pts = v.get("push_to", [])
params.append([k, period, pts, include, exclude, limits])

Expand All @@ -166,7 +163,7 @@ def crawl_telegram(users: dict, period: int, limits: int = 20) -> dict:


def crawl_single_repo(
username: str, repo: str, push_to: list = [], limits: int = 5
username: str, repo: str, push_to: list = [], limits: int = 5, exclude: str = ""
) -> dict:
if not username or not repo:
print(f"cannot crawl from github, username: {username}\trepo: {repo}")
Expand Down Expand Up @@ -196,7 +193,10 @@ def crawl_single_repo(
patch = file.get("patch", "")
collections.update(
extract_subscribes(
content=patch, push_to=push_to, source=Origin.REPO.name
content=patch,
push_to=push_to,
source=Origin.REPO.name,
exclude=exclude,
)
)
return collections
Expand All @@ -217,10 +217,11 @@ def crawl_github_repo(repos: dict):
repo_name = v.get("repo_name", "").strip()
push_to = v.get("push_to", [])
limits = max(v.get("commits", 2), 1)
exclude = v.get("exclude", "").strip()

if not username or not repo_name or not push_to:
continue
params.append([username, repo_name, push_to, limits])
params.append([username, repo_name, push_to, limits, exclude])

subscribes = multi_thread_crawl(fun=crawl_single_repo, params=params)
endtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
Expand Down Expand Up @@ -266,7 +267,7 @@ def crawl_google(


def crawl_github_page(
page: int, cookie: str, push_to: list = [], exclude: re.Pattern = None
page: int, cookie: str, push_to: list = [], exclude: str = ""
) -> dict:
if page <= 0 or not cookie:
return {}
Expand All @@ -285,21 +286,15 @@ def crawl_github_page(


def crawl_github(limits: int = 3, push_to: list = [], exclude: str = "") -> dict:
regex = None
try:
if exclude and exclude.strip() != "":
regex = re.compile(exclude.strip())
except:
print(f"compile regex error, exclude: {exclude}")

cookie = os.environ.get("GH_COOKIE", "").strip()
if not cookie:
print("[GithubCrawl] cannot start crawl from github because cookie is missing")
return {}

# 鉴于github搜索code不稳定,爬取两次
pages = range(1, limits + 1) * 2
params = [[x, cookie, push_to, regex] for x in pages]
exclude = "" if not exclude else exclude.strip()
params = [[x, cookie, push_to, exclude] for x in pages]
starttime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[GithubCrawl] start crawl from Github, time: {starttime}")
subscribes = multi_thread_crawl(fun=crawl_github_page, params=params)
Expand All @@ -313,37 +308,48 @@ def crawl_github(limits: int = 3, push_to: list = [], exclude: str = "") -> dict

def extract_subscribes(
content: str,
exclude: re.Pattern = None,
push_to: list = [],
include: str = "",
exclude: str = "",
limits: int = sys.maxsize,
source: str = Origin.OWNED.name,
) -> dict:
if not content:
return {}
try:
limits, collections = max(1, limits), {}
# regex = "https?://\S+/api/v1/client/subscribe\?token=[a-zA-Z0-9]+|https?://\S+/link/[a-zA-Z0-9]+\?sub=\d"
# subscribes = re.findall(regex, content)
regex = "https?://(?:[a-zA-Z0-9_\u4e00-\u9fa5\-]+\.)+[a-zA-Z0-9_\u4e00-\u9fa5\-]+(?:(?:(?:/index.php)?/api/v1/client/subscribe\?token=[a-zA-Z0-9]{16,32})|(?:/link/[a-zA-Z0-9]+\?(?:sub|mu)=\d))"

pattern = "https?://(?:[a-zA-Z0-9_\u4e00-\u9fa5\-]+\.)+[a-zA-Z0-9_\u4e00-\u9fa5\-]+(?:(?:(?:/index.php)?/api/v1/client/subscribe\?token=[a-zA-Z0-9]{16,32})|(?:/link/[a-zA-Z0-9]+\?(?:sub|mu)=\d))"
if include:
if not include.startswith("|"):
pattern = f"{pattern}|{include}"
else:
pattern = f"{pattern}{include}"
try:
if not include.startswith("|"):
pattern = f"{regex}|{include}"
else:
pattern = f"{regex}{include}"

subscribes = re.findall(pattern, content)
except:
print(
f"[ExtractError] maybe pattern 'include' exists some problems, include: {include}"
)
subscribes = re.findall(regex, content)
else:
subscribes = re.findall(regex, content)

subscribes = re.findall(pattern, content)
for s in subscribes:
if include and not re.match(
"https?://(?:[a-zA-Z0-9_\u4e00-\u9fa5\-]+\.)+[a-zA-Z0-9_\u4e00-\u9fa5\-]+.*",
s,
):
continue

if exclude and exclude.search(s):
# print(f"ignore url: {s} because matched exclude")
continue
try:
if include and not re.match(
"https?://(?:[a-zA-Z0-9_\u4e00-\u9fa5\-]+\.)+[a-zA-Z0-9_\u4e00-\u9fa5\-]+.*",
s,
):
continue

if exclude and re.search(exclude, s):
continue
except:
print(
f"[ExtractError] maybe pattern 'include' or 'exclude' exists some problems, include: {include}\texclude: {exclude}"
)

# 强制使用https协议
s = s.replace("http://", "https://", 1).strip()
Expand Down Expand Up @@ -387,23 +393,6 @@ def naming_task(url):
)


def get_pages(chanel_id: str) -> int:
if not chanel_id or chanel_id.strip() == "":
return 0

url = f"https://telemetr.io/en/channels/{chanel_id}/posts"
content = utils.http_get(url=url)
before = 0
try:
regex = f"https://telemetr.io/en/channels/{chanel_id}/posts\?before=(\d+)"
groups = re.findall(regex, content)
before = int(groups[0]) + 100 if groups else before
except:
print(f"[CrawlError] cannot count page num, chanel: {chanel_id}")

return before


def collect_airport_page(url: str) -> list:
if not url:
return []
Expand All @@ -423,18 +412,34 @@ def collect_airport_page(url: str) -> list:
return []


def collect_airport(chanel_id: int, group_name: str, thread_num: int = 50) -> list:
if not chanel_id or not group_name:
def collect_airport(channel_id: int, group_name: str, thread_num: int = 50) -> list:
def get_pages(channel: str) -> int:
if not channel or channel.strip() == "":
return 0

url = f"https://telemetr.io/en/channels/{channel}/posts"
content = utils.http_get(url=url)
before = 0
try:
regex = f"https://telemetr.io/en/channels/{channel}/posts\?before=(\d+)"
groups = re.findall(regex, content)
before = int(groups[0]) + 100 if groups else before
except:
print(f"[CrawlError] cannot count page num, chanel: {channel}")

return before

if not channel_id or not group_name:
return []

count = get_pages(chanel_id=f"{chanel_id}-{group_name}")
count = get_pages(channel=f"{channel_id}-{group_name}")
if count == 0:
return []

pages = range(count, -1, -100)

urls = [
f"https://telemetr.io/post-list-ajax/{chanel_id}?sort=-date&postType=all&period=365&before={x}"
f"https://telemetr.io/post-list-ajax/{channel_id}?sort=-date&postType=all&period=365&before={x}"
for x in pages
]

Expand Down Expand Up @@ -487,3 +492,44 @@ def validate_domain(

availables.append(url)
semaphore.release()


def crawl_single_page(url: str, push_to: list = [], exclude: str = "") -> dict:
if not url or not push_to:
print(f"cannot crawl from page: {url}")
return {}

content = utils.http_get(url=url)
if content == "":
return {}

return extract_subscribes(
content=content, push_to=push_to, exclude=exclude, source=Origin.TEMPORARY.name
)


def crawl_pages(pages: dict):
if not pages:
return {}

params = []
starttime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"[PageCrawl] start crawl from Page, time: {starttime}")
for k, v in pages.items():
if not re.match(
"https?://(?:[a-zA-Z0-9_\u4e00-\u9fa5\-]+\.)+[a-zA-Z0-9_\u4e00-\u9fa5\-]+.*",
k,
):
continue

push_to = v.get("push_to", [])
exclude = v.get("exclude", "").strip()

params.append([k, push_to, exclude])

subscribes = multi_thread_crawl(fun=crawl_single_page, params=params)
endtime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(
f"[PageCrawl] finished crawl from Page, time: {endtime}, subscribes: {list(subscribes.keys())}"
)
return subscribes
30 changes: 29 additions & 1 deletion aggregate/subscribe/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ def parse_config(config: dict) -> None:
if not disable and push_to:
github_conf = params.get("github", {})
github_conf["pages"] = max(pages, github_conf.get("pages", 3))
github_conf["exclude"] = "|".join([exclude, github_conf.get("exclude", "")])
github_conf["exclude"] = "|".join(
[github_conf.get("exclude", ""), exclude]
).removeprefix("|")
pts = github_conf.get("push_to", [])
pts.extend(push_to)
github_conf["push_to"] = list(set(pts))
Expand All @@ -98,6 +100,7 @@ def parse_config(config: dict) -> None:
repo_name = repo.get("repo_name", "").strip()
push_to = list(set(repo.get("push_to", [])))
commits = max(repo.get("commits", 3), 1)
exclude = repo.get("exclude", "").strip()

if disable or not username or not repo_name:
continue
Expand All @@ -106,13 +109,38 @@ def parse_config(config: dict) -> None:
item["username"] = username
item["repo_name"] = repo_name
item["commits"] = max(commits, item.get("commits", 3))
item["exclude"] = "|".join([item.get("exclude", ""), exclude]).removeprefix(
"|"
)
pts = item.get("push_to", [])
pts.extend(push_to)
item["push_to"] = list(set(push_to))

repo_conf[key] = item
params["repositories"] = repo_conf

pages = spiders.get("pages", [])
pages_conf = params.get("pages", {})
for page in pages:
disable = page.get("disable", False)
url = page.get("url", "")
if disable or not url:
continue

push_to = list(set(page.get("push_to", [])))
exclude = page.get("exclude", "").strip()

item = pages_conf.get(url, {})
item["exclude"] = "|".join([item.get("exclude", ""), exclude]).removeprefix(
"|"
)
pts = item.get("push_to", [])
pts.extend(push_to)
item["push_to"] = list(set(push_to))
pages_conf[url] = item

params["pages"] = pages_conf

sites, delay = [], sys.maxsize
params, push_conf, crawl_conf, update_conf = {}, {}, {}, {}

Expand Down

0 comments on commit 140674b

Please sign in to comment.