-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcollect.py
98 lines (80 loc) · 3.81 KB
/
collect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from urllib.parse import urlparse
import base64
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'
}
# 配置参数
MAX_SUCCESS = 50 # 需要获取的有效内容数量
TIMEOUT = 15 # 单次请求超时时间(秒)
OUTPUT_FILE = 'valid_content.txt'
def is_valid_url(url):
"""验证URL格式是否合法"""
try:
result = urlparse(url)
return all([result.scheme in ['http', 'https'], result.netloc])
except:
return False
# 获取原始URL列表
sub_all_clash_url = 'https://raw.githubusercontent.com/maimengmeng/collectSub/main/sub/sub_all_clash.txt'
try:
response = requests.get(sub_all_clash_url, headers=headers, timeout=10)
response.raise_for_status()
raw_urls = response.text.splitlines()
print(f"原始列表包含 {len(raw_urls)} 个URL")
except Exception as e:
print(f"获取URL列表失败: {e}")
exit()
# 预处理URL列表
valid_urls = [url.strip() for url in raw_urls if is_valid_url(url.strip())]
print(f"经过格式验证的有效URL数量:{len(valid_urls)}")
success_count = 0
processed_count = 0
with open(OUTPUT_FILE, 'w', encoding='utf-8') as out_file:
for url in valid_urls:
if success_count >= MAX_SUCCESS:
break
processed_count += 1
try:
# 显示智能进度(每10个或最后5个显示)
if processed_count % 10 == 0 or (MAX_SUCCESS - success_count) <= 5:
print(f"[进度] 已处理 {processed_count} 个 | 成功 {success_count} 个 | 目标 {MAX_SUCCESS} 个")
resp = requests.get(url, headers=headers, timeout=TIMEOUT)
resp.raise_for_status()
# 内容有效性检查(至少包含10个可打印字符,且不包含“DOMAIN”字符串)
if len(resp.text.strip()) < 10 or "DOMAIN" in resp.text or "port" in resp.text or "proxies" in resp.text:
raise ValueError("内容过短或包含DOMAIN字符串")
# 编码检测
resp.encoding = resp.apparent_encoding
# 写入文件
#out_file.write(f"\n\n{'=' * 20}\n")
#out_file.write(f"\n\n")
#out_file.write(f"# 成功序号: {success_count+1} | 原始序号: {raw_urls.index(url)+1}\n")
#out_file.write(f"# URL: {url}\n")
#out_file.write(f"{'=' * 20}\n\n")
#out_file.write(resp.text)
# 在写入文件之前,对 resp.text 进行 Base64 解码
try:
decoded_content = base64.b64decode(resp.text).decode('utf-8') # 解码并转换为字符串
out_file.write(decoded_content) # 写入解码后的内容
except base64.binascii.Error as e:
print(f"Base64 解码失败: {e}")
continue # 跳过解码失败的网页
except UnicodeDecodeError as e:
print(f"解码后内容无法转换为 UTF-8 字符串: {e}")
continue # 跳过无法解码的网页
success_count += 1
except requests.exceptions.RequestException as e:
continue # 静默跳过网络错误
except Exception as e:
continue # 跳过其他所有异常
# 最终结果报告
print("\n" + "=" * 50)
print(f"最终结果:")
print(f"处理URL总数:{processed_count}")
print(f"成功获取内容数:{success_count}")
print(f"有效内容率:{success_count/processed_count:.1%}")
if success_count < MAX_SUCCESS:
print(f"警告:未能达到目标数量,原始列表可能有效URL不足")
print(f"结果文件已保存至:{OUTPUT_FILE}")