forked from kserve/kserve
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathverify-doc-links.py
executable file
·354 lines (282 loc) · 11.3 KB
/
verify-doc-links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
#!/usr/bin/env python3
# Copyright 2022 The KServe Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.#
# This script finds MD-style links and URLs in markdown file and verifies
# that the referenced resources exist.
import concurrent.futures
import itertools
import re
from datetime import datetime, timedelta
from glob import glob
from os import environ as env
from os.path import abspath, dirname, exists, relpath
from time import sleep
from urllib.request import Request, urlopen
from urllib.parse import urlparse
from urllib.error import URLError, HTTPError
GITHUB_REPO = env.get("GITHUB_REPO", "https://github.com/kserve/kserve/")
BRANCH = "master"
# glob expressions to find markdown files in project
md_file_path_expressions = [
"/**/*.md",
"/.github/**/*.md",
]
# don't process .md files in excluded folders
excluded_paths = [
"/node_modules/",
"/temp/",
"/.venv/",
]
# don't analyze URLs with variables to be replaced by the user
url_excludes = ["<", ">", "$", "{", "}"]
# also exclude non-public or local URLs in a <host>:<port> style
url_excludes.extend(
[
"0.0.0.0",
":80",
":90",
":port",
":predict",
".default",
"blob.core.windows.net",
"customdomain.com",
"example.com",
"localhost",
"somecluster",
"sslip.io",
"svc.cluster.local",
"xip.io",
]
)
# GitHub rate-limiting is 60 requests per minute, then we sleep a bit
parallel_requests = 60 # use no more than 60 parallel requests
retry_wait = 60 # 1 minute before retrying GitHub requests after 429 error
extra_wait = 5 # additional wait time before retrying GitHub requests
script_folder = abspath(dirname(__file__))
project_root_dir = abspath(dirname(script_folder))
github_repo_master_path = "{}/blob/{}".format(GITHUB_REPO.rstrip("/"), BRANCH)
url_status_cache = dict()
def find_md_files() -> [str]:
list_of_lists = [
glob(project_root_dir + path_expr, recursive=True)
for path_expr in md_file_path_expressions
]
flattened_list = list(itertools.chain(*list_of_lists))
filtered_list = [
path for path in flattened_list if not any(s in path for s in excluded_paths)
]
return sorted(filtered_list)
def get_links_from_md_file(
md_file_path: str,
) -> [(int, str, str)]: # -> [(line, link_text, URL)]
with open(md_file_path, "r") as f:
try:
md_file_content = f.read()
except ValueError as e:
print(f"Error trying to load file {md_file_path}")
raise e
folder = relpath(dirname(md_file_path), project_root_dir)
# replace relative links that are siblings to the README, i.e. [link text](FEATURES.md)
md_file_content = re.sub(
r"\[([^]]+)\]\((?!http|#|/)([^)]+)\)",
r"[\1]({}/{}/\2)".format(github_repo_master_path, folder).replace("/./", "/"),
md_file_content,
)
# replace links that are relative to the project root, i.e. [link text](/sdk/FEATURES.md)
md_file_content = re.sub(
r"\[([^]]+)\]\(/([^)]+)\)",
r"[\1]({}/\2)".format(github_repo_master_path),
md_file_content,
)
# find all the links
line_text_url = []
for line_number, line_text in enumerate(md_file_content.splitlines()):
all_urls_in_this_line = set()
# find markdown-styled links [text](url)
for link_text, url in re.findall(
r"\[([^][]+)\]\((%s[^)]+)\)" % "http", line_text
):
if not any(s in url for s in url_excludes):
line_text_url.append((line_number + 1, link_text, url))
all_urls_in_this_line.add(url)
# find plain http(s)-style links
for url in re.findall(r"https?://[a-zA-Z0-9./?=_&%${}<>:-]+", line_text):
if url not in all_urls_in_this_line and not any(
s in url for s in url_excludes
):
try:
urlparse(url)
line_text_url.append((line_number + 1, "", url.strip(".")))
except URLError:
pass
# return completed links
return line_text_url
def test_url(
file: str, line: int, text: str, url: str
) -> (str, int, str, str, int): # (file, line, text, url, status)
short_url = url.split("#", maxsplit=1)[0]
status = 0
if short_url not in url_status_cache:
# mind GitHub rate-limiting, use local files to verify link
if short_url.startswith(github_repo_master_path):
local_path = short_url.replace(github_repo_master_path, "")
if exists(abspath(project_root_dir + local_path)):
status = 200
else:
status = 404
else:
status = request_url(short_url, method="HEAD")
if status == 403: # forbidden, try with web browser header
headers = {
"User-Agent": "Mozilla/5.0", # most pages want User-Agent
"Accept-Encoding": "gzip, deflate, br", # GitHub wants Accept-Encoding
}
status = request_url(short_url, method="GET", headers=headers)
if status == 405: # method not allowed, use GET instead of HEAD
status = request_url(short_url, method="GET")
if status in [
429,
503,
]: # GitHub rate-limiting or service unavailable, try again after 1 minute
sleep(retry_wait + extra_wait)
status = request_url(short_url, method="GET")
if status in [
444,
555,
]: # other URLError or Exception, retry with longer timeout
status = request_url(short_url, method="GET", timeout=15)
# if we keep getting the same error, mark it as 404 to be reported at the end
if status in [444, 555]:
status = 404
url_status_cache[short_url] = status
status = url_status_cache[short_url]
return file, line, text, url, status
# datetime to wait until before attempting requests to github.com
next_time_for_github_request = datetime.now()
def wait_before_retry(url):
global next_time_for_github_request
if "github.com" in url:
if datetime.now() < next_time_for_github_request:
sleep((next_time_for_github_request - datetime.now()).seconds + extra_wait)
def set_retry_time(url, status):
global next_time_for_github_request
if "github.com" in url and status == 429:
next_time_for_github_request = datetime.now() + timedelta(
seconds=retry_wait + extra_wait
)
def request_url(url, method="HEAD", timeout=5, headers={}) -> int:
wait_before_retry(url)
try:
req = Request(url, method=method, headers=headers)
resp = urlopen(req, timeout=timeout)
status = resp.code
if any(s in url for s in ["youtube.com", "youtu.be"]):
html = resp.read().decode("utf8")
if "This video isn't available anymore" in html:
status = 404
resp.close()
except HTTPError as e:
status = e.code
set_retry_time(url, status)
except URLError:
status = 444 # custom code used script-internally
except Exception:
status = 555 # custom code used script-internally
return status
def verify_urls_concurrently(
file_line_text_url: [(str, int, str, str)],
) -> [(str, int, str, str)]:
file_line_text_url_status = []
with concurrent.futures.ThreadPoolExecutor(
max_workers=parallel_requests
) as executor:
check_urls = (
executor.submit(test_url, file, line, text, url)
for (file, line, text, url) in file_line_text_url
)
for url_check in concurrent.futures.as_completed(check_urls):
try:
file, line, text, url, status = url_check.result()
file_line_text_url_status.append((file, line, text, url, status))
except Exception:
# set 555 status as a custom code used script-internally
file_line_text_url_status.append((file, line, text, url, 555))
finally:
print(
"{}/{}".format(
len(file_line_text_url_status), len(file_line_text_url)
),
end="\r",
)
return file_line_text_url_status
def verify_doc_links() -> [(str, int, str, str)]:
# 1. find all relevant Markdown files
md_file_paths = find_md_files()
# 2. extract all links with text and URL
file_line_text_url = [
(file, line, text, url)
for file in md_file_paths
for (line, text, url) in get_links_from_md_file(file)
]
# 3. validate the URLs
file_line_text_url_status = verify_urls_concurrently(file_line_text_url)
# 4. filter for the invalid URLs (status 404: "Not Found") to be reported
file_line_text_url_404 = [
(f, l, t, u, s) for (f, l, t, u, s) in file_line_text_url_status if s == 404
]
# 5. print some stats for confidence
print(
"{} {} links ({} unique URLs) in {} Markdown files.\n".format(
"Checked" if file_line_text_url_404 else "Verified",
len(file_line_text_url_status),
len(url_status_cache),
len(md_file_paths),
)
)
# 6. report invalid links, exit with error for CI/CD
if file_line_text_url_404:
for file, line, text, url, status in sorted(file_line_text_url_404):
print(
"{}:{}: {} -> {}".format(
relpath(file, project_root_dir),
line,
url.replace(github_repo_master_path, ""),
status,
)
)
# print a summary line for clear error discovery at the bottom of Travis job log
print(
"\nERROR: Found {} invalid Markdown links".format(
len(file_line_text_url_404)
)
)
exit(1)
def apply_monkey_patch_to_force_ipv4_connections():
# Monkey-patch socket.getaddrinfo to force IPv4 conections, since some older
# routers and some internet providers don't support IPv6, in which case Python
# will first try an IPv6 connection which will hang until timeout and only
# then attempt a successful IPv4 connection
import socket
# get a reference to the original getaddrinfo function
getaddrinfo_original = socket.getaddrinfo
# create a patched getaddrinfo function which uses the original function
# but filters out IPv6 (socket.AF_INET6) entries of host and port address infos
def getaddrinfo_patched(*args, **kwargs):
res = getaddrinfo_original(*args, **kwargs)
return [r for r in res if r[0] == socket.AF_INET]
# replace the original socket.getaddrinfo function with our patched version
socket.getaddrinfo = getaddrinfo_patched
if __name__ == "__main__":
apply_monkey_patch_to_force_ipv4_connections()
verify_doc_links()