Skip to content

Commit cc182a4

Browse files
authored
Merge pull request #49 from jklymak/fix-do-all-reditrects
Fix do all redirects
2 parents 4689935 + 55bc6e3 commit cc182a4

File tree

1 file changed

+242
-0
lines changed

1 file changed

+242
-0
lines changed

_websiteutils/make_redirects_links.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import functools
5+
import logging
6+
import multiprocessing
7+
import os
8+
import pathlib
9+
import re
10+
import tempfile
11+
import shutil
12+
13+
"""
14+
This script does three things that improve the website organization.
15+
16+
First, we used to host in the root of the webpage, but have now moved to
17+
``/stable/``. We do not want obsolete links to link to nothing (or that has
18+
been our policy), so we currently just keep the old version at the top level.
19+
Here, instead, we either softlink to the newest version, or replace the file by
20+
an html refresh redirect.
21+
22+
Second, it changes the canonical link in each html file to the newest version
23+
found of the html file (including stable if its in the latest version.)
24+
25+
Third, the script adds a new div to the top of all the old webpages with
26+
tag ``olddocs-message`` to warn users that the page is obsolete.
27+
28+
This script takes a while, and is destructive, so should probably be run on a
29+
branch and pushed as a PR so it can easily be reverted.
30+
"""
31+
32+
_log = logging.getLogger("make_redirect_links")
33+
34+
35+
tocheck = [pathlib.Path("stable")] + [
36+
pathlib.Path(f"{major}.{minor}.{micro}")
37+
for major in range(6, -1, -1)
38+
for minor in range(6, -1, -1)
39+
for micro in range(6, -1, -1)
40+
]
41+
42+
toignore = tocheck + [pathlib.Path(p) for p in [
43+
"mpl-probscale",
44+
"mpl_examples",
45+
"mpl_toolkits",
46+
"_webpageutils",
47+
"xkcd",
48+
"_sitemap",
49+
"robots.txt",
50+
"CNAME",
51+
".git",
52+
]]
53+
54+
logging.basicConfig(level=logging.DEBUG)
55+
56+
57+
@functools.cache
58+
def findlast(fname, tocheck):
59+
"""
60+
Check the directories listed in ``tocheck`` to see if they have
61+
``fname`` in them. Return the first one found, or None
62+
"""
63+
for t in tocheck:
64+
pnew = t / fname
65+
if pnew.exists():
66+
return t
67+
return None
68+
69+
70+
html_redirect = """<!DOCTYPE HTML>
71+
<html lang="en">
72+
<head>
73+
<meta charset="utf-8">
74+
<meta http-equiv="refresh" content="0;url={newurl}" />
75+
<link rel="canonical" href="https://matplotlib.org/{canonical}" />
76+
</head>
77+
<body>
78+
<h1>
79+
The page been moved <a href="{newurl}">here</a>!
80+
</h1>
81+
</body>
82+
</html>
83+
"""
84+
85+
# note these are all one line so they are easy to search and replace in the
86+
# html files (otherwise we need to close tags)
87+
warn_banner_exists = (
88+
'<div id="unreleased-message"> You are reading an old version of the '
89+
'documentation (v{version}). For the latest version see '
90+
'<a href="{url}">{url}</a></div>\n')
91+
92+
93+
warn_banner_old = (
94+
'<div id="unreleased-message"> You are reading an old version of the '
95+
'documentation (v{version}). For the latest version see '
96+
'<a href="/stable/">https://matplotlib.org/stable/</a> </div>\n')
97+
98+
99+
def do_links(root0):
100+
"""
101+
Either soft link a file at the top level to its newest position,
102+
or make an html redirect if it is an html file.
103+
"""
104+
105+
_log.info(f"Doing links on {root0}")
106+
for root, dirs, files in os.walk(root0):
107+
for name in files:
108+
fullname = pathlib.Path(root, name)
109+
last = findlast(fullname, tocheck)
110+
_log.debug(f"Checking: {fullname} found {last}")
111+
if last is not None:
112+
fullname.unlink()
113+
oldname = last / fullname
114+
# Need to do these relative to where the final is, but note
115+
# that `Path.relative_to` does not allow '.' as a common path
116+
# prefix, so we need to use `os.path.relpath` instead.
117+
relpath = os.path.relpath(oldname, start=fullname.parent)
118+
if name.endswith((".htm", ".html")):
119+
# make an html redirect.
120+
_log.info(f"Rewriting HTML: {fullname} in {last}")
121+
with fullname.open("w") as fout:
122+
st = html_redirect.format(
123+
newurl=relpath,
124+
canonical=oldname,
125+
)
126+
fout.write(st)
127+
else:
128+
# soft link
129+
_log.info(f"Linking {fullname} to {oldname}")
130+
fullname.symlink_to(relpath)
131+
132+
133+
def do_canonicals(dname):
134+
"""
135+
For each html file in the versioned docs, make the canonical link point
136+
to the newest version.
137+
"""
138+
_log.debug(f"Walking {dname}")
139+
for fullname in dname.rglob("*.html"):
140+
_log.debug(f"Checking {fullname}")
141+
basename = pathlib.Path(*fullname.parts[1:])
142+
last = findlast(basename, tocheck)
143+
if last is not None:
144+
update_canonical(fullname, last, dname == tocheck[1])
145+
146+
147+
def update_canonical(fullname, last, newest):
148+
"""
149+
Change the canonical link in *fullname* to the same link in the
150+
version given by *last*. We do this with a regexp to prevent
151+
removing any other content on a line that has the canonical link.
152+
153+
Also add a banner (div) in the body if an old version of the docs.
154+
155+
Note that if for some reason there are more than one canonical link
156+
this will change all of them.
157+
"""
158+
pre = "https://matplotlib.org/"
159+
pnew = last.joinpath(*fullname.parts[1:])
160+
newcanon = f"{pre}{str(pnew)}"
161+
_log.info(f"{fullname} to {pre}{str(pnew)}")
162+
rec = re.compile(b'<link rel="canonical" href=".*"')
163+
with tempfile.NamedTemporaryFile(delete=False) as fout:
164+
found = False
165+
with fullname.open("rb") as fin:
166+
for line in fin:
167+
if not found and b'<link rel="canonical"' in line:
168+
new = f'<link rel="canonical" href="{newcanon}"'
169+
ll = rec.sub(new.encode("utf-8"), line)
170+
_log.debug(f"new {line}->{ll}")
171+
fout.write(ll)
172+
found = True
173+
elif b'<body>' in line and not newest:
174+
# add a warning right under:
175+
fout.write(line)
176+
line = next(fin)
177+
if last == tocheck[0]:
178+
new = warn_banner_exists.format(
179+
version=fullname.parts[0],
180+
url=newcanon)
181+
else:
182+
new = warn_banner_old.format(version=fullname.parts[0])
183+
fout.write(new.encode("utf-8"))
184+
if b'<div id="olddocs-message">' not in line:
185+
# write the line out if it wasn't an olddocs-message:
186+
fout.write(line)
187+
188+
else:
189+
fout.write(line)
190+
191+
shutil.move(fout.name, fullname)
192+
193+
194+
if __name__ == "__main__":
195+
196+
parser = argparse.ArgumentParser()
197+
198+
parser.add_argument("--np", type=int, help="Number of processors to use")
199+
parser.add_argument("--no-canonicals", help="do not do canonical links",
200+
action="store_true")
201+
parser.add_argument("--no-redirects", help="do not do redirects links",
202+
action="store_true")
203+
204+
args = parser.parse_args()
205+
if args.np:
206+
np = args.np
207+
else:
208+
np = None
209+
210+
# figure out the newest version and trim tocheck at the same time:
211+
tocheck = tuple(p for p in tocheck if p.exists())
212+
print(tocheck)
213+
214+
# html redirect or soft link most things in the top-level directory that
215+
# are not other modules or versioned docs.
216+
if not args.no_redirects:
217+
for entry in os.scandir("."):
218+
fullname = pathlib.Path(entry.name)
219+
if fullname not in toignore:
220+
if entry.is_dir():
221+
do_links(entry.name)
222+
elif fullname.suffix == ".html":
223+
last = findlast(fullname, tocheck)
224+
_log.debug(f"Checking: {fullname} found {last}")
225+
if last is not None:
226+
fullname.unlink()
227+
_log.info(f"Rewriting HTML: {fullname} in {last}")
228+
with fullname.open("w") as fout:
229+
oldname = last / fullname
230+
st = html_redirect.format(newurl=oldname,
231+
canonical=oldname)
232+
fout.write(st)
233+
_log.info("Done links and redirects")
234+
235+
# change the canonical url for all html to the newest version in the docs:
236+
if not args.no_canonicals:
237+
if np is not None:
238+
with multiprocessing.Pool(np) as pool:
239+
pool.map(do_canonicals, tocheck[1:])
240+
else:
241+
for t in tocheck[1:]:
242+
do_canonicals(t)

0 commit comments

Comments
 (0)